лет назад: 8 · c2d386b786
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,8 +1,8 @@
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 #
			
 
				 # Copyright (C) 2009-2017  Université de Bordeaux
			
 
				-# Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
			
 
				-# Copyright (C) 2014, 2016 INRIA
			
 
				+# Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
			
 
				+# Copyright (C) 2014, 2016, 2017  INRIA
			
 
				 #
			
 
				 # StarPU is free software; you can redistribute it and/or modify
			
 
				 # it under the terms of the GNU Lesser General Public License as published by
			
@@ -23,6 +23,8 @@ New features:
 
				   * Support priorities for data transfers.
			
 
				   * Add support for Ayudame version 2.x debugging library.
			
 
				   * Add support for multiple linear regression performance models
			
 
				+  * Add MPI Master-Slave support to use the cores of remote nodes. Use the
			
 
				+    --enable-mpi-master-slave option to activate it.
			
 
				 
			
 
				 Small features:
			
 
				   * Scheduling contexts may now be associated a user data pointer at creation
			
@@ -30,10 +32,16 @@ Small features:
 
				   * Add STARPU_SIMGRID_TASK_SUBMIT_COST to simulate the cost of task submission
			
 
				     in simgrid mode. This provides more accurate simgrid predictions, especially
			
 
				     for the beginning of the execution.
			
 
				+  * New configure option --enable-mpi-pedantic-isend (disabled by
			
 
				+    default) to acquire data in STARPU_RW (instead of STARPU_R) before
			
 
				+    performing MPI_Isend call
			
 
				 
			
 
				 Changes:
			
 
				   * Vastly improve simgrid simulation time.
			
 
				 
			
 
				+Small changes:
			
 
				+  * Use asynchronous transfers for task data fetches with were not prefetched.
			
 
				+
			
 
				 StarPU 1.2.1 (svn revision xxx)
			
 
				 ==============================================
			
 
				 New features:
			
@@ -46,6 +54,7 @@ New features:
 
				     execution of just one MPI node.
			
 
				   * Add STARPU_PERF_MODEL_HOMOGENEOUS_CUDA/OPENCL/MIC/SCC to share performance
			
 
				     models between devices, making calibration much faster.
			
 
				+  * Add modular-heft-prio scheduler.
			
 
				 
			
 
				 StarPU 1.2.0 (svn revision 18521)
			
 
				 ==============================================
			
--- a/configure.ac
+++ b/configure.ac
@@ -83,14 +83,6 @@ AC_PROG_EGREP
 
				 AC_CHECK_PROGS(PROG_STAT,gstat stat)
			
 
				 AC_CHECK_PROGS(PROG_DATE,gdate date)
			
 
				 AC_OPENMP
			
 
				-#c++11 detection
			
 
				-AX_CXX_COMPILE_STDCXX(11,noext,optional)
			
 
				-
			
 
				-AC_SUBST([STARPU_HAVE_CXX11], $HAVE_CXX11)
			
 
				-AM_CONDITIONAL([STARPU_HAVE_CXX11], [test "$HAVE_CXX11" -eq 1])
			
 
				-if test $HAVE_CXX11 -eq 1; then
			
 
				-  AC_DEFINE(STARPU_HAVE_CXX11, [1], [compiler supports cxx11])
			
 
				-fi
			
 
				 
			
 
				 if test x$enable_perf_debug = xyes; then
			
 
				     enable_shared=no
			
@@ -306,7 +298,7 @@ AC_ARG_WITH(mpicxx, [AS_HELP_STRING([--with-mpicxx[=<path to mpicxx>]],
 
				        fi
			
 
				        # nothing was specified: default value is used
			
 
				        AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$simgrid_dir/bin:$PATH])
			
 
				-       
			
 
				+
			
 
				        # try with mpic++ if mpicxx was not found
			
 
				        if test x$mpicxx_path = xno ; then
			
 
				             DEFAULT_MPICXX=mpic++
			
@@ -349,6 +341,13 @@ if  test x$enable_mpi_progression_hook = xyes; then
 
				 	AC_DEFINE(STARPU_MPI_ACTIVITY, [1], [enable StarPU MPI activity polling method])
			
 
				 fi
			
 
				 
			
 
				+AC_ARG_ENABLE(mpi-pedantic-isend, [AS_HELP_STRING([--enable-mpi-pedantic-isend],
			
 
				+				   [Enable StarPU MPI pedantic isend])],
			
 
				+				   enable_mpi_pedantic_isend=$enableval, enable_mpi_pedantic_isend=no)
			
 
				+if  test x$enable_mpi_pedantic_isend = xyes; then
			
 
				+	AC_DEFINE(STARPU_MPI_PEDANTIC_ISEND, [1], [enable StarPU MPI pedantic isend])
			
 
				+fi
			
 
				+
			
 
				 #We can only build MPI Master Slave if User wants it and MPI is available
			
 
				 if test x$use_mpi_master_slave = xyes -a x$use_mpi = xyes -a x$use_mpicxx = xyes; then
			
 
				     build_mpi_master_slave=yes
			
@@ -363,10 +362,10 @@ fi
 
				 
			
 
				 if test x$build_mpi_master_slave = xyes; then
			
 
				     AC_DEFINE(STARPU_USE_MPI_MASTER_SLAVE, [1], [MPI Master Slave support is enabled])
			
 
				-    CC=$mpicc_path    
			
 
				-    CCLD=$mpicc_path      
			
 
				-    CXX=$mpicxx_path      
			
 
				-    CXXLD=mpicxx_path    
			
 
				+    CC=$mpicc_path
			
 
				+    CCLD=$mpicc_path
			
 
				+    CXX=$mpicxx_path
			
 
				+    CXXLD=mpicxx_path
			
 
				 fi
			
 
				 
			
 
				 AC_ARG_WITH(mpi-master-slave-multiple-thread, [AS_HELP_STRING([--with-mpi-master-slave-multiple-thread])],
			
@@ -396,6 +395,15 @@ AC_DEFINE_UNQUOTED(STARPU_MAXMPIDEVS, [$nmaxmpidev], [maximum number of MPI devi
 
				 #                                                                             #
			
 
				 ###############################################################################
			
 
				 
			
 
				+#c++11 detection
			
 
				+AX_CXX_COMPILE_STDCXX(11,noext,optional)
			
 
				+
			
 
				+AC_SUBST([STARPU_HAVE_CXX11], $HAVE_CXX11)
			
 
				+AM_CONDITIONAL([STARPU_HAVE_CXX11], [test "$HAVE_CXX11" -eq 1])
			
 
				+if test $HAVE_CXX11 -eq 1; then
			
 
				+  AC_DEFINE(STARPU_HAVE_CXX11, [1], [compiler supports cxx11])
			
 
				+fi
			
 
				+
			
 
				 LT_PREREQ([2.2])
			
 
				 LT_INIT([win32-dll])
			
 
				 
			
@@ -630,7 +638,7 @@ then
 
				     INCLUDE_PTHREAD_H='#include <pthread.h>'
			
 
				 fi
			
 
				 
			
 
				-AC_CHECK_TYPE([struct timespec], 
			
 
				+AC_CHECK_TYPE([struct timespec],
			
 
				 	       AC_DEFINE(STARPU_HAVE_STRUCT_TIMESPEC,[1],[struct timespec is defined]),
			
 
				 	       [], [
			
 
				 #include <sys/types.h>
			
@@ -1357,7 +1365,7 @@ if test x$enable_opencl = xyes -o x$enable_opencl = xmaybe; then
 
				           [AC_MSG_RESULT(no)
			
 
				              enable_opencl=no])
			
 
				           LIBS=$SAVED_LIBS
			
 
				-          ;;        
			
 
				+          ;;
			
 
				         *)
			
 
				 	  STARPU_LOOK_FOR_OPENCL()
			
 
				 	  # in case OpenCL was explicitely required, but is not available, this is an error
			
@@ -2163,7 +2171,7 @@ AC_MSG_RESULT($nmaxworkers)
 
				 AC_DEFINE_UNQUOTED(STARPU_NMAXWORKERS, [$nmaxworkers], [Maximum number of workers])
			
 
				 
			
 
				 # Computes the maximun number of combined worker
			
 
				-nmaxcombinedworkers=`expr $maxcpus + $nmaxmicthreads`  
			
 
				+nmaxcombinedworkers=`expr $maxcpus + $nmaxmicthreads`
			
 
				 AC_MSG_CHECKING(Maximum number of workers combinations)
			
 
				 AC_MSG_RESULT($nmaxcombinedworkers)
			
 
				 AC_DEFINE_UNQUOTED(STARPU_NMAX_COMBINEDWORKERS,
			
@@ -2472,9 +2480,9 @@ fi
 
				 if test x$build_mpi_master_slave = xyes; then
			
 
				     #Check if we can compile fortran cases
			
 
				     if test x$use_mpi_fort = xyes ; then
			
 
				-        F77LD=$mpifort_path    
			
 
				+        F77LD=$mpifort_path
			
 
				         FCLD=$mpifort_path
			
 
				-        F77=$mpifort_path    
			
 
				+        F77=$mpifort_path
			
 
				         FC=$mpifort_path
			
 
				     else
			
 
				         enable_build_fortran=no
			
@@ -2837,7 +2845,7 @@ if test x$enable_mlr = xyes -a "$starpu_windows" != "yes" ; then
 
				 		LDFLAGS="-llapack $LDFLAGS"
			
 
				 	else
			
 
				 		if test x$blas_lib = xmkl; then
			
 
				-		   	AC_DEFINE(STARPU_MLR_MODEL, [1], [use mkl library])			
			
 
				+		   	AC_DEFINE(STARPU_MLR_MODEL, [1], [use mkl library])
			
 
				 		else
			
 
				 			AC_MSG_CHECKING(whether min-dgels is linked)
			
 
				 			if test x"$DGELS_LIBS" != x; then
			
@@ -3058,15 +3066,20 @@ if test "$enable_build_doc" = "yes" ; then
 
				    if test "$epstopdfcommand" = "" ; then
			
 
				 	enable_build_doc="no"
			
 
				    fi
			
 
				-   if test -f "$srcdir/doc/doxygen/starpu.pdf"
			
 
				-   then
			
 
				-	enable_build_doc="no"
			
 
				-   fi
			
 
				+fi
			
 
				+available_doc="no"
			
 
				+if test -f "$srcdir/doc/doxygen/starpu.pdf"
			
 
				+then
			
 
				+   enable_build_doc="no"
			
 
				+   available_doc="yes"
			
 
				 fi
			
 
				 AC_MSG_CHECKING(whether documentation should be compiled)
			
 
				 AC_MSG_RESULT($enable_build_doc)
			
 
				+AC_MSG_CHECKING(whether documentation is available)
			
 
				+AC_MSG_RESULT($available_doc)
			
 
				 
			
 
				 AM_CONDITIONAL(BUILD_DOC, [test x$enable_build_doc != xno])
			
 
				+AM_CONDITIONAL(AVAILABLE_DOC, [test x$available_doc != xno])
			
 
				 
			
 
				 ###############################################################################
			
 
				 #                                                                             #
			
@@ -3088,6 +3101,7 @@ AC_SUBST([LIBSTARPU_LINK])
 
				 if test "x$enable_shared" = xno; then
			
 
				         # No .so, so application will unexpected have to know which -l to
			
 
				         # use. Give them in .pc file.
			
 
				+	AC_DEFINE(STARPU_STATIC_ONLY, [1], [Only static compilation was made])
			
 
				 	STARPU_EXPORTED_LIBS="$LDFLAGS $LIBS $LIBSTARPU_LDFLAGS"
			
 
				 fi
			
 
				 AC_SUBST(STARPU_EXPORTED_LIBS)
			
--- a/doc/Makefile.am
+++ b/doc/Makefile.am
@@ -1,6 +1,6 @@
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 #
			
 
				-# Copyright (C) 2013, 2014, 2016  CNRS
			
 
				+# Copyright (C) 2013, 2014, 2016, 2017  CNRS
			
 
				 #
			
 
				 # StarPU is free software; you can redistribute it and/or modify
			
 
				 # it under the terms of the GNU Lesser General Public License as published by
			
--- a/doc/doxygen/Makefile.am
+++ b/doc/doxygen/Makefile.am
@@ -1,7 +1,7 @@
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 #
			
 
				 # Copyright (C) 2009, 2011, 2013-2014  Université de Bordeaux
			
 
				-# Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
			
 
				+# Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
			
 
				 # Copyright (C) 2014  INRIA
			
 
				 #
			
 
				 # StarPU is free software; you can redistribute it and/or modify
			
@@ -19,12 +19,7 @@ DOXYGEN = doxygen
 
				 PDFLATEX = pdflatex
			
 
				 MAKEINDEX = makeindex
			
 
				 
			
 
				-if BUILD_DOC
			
 
				 DOX_DIR = $(top_builddir)/doc/doxygen
			
 
				-else
			
 
				-DOX_DIR = $(top_srcdir)/doc/doxygen
			
 
				-endif
			
 
				-
			
 
				 DOX_CONFIG = $(top_srcdir)/doc/doxygen/doxygen.cfg
			
 
				 
			
 
				 DOX_HTML_DIR = html
			
@@ -32,6 +27,33 @@ DOX_LATEX_DIR = latex
 
				 DOX_PDF = $(DOX_DIR)/starpu.pdf
			
 
				 DOX_TAG = starpu.tag
			
 
				 
			
 
				+txtdir   = $(docdir)/manual
			
 
				+
			
 
				+if BUILD_DOC
			
 
				+all: $(DOX_HTML_DIR) $(DOX_PDF)
			
 
				+EXTRA_DIST = $(DOX_HTML_DIR) $(DOX_PDF)
			
 
				+txt_DATA = $(DOX_PDF)
			
 
				+DOX_HTML_SRCDIR=$(DOX_HTML_DIR)
			
 
				+install-exec-hook:
			
 
				+	$(MKDIR_P) $(DESTDIR)$(docdir)/manual/html
			
 
				+	(cd $(DOX_HTML_SRCDIR) && find . -type f -exec $(INSTALL) -c -m 644 {} $(DESTDIR)$(docdir)/manual/html \;)
			
 
				+uninstall-hook:
			
 
				+	rm -rf $(DESTDIR)$(docdir)/manual/html
			
 
				+else
			
 
				+if AVAILABLE_DOC
			
 
				+EXTRA_DIST = $(top_srcdir)/doc/doxygen/html $(top_srcdir)/doc/doxygen/starpu.pdf
			
 
				+txt_DATA = $(top_srcdir)/doc/doxygen/starpu.pdf
			
 
				+DOX_HTML_SRCDIR=$(top_srcdir)/doc/doxygen/html
			
 
				+install-exec-hook:
			
 
				+	$(MKDIR_P) $(DESTDIR)$(docdir)/manual/html
			
 
				+	(cd $(DOX_HTML_SRCDIR) && find . -type f -exec $(INSTALL) -c -m 644 {} $(DESTDIR)$(docdir)/manual/html \;)
			
 
				+uninstall-hook:
			
 
				+	rm -rf $(DESTDIR)$(docdir)/manual/html
			
 
				+endif
			
 
				+endif
			
 
				+
			
 
				+
			
 
				+if BUILD_DOC
			
 
				 chapters =	\
			
 
				 	chapters/000_introduction.doxy		\
			
 
				 	chapters/101_building.doxy		\
			
@@ -120,7 +142,8 @@ chapters =	\
 
				 	chapters/api/toolbox.doxy \
			
 
				 	chapters/api/sc_hypervisor/sc_hypervisor.doxy \
			
 
				 	chapters/api/sc_hypervisor/sc_hypervisor_usage.doxy \
			
 
				-	chapters/api/modularized_scheduler.doxy
			
 
				+	chapters/api/modularized_scheduler.doxy \
			
 
				+	chapters/api/clustering_machine.doxy
			
 
				 
			
 
				 starpu_config.h: $(top_srcdir)/include/starpu_config.h.in
			
 
				 	@$(SED) 's/#undef \(.*\)/#define \1 1/' $< > $@
			
@@ -162,55 +185,6 @@ chapters/version.html: $(chapters)
 
				 		if test -f $$f ; then $(RM) $$f ; fi ;\
			
 
				 	done
			
 
				 
			
 
				-if BUILD_DOC
			
 
				-EXTRA_DIST	= 					\
			
 
				-	$(chapters) 					\
			
 
				-	chapters/version.sty				\
			
 
				-	chapters/version.html				\
			
 
				-	chapters/data_trace.eps				\
			
 
				-	chapters/data_trace.pdf				\
			
 
				-	chapters/data_trace.png				\
			
 
				-	chapters/distrib_data.png	\
			
 
				-	chapters/distrib_data.eps	\
			
 
				-	chapters/distrib_data.pdf	\
			
 
				-	chapters/distrib_data_histo.png	\
			
 
				-	chapters/distrib_data_histo.eps	\
			
 
				-	chapters/distrib_data_histo.pdf	\
			
 
				-	chapters/paje_draw_histogram.eps	\
			
 
				-	chapters/paje_draw_histogram.png	\
			
 
				-	chapters/paje_draw_histogram.pdf	\
			
 
				-	chapters/parallel_worker1.eps		\
			
 
				-	chapters/parallel_worker1.pdf		\
			
 
				-	chapters/parallel_worker1.png		\
			
 
				-	chapters/parallel_worker2.eps		\
			
 
				-	chapters/parallel_worker2.pdf		\
			
 
				-	chapters/parallel_worker2.png		\
			
 
				-	chapters/runtime-par.eps			\
			
 
				-	chapters/runtime-par.pdf			\
			
 
				-	chapters/runtime-par.png			\
			
 
				-	chapters/runtime-seq.eps			\
			
 
				-	chapters/runtime-seq.pdf			\
			
 
				-	chapters/runtime-seq.png			\
			
 
				-	chapters/starpu_chol_model_11_type.png	\
			
 
				-	chapters/starpu_chol_model_11_type.eps	\
			
 
				-	chapters/starpu_chol_model_11_type.pdf	\
			
 
				-	chapters/starpu_non_linear_memset_regression_based.png	\
			
 
				-	chapters/starpu_non_linear_memset_regression_based.eps	\
			
 
				-	chapters/starpu_non_linear_memset_regression_based.pdf	\
			
 
				-	chapters/starpu_non_linear_memset_regression_based_2.png	\
			
 
				-	chapters/starpu_non_linear_memset_regression_based_2.eps	\
			
 
				-	chapters/starpu_non_linear_memset_regression_based_2.pdf	\
			
 
				-	chapters/starpu_starpu_slu_lu_model_11.png	\
			
 
				-	chapters/starpu_starpu_slu_lu_model_11.eps	\
			
 
				-	chapters/starpu_starpu_slu_lu_model_11.pdf	\
			
 
				-	chapters/tasks_size_overhead.png		\
			
 
				-	chapters/tasks_size_overhead.eps		\
			
 
				-	chapters/tasks_size_overhead.pdf		\
			
 
				-	chapters/temanejo.png		\
			
 
				-	doxygen.cfg 					\
			
 
				-	refman.tex					\
			
 
				-	$(DOX_HTML_DIR)
			
 
				-endif
			
 
				 
			
 
				 dox_inputs = $(DOX_CONFIG) 				\
			
 
				 	$(chapters) 					\
			
@@ -278,12 +252,9 @@ $(DOX_TAG): $(dox_inputs)
 
				 	@$(SED) -i '/\\begin{titlepage}/,$$d' $(DOX_LATEX_DIR)/refman.tex
			
 
				 	@cat $(top_srcdir)/doc/doxygen/refman.tex >> $(DOX_LATEX_DIR)/refman.tex
			
 
				 
			
 
				-if BUILD_DOC
			
 
				-EXTRA_DIST += $(DOX_PDF)
			
 
				-
			
 
				 $(DOX_PDF): $(DOX_TAG) refman.tex
			
 
				 	@cp $(top_srcdir)/doc/doxygen/chapters/version.sty $(DOX_LATEX_DIR)
			
 
				-	@cp $(top_srcdir)/doc/doxygen/chapters/*pdf $(DOX_LATEX_DIR)
			
 
				+	@cp $(top_srcdir)/doc/doxygen/chapters/images/*pdf $(DOX_LATEX_DIR)
			
 
				 	@echo $(PDFLATEX) $(DOX_LATEX_DIR)/refman.tex
			
 
				 	@cd $(DOX_LATEX_DIR) ;\
			
 
				 	rm -f *.aux *.toc *.idx *.ind *.ilg *.log *.out ;\
			
@@ -310,8 +281,8 @@ CLEANFILES = $(DOX_TAG) starpu_config.h \
 
				     $(DOX_HTML_DIR) \
			
 
				     $(DOX_LATEX_DIR) \
			
 
				     $(DOX_PDF)
			
 
				-endif
			
 
				 
			
 
				+endif
			
 
				 # Rule to update documentation on web server. Should only be used locally.
			
 
				 PUBLISHHOST	?= gforge
			
 
				 update-web: $(DOX_PDF)
			
--- a/doc/doxygen/chapters/210_check_list_performance.doxy
+++ b/doc/doxygen/chapters/210_check_list_performance.doxy
@@ -2,7 +2,7 @@
 
				  * This file is part of the StarPU Handbook.
			
 
				  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
			
 
				- * Copyright (C) 2011, 2012 INRIA
			
 
				+ * Copyright (C) 2011, 2012, 2017  INRIA
			
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
 
				 
			
@@ -264,7 +264,8 @@ having to calibrate performance models for each GPU of a homogeneous set of GPU
 
				 devices for instance, the model can be shared by setting
			
 
				 <c>export STARPU_PERF_MODEL_HOMOGENEOUS_CUDA=1</c> ,
			
 
				 <c>export STARPU_PERF_MODEL_HOMOGENEOUS_OPENCL=1</c> ,
			
 
				-<c>export STARPU_PERF_MODEL_HOMOGENEOUS_MIC=1</c> , or
			
 
				+<c>export STARPU_PERF_MODEL_HOMOGENEOUS_MIC=1</c> ,
			
 
				+<c>export STARPU_PERF_MODEL_HOMOGENEOUS_MPI_MS=1</c> , or
			
 
				 <c>export STARPU_PERF_MODEL_HOMOGENEOUS_SCC=1</c> (depending on your GPU device type).
			
 
				 
			
 
				 To force continuing calibration,
			
--- a/doc/doxygen/chapters/410_mpi_support.doxy
+++ b/doc/doxygen/chapters/410_mpi_support.doxy
@@ -1,8 +1,8 @@
 
				 /*
			
 
				  * This file is part of the StarPU Handbook.
			
 
				  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
			
 
				- * Copyright (C) 2011, 2012 INRIA
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
			
 
				+ * Copyright (C) 2011, 2012, 2017  INRIA
			
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
 
				 
			
@@ -208,6 +208,12 @@ automatically released. This mechanism is similar to the pthread
 
				 detach state attribute which determines whether a thread will be
			
 
				 created in a joinable or a detached state.
			
 
				 
			
 
				+For send communications, data is acquired with the mode ::STARPU_R.
			
 
				+When using the configure option
			
 
				+\ref enable-mpi-pedantic-isend "--enable-mpi-pedantic-isend", the mode
			
 
				+::STARPU_RW is used to make sure there is no more than 1 concurrent
			
 
				+MPI_Isend call accessing a data.
			
 
				+
			
 
				 Internally, all communication are divided in 2 communications, a first
			
 
				 message is used to exchange an envelope describing the data (i.e its
			
 
				 tag and its size), the data itself is sent in a second message. All
			
@@ -681,8 +687,6 @@ for(x = 0; x < nblocks ;  x++) {
 
				 starpu_mpi_gather_detached(data_handles, nblocks, 0, MPI_COMM_WORLD);
			
 
				 \endcode
			
 
				 
			
 
				-*/
			
 
				-
			
 
				 Other collective operations would be easy to define, just ask starpu-devel for
			
 
				 them!
			
 
				 
			
@@ -726,3 +730,30 @@ uses implicit MPI data transfers, <c>plu_outofcore_example</c> uses implicit MPI
 
				 data transfers and supports data matrices which do not fit in memory (out-of-core).
			
 
				 </li>
			
 
				 </ul>
			
 
				+
			
 
				+\section MPIMasterSlave MPI Master Slave Support
			
 
				+
			
 
				+StarPU includes an other way to execute the application across many nodes. The Master
			
 
				+Slave support permits to use remote cores without thinking about data distribution. This
			
 
				+support can be activated with the \ref enable-mpi-master-slave "--enable-mpi-master-slave". However, you should not activate
			
 
				+both MPI support and MPI Master-Slave support.
			
 
				+
			
 
				+If a codelet contains a kernel for CPU devices, it is automatically eligible to be executed
			
 
				+on a MPI Slave device. However, you can decide to execute the codelet on a MPI Slave by filling
			
 
				+the \ref starpu_codelet::mpi_ms_funcs variable. The functions have to be globally-visible (i.e. not static ) for
			
 
				+StarPU to be able to look them up, and <c>-rdynamic</c> must be passed to gcc (or <c>-export-dynamic</c> to ld)
			
 
				+so that symbols of the main program are visible.
			
 
				+
			
 
				+By default, one core is dedicated on the master to manage the entire set of slaves. If MPI
			
 
				+has a good multiple threads support, you can use \ref with-mpi-master-slave-multiple-thread "--with-mpi-master-slave-multiple-thread"  to
			
 
				+dedicate one core per slave.
			
 
				+
			
 
				+If you want to chose the number of cores on the slave device, use the \ref STARPU_NMPIMSTHREADS "STARPU_NMPIMSTHREADS=\<number\>"
			
 
				+with <c>\<number\></c> is the number of cores wanted. The default value is all the slave's cores. To select
			
 
				+the number of slaves nodes, change the <c>-n</c> parameter when executing the application with mpirun
			
 
				+or mpiexec.
			
 
				+
			
 
				+The node chosen by default is the with the MPI rank 0. To modify this, use the environment variable
			
 
				+\ref STARPU_MPI_MASTER_NODE "STARPU_MPI_MASTER_NODE=\<number\>" with <c>\<number\></c> is the MPI rank wanted.
			
 
				+
			
 
				+*/
			
--- a/doc/doxygen/chapters/501_environment_variables.doxy
+++ b/doc/doxygen/chapters/501_environment_variables.doxy
@@ -2,7 +2,7 @@
 
				  * This file is part of the StarPU Handbook.
			
 
				  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
			
 
				- * Copyright (C) 2011, 2012, 2016 INRIA
			
 
				+ * Copyright (C) 2011, 2012, 2016, 2017  INRIA
			
 
				  * Copyright (C) 2016 Uppsala University
			
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
@@ -123,6 +123,28 @@ MIC devices to use.
 
				 Number of threads to use on the MIC devices.
			
 
				 </dd>
			
 
				 
			
 
				+<dt>STARPU_NMPI_MS</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_NMPI_MS
			
 
				+\addindex __env__STARPU_NMPI_MS
			
 
				+MPI Master Slave equivalent of the environment variable \ref STARPU_NCUDA, i.e. the number of
			
 
				+MPI Master Slave devices to use.
			
 
				+</dd>
			
 
				+
			
 
				+<dt>STARPU_NMPIMSTHREADS</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_NMPIMSTHREADS
			
 
				+\addindex __env__STARPU_NMPIMSTHREADS
			
 
				+Number of threads to use on the MPI Slave devices.
			
 
				+</dd>
			
 
				+
			
 
				+<dt>STARPU_MPI_MASTER_NODE</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_MPI_MASTER_NODE
			
 
				+\addindex __env__STARPU_MPI_MASTER_NODE
			
 
				+This variable allows to chose which MPI node (with the MPI ID) will be the master.
			
 
				+</dd>
			
 
				+
			
 
				 <dt>STARPU_NSCC</dt>
			
 
				 <dd>
			
 
				 \anchor STARPU_NSCC
			
@@ -310,6 +332,13 @@ it is therefore necessary to disable asynchronous data transfers.
 
				 Disable asynchronous copies between CPU and MIC devices.
			
 
				 </dd>
			
 
				 
			
 
				+<dt>STARPU_DISABLE_ASYNCHRONOUS_MPI_MS_COPY</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_DISABLE_ASYNCHRONOUS_MPI_MS_COPY 
			
 
				+\addindex __env__STARPU_DISABLE_ASYNCHRONOUS_MPI_MS_COPY
			
 
				+Disable asynchronous copies between CPU and MPI Slave devices.
			
 
				+</dd>
			
 
				+
			
 
				 <dt>STARPU_ENABLE_CUDA_GPU_GPU_DIRECT</dt>
			
 
				 <dd>
			
 
				 \anchor STARPU_ENABLE_CUDA_GPU_GPU_DIRECT
			
@@ -652,6 +681,16 @@ calibration to be much faster, since measurements only have to be once for all
 
				 MIC GPUs.
			
 
				 </dd>
			
 
				 
			
 
				+<dt>STARPU_PERF_MODEL_HOMOGENEOUS_MPI_MS</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_PERF_MODEL_HOMOGENEOUS_MPI_MS
			
 
				+\addindex __env__STARPU_PERF_MODEL_HOMOGENEOUS_MPI_MS
			
 
				+When this is set to 1, StarPU will assume that all MPI Slave devices have the same
			
 
				+performance, and thus share performance models for them, thus allowing kernel
			
 
				+calibration to be much faster, since measurements only have to be once for all
			
 
				+MPI Slaves.
			
 
				+</dd>
			
 
				+
			
 
				 <dt>STARPU_PERF_MODEL_HOMEGENEOUS_SCC</dt>
			
 
				 <dd>
			
 
				 \anchor STARPU_PERF_MODEL_HOMOGENEOUS_SCC
			
--- a/doc/doxygen/chapters/510_configure_options.doxy
+++ b/doc/doxygen/chapters/510_configure_options.doxy
@@ -1,8 +1,8 @@
 
				 /*
			
 
				  * This file is part of the StarPU Handbook.
			
 
				  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
			
 
				- * Copyright (C) 2011, 2012 INRIA
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
			
 
				+ * Copyright (C) 2011, 2012, 2017  INRIA
			
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
 
				 
			
@@ -306,6 +306,13 @@ Specify the maximum number of MIC threads
 
				 Disable asynchronous copies between CPU and MIC devices.
			
 
				 </dd>
			
 
				 
			
 
				+<dt>--disable-asynchronous-mpi-master-slave-copy</dt>
			
 
				+<dd>
			
 
				+\anchor disable-asynchronous-mpi-master-slave-copy
			
 
				+\addindex __configure__--disable-asynchronous-mpi-master-slave-copy
			
 
				+Disable asynchronous copies between CPU and MPI Slave devices.
			
 
				+</dd>
			
 
				+
			
 
				 <dt>--enable-maxnodes=<c>count</c></dt>
			
 
				 <dd>
			
 
				 \anchor enable-maxnodes
			
@@ -321,6 +328,55 @@ used by StarPU data structures.
 
				 
			
 
				 <dl>
			
 
				 
			
 
				+<dt>--disable-mpi</dt>
			
 
				+<dd>
			
 
				+\anchor disable-mpi
			
 
				+\addindex __configure__--disable-mpi
			
 
				+Disable the build of libstarpumpi. By default, it is enabled when MPI is found.
			
 
				+</dd>
			
 
				+
			
 
				+<dt>--with-mpicc=<c>path</c></dt>
			
 
				+<dd>
			
 
				+\anchor with-mpicc
			
 
				+\addindex __configure__--with-mpicc
			
 
				+Use the compiler <c>mpicc</c> at <c>path</c>, for StarPU-MPI.
			
 
				+(\ref MPISupport).
			
 
				+</dd>
			
 
				+
			
 
				+<dt>--enable-mpi-progression-hook</dt>
			
 
				+<dd>
			
 
				+\anchor enable-mpi-progression-hook
			
 
				+\addindex __configure__--enable-mpi-progression-hook
			
 
				+Enable the activity polling method for StarPU-MPI. This is however experimental,
			
 
				+do not enable it unless you know what you are doing.
			
 
				+</dd>
			
 
				+
			
 
				+<dt>--enable-mpi-pedantic-isend</dt>
			
 
				+<dd>
			
 
				+\anchor enable-mpi-pedantic-isend
			
 
				+\addindex __configure__--enable-mpi-pedantic-isend
			
 
				+Before performing any MPI communication, StarPU-MPI waits for the data
			
 
				+to be available in the main memory of the node submitting the request.
			
 
				+For send communications, data is acquired with the mode ::STARPU_R.
			
 
				+When enabling the pedantic mode, data are instead acquired with the
			
 
				+::STARPU_RW which thus ensures that there is not more than 1
			
 
				+concurrent MPI_Isend calls accessing the data.
			
 
				+</dd>
			
 
				+
			
 
				+<dt>--enable-mpi-master-slave</dt>
			
 
				+<dd>
			
 
				+\anchor enable-mpi-master-slave
			
 
				+\addindex __configure__--enable-mpi-master-slave
			
 
				+Enable the MPI Master-Slave support. By default, it is disabled.
			
 
				+</dd>
			
 
				+
			
 
				+<dt>--with-mpi-master-slave-multiple-thread</dt>
			
 
				+<dd>
			
 
				+\anchor with-mpi-master-slave-multiple-thread
			
 
				+\addindex __configure__--with-mpi-master-slave-multiple-thread
			
 
				+Create one thread per MPI Slave on the MPI master to manage communications.
			
 
				+</dd>
			
 
				+
			
 
				 <dt>--disable-fortran</dt>
			
 
				 <dd>
			
 
				 \anchor disable-fortran
			
@@ -353,22 +409,6 @@ Disable the GCC plug-in (\ref cExtensions).  By default, it is
 
				 enabled when the GCC compiler provides a plug-in support.
			
 
				 </dd>
			
 
				 
			
 
				-<dt>--with-mpicc=<c>path</c></dt>
			
 
				-<dd>
			
 
				-\anchor with-mpicc
			
 
				-\addindex __configure__--with-mpicc
			
 
				-Use the compiler <c>mpicc</c> at <c>path</c>, for StarPU-MPI.
			
 
				-(\ref MPISupport).
			
 
				-</dd>
			
 
				-
			
 
				-<dt>--enable-mpi-progression-hook</dt>
			
 
				-<dd>
			
 
				-\anchor enable-mpi-progression-hook
			
 
				-\addindex __configure__--enable-mpi-progression-hook
			
 
				-Enable the activity polling method for StarPU-MPI. This is however experimental,
			
 
				-do not enable it unless you know what you are doing.
			
 
				-</dd>
			
 
				-
			
 
				 <dt>--with-coi-dir</dt>
			
 
				 <dd>
			
 
				 \anchor with-coi-dir
			
--- a/doc/doxygen/chapters/api/clustering_machine.doxy
+++ b/doc/doxygen/chapters/api/clustering_machine.doxy
@@ -0,0 +1,13 @@
 
				+/*
			
 
				+ * This file is part of the StarPU Handbook.
			
 
				+ * Copyright (C) 2017  Inria
			
 
				+ * See the file version.doxy for copying conditions.
			
 
				+ */
			
 
				+
			
 
				+/*! \defgroup API_Clustering_Machine Clustering Machine
			
 
				+
			
 
				+\def STARPU_CLUSTER_AWAKE_WORKERS
			
 
				+\ingroup API_Clustering_Machine
			
 
				+TODO
			
 
				+
			
 
				+*/
			
--- a/doc/doxygen/chapters/api/codelet_and_tasks.doxy
+++ b/doc/doxygen/chapters/api/codelet_and_tasks.doxy
@@ -2,7 +2,7 @@
 
				  * This file is part of the StarPU Handbook.
			
 
				  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
			
 
				- * Copyright (C) 2011, 2012 INRIA
			
 
				+ * Copyright (C) 2011, 2012, 2017  INRIA
			
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
 
				 
			
@@ -83,6 +83,11 @@ specify the codelet may be executed on a OpenCL processing unit.
 
				 This macro is used when setting the field starpu_codelet::where to
			
 
				 specify the codelet may be executed on a MIC processing unit.
			
 
				 
			
 
				+\def STARPU_MPI_MS
			
 
				+\ingroup API_Codelet_And_Tasks
			
 
				+This macro is used when setting the field starpu_codelet::where to
			
 
				+specify the codelet may be executed on a MPI Slave processing unit.
			
 
				+
			
 
				 \def STARPU_SCC
			
 
				 \ingroup API_Codelet_And_Tasks
			
 
				 This macro is used when setting the field starpu_codelet::where to
			
@@ -152,6 +157,10 @@ OpenCL implementation of a codelet.
 
				 \ingroup API_Codelet_And_Tasks
			
 
				 MIC implementation of a codelet.
			
 
				 
			
 
				+\typedef starpu_mpi_ms_func_t
			
 
				+\ingroup API_Codelet_And_Tasks
			
 
				+MPI Master Slave implementation of a codelet.
			
 
				+
			
 
				 \typedef starpu_scc_func_t
			
 
				 \ingroup API_Codelet_And_Tasks
			
 
				 SCC implementation of a codelet.
			
@@ -160,6 +169,10 @@ SCC implementation of a codelet.
 
				 \ingroup API_Codelet_And_Tasks
			
 
				 MIC kernel for a codelet
			
 
				 
			
 
				+\typedef starpu_mpi_ms_kernel_t
			
 
				+\ingroup API_Codelet_And_Tasks
			
 
				+MPI Master Slave kernel for a codelet
			
 
				+
			
 
				 \typedef starpu_scc_kernel_t
			
 
				 \ingroup API_Codelet_And_Tasks
			
 
				 SCC kernel for a codelet
			
@@ -277,6 +290,18 @@ in the field starpu_codelet::where. It can be null if
 
				 starpu_codelet::cpu_funcs_name is non-NULL, in which case StarPU will
			
 
				 simply make a symbol lookup to get the implementation.
			
 
				 
			
 
				+\var starpu_mpi_ms_func_t starpu_codelet::mpi_ms_funcs[STARPU_MAXIMPLEMENTATIONS]
			
 
				+Optional array of function pointers to a function which returns the
			
 
				+MPI Master Slave implementation of the codelet. The functions prototype must be:
			
 
				+\code{.c}
			
 
				+starpu_mpi_ms_kernel_t mpi_ms_func(struct starpu_codelet *cl, unsigned nimpl)
			
 
				+\endcode
			
 
				+If the field starpu_codelet::where is set, then the field
			
 
				+starpu_codelet::mpi_ms_funcs is ignored if ::STARPU_MPI_MS does not appear
			
 
				+in the field starpu_codelet::where. It can be null if
			
 
				+starpu_codelet::cpu_funcs_name is non-NULL, in which case StarPU will
			
 
				+simply make a symbol lookup to get the implementation.
			
 
				+
			
 
				 \var starpu_scc_func_t starpu_codelet::scc_funcs[STARPU_MAXIMPLEMENTATIONS]
			
 
				 Optional array of function pointers to a function which returns the
			
 
				 SCC implementation of the codelet. The functions prototype must be:
			
--- a/doc/doxygen/chapters/api/data_interfaces.doxy
+++ b/doc/doxygen/chapters/api/data_interfaces.doxy
@@ -2,7 +2,7 @@
 
				  * This file is part of the StarPU Handbook.
			
 
				  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
			
 
				- * Copyright (C) 2011, 2012 INRIA
			
 
				+ * Copyright (C) 2011, 2012, 2017  INRIA
			
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
 
				 
			
@@ -136,6 +136,19 @@ Must return 0 if the transfer was actually completed completely
 
				 synchronously, or -EAGAIN if at least some transfers are still ongoing
			
 
				 and should be awaited for by the core.
			
 
				 
			
 
				+\var int (*starpu_data_copy_methods::ram_to_mpi_ms)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				+Define how to copy data from the \p src_interface interface on the
			
 
				+\p src_node CPU node to the \p dst_interface interface on the \p dst_node MPI Slave
			
 
				+node. Return 0 on success.
			
 
				+\var int (*starpu_data_copy_methods::mpi_ms_to_ram)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				+Define how to copy data from the \p src_interface interface on the
			
 
				+\p src_node MPI Slave node to the \p dst_interface interface on the \p dst_node CPU
			
 
				+node. Return 0 on success.
			
 
				+\var int (*starpu_data_copy_methods::mpi_ms_to_mpi_ms)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				+Define how to copy data from the \p src_interface interface on the
			
 
				+\p src_node MPI Slave node to the \p dst_interface interface on the \p dst_node
			
 
				+MPI Slave node. Return 0 on success.
			
 
				+
			
 
				 \var int (*starpu_data_copy_methods::ram_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
			
 
				 Define how to copy data from the \p src_interface interface on the
			
 
				 \p src_node CPU node to the \p dst_interface interface on the \p dst_node CUDA
			
@@ -180,6 +193,25 @@ actually completed completely synchronously, or -EAGAIN if at least
 
				 some transfers are still ongoing and should be awaited for by the
			
 
				 core.
			
 
				 
			
 
				+\var int (*starpu_data_copy_methods::ram_to_mpi_ms_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void * event)
			
 
				+Define how to copy data from the \p src_interface interface on the
			
 
				+\p src_node CPU node to the \p dst_interface interface on the \p dst_node MPI Slave
			
 
				+node, with the given even. Must return 0 if the transfer was
			
 
				+actually completed completely synchronously, or -EAGAIN if at least
			
 
				+some transfers are still ongoing and should be awaited for by the core.
			
 
				+\var int (*starpu_data_copy_methods::mpi_ms_to_ram_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void * event)
			
 
				+Define how to copy data from the \p src_interface interface on the
			
 
				+\p src_node MPI Slave node to the \p dst_interface interface on the \p dst_node CPU
			
 
				+node, with the given event. Must return 0 if the transfer was
			
 
				+actually completed completely synchronously, or -EAGAIN if at least
			
 
				+some transfers are still ongoing and should be awaited for by the core.
			
 
				+\var int (*starpu_data_copy_methods::mpi_ms_to_mpi_ms_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void * event)
			
 
				+Define how to copy data from the \p src_interface interface on the
			
 
				+\p src_node MPI Slave node to the \p dst_interface interface on the \p dst_node MPI Slave 
			
 
				+node, using the given stream. Must return 0 if the transfer was
			
 
				+actually completed completely synchronously, or -EAGAIN if at least
			
 
				+some transfers are still ongoing and should be awaited for by the core.
			
 
				+
			
 
				 \var int (*starpu_data_copy_methods::ram_to_mic_async)(void *src_intreface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				 Define how to copy data from the \p src_interface interface on the
			
 
				 \p src_node CPU node to the \p dst_interface interface on the \p dst_node
			
--- a/doc/doxygen/chapters/api/initialization.doxy
+++ b/doc/doxygen/chapters/api/initialization.doxy
@@ -2,7 +2,7 @@
 
				  * This file is part of the StarPU Handbook.
			
 
				  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
			
 
				- * Copyright (C) 2011, 2012 INRIA
			
 
				+ * Copyright (C) 2011, 2012, 2017  INRIA
			
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
 
				 
			
@@ -52,6 +52,10 @@ be specified with the environment variable \ref STARPU_NMIC.
 
				 This is the number of SCC devices that StarPU can use. This can also
			
 
				 be specified with the environment variable \ref STARPU_NSCC.
			
 
				 (default = -1)
			
 
				+\var int starpu_conf::nmpi_ms
			
 
				+This is the number of MPI Master Slave devices that StarPU can use. This can also
			
 
				+be specified with the environment variable \ref STARPU_NMPI_MS.
			
 
				+(default = -1)
			
 
				 
			
 
				 \var unsigned starpu_conf::use_explicit_workers_bindid
			
 
				 If this flag is set, the starpu_conf::workers_bindid array indicates
			
@@ -105,6 +109,14 @@ array contains the logical identifiers of the SCC devices to be used.
 
				 Otherwise, StarPU affects the SCC devices in a round-robin fashion.
			
 
				 This can also be specified with the environment variable
			
 
				 \ref STARPU_WORKERS_SCCID.
			
 
				+\var unsigned starpu_conf::use_explicit_workers_mpi_ms_deviceid
			
 
				+If this flag is set, the MPI Master Slave workers will be attached to the MPI Master Slave
			
 
				+devices specified in the array starpu_conf::workers_mpi_ms_deviceid.
			
 
				+Otherwise, StarPU affects the MPI Master Slave devices in a round-robin fashion.
			
 
				+(default = 0)
			
 
				+\var unsigned starpu_conf::workers_mpi_ms_deviceid[STARPU_NMAXWORKERS]
			
 
				+If the flag starpu_conf::use_explicit_workers_mpi_ms_deviceid is set, the
			
 
				+array contains the logical identifiers of the MPI Master Slave devices to be used.
			
 
				 
			
 
				 \var int starpu_conf::bus_calibrate
			
 
				 If this flag is set, StarPU will recalibrate the bus.  If this value
			
@@ -176,6 +188,13 @@ environment variable \ref STARPU_DISABLE_ASYNCHRONOUS_MIC_COPY.
 
				 This can also be specified at compilation time by giving to the
			
 
				 configure script the option \ref disable-asynchronous-mic-copy "--disable-asynchronous-mic-copy".
			
 
				 (default = 0).
			
 
				+\var int starpu_conf::disable_asynchronous_mpi_ms_copy
			
 
				+This flag should be set to 1 to disable asynchronous copies between
			
 
				+CPUs and MPI Master Slave devices. This can also be specified with the
			
 
				+environment variable \ref STARPU_DISABLE_ASYNCHRONOUS_MPI_MS_COPY.
			
 
				+This can also be specified at compilation time by giving to the
			
 
				+configure script the option \ref disable-asynchronous-mpi-master-slave-copy "--disable-asynchronous-mpi-master-slave-copy".
			
 
				+(default = 0).
			
 
				 
			
 
				 \var unsigned *starpu_conf::cuda_opengl_interoperability
			
 
				 Enable CUDA/OpenGL interoperation on these CUDA
			
@@ -269,6 +288,11 @@ accelerators are disabled.
 
				 Return 1 if asynchronous data transfers between CPU and MIC
			
 
				 devices are disabled.
			
 
				 
			
 
				+\fn int starpu_asynchronous_mpi_ms_copy_disabled(void)
			
 
				+\ingroup API_Initialization_and_Termination
			
 
				+Return 1 if asynchronous data transfers between CPU and MPI Slave
			
 
				+devices are disabled.
			
 
				+
			
 
				 \fn void starpu_topology_print(FILE *f)
			
 
				 \ingroup API_Initialization_and_Termination
			
 
				 Prints a description of the topology on f.
			
--- a/doc/doxygen/chapters/api/modularized_scheduler.doxy
+++ b/doc/doxygen/chapters/api/modularized_scheduler.doxy
@@ -2,7 +2,7 @@
 
				  * This file is part of the StarPU Handbook.
			
 
				  * Copyright (C) 2013        Simon Archipoff
			
 
				  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
			
 
				- * Copyright (C) 2014, 2015, 2016        CNRS
			
 
				+ * Copyright (C) 2014, 2015, 2016, 2017        CNRS
			
 
				  * Copyright (C) 2013, 2014  INRIA
			
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
@@ -227,18 +227,32 @@ The actual scheduler
 
				 \ingroup API_Modularized_Scheduler
			
 
				 	 return true iff \p component is a combined worker component
			
 
				 
			
 
				-\fn void starpu_sched_component_worker_pre_exec_hook(struct starpu_task *task)
			
 
				+\fn void starpu_sched_component_worker_pre_exec_hook(struct starpu_task *task, unsigned sched_ctx_id)
			
 
				 \ingroup API_Modularized_Scheduler
			
 
				 	 compatibility with starpu_sched_policy interface
			
 
				 	 update predictions for workers
			
 
				 
			
 
				-\fn void starpu_sched_component_worker_post_exec_hook(struct starpu_task *task)
			
 
				+\fn void starpu_sched_component_worker_post_exec_hook(struct starpu_task *task, unsigned sched_ctx_id)
			
 
				 \ingroup API_Modularized_Scheduler
			
 
				 	 compatibility with starpu_sched_policy interface
			
 
				 
			
 
				 @name Flow-control Fifo Component API
			
 
				 \ingroup API_Modularized_Scheduler
			
 
				 
			
 
				+\fn double starpu_sched_component_estimated_load(struct starpu_sched_component * component);
			
 
				+\ingroup API_Modularized_Scheduler
			
 
				+default function for the estimated_load component method, just sums up the loads
			
 
				+of the children of the component.
			
 
				+
			
 
				+\fn double starpu_sched_component_estimated_end_min(struct starpu_sched_component * component);
			
 
				+\ingroup API_Modularized_Scheduler
			
 
				+function that can be used for the estimated_end component method, which just computes the minimum completion time of the children.
			
 
				+
			
 
				+\fn double starpu_sched_component_estimated_end_average(struct starpu_sched_component * component);
			
 
				+\ingroup API_Modularized_Scheduler
			
 
				+default function for the estimated_end component method, which just computes the average completion time of the children.
			
 
				+
			
 
				+
			
 
				 \struct starpu_sched_component_fifo_data
			
 
				 \ingroup API_Modularized_Scheduler
			
 
				 \var unsigned starpu_sched_component_fifo_data::ntasks_threshold
			
--- a/doc/doxygen/chapters/api/mpi.doxy
+++ b/doc/doxygen/chapters/api/mpi.doxy
@@ -2,7 +2,7 @@
 
				  * This file is part of the StarPU Handbook.
			
 
				  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
			
 
				- * Copyright (C) 2011, 2012 INRIA
			
 
				+ * Copyright (C) 2011, 2012, 2017  INRIA
			
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
 
				 
			
@@ -484,4 +484,14 @@ of the collective communication, the \p rcallback function is called
 
				 with the argument \p rarg on the process root, the \p scallback
			
 
				 function is called with the argument \p sarg on any other process.
			
 
				 
			
 
				+@name MPI Master Slave
			
 
				+\anchor MPIMasterSlave
			
 
				+\ingroup API_MPI_Support
			
 
				+
			
 
				+\def STARPU_USE_MPI_MASTER_SLAVE
			
 
				+\ingroup API_MPI_Support
			
 
				+This macro is defined when StarPU has been installed with MPI Master Slave
			
 
				+support. It should be used in your code to detect the availability of
			
 
				+MPI Master Slave.
			
 
				+
			
 
				 */
			
--- a/doc/doxygen/chapters/api/scheduling_contexts.doxy
+++ b/doc/doxygen/chapters/api/scheduling_contexts.doxy
@@ -97,6 +97,11 @@ minimum scheduler priority value.
 
				 This macro is used when calling starpu_sched_ctx_create() to specify a
			
 
				 maximum scheduler priority value.
			
 
				 
			
 
				+\def STARPU_SCHED_CTX_AWAKE_WORKERS
			
 
				+\ingroup API_Scheduling_Contexts
			
 
				+This macro is used when calling starpu_sched_ctx_create() to specify a
			
 
				+pointer to a scheduling policy
			
 
				+
			
 
				 \def STARPU_SCHED_CTX_POLICY_INIT
			
 
				 \ingroup API_Scheduling_Contexts
			
 
				 This macro is used when calling starpu_sched_ctx_create() to specify a
			
--- a/doc/doxygen/chapters/api/workers.doxy
+++ b/doc/doxygen/chapters/api/workers.doxy
@@ -1,8 +1,8 @@
 
				 /*
			
 
				  * This file is part of the StarPU Handbook.
			
 
				  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
			
 
				- * Copyright (C) 2011, 2012 INRIA
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
			
 
				+ * Copyright (C) 2011, 2012, 2017  INRIA
			
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
 
				 
			
@@ -39,6 +39,9 @@ TODO
 
				 \var starpu_node_kind::STARPU_OPENCL_RAM
			
 
				 \ingroup API_Workers_Properties
			
 
				 TODO
			
 
				+\var starpu_node_kind::STARPU_DISK_RAM
			
 
				+\ingroup API_Workers_Properties
			
 
				+TODO
			
 
				 \var starpu_node_kind::STARPU_MIC_RAM
			
 
				 \ingroup API_Workers_Properties
			
 
				 TODO
			
@@ -49,6 +52,9 @@ will be useful for MPI.
 
				 \var starpu_node_kind::STARPU_SCC_SHM
			
 
				 \ingroup API_Workers_Properties
			
 
				 TODO
			
 
				+\var starpu_node_kind::STARPU_MPI_MS_RAM
			
 
				+\ingroup API_Workers_Properties
			
 
				+TODO
			
 
				 
			
 
				 \enum starpu_worker_archtype
			
 
				 \ingroup API_Workers_Properties
			
@@ -71,6 +77,9 @@ Intel MIC device
 
				 \var starpu_worker_archtype::STARPU_SCC_WORKER
			
 
				 \ingroup API_Workers_Properties
			
 
				 Intel SCC device
			
 
				+\var starpu_worker_archtype::STARPU_MPI_MS_WORKER
			
 
				+\ingroup API_Workers_Properties
			
 
				+MPI Slave device
			
 
				 
			
 
				 
			
 
				 \struct starpu_worker_collection
			
@@ -147,6 +156,10 @@ This function returns the number of MIC workers controlled by StarPU.
 
				 This function returns the number of MIC devices controlled by StarPU.
			
 
				 The returned value should be at most \ref STARPU_MAXMICDEVS.
			
 
				 
			
 
				+\fn unsigned starpu_mpi_ms_worker_get_count(void)
			
 
				+\ingroup API_Workers_Properties
			
 
				+This function returns the number of MPI Master Slave workers controlled by StarPU.
			
 
				+
			
 
				 \fn unsigned starpu_scc_worker_get_count(void)
			
 
				 \ingroup API_Workers_Properties
			
 
				 This function returns the number of SCC devices controlled by StarPU.
			
@@ -170,7 +183,7 @@ between 0 and starpu_worker_get_count() - 1.
 
				 This is the same as starpu_worker_get_id, but aborts when called from outside a
			
 
				 worker (i.e. when starpu_worker_get_id() would return -1).
			
 
				 
			
 
				-\fn int starpu_worker_get_ids_by_type(enum starpu_worker_archtype type, int *workerids, int maxsize)
			
 
				+\fn unsigned starpu_worker_get_ids_by_type(enum starpu_worker_archtype type, int *workerids, unsigned maxsize)
			
 
				 \ingroup API_Workers_Properties
			
 
				 This function gets the list of identifiers of workers with the
			
 
				 given type. It fills the array \p workerids with the identifiers of the
			
--- a/doc/doxygen/chapters/images/data_trace.eps
+++ b/doc/doxygen/chapters/images/data_trace.eps
--- a/doc/doxygen/chapters/images/data_trace.pdf
+++ b/doc/doxygen/chapters/images/data_trace.pdf
--- a/doc/doxygen/chapters/images/data_trace.png
+++ b/doc/doxygen/chapters/images/data_trace.png
--- a/doc/doxygen/chapters/images/distrib_data.eps
+++ b/doc/doxygen/chapters/images/distrib_data.eps
--- a/doc/doxygen/chapters/images/distrib_data.pdf
+++ b/doc/doxygen/chapters/images/distrib_data.pdf
--- a/doc/doxygen/chapters/images/distrib_data.png
+++ b/doc/doxygen/chapters/images/distrib_data.png
--- a/doc/doxygen/chapters/images/distrib_data_histo.eps
+++ b/doc/doxygen/chapters/images/distrib_data_histo.eps
--- a/doc/doxygen/chapters/images/distrib_data_histo.pdf
+++ b/doc/doxygen/chapters/images/distrib_data_histo.pdf
--- a/doc/doxygen/chapters/images/distrib_data_histo.png
+++ b/doc/doxygen/chapters/images/distrib_data_histo.png
--- a/doc/doxygen/chapters/images/paje_draw_histogram.eps
+++ b/doc/doxygen/chapters/images/paje_draw_histogram.eps
--- a/doc/doxygen/chapters/images/paje_draw_histogram.pdf
+++ b/doc/doxygen/chapters/images/paje_draw_histogram.pdf
--- a/doc/doxygen/chapters/images/paje_draw_histogram.png
+++ b/doc/doxygen/chapters/images/paje_draw_histogram.png
--- a/doc/doxygen/chapters/images/parallel_worker2.eps
+++ b/doc/doxygen/chapters/images/parallel_worker2.eps
--- a/doc/doxygen/chapters/images/parallel_worker2.pdf
+++ b/doc/doxygen/chapters/images/parallel_worker2.pdf
--- a/doc/doxygen/chapters/images/parallel_worker2.png
+++ b/doc/doxygen/chapters/images/parallel_worker2.png
--- a/doc/doxygen/chapters/images/runtime-par.eps
+++ b/doc/doxygen/chapters/images/runtime-par.eps
--- a/doc/doxygen/chapters/images/runtime-par.pdf
+++ b/doc/doxygen/chapters/images/runtime-par.pdf
--- a/doc/doxygen/chapters/images/runtime-par.png
+++ b/doc/doxygen/chapters/images/runtime-par.png
--- a/doc/doxygen/chapters/images/starpu_chol_model_11_type.eps
+++ b/doc/doxygen/chapters/images/starpu_chol_model_11_type.eps
--- a/doc/doxygen/chapters/images/starpu_chol_model_11_type.pdf
+++ b/doc/doxygen/chapters/images/starpu_chol_model_11_type.pdf
--- a/doc/doxygen/chapters/images/starpu_chol_model_11_type.png
+++ b/doc/doxygen/chapters/images/starpu_chol_model_11_type.png
--- a/doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based.eps
+++ b/doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based.eps
--- a/doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based.pdf
+++ b/doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based.pdf
--- a/doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based.png
+++ b/doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based.png
--- a/doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based_2.eps
+++ b/doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based_2.eps
--- a/doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based_2.pdf
+++ b/doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based_2.pdf
--- a/doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based_2.png
+++ b/doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based_2.png
--- a/doc/doxygen/chapters/images/starpu_starpu_slu_lu_model_11.eps
+++ b/doc/doxygen/chapters/images/starpu_starpu_slu_lu_model_11.eps
--- a/doc/doxygen/chapters/images/starpu_starpu_slu_lu_model_11.pdf
+++ b/doc/doxygen/chapters/images/starpu_starpu_slu_lu_model_11.pdf
--- a/doc/doxygen/chapters/images/starpu_starpu_slu_lu_model_11.png
+++ b/doc/doxygen/chapters/images/starpu_starpu_slu_lu_model_11.png
--- a/doc/doxygen/chapters/images/tasks_size_overhead.eps
+++ b/doc/doxygen/chapters/images/tasks_size_overhead.eps
--- a/doc/doxygen/chapters/images/tasks_size_overhead.pdf
+++ b/doc/doxygen/chapters/images/tasks_size_overhead.pdf
--- a/doc/doxygen/chapters/images/tasks_size_overhead.png
+++ b/doc/doxygen/chapters/images/tasks_size_overhead.png
--- a/doc/doxygen/chapters/images/temanejo.png
+++ b/doc/doxygen/chapters/images/temanejo.png
--- a/doc/doxygen/chapters/parallel_worker1.eps
+++ b/doc/doxygen/chapters/parallel_worker1.eps
--- a/doc/doxygen/chapters/parallel_worker1.pdf
+++ b/doc/doxygen/chapters/parallel_worker1.pdf
--- a/doc/doxygen/chapters/parallel_worker1.png
+++ b/doc/doxygen/chapters/parallel_worker1.png
--- a/doc/doxygen/chapters/runtime-seq.eps
+++ b/doc/doxygen/chapters/runtime-seq.eps
--- a/doc/doxygen/chapters/runtime-seq.pdf
+++ b/doc/doxygen/chapters/runtime-seq.pdf
--- a/doc/doxygen/chapters/runtime-seq.png
+++ b/doc/doxygen/chapters/runtime-seq.png
--- a/doc/doxygen/doxygen-config.cfg.in
+++ b/doc/doxygen/doxygen-config.cfg.in
@@ -1,7 +1,7 @@
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 #
			
 
				 # Copyright (C) 2009-2013  Université de Bordeaux
			
 
				-# Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
			
 
				+# Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
			
 
				 # Copyright (C) 2011  Télécom-SudParis
			
 
				 # Copyright (C) 2011, 2012  INRIA
			
 
				 #
			
@@ -74,4 +74,5 @@ INPUT_FILTER           = @top_builddir@/doc/doxygen/doxygen_filter.sh
 
				 
			
 
				 #LATEX_HEADER           = @top_srcdir@/doc/doxygen/refman.tex
			
 
				 
			
 
				-IMAGE_PATH             = @top_srcdir@/doc/doxygen/chapters
			
 
				+IMAGE_PATH             = @top_srcdir@/doc/doxygen/chapters/images
			
 
				+
			
--- a/doc/doxygen/refman.tex
+++ b/doc/doxygen/refman.tex
@@ -20,7 +20,7 @@ was last updated on \STARPUUPDATED.\\
 
				 
			
 
				 Copyright © 2009–2013 Université de Bordeaux\\
			
 
				 
			
 
				-Copyright © 2010-2016 CNRS
			
 
				+Copyright © 2010-2017 CNRS
			
 
				 
			
 
				 Copyright © 2011, 2012, 2016 INRIA
			
 
				 
			
@@ -233,6 +233,7 @@ Documentation License”.
 
				 \input{group__API__SC__Hypervisor__usage}
			
 
				 \input{group__API__SC__Hypervisor}
			
 
				 \input{group__API__Modularized__Scheduler}
			
 
				+\input{group__API__Clustering__Machine}
			
 
				 
			
 
				 \chapter{File Index}
			
 
				 \input{files}
			
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -1,7 +1,7 @@
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 #
			
 
				 # Copyright (C) 2009-2017  Université de Bordeaux
			
 
				-# Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  CNRS
			
 
				+# Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2017  CNRS
			
 
				 # Copyright (C) 2011  Télécom-SudParis
			
 
				 # Copyright (C) 2011-2012  INRIA
			
 
				 # Copyright (C) 2015-2016  Inria
			
@@ -78,7 +78,7 @@ EXTRA_DIST = 					\
 
				 	scheduler/schedulers_context.sh			\
			
 
				 	fortran/Makefile				\
			
 
				 	sched_ctx/axpy_partition_gpu.h				\
			
 
				-	sched_ctx/axpy_partition_gpu.cu		
			
 
				+	sched_ctx/axpy_partition_gpu.cu
			
 
				 
			
 
				 
			
 
				 CLEANFILES = *.gcno *.gcda *.linkinfo *.mod starpu_idle_microsec.log
			
@@ -145,7 +145,7 @@ noinst_HEADERS = 				\
 
				 	pi/SobolQRNG/sobol_primitives.h         \
			
 
				 	reductions/dot_product.h                \
			
 
				 	basic_examples/vector_scal_cpu_template.h \
			
 
				-	sched_ctx/axpy_partition_gpu.h				
			
 
				+	sched_ctx/axpy_partition_gpu.h
			
 
				 
			
 
				 #####################################
			
 
				 # What to install and what to check #
			
@@ -179,14 +179,17 @@ LOADER_BIN		=	$(top_builddir)/examples/loader-cross.sh
 
				 endif
			
 
				 
			
 
				 if STARPU_USE_MPI_MASTER_SLAVE
			
 
				-MPI = $(MPIEXEC) $(MPIEXEC_ARGS) -np 4
			
 
				+MPI 			= $(MPIEXEC) $(MPIEXEC_ARGS) -np 4
			
 
				+LOADER_BIN2		= $(MPI) $(LOADER_BIN)
			
 
				+else
			
 
				+LOADER_BIN2		= $(LOADER_BIN)
			
 
				 endif
			
 
				 
			
 
				 if STARPU_HAVE_AM111
			
 
				 TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
			
 
				-LOG_COMPILER		=	$(MPI) $(LOADER_BIN)
			
 
				+LOG_COMPILER		=	$(LOADER_BIN2)
			
 
				 else
			
 
				-TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPI) $(LOADER_BIN)
			
 
				+TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(LOADER_BIN2)
			
 
				 endif
			
 
				 
			
 
				 endif
			
@@ -240,8 +243,7 @@ STARPU_EXAMPLES +=				\
 
				 	sched_ctx/dummy_sched_with_ctx		\
			
 
				 	worker_collections/worker_tree_example  \
			
 
				 	reductions/dot_product			\
			
 
				-	reductions/minmax_reduction		\
			
 
				-	sched_ctx/gpu_partition
			
 
				+	reductions/minmax_reduction		
			
 
				 
			
 
				 endif
			
 
				 
			
@@ -334,6 +336,7 @@ STARPU_EXAMPLES +=				\
 
				 	sched_ctx/nested_sched_ctxs		\
			
 
				 	sched_ctx/sched_ctx_without_sched_policy_awake\
			
 
				 	sched_ctx/parallel_tasks_reuse_handle
			
 
				+
			
 
				 if STARPU_LONG_CHECK
			
 
				 STARPU_EXAMPLES +=				\
			
 
				 	sched_ctx/parallel_code
			
@@ -349,11 +352,11 @@ endif
 
				 
			
 
				 endif !STARPU_SIMGRID
			
 
				 
			
 
				-sched_ctx_gpu_partition_SOURCES =		\
			
 
				-	sched_ctx/gpu_partition.c
			
 
				-
			
 
				 if STARPU_USE_CUDA
			
 
				-sched_ctx_gpu_partition_SOURCES +=		\
			
 
				+STARPU_EXAMPLES +=				\
			
 
				+	sched_ctx/gpu_partition
			
 
				+sched_ctx_gpu_partition_SOURCES =		\
			
 
				+	sched_ctx/gpu_partition.c		\
			
 
				 	sched_ctx/axpy_partition_gpu.cu
			
 
				 endif
			
 
				 
			
--- a/examples/cholesky/cholesky.h
+++ b/examples/cholesky/cholesky.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2009-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -119,24 +119,28 @@ static unsigned nbigblocks;
 
				 
			
 
				 static inline void init_sizes(void) {
			
 
				 	int power = starpu_cpu_worker_get_count() + 32 * starpu_cuda_worker_get_count();
			
 
				-	int power_sqrt = sqrt(power)/2;
			
 
				-	if (power_sqrt < 1)
			
 
				-		power_sqrt = 1;
			
 
				+	int power_cbrt = cbrt(power);
			
 
				+#ifndef STARPU_LONG_CHECK
			
 
				+	power_cbrt /= 2;
			
 
				+#endif
			
 
				+
			
 
				+	if (power_cbrt < 1)
			
 
				+		power_cbrt = 1;
			
 
				 
			
 
				 #ifdef STARPU_QUICK_CHECK
			
 
				 	if (!size)
			
 
				-		size = 320*2*power_sqrt;
			
 
				+		size = 320*2*power_cbrt;
			
 
				 	if (!nblocks)
			
 
				-		nblocks = 2*power_sqrt;
			
 
				+		nblocks = 2*power_cbrt;
			
 
				 	if (!nbigblocks)
			
 
				-		nbigblocks = power_sqrt;
			
 
				+		nbigblocks = power_cbrt;
			
 
				 #else
			
 
				 	if (!size)
			
 
				-		size = 960*8*power_sqrt;
			
 
				+		size = 960*8*power_cbrt;
			
 
				 	if (!nblocks)
			
 
				-		nblocks = 8*power_sqrt;
			
 
				+		nblocks = 8*power_cbrt;
			
 
				 	if (!nbigblocks)
			
 
				-		nbigblocks = 4*power_sqrt;
			
 
				+		nbigblocks = 4*power_cbrt;
			
 
				 #endif
			
 
				 }
			
 
				 
			
--- a/examples/lu/lu_example.c
+++ b/examples/lu/lu_example.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2009-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -314,20 +314,23 @@ int main(int argc, char **argv)
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				 	int power = starpu_cpu_worker_get_count() + 32 * starpu_cuda_worker_get_count();
			
 
				-	int power_sqrt = sqrt(power)/2;
			
 
				-	if (power_sqrt < 1)
			
 
				-		power_sqrt = 1;
			
 
				+	int power_cbrt = cbrt(power);
			
 
				+#ifndef STARPU_LONG_CHECK
			
 
				+	power_cbrt /= 2;
			
 
				+#endif
			
 
				+	if (power_cbrt < 1)
			
 
				+		power_cbrt = 1;
			
 
				 
			
 
				 #ifdef STARPU_QUICK_CHECK
			
 
				 	if (!size)
			
 
				-		size = 320*2*power_sqrt;
			
 
				+		size = 320*2*power_cbrt;
			
 
				 	if (!nblocks)
			
 
				-		nblocks = 2*power_sqrt;
			
 
				+		nblocks = 2*power_cbrt;
			
 
				 #else
			
 
				 	if (!size)
			
 
				-		size = 960*8*power_sqrt;
			
 
				+		size = 960*8*power_cbrt;
			
 
				 	if (!nblocks)
			
 
				-		nblocks = 8*power_sqrt;
			
 
				+		nblocks = 8*power_cbrt;
			
 
				 #endif
			
 
				 
			
 
				 	parse_args(argc, argv);
			
--- a/examples/sched_ctx/gpu_partition.c
+++ b/examples/sched_ctx/gpu_partition.c
@@ -110,7 +110,7 @@ int main(int argc, char **argv)
 
				 	int gpu_devid = -1;
			
 
				 
			
 
				 #warning temporary fix: skip test as cuda computation fails
			
 
				-	return 77;
			
 
				+ 	return 77;
			
 
				 
			
 
				 #ifndef STARPU_HAVE_SETENV
			
 
				 	return 77;
			
@@ -118,6 +118,7 @@ int main(int argc, char **argv)
 
				 	/* Have separate threads for streams */
			
 
				 	setenv("STARPU_CUDA_THREAD_PER_WORKER", "1", 1);
			
 
				 	setenv("STARPU_NWORKER_PER_CUDA", "2", 1);
			
 
				+	setenv("STARPU_NCUDA", "1", 1);
			
 
				 #endif
			
 
				 
			
 
				 	/* Initialize StarPU */
			
@@ -175,7 +176,7 @@ int main(int argc, char **argv)
 
				 	int ncpus = starpu_cpu_worker_get_count();
			
 
				 	int workers[ncpus+nstreams];
			
 
				 	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, workers, ncpus);
			
 
				-
			
 
				+	
			
 
				 	int sched_ctxs[nstreams];
			
 
				 	int nsms[nstreams];
			
 
				 	nsms[0] = 6;
			
--- a/examples/sched_ctx/nested_sched_ctxs.c
+++ b/examples/sched_ctx/nested_sched_ctxs.c
@@ -50,11 +50,10 @@ int parallel_code(int sched_ctx)
 
				 
			
 
				 static void sched_ctx_func(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
			
 
				 {
			
 
				-	int w = starpu_worker_get_id();
			
 
				-	(void) w;
			
 
				 	unsigned sched_ctx = (uintptr_t)arg;
			
 
				-	int n = parallel_code(sched_ctx);
			
 
				-	(void) n;
			
 
				+	int t = parallel_code(sched_ctx);
			
 
				+	if (sched_ctx > 0 && sched_ctx < 3)
			
 
				+		tasks_executed[sched_ctx-1] += t;
			
 
				 	//printf("w %d executed %d it \n", w, n);
			
 
				 }
			
 
				 
			
@@ -232,8 +231,8 @@ int main(int argc, char **argv)
 
				 	starpu_sched_ctx_delete(sched_ctx1);
			
 
				 	starpu_sched_ctx_delete(sched_ctx2);
			
 
				 
			
 
				-	printf("ctx%u: tasks starpu executed %d out of %d\n", sched_ctx1, tasks_executed[0], NTASKS);
			
 
				-	printf("ctx%u: tasks starpu executed %d out of %d\n", sched_ctx2, tasks_executed[1], NTASKS);
			
 
				+	printf("ctx%u: tasks starpu executed %d out of %d\n", sched_ctx1, tasks_executed[0], NTASKS*NTASKS);
			
 
				+	printf("ctx%u: tasks starpu executed %d out of %d\n", sched_ctx2, tasks_executed[1], NTASKS*NTASKS);
			
 
				 
			
 
				 #ifdef STARPU_USE_CPU
			
 
				 	free(procs1);
			
--- a/examples/sched_ctx/parallel_code.c
+++ b/examples/sched_ctx/parallel_code.c
@@ -19,13 +19,11 @@
 
				 #include <omp.h>
			
 
				 
			
 
				 #ifdef STARPU_QUICK_CHECK
			
 
				-#define NTASKS 64
			
 
				+#define NTASKS 4
			
 
				 #else
			
 
				 #define NTASKS 10
			
 
				 #endif
			
 
				 
			
 
				-int tasks_executed[2];
			
 
				-
			
 
				 int parallel_code(unsigned *sched_ctx)
			
 
				 {
			
 
				 	int i;
			
@@ -34,53 +32,36 @@ int parallel_code(unsigned *sched_ctx)
 
				 	int ncpuids = 0;
			
 
				 	starpu_sched_ctx_get_available_cpuids(*sched_ctx, &cpuids, &ncpuids);
			
 
				 
			
 
				-//	printf("execute task of %d threads \n", ncpuids);
			
 
				-#pragma omp parallel num_threads(ncpuids)
			
 
				+	/* printf("execute task of %d threads \n", ncpuids); */
			
 
				+	omp_set_num_threads(ncpuids);
			
 
				+#pragma omp parallel
			
 
				 	{
			
 
				 		starpu_sched_ctx_bind_current_thread_to_cpuid(cpuids[omp_get_thread_num()]);
			
 
				-// 			printf("cpu = %d ctx%d nth = %d\n", sched_getcpu(), sched_ctx, omp_get_num_threads());
			
 
				+			/* printf("cpu = %d ctx%d nth = %d\n", sched_getcpu(), *sched_ctx, omp_get_num_threads()); */
			
 
				 #pragma omp for
			
 
				 		for(i = 0; i < NTASKS; i++)
			
 
				-			t++;
			
 
				+		{
			
 
				+#pragma omp atomic
			
 
				+				t++;
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	free(cpuids);
			
 
				-	tasks_executed[*sched_ctx-1] = t;
			
 
				 	return t;
			
 
				 }
			
 
				 
			
 
				-static void sched_ctx_func(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
			
 
				-{
			
 
				-	int w = starpu_worker_get_id();
			
 
				-	unsigned *sched_ctx = (unsigned*)arg;
			
 
				-	int n = parallel_code(sched_ctx);
			
 
				-	printf("w %d executed %d it \n", w, n);
			
 
				-}
			
 
				-
			
 
				-
			
 
				-static struct starpu_codelet sched_ctx_codelet =
			
 
				-{
			
 
				-	.cpu_funcs = {sched_ctx_func},
			
 
				-	.model = NULL,
			
 
				-	.nbuffers = 0,
			
 
				-	.name = "sched_ctx"
			
 
				-};
			
 
				-
			
 
				 void *th(void* p)
			
 
				 {
			
 
				 	unsigned* sched_ctx = (unsigned*)p;
			
 
				-	tasks_executed[*sched_ctx-1] = 0;
			
 
				-	//here the return of parallel code could be used (as a void*)
			
 
				-	starpu_sched_ctx_exec_parallel_code((void*)parallel_code, p, *sched_ctx);
			
 
				-	return &tasks_executed[*sched_ctx-1];
			
 
				+	void* ret;
			
 
				+	ret = starpu_sched_ctx_exec_parallel_code((void*)parallel_code, p, *sched_ctx);
			
 
				+	pthread_exit(ret);
			
 
				 }
			
 
				 
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				-	tasks_executed[0] = 0;
			
 
				-	tasks_executed[1] = 0;
			
 
				-	int ntasks = NTASKS;
			
 
				-	int ret, j, k;
			
 
				+	int ret;
			
 
				+	void* tasks_executed;
			
 
				 
			
 
				 	ret = starpu_init(NULL);
			
 
				 	if (ret == -ENODEV)
			
@@ -88,142 +69,33 @@ int main(int argc, char **argv)
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				 	int nprocs1;
			
 
				-	int nprocs2;
			
 
				-	int *procs1, *procs2;
			
 
				+	int *procs1;
			
 
				 
			
 
				 #ifdef STARPU_USE_CPU
			
 
				 	unsigned ncpus =  starpu_cpu_worker_get_count();
			
 
				 	procs1 = (int*)malloc(ncpus*sizeof(int));
			
 
				-	procs2 = (int*)malloc(ncpus*sizeof(int));
			
 
				 	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, procs1, ncpus);
			
 
				-
			
 
				-	nprocs1 = ncpus/2;
			
 
				-	nprocs2 =  nprocs1;
			
 
				-	k = 0;
			
 
				-	for(j = nprocs1; j < nprocs1+nprocs2; j++)
			
 
				-		procs2[k++] = j;
			
 
				+	nprocs1 = ncpus;
			
 
				 #else
			
 
				 	nprocs1 = 1;
			
 
				-	nprocs2 = 1;
			
 
				 	procs1 = (int*)malloc(nprocs1*sizeof(int));
			
 
				-	procs2 = (int*)malloc(nprocs2*sizeof(int));
			
 
				-	procs1[0] = 0;
			
 
				-	procs2[0] = 0;
			
 
				 #endif
			
 
				 
			
 
				-	if (nprocs1 < 4)
			
 
				-	{
			
 
				-		/* Not enough procs */
			
 
				-		free(procs1);
			
 
				-		free(procs2);
			
 
				-		starpu_shutdown();
			
 
				-		return 77;
			
 
				-	}
			
 
				-
			
 
				-	/*create contexts however you want*/
			
 
				 	unsigned sched_ctx1 = starpu_sched_ctx_create(procs1, nprocs1, "ctx1", STARPU_SCHED_CTX_POLICY_NAME, "dmda", 0);
			
 
				-	unsigned sched_ctx2 = starpu_sched_ctx_create(procs2, nprocs2, "ctx2", STARPU_SCHED_CTX_POLICY_NAME, "dmda", 0);
			
 
				-
			
 
				-	/*indicate what to do with the resources when context 2 finishes (it depends on your application)*/
			
 
				-//	starpu_sched_ctx_set_inheritor(sched_ctx2, sched_ctx1);
			
 
				-
			
 
				-	int nprocs3 = nprocs1/2;
			
 
				-	int nprocs4 = nprocs1/2;
			
 
				-	int nprocs5 = nprocs2/2;
			
 
				-	int nprocs6 = nprocs2/2;
			
 
				-	int procs3[nprocs3];
			
 
				-	int procs4[nprocs4];
			
 
				-	int procs5[nprocs5];
			
 
				-	int procs6[nprocs6];
			
 
				-
			
 
				-	k = 0;
			
 
				-	for(j = 0; j < nprocs3; j++)
			
 
				-		procs3[k++] = procs1[j];
			
 
				-	k = 0;
			
 
				-	for(j = nprocs3; j < nprocs3+nprocs4; j++)
			
 
				-		procs4[k++] = procs1[j];
			
 
				-
			
 
				-	k = 0;
			
 
				-	for(j = 0; j < nprocs5; j++)
			
 
				-		procs5[k++] = procs2[j];
			
 
				-	k = 0;
			
 
				-	for(j = nprocs5; j < nprocs5+nprocs6; j++)
			
 
				-		procs6[k++] = procs2[j];
			
 
				-
			
 
				-	int master3 = starpu_sched_ctx_book_workers_for_task(sched_ctx1, procs3, nprocs3);
			
 
				-	int master4 = starpu_sched_ctx_book_workers_for_task(sched_ctx1, procs4, nprocs4);
			
 
				-
			
 
				-	int master5 = starpu_sched_ctx_book_workers_for_task(sched_ctx2, procs5, nprocs5);
			
 
				-	int master6 = starpu_sched_ctx_book_workers_for_task(sched_ctx2, procs6, nprocs6);
			
 
				-
			
 
				-/* 	int master1 = starpu_sched_ctx_book_workers_for_task(procs1, nprocs1); */
			
 
				-/* 	int master2 = starpu_sched_ctx_book_workers_for_task(procs2, nprocs2); */
			
 
				-
			
 
				-
			
 
				-	int i;
			
 
				-	for (i = 0; i < ntasks; i++)
			
 
				-	{
			
 
				-		struct starpu_task *task = starpu_task_create();
			
 
				-
			
 
				-		task->cl = &sched_ctx_codelet;
			
 
				-		task->cl_arg = &sched_ctx1;
			
 
				-
			
 
				-		/*submit tasks to context*/
			
 
				-		ret = starpu_task_submit_to_ctx(task,sched_ctx1);
			
 
				-		if (ret == -ENODEV) goto enodev;
			
 
				-
			
 
				-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				-	}
			
 
				-
			
 
				-	for (i = 0; i < ntasks; i++)
			
 
				-	{
			
 
				-		struct starpu_task *task = starpu_task_create();
			
 
				-
			
 
				-		task->cl = &sched_ctx_codelet;
			
 
				-		task->cl_arg = &sched_ctx2;
			
 
				-
			
 
				-		/*submit tasks to context*/
			
 
				-		ret = starpu_task_submit_to_ctx(task,sched_ctx2);
			
 
				-		if (ret == -ENODEV) goto enodev;
			
 
				-
			
 
				-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				-	}
			
 
				-
			
 
				-
			
 
				-	/* tell starpu when you finished submitting tasks to this context
			
 
				-	   in order to allow moving resources from this context to the inheritor one
			
 
				-	   when its corresponding tasks finished executing */
			
 
				-
			
 
				-
			
 
				-enodev:
			
 
				-
			
 
				-	/* wait for all tasks at the end*/
			
 
				-	starpu_task_wait_for_all();
			
 
				-
			
 
				-/* 	starpu_sched_ctx_unbook_workers_for_task(sched_ctx1, master1); */
			
 
				-/* 	starpu_sched_ctx_unbook_workers_for_task(sched_ctx2, master2); */
			
 
				-
			
 
				-	starpu_sched_ctx_unbook_workers_for_task(sched_ctx1, master3);
			
 
				-	starpu_sched_ctx_unbook_workers_for_task(sched_ctx1, master4);
			
 
				-
			
 
				-	starpu_sched_ctx_unbook_workers_for_task(sched_ctx2, master5);
			
 
				-	starpu_sched_ctx_unbook_workers_for_task(sched_ctx2, master6);
			
 
				 
			
 
				-	pthread_t mp[2];
			
 
				-	pthread_create(&mp[0], NULL, th, &sched_ctx1);
			
 
				-	pthread_create(&mp[1], NULL, th, &sched_ctx2);
			
 
				+	/* This is the interesting part, we can launch a code to hijack the context and
			
 
				+		 use its cores to do something else entirely thanks to this */
			
 
				+	pthread_t mp;
			
 
				+	pthread_create(&mp, NULL, th, &sched_ctx1);
			
 
				 
			
 
				-	pthread_join(mp[0], NULL);
			
 
				-	pthread_join(mp[1], NULL);
			
 
				+	pthread_join(mp, &tasks_executed);
			
 
				 
			
 
				+	/* Finished, delete the context and print the amount of executed tasks */
			
 
				 	starpu_sched_ctx_delete(sched_ctx1);
			
 
				-	starpu_sched_ctx_delete(sched_ctx2);
			
 
				-	printf("ctx%u: tasks starpu executed %d out of %d\n", sched_ctx1, tasks_executed[0], NTASKS);
			
 
				-	printf("ctx%u: tasks starpu executed %d out of %d\n", sched_ctx2, tasks_executed[1], NTASKS);
			
 
				+	printf("ctx%u: tasks starpu executed %ld out of %d\n", sched_ctx1, (intptr_t)tasks_executed, NTASKS);
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				 	free(procs1);
			
 
				-	free(procs2);
			
 
				 
			
 
				 	return (ret == -ENODEV ? 77 : 0);
			
 
				 }
			
--- a/examples/stencil/Makefile.am
+++ b/examples/stencil/Makefile.am
@@ -1,6 +1,6 @@
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 #
			
 
				-# Copyright (C) 2010-2012, 2015-2016  Université de Bordeaux
			
 
				+# Copyright (C) 2010-2012, 2015-2017  Université de Bordeaux
			
 
				 #
			
 
				 # StarPU is free software; you can redistribute it and/or modify
			
 
				 # it under the terms of the GNU Lesser General Public License as published by
			
@@ -64,8 +64,14 @@ check_PROGRAMS	=	$(LOADER) $(STARPU_EXAMPLES)
 
				 endif
			
 
				 
			
 
				 if !STARPU_SIMGRID
			
 
				+if USE_MPI
			
 
				+if STARPU_MPI_CHECK
			
 
				 TESTS		=	$(STARPU_EXAMPLES)
			
 
				 endif
			
 
				+else
			
 
				+TESTS		=	$(STARPU_EXAMPLES)
			
 
				+endif
			
 
				+endif
			
 
				 
			
 
				 if !STARPU_HAVE_WINDOWS
			
 
				 ## test loader program
			
--- a/include/starpu.h
+++ b/include/starpu.h
@@ -107,8 +107,8 @@ struct starpu_conf
 
				 	unsigned use_explicit_workers_scc_deviceid;
			
 
				 	unsigned workers_scc_deviceid[STARPU_NMAXWORKERS];
			
 
				 
			
 
				-	unsigned use_explicit_workers_mpi_deviceid;
			
 
				-	unsigned workers_mpi_deviceid[STARPU_NMAXWORKERS];
			
 
				+	unsigned use_explicit_workers_mpi_ms_deviceid;
			
 
				+	unsigned workers_mpi_ms_deviceid[STARPU_NMAXWORKERS];
			
 
				 
			
 
				 	int bus_calibrate;
			
 
				 	int calibrate;
			
--- a/include/starpu_sched_component.h
+++ b/include/starpu_sched_component.h
@@ -115,6 +115,10 @@ int starpu_sched_component_is_combined_worker(struct starpu_sched_component *com
 
				 void starpu_sched_component_worker_pre_exec_hook(struct starpu_task *task, unsigned sched_ctx_id);
			
 
				 void starpu_sched_component_worker_post_exec_hook(struct starpu_task *task, unsigned sched_ctx_id);
			
 
				 
			
 
				+double starpu_sched_component_estimated_load(struct starpu_sched_component * component);
			
 
				+double starpu_sched_component_estimated_end_min(struct starpu_sched_component * component);
			
 
				+double starpu_sched_component_estimated_end_average(struct starpu_sched_component * component);
			
 
				+
			
 
				 struct starpu_sched_component_fifo_data
			
 
				 {
			
 
				 	unsigned ntasks_threshold;
			
--- a/include/starpu_stdlib.h
+++ b/include/starpu_stdlib.h
@@ -52,21 +52,10 @@ starpu_ssize_t starpu_memory_get_available_all_nodes();
 
				 
			
 
				 void starpu_memory_wait_available(unsigned node, size_t size);
			
 
				 
			
 
				-/**
			
 
				- * Try to allocate memory on the given node
			
 
				- *
			
 
				- * @param size amount of memory to allocate
			
 
				- * @param node node where the memory is to be allocated
			
 
				- * @return 1 if the given amount of memory was allocated on the given node
			
 
				- */
			
 
				+/* Try to allocate memory on the given node */
			
 
				 int starpu_memory_allocate(unsigned node, size_t size, int flags);
			
 
				 
			
 
				-/**
			
 
				- * Indicates the given amount of memory is going to be deallocated from the given node
			
 
				- *
			
 
				- * @param size amount of memory to be deallocated
			
 
				- * @param node node where the memory is going to be deallocated
			
 
				- */
			
 
				+/* Indicates the given amount of memory is going to be deallocated from the given node */
			
 
				 void starpu_memory_deallocate(unsigned node, size_t size);
			
 
				 
			
 
				 void starpu_sleep(float nb_sec);
			
--- a/include/starpu_worker.h
+++ b/include/starpu_worker.h
@@ -2,7 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2009-2013, 2016  Université de Bordeaux
			
 
				  * Copyright (C) 2010-2014  CNRS
			
 
				- * Copyright (C) 2016  INRIA
			
 
				+ * Copyright (C) 2016, 2017  INRIA
			
 
				  * Copyright (C) 2016  Uppsala University
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -36,7 +36,7 @@ enum starpu_worker_archtype
 
				 	STARPU_OPENCL_WORKER,
			
 
				 	STARPU_MIC_WORKER,
			
 
				 	STARPU_SCC_WORKER,
			
 
				-	STARPU_MPI_WORKER,
			
 
				+	STARPU_MPI_MS_WORKER,
			
 
				 	STARPU_ANY_WORKER
			
 
				 };
			
 
				 
			
--- a/mpi/src/starpu_mpi.c
+++ b/mpi/src/starpu_mpi.c
@@ -368,7 +368,7 @@ static struct _starpu_mpi_req *_starpu_mpi_isend_irecv_common(starpu_data_handle
 
				  }
			
 
				 
			
 
				 #ifdef STARPU_SIMGRID
			
 
				-int _starpu_mpi_simgrid_mpi_test(int *done, int *flag)
			
 
				+int _starpu_mpi_simgrid_mpi_test(unsigned *done, int *flag)
			
 
				 {
			
 
				 	*flag = 0;
			
 
				 	if (*done)
			
@@ -378,7 +378,7 @@ int _starpu_mpi_simgrid_mpi_test(int *done, int *flag)
 
				 	}
			
 
				 	return MPI_SUCCESS;
			
 
				 }
			
 
				-static void* _starpu_mpi_simgrid_wait_req_func(void* arg)
			
 
				+static void _starpu_mpi_simgrid_wait_req_func(void* arg)
			
 
				 {
			
 
				 	struct _starpu_simgrid_mpi_req *sim_req = arg;
			
 
				 	int ret;
			
@@ -399,8 +399,6 @@ static void* _starpu_mpi_simgrid_wait_req_func(void* arg)
 
				 	if (--wait_counter == 0)
			
 
				 		STARPU_PTHREAD_COND_SIGNAL(&wait_counter_cond);
			
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&wait_counter_mutex);
			
 
				-
			
 
				-	return NULL;
			
 
				 }
			
 
				 void _starpu_mpi_simgrid_wait_req(MPI_Request *request, MPI_Status *status, starpu_pthread_queue_t *queue, unsigned *done)
			
 
				 {
			
@@ -535,7 +533,13 @@ static struct _starpu_mpi_req *_starpu_mpi_isend_common(starpu_data_handle_t dat
 
				 							unsigned detached, unsigned sync, void (*callback)(void *), void *arg,
			
 
				 							int sequential_consistency)
			
 
				 {
			
 
				-	return _starpu_mpi_isend_irecv_common(data_handle, dest, data_tag, comm, detached, sync, callback, arg, SEND_REQ, _starpu_mpi_isend_size_func, STARPU_R, sequential_consistency, 0, 0);
			
 
				+	return _starpu_mpi_isend_irecv_common(data_handle, dest, data_tag, comm, detached, sync, callback, arg, SEND_REQ, _starpu_mpi_isend_size_func,
			
 
				+#ifdef STARPU_MPI_PEDANTIC_ISEND
			
 
				+					      STARPU_RW,
			
 
				+#else
			
 
				+					      STARPU_R,
			
 
				+#endif
			
 
				+					      sequential_consistency, 0, 0);
			
 
				 }
			
 
				 
			
 
				 int starpu_mpi_isend(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int dest, int data_tag, MPI_Comm comm)
			
@@ -828,7 +832,8 @@ static void _starpu_mpi_test_func(struct _starpu_mpi_req *testing_req)
 
				 	struct _starpu_mpi_req *req = testing_req->other_request;
			
 
				 
			
 
				 	_STARPU_MPI_DEBUG(2, "Test request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
			
 
				-			  req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.rank, req->data_handle, req->ptr, req->datatype_name, (int)req->count, req->registered_datatype);
			
 
				+			  req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.rank, req->data_handle, req->ptr,
			
 
				+			  req->datatype_name, (int)req->count, req->registered_datatype);
			
 
				 
			
 
				 	_STARPU_MPI_TRACE_UTESTING_BEGIN(req->node_tag.rank, req->node_tag.data_tag);
			
 
				 
			
@@ -1268,7 +1273,8 @@ static void _starpu_mpi_handle_ready_request(struct _starpu_mpi_req *req)
 
				 
			
 
				 	/* submit the request to MPI */
			
 
				 	_STARPU_MPI_DEBUG(2, "Handling new request %p type %s tag %d src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
			
 
				-			  req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.rank, req->data_handle, req->ptr, req->datatype_name, (int)req->count, req->registered_datatype);
			
 
				+			  req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.rank, req->data_handle,
			
 
				+			  req->ptr, req->datatype_name, (int)req->count, req->registered_datatype);
			
 
				 	req->func(req);
			
 
				 
			
 
				 	_STARPU_MPI_LOG_OUT();
			
@@ -1328,7 +1334,8 @@ static void _starpu_mpi_receive_early_data(struct _starpu_mpi_envelope *envelope
 
				 		//_starpu_mpi_early_data_add(early_data_handle);
			
 
				 	}
			
 
				 
			
 
				-	_STARPU_MPI_DEBUG(20, "Posting internal detached irecv on early_data_handle with tag %d from comm %d src %d ..\n", early_data_handle->node_tag.data_tag, comm, status.MPI_SOURCE);
			
 
				+	_STARPU_MPI_DEBUG(20, "Posting internal detached irecv on early_data_handle with tag %d from comm %ld src %d ..\n",
			
 
				+			  early_data_handle->node_tag.data_tag, (long int)comm, status.MPI_SOURCE);
			
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
			
 
				 	early_data_handle->req = _starpu_mpi_irecv_common(early_data_handle->handle, status.MPI_SOURCE,
			
 
				 							  early_data_handle->node_tag.data_tag, comm, 1, 0,
			
--- a/mpi/src/starpu_mpi_comm.c
+++ b/mpi/src/starpu_mpi_comm.c
@@ -94,7 +94,7 @@ void _starpu_mpi_comm_register(MPI_Comm comm)
 
				 	HASH_FIND(hh, _starpu_mpi_comms_cache, &comm, sizeof(MPI_Comm), found);
			
 
				 	if (found)
			
 
				 	{
			
 
				-		_STARPU_MPI_DEBUG(10, "comm %d (%d) already registered\n", comm, MPI_COMM_WORLD);
			
 
				+		_STARPU_MPI_DEBUG(10, "comm %ld (%ld) already registered\n", (long int)comm, (long int)MPI_COMM_WORLD);
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
@@ -104,7 +104,7 @@ void _starpu_mpi_comm_register(MPI_Comm comm)
 
				 			_STARPU_MPI_DEBUG(10, "reallocating for %d communicators\n", _starpu_mpi_comm_allocated);
			
 
				 			_STARPU_MPI_REALLOC(_starpu_mpi_comms, _starpu_mpi_comm_allocated * sizeof(struct _starpu_mpi_comm *));
			
 
				 		}
			
 
				-		_STARPU_MPI_DEBUG(10, "registering comm %d (%d) number %d\n", comm, MPI_COMM_WORLD, _starpu_mpi_comm_nb);
			
 
				+		_STARPU_MPI_DEBUG(10, "registering comm %ld (%ld) number %d\n", (long int)comm, (long int)MPI_COMM_WORLD, _starpu_mpi_comm_nb);
			
 
				 		struct _starpu_mpi_comm *_comm;
			
 
				 		_STARPU_MPI_CALLOC(_comm, 1, sizeof(struct _starpu_mpi_comm));
			
 
				 		_comm->comm = comm;
			
@@ -136,7 +136,7 @@ void _starpu_mpi_comm_post_recv()
 
				 		struct _starpu_mpi_comm *_comm = _starpu_mpi_comms[i]; // get the ith _comm;
			
 
				 		if (_comm->posted == 0)
			
 
				 		{
			
 
				-			_STARPU_MPI_DEBUG(3, "Posting a receive to get a data envelop on comm %d %d\n", i, _comm->comm);
			
 
				+			_STARPU_MPI_DEBUG(3, "Posting a receive to get a data envelop on comm %d %ld\n", i, (long int)_comm->comm);
			
 
				 			_STARPU_MPI_COMM_FROM_DEBUG(sizeof(struct _starpu_mpi_envelope), MPI_BYTE, MPI_ANY_SOURCE, _STARPU_MPI_TAG_ENVELOPE, _STARPU_MPI_TAG_ENVELOPE, _comm->comm);
			
 
				 			MPI_Irecv(_comm->envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, MPI_ANY_SOURCE, _STARPU_MPI_TAG_ENVELOPE, _comm->comm, &_comm->request);
			
 
				 #ifdef STARPU_SIMGRID
			
--- a/mpi/src/starpu_mpi_early_data.c
+++ b/mpi/src/starpu_mpi_early_data.c
@@ -76,7 +76,7 @@ struct _starpu_mpi_early_data_handle *_starpu_mpi_early_data_find(struct _starpu
 
				 	struct _starpu_mpi_early_data_handle *early_data_handle;
			
 
				 
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&_starpu_mpi_early_data_handle_mutex);
			
 
				-	_STARPU_MPI_DEBUG(60, "Looking for early_data_handle with comm %d source %d tag %d\n", node_tag->comm, node_tag->rank, node_tag->data_tag);
			
 
				+	_STARPU_MPI_DEBUG(60, "Looking for early_data_handle with comm %ld source %d tag %d\n", (long int)node_tag->comm, node_tag->rank, node_tag->data_tag);
			
 
				 	HASH_FIND(hh, _starpu_mpi_early_data_handle_hashmap, node_tag, sizeof(struct _starpu_mpi_node_tag), hashlist);
			
 
				 	if (hashlist == NULL)
			
 
				 	{
			
@@ -94,7 +94,7 @@ struct _starpu_mpi_early_data_handle *_starpu_mpi_early_data_find(struct _starpu
 
				 			early_data_handle = _starpu_mpi_early_data_handle_list_pop_front(hashlist->list);
			
 
				 		}
			
 
				 	}
			
 
				-	_STARPU_MPI_DEBUG(60, "Found early_data_handle %p with comm %d source %d tag %d\n", early_data_handle, node_tag->comm, node_tag->rank, node_tag->data_tag);
			
 
				+	_STARPU_MPI_DEBUG(60, "Found early_data_handle %p with comm %ld source %d tag %d\n", early_data_handle, (long int)node_tag->comm, node_tag->rank, node_tag->data_tag);
			
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&_starpu_mpi_early_data_handle_mutex);
			
 
				 	return early_data_handle;
			
 
				 }
			
@@ -102,7 +102,7 @@ struct _starpu_mpi_early_data_handle *_starpu_mpi_early_data_find(struct _starpu
 
				 void _starpu_mpi_early_data_add(struct _starpu_mpi_early_data_handle *early_data_handle)
			
 
				 {
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&_starpu_mpi_early_data_handle_mutex);
			
 
				-	_STARPU_MPI_DEBUG(60, "Trying to add early_data_handle %p with comm %d source %d tag %d\n", early_data_handle, early_data_handle->node_tag.comm,
			
 
				+	_STARPU_MPI_DEBUG(60, "Trying to add early_data_handle %p with comm %ld source %d tag %d\n", early_data_handle, (long int)early_data_handle->node_tag.comm,
			
 
				 			  early_data_handle->node_tag.rank, early_data_handle->node_tag.data_tag);
			
 
				 
			
 
				 	struct _starpu_mpi_early_data_handle_hashlist *hashlist;
			
--- a/mpi/src/starpu_mpi_early_request.c
+++ b/mpi/src/starpu_mpi_early_request.c
@@ -74,7 +74,7 @@ struct _starpu_mpi_req* _starpu_mpi_early_request_dequeue(int data_tag, int sour
 
				 	node_tag.rank = source;
			
 
				 	node_tag.data_tag = data_tag;
			
 
				 
			
 
				-	_STARPU_MPI_DEBUG(100, "Looking for early_request with comm %d source %d tag %d\n", node_tag.comm, node_tag.rank, node_tag.data_tag);
			
 
				+	_STARPU_MPI_DEBUG(100, "Looking for early_request with comm %ld source %d tag %d\n", (long int)node_tag.comm, node_tag.rank, node_tag.data_tag);
			
 
				 	HASH_FIND(hh, _starpu_mpi_early_request_hash, &node_tag, sizeof(struct _starpu_mpi_node_tag), hashlist);
			
 
				 	if (hashlist == NULL)
			
 
				 	{
			
@@ -92,7 +92,7 @@ struct _starpu_mpi_req* _starpu_mpi_early_request_dequeue(int data_tag, int sour
 
				 			_starpu_mpi_early_request_hash_count --;
			
 
				 		}
			
 
				 	}
			
 
				-	_STARPU_MPI_DEBUG(100, "Found early_request %p with comm %d source %d tag %d\n", found, node_tag.comm, node_tag.rank, node_tag.data_tag);
			
 
				+	_STARPU_MPI_DEBUG(100, "Found early_request %p with comm %ld source %d tag %d\n", found, (long int)node_tag.comm, node_tag.rank, node_tag.data_tag);
			
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&_starpu_mpi_early_request_mutex);
			
 
				 	return found;
			
 
				 }
			
@@ -100,7 +100,7 @@ struct _starpu_mpi_req* _starpu_mpi_early_request_dequeue(int data_tag, int sour
 
				 void _starpu_mpi_early_request_enqueue(struct _starpu_mpi_req *req)
			
 
				 {
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&_starpu_mpi_early_request_mutex);
			
 
				-	_STARPU_MPI_DEBUG(100, "Adding request %p with comm %d source %d tag %d in the application request hashmap\n", req, req->node_tag.comm, req->node_tag.rank, req->node_tag.data_tag);
			
 
				+	_STARPU_MPI_DEBUG(100, "Adding request %p with comm %ld source %d tag %d in the application request hashmap\n", req, (long int)req->node_tag.comm, req->node_tag.rank, req->node_tag.data_tag);
			
 
				 
			
 
				 	struct _starpu_mpi_early_request_hashlist *hashlist;
			
 
				 	HASH_FIND(hh, _starpu_mpi_early_request_hash, &req->node_tag, sizeof(struct _starpu_mpi_node_tag), hashlist);
			
--- a/mpi/src/starpu_mpi_private.h
+++ b/mpi/src/starpu_mpi_private.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2012-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2012-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -42,7 +42,7 @@ struct _starpu_simgrid_mpi_req
 
				 	unsigned *done;
			
 
				 };
			
 
				 
			
 
				-int _starpu_mpi_simgrid_mpi_test(int *done, int *flag);
			
 
				+int _starpu_mpi_simgrid_mpi_test(unsigned *done, int *flag);
			
 
				 void _starpu_mpi_simgrid_wait_req(MPI_Request *request, 	MPI_Status *status, starpu_pthread_queue_t *queue, unsigned *done);
			
 
				 #endif
			
 
				 	
			
--- a/mpi/src/starpu_mpi_sync_data.c
+++ b/mpi/src/starpu_mpi_sync_data.c
@@ -60,11 +60,11 @@ void _starpu_mpi_sync_data_handle_display_hash(struct _starpu_mpi_node_tag *node
 
				 
			
 
				 	if (hashlist == NULL)
			
 
				 	{
			
 
				-		_STARPU_MPI_DEBUG(60, "Hashlist for comm %d source %d and tag %d does not exist\n", node_tag->comm, node_tag->rank, node_tag->data_tag);
			
 
				+		_STARPU_MPI_DEBUG(60, "Hashlist for comm %ld source %d and tag %d does not exist\n", (long int)node_tag->comm, node_tag->rank, node_tag->data_tag);
			
 
				 	}
			
 
				 	else if (_starpu_mpi_req_list_empty(hashlist->list))
			
 
				 	{
			
 
				-		_STARPU_MPI_DEBUG(60, "Hashlist for comm %d source %d and tag %d is empty\n", node_tag->comm, node_tag->rank, node_tag->data_tag);
			
 
				+		_STARPU_MPI_DEBUG(60, "Hashlist for comm %ld source %d and tag %d is empty\n", (long int)node_tag->comm, node_tag->rank, node_tag->data_tag);
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
@@ -73,7 +73,7 @@ void _starpu_mpi_sync_data_handle_display_hash(struct _starpu_mpi_node_tag *node
 
				 		     cur != _starpu_mpi_req_list_end(hashlist->list);
			
 
				 		     cur = _starpu_mpi_req_list_next(cur))
			
 
				 		{
			
 
				-			_STARPU_MPI_DEBUG(60, "Element for comm %d source %d and tag %d: %p\n", node_tag->comm, node_tag->rank, node_tag->data_tag, cur);
			
 
				+			_STARPU_MPI_DEBUG(60, "Element for comm %ld source %d and tag %d: %p\n", (long int)node_tag->comm, node_tag->rank, node_tag->data_tag, cur);
			
 
				 		}
			
 
				 	}
			
 
				 }
			
@@ -100,7 +100,7 @@ struct _starpu_mpi_req *_starpu_mpi_sync_data_find(int data_tag, int source, MPI
 
				 	node_tag.rank = source;
			
 
				 	node_tag.data_tag = data_tag;
			
 
				 
			
 
				-	_STARPU_MPI_DEBUG(60, "Looking for sync_data_handle with comm %d source %d tag %d in the hashmap\n", comm, source, data_tag);
			
 
				+	_STARPU_MPI_DEBUG(60, "Looking for sync_data_handle with comm %ld source %d tag %d in the hashmap\n", (long int)comm, source, data_tag);
			
 
				 
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&_starpu_mpi_sync_data_handle_mutex);
			
 
				 	HASH_FIND(hh, _starpu_mpi_sync_data_handle_hashmap, &node_tag, sizeof(struct _starpu_mpi_node_tag), found);
			
@@ -121,7 +121,7 @@ struct _starpu_mpi_req *_starpu_mpi_sync_data_find(int data_tag, int source, MPI
 
				 		}
			
 
				 	}
			
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&_starpu_mpi_sync_data_handle_mutex);
			
 
				-	_STARPU_MPI_DEBUG(60, "Found sync_data_handle %p with comm %d source %d tag %d in the hashmap\n", req, comm, source, data_tag);
			
 
				+	_STARPU_MPI_DEBUG(60, "Found sync_data_handle %p with comm %ld source %d tag %d in the hashmap\n", req, (long int)comm, source, data_tag);
			
 
				 	return req;
			
 
				 }
			
 
				 
			
@@ -129,7 +129,7 @@ void _starpu_mpi_sync_data_add(struct _starpu_mpi_req *sync_req)
 
				 {
			
 
				 	struct _starpu_mpi_sync_data_handle_hashlist *hashlist;
			
 
				 
			
 
				-	_STARPU_MPI_DEBUG(2000, "Adding sync_req %p with comm %d source %d tag %d in the hashmap\n", sync_req, sync_req->node_tag.comm, sync_req->node_tag.rank, sync_req->node_tag.data_tag);
			
 
				+	_STARPU_MPI_DEBUG(2000, "Adding sync_req %p with comm %ld source %d tag %d in the hashmap\n", sync_req, (long int)sync_req->node_tag.comm, sync_req->node_tag.rank, sync_req->node_tag.data_tag);
			
 
				 
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&_starpu_mpi_sync_data_handle_mutex);
			
 
				 	HASH_FIND(hh, _starpu_mpi_sync_data_handle_hashmap, &sync_req->node_tag, sizeof(struct _starpu_mpi_node_tag), hashlist);
			
--- a/mpi/tests/insert_task_recv_cache.c
+++ b/mpi/tests/insert_task_recv_cache.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2011, 2012, 2013, 2014, 2015, 2016  CNRS
			
 
				+ * Copyright (C) 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -63,7 +63,7 @@ void test_cache(int rank, char *enabled, size_t *comm_amount)
 
				 {
			
 
				 	int i;
			
 
				 	int ret;
			
 
				-	unsigned **v;
			
 
				+	unsigned *v[2];
			
 
				 	starpu_data_handle_t data_handles[2];
			
 
				 
			
 
				 	FPRINTF_MPI(stderr, "Testing with STARPU_MPI_CACHE=%s\n", enabled);
			
@@ -74,7 +74,6 @@ void test_cache(int rank, char *enabled, size_t *comm_amount)
 
				 	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				 
			
 
				-	v = calloc(2, sizeof(unsigned *));
			
 
				 	for(i = 0; i < 2; i++)
			
 
				 	{
			
 
				 		int j;
			
@@ -125,7 +124,6 @@ void test_cache(int rank, char *enabled, size_t *comm_amount)
 
				 		starpu_data_unregister(data_handles[i]);
			
 
				 		free(v[i]);
			
 
				 	}
			
 
				-	free(v);
			
 
				 
			
 
				 	starpu_mpi_comm_amounts_retrieve(comm_amount);
			
 
				 	starpu_mpi_shutdown();
			
--- a/mpi/tests/insert_task_sent_cache.c
+++ b/mpi/tests/insert_task_sent_cache.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2011, 2012, 2013, 2014, 2015, 2016  CNRS
			
 
				+ * Copyright (C) 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -63,7 +63,7 @@ void test_cache(int rank, char *enabled, size_t *comm_amount)
 
				 {
			
 
				 	int i;
			
 
				 	int ret;
			
 
				-	unsigned **v;
			
 
				+	unsigned *v[2];
			
 
				 	starpu_data_handle_t data_handles[2];
			
 
				 
			
 
				 	setenv("STARPU_MPI_CACHE", enabled, 1);
			
@@ -73,7 +73,6 @@ void test_cache(int rank, char *enabled, size_t *comm_amount)
 
				 	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				 
			
 
				-	v = malloc(2 * sizeof(unsigned *));
			
 
				 	for(i = 0; i < 2; i++)
			
 
				 	{
			
 
				 		int j;
			
@@ -90,7 +89,7 @@ void test_cache(int rank, char *enabled, size_t *comm_amount)
 
				 		if (mpi_rank == rank)
			
 
				 		{
			
 
				 			//FPRINTF(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
			
 
				-			starpu_vector_data_register(&data_handles[i], STARPU_MAIN_RAM, (uintptr_t)&(v[i]), N, sizeof(unsigned));
			
 
				+			starpu_vector_data_register(&data_handles[i], STARPU_MAIN_RAM, (uintptr_t)v[i], N, sizeof(unsigned));
			
 
				 		}
			
 
				 		else
			
 
				 		{
			
@@ -131,7 +130,6 @@ void test_cache(int rank, char *enabled, size_t *comm_amount)
 
				 		starpu_data_unregister(data_handles[i]);
			
 
				 		free(v[i]);
			
 
				 	}
			
 
				-	free(v);
			
 
				 
			
 
				 	starpu_mpi_comm_amounts_retrieve(comm_amount);
			
 
				 	starpu_mpi_shutdown();
			
--- a/mpi/tests/policy_selection.c
+++ b/mpi/tests/policy_selection.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2015  CNRS
			
 
				+ * Copyright (C) 2015, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -53,7 +53,7 @@ int main(int argc, char **argv)
 
				 {
			
 
				 	int ret;
			
 
				 	int rank, size;
			
 
				-	int policy;
			
 
				+	int policy = 12;
			
 
				 	struct starpu_task *task;
			
 
				 	starpu_data_handle_t handles[3];
			
 
				 
			
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1,7 +1,7 @@
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 #
			
 
				-# Copyright (C) 2009-2016  Université de Bordeaux
			
 
				-# Copyright (C) 2010, 2011, 2012, 2013, 2015  CNRS
			
 
				+# Copyright (C) 2009-2017  Université de Bordeaux
			
 
				+# Copyright (C) 2010, 2011, 2012, 2013, 2015, 2017  CNRS
			
 
				 # Copyright (C) 2011, 2014  INRIA
			
 
				 # Copyright (C) 2016  Inria
			
 
				 #
			
@@ -293,6 +293,7 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 		\
 
				 	sched_policies/modular_random.c				\
			
 
				 	sched_policies/modular_random_prefetching.c			\
			
 
				 	sched_policies/modular_heft.c				\
			
 
				+	sched_policies/modular_heft_prio.c			\
			
 
				 	sched_policies/modular_heft2.c				\
			
 
				 	sched_policies/modular_ws.c
			
 
				 
			
@@ -371,19 +372,31 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/mic/driver_mic_utils.
 
				 endif
			
 
				 
			
 
				 #########################################
			
 
				-#                                       # 	 
			
 
				-#     MPI Master/Slave compilation      # 	 
			
 
				-#                                       # 	 
			
 
				-######################################### 	 
			
 
				+#                                       #
			
 
				+#     MPI Master/Slave compilation      #
			
 
				+#                                       #
			
 
				+#########################################
			
 
				 
			
 
				-if STARPU_USE_MPI_MASTER_SLAVE 	 
			
 
				-libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/mpi/driver_mpi_common.c 	 
			
 
				-libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/mpi/driver_mpi_source.c 	 
			
 
				-libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/mpi/driver_mpi_sink.c 	 
			
 
				-endif 	 
			
 
				+if STARPU_USE_MPI_MASTER_SLAVE
			
 
				+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/mpi/driver_mpi_common.c
			
 
				+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/mpi/driver_mpi_source.c
			
 
				+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/mpi/driver_mpi_sink.c
			
 
				+endif
			
 
				 
			
 
				 
			
 
				 #########################################
			
 
				 
			
 
				+# If some external references appear (U), it means the corresponding .c file has
			
 
				+# only included <starpu.h> and not the internal src/ header which contains the
			
 
				+# static inline definition
			
 
				+dist-hook:
			
 
				+	failed=0 ; \
			
 
				+	for i in $$( grep "static inline" $$(find $(srcdir) -name \*.h) | sed -e 's/.*static inline //g' | grep -v ENAME | sed -e 's/[^(]* \(\|\*\)\([^ (]*\)(.*/\2/' | grep -v _starpu_spin_init) ; do \
			
 
				+		for j in .libs/*.o ; do \
			
 
				+			nm $$j | grep "U $$i" && { echo $$j ; failed=1 ; } ; \
			
 
				+		done ; \
			
 
				+	done ; \
			
 
				+	[ $$failed == 0 ]
			
 
				+
			
 
				 showcheck:
			
 
				 	-cat /dev/null
			
--- a/src/common/rbtree.h
+++ b/src/common/rbtree.h
@@ -250,8 +250,7 @@ MACRO_END
 
				  * must not compare equal to an existing node in the tree (i.e. the slot
			
 
				  * must denote a null node).
			
 
				  */
			
 
				-static inline void
			
 
				-starpu_rbtree_insert_slot(struct starpu_rbtree *tree, unsigned long slot,
			
 
				+static inline void starpu_rbtree_insert_slot(struct starpu_rbtree *tree, unsigned long slot,
			
 
				                    struct starpu_rbtree_node *node)
			
 
				 {
			
 
				     struct starpu_rbtree_node *parent;
			
--- a/src/common/thread.c
+++ b/src/common/thread.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2012-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2012-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -68,9 +68,8 @@ int starpu_pthread_create(starpu_pthread_t *thread, const starpu_pthread_attr_t
 
				 
			
 
				 int starpu_pthread_join(starpu_pthread_t thread STARPU_ATTRIBUTE_UNUSED, void **retval STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				-#if 0 //def HAVE_MSG_PROCESS_JOIN
			
 
				-	/* https://gforge.inria.fr/tracker/index.php?func=detail&aid=13601&group_id=12&atid=165 */
			
 
				-	MSG_process_join(thread, 100);
			
 
				+#if SIMGRID_VERSION_MAJOR > 3 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR >= 14)
			
 
				+	MSG_process_join(thread, 1000000);
			
 
				 #else
			
 
				 	MSG_process_sleep(1);
			
 
				 #endif
			
--- a/src/core/debug.c
+++ b/src/core/debug.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2013  Université de Bordeaux
			
 
				+ * Copyright (C) 2009-2013, 2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2013, 2015, 2016  CNRS
			
 
				  * Copyright (C) 2016  Inria
			
 
				  *
			
@@ -46,7 +46,7 @@ void _starpu_open_debug_logfile(void)
 
				 	}
			
 
				 
			
 
				 	logfile = fopen(logfile_name, "w+");
			
 
				-	STARPU_ASSERT(logfile);
			
 
				+	STARPU_ASSERT_MSG(logfile, "Could not open file %s for verbose logs (%s). You can specify another file destination with the STARPU_LOGFILENAME environment variable", logfile_name, strerror(errno));
			
 
				 #endif
			
 
				 }
			
 
				 
			
--- a/src/core/perfmodel/perfmodel.c
+++ b/src/core/perfmodel/perfmodel.c
@@ -3,7 +3,7 @@
 
				  * Copyright (C) 2009-2016  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
			
 
				  * Copyright (C) 2011  Télécom-SudParis
			
 
				- * Copyright (C) 2016  Inria
			
 
				+ * Copyright (C) 2016, 2017  Inria
			
 
				  * Copyright (C) 2016  Uppsala University
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -123,7 +123,7 @@ double starpu_worker_get_relative_speedup(struct starpu_perfmodel_arch* perf_arc
 
				 			coef = _STARPU_MIC_ALPHA;
			
 
				 		else if (perf_arch->devices[dev].type == STARPU_SCC_WORKER)
			
 
				 			coef = _STARPU_SCC_ALPHA;
			
 
				-		else if (perf_arch->devices[dev].type == STARPU_MPI_WORKER)
			
 
				+		else if (perf_arch->devices[dev].type == STARPU_MPI_MS_WORKER)
			
 
				 			coef = _STARPU_MPI_MS_ALPHA;
			
 
				 
			
 
				 		speedup += coef * (perf_arch->devices[dev].ncores);
			
@@ -265,7 +265,7 @@ double starpu_task_expected_conversion_time(struct starpu_task *task,
 
				 			case STARPU_SCC_WORKER:
			
 
				 				node_kind = STARPU_SCC_RAM;
			
 
				 				break;
			
 
				-			case STARPU_MPI_WORKER:
			
 
				+			case STARPU_MPI_MS_WORKER:
			
 
				 				node_kind = STARPU_MPI_MS_RAM;
			
 
				 				break;
			
 
				 			default:
			
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009-2016  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
			
 
				  * Copyright (C) 2013 Corentin Salingue
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -747,7 +747,7 @@ static void benchmark_all_gpu_devices(void)
 
				 #endif /* STARPU_USE_MIC */
			
 
				 
			
 
				 #ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				-    
			
 
				+
			
 
				         _starpu_mpi_common_measure_bandwidth_latency(mpi_time_device_to_device, mpi_latency_device_to_device);
			
 
				 
			
 
				 #endif /* STARPU_USE_MPI_MASTER_SLAVE */
			
@@ -1206,7 +1206,7 @@ static void write_bus_latency_file_content(void)
 
				 #endif
			
 
				                                 /* TODO Latency MIC */
			
 
				 #ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				-                                /* Modify MPI src and MPI dst if they contain the master node or not 
			
 
				+                                /* Modify MPI src and MPI dst if they contain the master node or not
			
 
				                                  * Because, we only take care about slaves */
			
 
				                                 int mpi_master = _starpu_mpi_common_get_src_node();
			
 
				 
			
@@ -1225,7 +1225,7 @@ static void write_bus_latency_file_content(void)
 
				                                         }
			
 
				                                         else
			
 
				                                         {
			
 
				-                                                /* Only src represents an MPI device 
			
 
				+                                                /* Only src represents an MPI device
			
 
				                                                  * So we add latency between src and master */
			
 
				                                                 latency += mpi_latency_device_to_device[mpi_src][mpi_master];
			
 
				                                         }
			
@@ -1234,7 +1234,7 @@ static void write_bus_latency_file_content(void)
 
				                                 {
			
 
				                                         if (dst > ncuda + nopencl + nmic && dst <= ncuda + nopencl + nmic + nmpi_ms)
			
 
				                                         {
			
 
				-                                                /* Only dst identifies an MPI device 
			
 
				+                                                /* Only dst identifies an MPI device
			
 
				                                                  * So we add latency between master and dst */
			
 
				                                                 latency += mpi_latency_device_to_device[mpi_master][mpi_dst];
			
 
				                                         }
			
@@ -1476,7 +1476,7 @@ static void write_bus_bandwidth_file_content(void)
 
				 					slowness += mic_time_host_to_device[dst - (ncuda + nopencl)];
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				-                                /* Modify MPI src and MPI dst if they contain the master node or not 
			
 
				+                                /* Modify MPI src and MPI dst if they contain the master node or not
			
 
				                                  * Because, we only take care about slaves */
			
 
				                                 int mpi_master = _starpu_mpi_common_get_src_node();
			
 
				 
			
@@ -1496,7 +1496,7 @@ static void write_bus_bandwidth_file_content(void)
 
				                                         }
			
 
				                                         else
			
 
				                                         {
			
 
				-                                                /* Only src represents an MPI device 
			
 
				+                                                /* Only src represents an MPI device
			
 
				                                                  * So we add bandwidth between src and master */
			
 
				                                                 slowness += 1.0/mpi_time_device_to_device[mpi_src][mpi_master];
			
 
				                                         }
			
@@ -1505,7 +1505,7 @@ static void write_bus_bandwidth_file_content(void)
 
				                                 {
			
 
				                                         if (dst > ncuda + nopencl + nmic && dst <= ncuda + nopencl + nmic + nmpi_ms)
			
 
				                                         {
			
 
				-                                                /* Only dst identifies an MPI device 
			
 
				+                                                /* Only dst identifies an MPI device
			
 
				                                                  * So we add bandwidth between master and dst */
			
 
				                                                 slowness += 1.0/mpi_time_device_to_device[mpi_master][mpi_dst];
			
 
				                                         }
			
@@ -1573,7 +1573,7 @@ void starpu_bus_print_bandwidth(FILE *f)
 
				 	for (dst = 0; dst < nmic; dst++)
			
 
				 		fprintf(f, "MIC%u\t", dst);
			
 
				 	for (dst = 0; dst < nmpi_ms; dst++)
			
 
				-		fprintf(f, "MPI_MS%d\t", dst);
			
 
				+		fprintf(f, "MPI_MS%u\t", dst);
			
 
				 	fprintf(f, "\n");
			
 
				 
			
 
				 	for (src = 0; src <= maxnode; src++)
			
@@ -1587,7 +1587,7 @@ void starpu_bus_print_bandwidth(FILE *f)
 
				 		else if (src <= ncuda + nopencl + nmic)
			
 
				 			fprintf(f, "MIC%u\t", src-ncuda-nopencl-1);
			
 
				                 else
			
 
				-			fprintf(f, "MPI_MS%d\t", src-ncuda-nopencl-nmic-1);
			
 
				+			fprintf(f, "MPI_MS%u\t", src-ncuda-nopencl-nmic-1);
			
 
				 		for (dst = 0; dst <= maxnode; dst++)
			
 
				 			fprintf(f, "%.0f\t", bandwidth_matrix[src][dst]);
			
 
				 
			
@@ -1606,7 +1606,7 @@ void starpu_bus_print_bandwidth(FILE *f)
 
				 		else if (src <= ncuda + nopencl + nmic)
			
 
				 			fprintf(f, "MIC%u\t", src-ncuda-nopencl-1);
			
 
				                 else
			
 
				-			fprintf(f, "MPI_MS%d\t", src-ncuda-nopencl-nmic-1);
			
 
				+			fprintf(f, "MPI_MS%u\t", src-ncuda-nopencl-nmic-1);
			
 
				 		for (dst = 0; dst <= maxnode; dst++)
			
 
				 			fprintf(f, "%.0f\t", latency_matrix[src][dst]);
			
 
				 
			
@@ -1662,7 +1662,7 @@ static void generate_bus_bandwidth_file(void)
 
				 {
			
 
				 	if (!was_benchmarked)
			
 
				 		benchmark_all_gpu_devices();
			
 
				-    
			
 
				+
			
 
				 #ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				         /* Slaves don't write files */
			
 
				         if (!_starpu_mpi_common_is_src_node())
			
@@ -1704,10 +1704,11 @@ static int mpi_check_recalibrate(int my_recalibrate)
 
				 {
			
 
				         int nb_mpi = _starpu_mpi_src_get_device_count() + 1;
			
 
				         int mpi_recalibrate[nb_mpi];
			
 
				+	int i;
			
 
				 
			
 
				         MPI_Allgather(&my_recalibrate, 1, MPI_INT, mpi_recalibrate, 1, MPI_INT, MPI_COMM_WORLD);
			
 
				 
			
 
				-        for (int i = 0; i < nb_mpi; i++)
			
 
				+        for (i = 0; i < nb_mpi; i++)
			
 
				         {
			
 
				                 if (mpi_recalibrate[i])
			
 
				                 {
			
@@ -1858,7 +1859,7 @@ static void write_bus_config_file_content(void)
 
				         fprintf(f, "%u # Number of CUDA devices\n", ncuda);
			
 
				         fprintf(f, "%u # Number of OpenCL devices\n", nopencl);
			
 
				         fprintf(f, "%u # Number of MIC devices\n", nmic);
			
 
				-        fprintf(f, "%d # Number of MPI devices\n", nmpi_ms);
			
 
				+        fprintf(f, "%u # Number of MPI devices\n", nmpi_ms);
			
 
				 
			
 
				 	if (locked)
			
 
				 		_starpu_fwrunlock(f);
			
@@ -1869,7 +1870,7 @@ static void generate_bus_config_file(void)
 
				 {
			
 
				 	if (!was_benchmarked)
			
 
				 		benchmark_all_gpu_devices();
			
 
				-    
			
 
				+
			
 
				 #ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				         /* Slaves don't write files */
			
 
				         if (!_starpu_mpi_common_is_src_node())
			
@@ -2760,7 +2761,7 @@ void _starpu_save_bandwidth_and_latency_disk(double bandwidth_write, double band
 
				 					slowness_main_ram_between_node = 1/bandwidth_matrix[STARPU_MAIN_RAM][j];
			
 
				 				else
			
 
				 					slowness_main_ram_between_node = 0;
			
 
				-				
			
 
				+
			
 
				 				bandwidth_matrix[i][j] = 1/(slowness_disk_between_main_ram+slowness_main_ram_between_node);
			
 
				 			}
			
 
				 			else if (j == node) /* destination == disk */
			
--- a/src/core/perfmodel/perfmodel_history.c
+++ b/src/core/perfmodel/perfmodel_history.c
@@ -3,7 +3,7 @@
 
				  * Copyright (C) 2009-2016  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
			
 
				  * Copyright (C) 2011  Télécom-SudParis
			
 
				- * Copyright (C) 2016  Inria
			
 
				+ * Copyright (C) 2016, 2017  Inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -569,7 +569,7 @@ static enum starpu_worker_archtype _get_enum_type(int type)
 
				         	case 4:
			
 
				 			return STARPU_SCC_WORKER;
			
 
				         	case 5:
			
 
				-			return STARPU_MPI_WORKER;
			
 
				+			return STARPU_MPI_MS_WORKER;
			
 
				 		default:
			
 
				 			STARPU_ABORT();
			
 
				 	}
			
@@ -923,7 +923,7 @@ void _starpu_initialize_registered_performance_models(void)
 
				 	ignore_devid[STARPU_CUDA_WORKER] = starpu_get_env_number_default("STARPU_PERF_MODEL_HOMOGENEOUS_CUDA", 0);
			
 
				 	ignore_devid[STARPU_OPENCL_WORKER] = starpu_get_env_number_default("STARPU_PERF_MODEL_HOMOGENEOUS_OPENCL", 0);
			
 
				 	ignore_devid[STARPU_MIC_WORKER] = starpu_get_env_number_default("STARPU_PERF_MODEL_HOMOGENEOUS_MIC", 0);
			
 
				-	ignore_devid[STARPU_MPI_WORKER] = starpu_get_env_number_default("STARPU_PERF_MODEL_HOMOGENEOUS_MPI_MS", 0);
			
 
				+	ignore_devid[STARPU_MPI_MS_WORKER] = starpu_get_env_number_default("STARPU_PERF_MODEL_HOMOGENEOUS_MPI_MS", 0);
			
 
				 	ignore_devid[STARPU_SCC_WORKER] = starpu_get_env_number_default("STARPU_PERF_MODEL_HOMOGENEOUS_SCC", 0);
			
 
				 }
			
 
				 
			
@@ -1206,7 +1206,7 @@ char* starpu_perfmodel_get_archtype_name(enum starpu_worker_archtype archtype)
 
				 		case(STARPU_SCC_WORKER):
			
 
				 			return "scc";
			
 
				 			break;
			
 
				-		case(STARPU_MPI_WORKER):
			
 
				+		case(STARPU_MPI_MS_WORKER):
			
 
				 			return "mpi_ms";
			
 
				 			break;
			
 
				 		default:
			
--- a/src/core/sched_ctx.c
+++ b/src/core/sched_ctx.c
@@ -39,7 +39,8 @@ static int occupied_sms = 0;
 
				 static unsigned _starpu_get_first_free_sched_ctx(struct _starpu_machine_config *config);
			
 
				 
			
 
				 static void _starpu_sched_ctx_put_new_master(unsigned sched_ctx_id);
			
 
				-static void _starpu_sched_ctx_wake_up_workers(unsigned sched_ctx_id);
			
 
				+static void _starpu_sched_ctx_put_workers_to_sleep(unsigned sched_ctx_id, unsigned all);
			
 
				+static void _starpu_sched_ctx_wake_up_workers(unsigned sched_ctx_id, unsigned all);
			
 
				 static void _starpu_sched_ctx_update_parallel_workers_with(unsigned sched_ctx_id);
			
 
				 static void _starpu_sched_ctx_update_parallel_workers_without(unsigned sched_ctx_id);
			
 
				 
			
@@ -1123,7 +1124,7 @@ void starpu_sched_ctx_delete(unsigned sched_ctx_id)
 
				 	if(!_starpu_wait_for_all_tasks_of_sched_ctx(sched_ctx_id))
			
 
				 	{
			
 
				 		if(!sched_ctx->sched_policy)
			
 
				-			_starpu_sched_ctx_wake_up_workers(sched_ctx_id);
			
 
				+			_starpu_sched_ctx_wake_up_workers(sched_ctx_id, 0);
			
 
				 		/*if btw the mutex release & the mutex lock the context has changed take care to free all
			
 
				 		  scheduling data before deleting the context */
			
 
				 		_starpu_update_workers_without_ctx(workerids, nworkers_ctx, sched_ctx_id, 1);
			
@@ -2359,18 +2360,23 @@ void _starpu_sched_ctx_signal_worker_woke_up(unsigned sched_ctx_id, int workerid
 
				 	return;
			
 
				 }
			
 
				 
			
 
				-static void _starpu_sched_ctx_put_workers_to_sleep(unsigned sched_ctx_id)
			
 
				+static void _starpu_sched_ctx_put_workers_to_sleep(unsigned sched_ctx_id, unsigned all)
			
 
				 {
			
 
				 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
			
 
				 	int current_worker_id = starpu_worker_get_id();
			
 
				-	int master = sched_ctx->main_master;
			
 
				+	int master, temp_master = 0;
			
 
				 	struct starpu_worker_collection *workers = sched_ctx->workers;
			
 
				 	struct starpu_sched_ctx_iterator it;
			
 
				 	unsigned sleeping[workers->nworkers];
			
 
				 	int workers_count = 0;
			
 
				 
			
 
				-	if (master == -1)
			
 
				-		return;
			
 
				+	/* temporarily put a master if needed */
			
 
				+	if (sched_ctx->main_master == -1)
			
 
				+	{
			
 
				+		_starpu_sched_ctx_put_new_master(sched_ctx_id);
			
 
				+		temp_master = 1;
			
 
				+	}
			
 
				+	master = sched_ctx->main_master;
			
 
				 
			
 
				     workers->init_iterator(workers, &it);
			
 
				     while(workers->has_next(workers, &it))
			
@@ -2379,7 +2385,7 @@ static void _starpu_sched_ctx_put_workers_to_sleep(unsigned sched_ctx_id)
 
				 			sleeping[workers_count] = _worker_sleeping_in_other_ctx(sched_ctx_id, workerid);
			
 
				 
			
 
				 			if(starpu_worker_get_type(workerid) == STARPU_CPU_WORKER
			
 
				-				 && !sched_ctx->parallel_sect[workerid] && workerid != master)
			
 
				+				 && !sched_ctx->parallel_sect[workerid] && (workerid != master || all))
			
 
				        {
			
 
				             if (current_worker_id == -1 || workerid != current_worker_id)
			
 
				             {
			
@@ -2397,7 +2403,7 @@ static void _starpu_sched_ctx_put_workers_to_sleep(unsigned sched_ctx_id)
 
				     {
			
 
				             int workerid = workers->get_next(workers, &it);
			
 
				             if(starpu_worker_get_type(workerid) == STARPU_CPU_WORKER
			
 
				-							 && workerid != master
			
 
				+							 && (workerid != master || all)
			
 
				                && (current_worker_id == -1 || workerid != current_worker_id)
			
 
				                && !sleeping[workers_count])
			
 
				             {
			
@@ -2406,26 +2412,34 @@ static void _starpu_sched_ctx_put_workers_to_sleep(unsigned sched_ctx_id)
 
				 						workers_count++;
			
 
				     }
			
 
				 
			
 
				+		if (temp_master)
			
 
				+			sched_ctx->main_master = -1;
			
 
				+
			
 
				     return;
			
 
				 }
			
 
				 
			
 
				-static void _starpu_sched_ctx_wake_up_workers(unsigned sched_ctx_id)
			
 
				+static void _starpu_sched_ctx_wake_up_workers(unsigned sched_ctx_id, unsigned all)
			
 
				 {
			
 
				 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
			
 
				 	int current_worker_id = starpu_worker_get_id();
			
 
				-	int master = sched_ctx->main_master;
			
 
				+	int master, temp_master = 0;
			
 
				 	struct starpu_worker_collection *workers = sched_ctx->workers;
			
 
				 	struct starpu_sched_ctx_iterator it;
			
 
				 
			
 
				-	if (master == -1)
			
 
				-		return;
			
 
				+	/* temporarily put a master if needed */
			
 
				+	if (sched_ctx->main_master == -1)
			
 
				+	{
			
 
				+		_starpu_sched_ctx_put_new_master(sched_ctx_id);
			
 
				+		temp_master = 1;
			
 
				+	}
			
 
				+	master = sched_ctx->main_master;
			
 
				 
			
 
				 	workers->init_iterator(workers, &it);
			
 
				 	while(workers->has_next(workers, &it))
			
 
				 	{
			
 
				 		int workerid = workers->get_next(workers, &it);
			
 
				 		if(starpu_worker_get_type(workerid) == STARPU_CPU_WORKER
			
 
				-			 && sched_ctx->parallel_sect[workerid] && workerid != master)
			
 
				+			 && sched_ctx->parallel_sect[workerid] && (workerid != master || all))
			
 
				 		{
			
 
				 			if((current_worker_id == -1 || workerid != current_worker_id) && sched_ctx->sleeping[workerid])
			
 
				 			{
			
@@ -2439,19 +2453,22 @@ static void _starpu_sched_ctx_wake_up_workers(unsigned sched_ctx_id)
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+	if (temp_master)
			
 
				+		sched_ctx->main_master = -1;
			
 
				+
			
 
				 	return;
			
 
				 }
			
 
				 
			
 
				 void* starpu_sched_ctx_exec_parallel_code(void* (*func)(void*), void* param, unsigned sched_ctx_id)
			
 
				 {
			
 
				-    _starpu_sched_ctx_put_workers_to_sleep(sched_ctx_id);
			
 
				+	_starpu_sched_ctx_put_workers_to_sleep(sched_ctx_id, 1);
			
 
				 
			
 
				-    /* execute parallel code */
			
 
				-    void* ret = func(param);
			
 
				+	/* execute parallel code */
			
 
				+	void* ret = func(param);
			
 
				 
			
 
				-    /* wake up starpu workers */
			
 
				-    _starpu_sched_ctx_wake_up_workers(sched_ctx_id);
			
 
				-    return ret;
			
 
				+	/* wake up starpu workers */
			
 
				+	_starpu_sched_ctx_wake_up_workers(sched_ctx_id, 1);
			
 
				+	return ret;
			
 
				 }
			
 
				 
			
 
				 static void _starpu_sched_ctx_update_parallel_workers_with(unsigned sched_ctx_id)
			
@@ -2466,7 +2483,7 @@ static void _starpu_sched_ctx_update_parallel_workers_with(unsigned sched_ctx_id
 
				 
			
 
				 	if(!sched_ctx->awake_workers)
			
 
				 	{
			
 
				-		_starpu_sched_ctx_put_workers_to_sleep(sched_ctx_id);
			
 
				+		_starpu_sched_ctx_put_workers_to_sleep(sched_ctx_id, 0);
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -2482,7 +2499,7 @@ static void _starpu_sched_ctx_update_parallel_workers_without(unsigned sched_ctx
 
				 
			
 
				 	if(!sched_ctx->awake_workers)
			
 
				 	{
			
 
				-		_starpu_sched_ctx_wake_up_workers(sched_ctx_id);
			
 
				+		_starpu_sched_ctx_wake_up_workers(sched_ctx_id, 0);
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -2516,18 +2533,15 @@ static void _starpu_sched_ctx_put_new_master(unsigned sched_ctx_id)
 
				 	int *workerids;
			
 
				 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
			
 
				 	unsigned nworkers = starpu_sched_ctx_get_workers_list_raw(sched_ctx_id, &workerids);
			
 
				-	int i;
			
 
				+	unsigned i;
			
 
				 
			
 
				 	for (i=0; i<nworkers; i++)
			
 
				 	{
			
 
				-			 if (starpu_worker_get_type(workerids[i]) == STARPU_CPU_WORKER)
			
 
				-			 {
			
 
				-				 sched_ctx->main_master = workerids[i];
			
 
				-				 break;
			
 
				-			 }
			
 
				-	     else {
			
 
				-				 sched_ctx->main_master = -1;
			
 
				-			 }
			
 
				+		if (starpu_worker_get_type(workerids[i]) == STARPU_CPU_WORKER)
			
 
				+		{
			
 
				+			sched_ctx->main_master = workerids[i];
			
 
				+			break;
			
 
				+		}
			
 
				 	}
			
 
				 }
			
 
				 
			
--- a/src/core/sched_policy.c
+++ b/src/core/sched_policy.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2010-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010-2017  CNRS
			
 
				  * Copyright (C) 2011, 2016  INRIA
			
 
				  * Copyright (C) 2016  Uppsala University
			
@@ -59,6 +59,7 @@ static struct starpu_sched_policy *predefined_policies[] =
 
				 	&_starpu_sched_modular_random_prio_prefetching_policy,
			
 
				 	&_starpu_sched_modular_ws_policy,
			
 
				 	&_starpu_sched_modular_heft_policy,
			
 
				+	&_starpu_sched_modular_heft_prio_policy,
			
 
				 	&_starpu_sched_modular_heft2_policy,
			
 
				 	&_starpu_sched_eager_policy,
			
 
				 	&_starpu_sched_prio_policy,
			
@@ -571,8 +572,14 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 
				 				while(workers->has_next(workers, &it))
			
 
				 				{
			
 
				 					unsigned workerid = workers->get_next(workers, &it);
			
 
				-					struct starpu_task *alias = starpu_task_dup(task);
			
 
				-					alias->destroy = 1;
			
 
				+					struct starpu_task *alias;
			
 
				+					if (job->task_size > 1)
			
 
				+					{
			
 
				+						alias = starpu_task_dup(task);
			
 
				+						alias->destroy = 1;
			
 
				+					}
			
 
				+					else
			
 
				+						alias = task;
			
 
				 					ret |= _starpu_push_task_on_specific_worker(alias, workerid);
			
 
				 				}
			
 
				 			}
			
--- a/src/core/sched_policy.h
+++ b/src/core/sched_policy.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2012-2013, 2015-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2012-2013, 2015-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2011  INRIA
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -89,6 +89,7 @@ extern struct starpu_sched_policy _starpu_sched_modular_random_prefetching_polic
 
				 extern struct starpu_sched_policy _starpu_sched_modular_random_prio_prefetching_policy;
			
 
				 extern struct starpu_sched_policy _starpu_sched_modular_ws_policy;
			
 
				 extern struct starpu_sched_policy _starpu_sched_modular_heft_policy;
			
 
				+extern struct starpu_sched_policy _starpu_sched_modular_heft_prio_policy;
			
 
				 extern struct starpu_sched_policy _starpu_sched_modular_heft2_policy;
			
 
				 extern struct starpu_sched_policy _starpu_sched_graph_test_policy;
			
 
				 
			
--- a/src/core/simgrid.c
+++ b/src/core/simgrid.c
@@ -299,7 +299,7 @@ void _starpu_simgrid_init(int *argc STARPU_ATTRIBUTE_UNUSED, char ***argv STARPU
 
				 	}
			
 
				 	if (_starpu_simgrid_running_smpi())
			
 
				 	{
			
 
				-#ifdef __PIC__
			
 
				+#ifndef STARPU_STATIC_ONLY
			
 
				 		_STARPU_ERROR("Simgrid currently does not support privatization for dynamically-linked libraries in SMPI. Please reconfigure and build StarPU with --disable-shared");
			
 
				 #endif
			
 
				 		MSG_process_set_data(MSG_process_self(), calloc(MAX_TSD, sizeof(void*)));
			
@@ -819,41 +819,56 @@ void _starpu_simgrid_count_ngpus(void)
 
				 }
			
 
				 
			
 
				 typedef struct{
			
 
				-  void_f_pvoid_t code;
			
 
				-  void *userparam;
			
 
				-  void *father_data;
			
 
				+	void_f_pvoid_t code;
			
 
				+	void *userparam;
			
 
				+	void *father_data;
			
 
				 } thread_data_t;
			
 
				 
			
 
				-static int _starpu_simgrid_xbt_thread_create_wrapper(int argc, char *argv[])
			
 
				+static int _starpu_simgrid_xbt_thread_create_wrapper(int argc STARPU_ATTRIBUTE_UNUSED, char *argv[] STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				-  /* FIXME: Ugly work-around for bug in simgrid: the MPI context is not properly set at MSG process startup */
			
 
				-  MSG_process_sleep(0.000001);
			
 
				+	/* FIXME: Ugly work-around for bug in simgrid: the MPI context is not properly set at MSG process startup */
			
 
				+	MSG_process_sleep(0.000001);
			
 
				 
			
 
				 #ifdef HAVE_SMX_ACTOR_T
			
 
				-  smx_actor_t
			
 
				+	smx_actor_t
			
 
				 #else
			
 
				-  smx_process_t
			
 
				+	smx_process_t
			
 
				 #endif
			
 
				-	  self = SIMIX_process_self();
			
 
				-  thread_data_t *t = SIMIX_process_self_get_data(self);
			
 
				-  simcall_process_set_data(self, t->father_data);
			
 
				-  t->code(t->userparam);
			
 
				-  simcall_process_set_data(self, NULL);
			
 
				-  free(t);
			
 
				-  
			
 
				-  return 0;
			
 
				+	self = SIMIX_process_self();
			
 
				+	thread_data_t *t = SIMIX_process_self_get_data(self);
			
 
				+	simcall_process_set_data(self, t->father_data);
			
 
				+	t->code(t->userparam);
			
 
				+	simcall_process_set_data(self, NULL);
			
 
				+	free(t);
			
 
				+
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				 void _starpu_simgrid_xbt_thread_create(const char *name, void_f_pvoid_t code, void *param)
			
 
				 {
			
 
				-  thread_data_t *res = malloc(sizeof(thread_data_t));
			
 
				-  res->userparam = param;
			
 
				-  res->code = code;
			
 
				-  res->father_data = SIMIX_process_self_get_data(SIMIX_process_self());
			
 
				-
			
 
				-  simcall_process_create(name,
			
 
				-                           _starpu_simgrid_xbt_thread_create_wrapper, res,
			
 
				-                           SIMIX_host_self_get_name(), -1.0, 0, NULL,
			
 
				-                           /*props */ NULL,0);
			
 
				+#ifdef HAVE_SMX_ACTOR_T
			
 
				+	smx_actor_t process STARPU_ATTRIBUTE_UNUSED;
			
 
				+#else
			
 
				+	smx_process_t process;
			
 
				+#endif
			
 
				+	thread_data_t *res = malloc(sizeof(thread_data_t));
			
 
				+	res->userparam = param;
			
 
				+	res->code = code;
			
 
				+	res->father_data = SIMIX_process_self_get_data(SIMIX_process_self());
			
 
				+
			
 
				+#if SIMGRID_VERSION_MAJOR < 3 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR < 12)
			
 
				+	simcall_process_create(&process,
			
 
				+#else
			
 
				+	process = simcall_process_create(
			
 
				+#endif
			
 
				+	                         name,
			
 
				+	                         _starpu_simgrid_xbt_thread_create_wrapper, res,
			
 
				+#if SIMGRID_VERSION_MAJOR < 3 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR < 14)
			
 
				+	                         SIMIX_host_self_get_name(),
			
 
				+#else
			
 
				+	                         SIMIX_host_self(),
			
 
				+#endif
			
 
				+				 -1.0, 0, NULL,
			
 
				+	                         /*props */ NULL,0);
			
 
				 }
			
 
				 #endif
			
--- a/src/core/task.c
+++ b/src/core/task.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2009-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
			
 
				  * Copyright (C) 2011  Télécom-SudParis
			
 
				  * Copyright (C) 2011, 2014, 2016  INRIA
			
@@ -110,6 +110,7 @@ void starpu_task_init(struct starpu_task *task)
 
				 void starpu_task_clean(struct starpu_task *task)
			
 
				 {
			
 
				 	STARPU_ASSERT(task);
			
 
				+	task->magic = 0;
			
 
				 
			
 
				 	/* If a buffer was allocated to store the profiling info, we free it. */
			
 
				 	if (task->profiling_info)
			
@@ -570,6 +571,8 @@ static int _starpu_task_submit_head(struct starpu_task *task)
 
				 		for (i = 0; i < nbuffers; i++)
			
 
				 		{
			
 
				 			starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, i);
			
 
				+			/* Make sure handles are valid */
			
 
				+			STARPU_ASSERT_MSG(handle->magic == 42, "data %p is invalid (was it already unregistered?)", handle);
			
 
				 			/* Make sure handles are not partitioned */
			
 
				 			STARPU_ASSERT_MSG(handle->nchildren == 0, "only unpartitioned data (or the pieces of a partitioned data) can be used in a task");
			
 
				 			/* Provide the home interface for now if any,
			
--- a/src/core/topology.c
+++ b/src/core/topology.c
@@ -2,7 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2009-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 CNRS
			
 
				- * Copyright (C) 2011, 2016  INRIA
			
 
				+ * Copyright (C) 2011, 2016, 2017  INRIA
			
 
				  * Copyright (C) 2016  Uppsala University
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -415,7 +415,7 @@ static inline int _starpu_get_next_mpi_deviceid(struct _starpu_machine_config *c
 
				 {
			
 
				 	unsigned i = ((config->current_mpi_deviceid++) % config->topology.nmpidevices);
			
 
				 
			
 
				-	return (int)config->topology.workers_mpi_deviceid[i];
			
 
				+	return (int)config->topology.workers_mpi_ms_deviceid[i];
			
 
				 }
			
 
				 
			
 
				 static void
			
@@ -623,7 +623,7 @@ _starpu_init_topology (struct _starpu_machine_config *config)
 
				 #ifdef STARPU_USE_SCC
			
 
				 	config->topology.nhwscc = _starpu_scc_src_get_device_count();
			
 
				 #endif
			
 
				-#ifdef STARPU_USE_MPI_MASTER_SLAVE 
			
 
				+#ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				         config->topology.nhwmpi = _starpu_mpi_src_get_device_count();
			
 
				 #endif
			
 
				 
			
@@ -925,7 +925,7 @@ _starpu_init_mic_config (struct _starpu_machine_config *config,
 
				 	}
			
 
				 
			
 
				 	topology->nworkers += topology->nmiccores[mic_idx];
			
 
				-}  
			
 
				+}
			
 
				 
			
 
				 static COIENGINE mic_handles[STARPU_MAXMICDEVS];
			
 
				 COIPROCESS _starpu_mic_process[STARPU_MAXMICDEVS];
			
@@ -975,10 +975,10 @@ _starpu_init_mpi_config (struct _starpu_machine_config *config,
 
				         {
			
 
				                 int worker_idx = topology->nworkers + mpicore_id;
			
 
				                 config->workers[worker_idx].set = &mpi_worker_set[mpi_idx];
			
 
				-                config->workers[worker_idx].arch = STARPU_MPI_WORKER;
			
 
				+                config->workers[worker_idx].arch = STARPU_MPI_MS_WORKER;
			
 
				                 _STARPU_MALLOC(config->workers[worker_idx].perf_arch.devices, sizeof(struct starpu_perfmodel_device));
			
 
				                 config->workers[worker_idx].perf_arch.ndevices = 1;
			
 
				-                config->workers[worker_idx].perf_arch.devices[0].type = STARPU_MPI_WORKER;
			
 
				+                config->workers[worker_idx].perf_arch.devices[0].type = STARPU_MPI_MS_WORKER;
			
 
				                 config->workers[worker_idx].perf_arch.devices[0].devid = mpi_idx;
			
 
				                 config->workers[worker_idx].perf_arch.devices[0].ncores = 1;
			
 
				                 config->workers[worker_idx].devid = mpi_idx;
			
@@ -988,9 +988,10 @@ _starpu_init_mpi_config (struct _starpu_machine_config *config,
 
				         }
			
 
				 
			
 
				         topology->nworkers += topology->nmpicores[mpi_idx];
			
 
				-}  
			
 
				+}
			
 
				 #endif
			
 
				 
			
 
				+#if defined(STARPU_USE_MIC) || defined(STARPU_USE_MPI_MASTER_SLAVE)
			
 
				 static void
			
 
				 _starpu_init_mp_config (struct _starpu_machine_config *config,
			
 
				 			struct starpu_conf *user_conf, int no_mp_config)
			
@@ -1002,92 +1003,92 @@ _starpu_init_mp_config (struct _starpu_machine_config *config,
 
				 	 * - configure the workers accordingly.
			
 
				 	 */
			
 
				 
			
 
				-	struct _starpu_machine_topology *topology = &config->topology;
			
 
				-
			
 
				 #ifdef STARPU_USE_MIC
			
 
				-    if (!no_mp_config)
			
 
				-    {
			
 
				-        /* Discover and initialize the number of MIC nodes through the mp
			
 
				-         * infrastructure. */
			
 
				-        unsigned nhwmicdevices = _starpu_mic_src_get_device_count();
			
 
				-
			
 
				-        int reqmicdevices = starpu_get_env_number("STARPU_NMIC");
			
 
				-        if (reqmicdevices == -1 && user_conf)
			
 
				-            reqmicdevices = user_conf->nmic;
			
 
				-        if (reqmicdevices == -1)
			
 
				-            /* Nothing was specified, so let's use the number of
			
 
				-             * detected mic devices. ! */
			
 
				-            reqmicdevices = nhwmicdevices;
			
 
				-
			
 
				-	if (reqmicdevices != -1)
			
 
				+	if (!no_mp_config)
			
 
				 	{
			
 
				-		if ((unsigned) reqmicdevices > nhwmicdevices)
			
 
				-		{
			
 
				-			/* The user requires more MIC devices than there is available */
			
 
				-			_STARPU_MSG("# Warning: %d MIC devices requested. Only %d available.\n", reqmicdevices, nhwmicdevices);
			
 
				+		struct _starpu_machine_topology *topology = &config->topology;
			
 
				+
			
 
				+		/* Discover and initialize the number of MIC nodes through the mp
			
 
				+		 * infrastructure. */
			
 
				+		unsigned nhwmicdevices = _starpu_mic_src_get_device_count();
			
 
				+
			
 
				+		int reqmicdevices = starpu_get_env_number("STARPU_NMIC");
			
 
				+		if (reqmicdevices == -1 && user_conf)
			
 
				+			reqmicdevices = user_conf->nmic;
			
 
				+		if (reqmicdevices == -1)
			
 
				+			/* Nothing was specified, so let's use the number of
			
 
				+			 * detected mic devices. ! */
			
 
				 			reqmicdevices = nhwmicdevices;
			
 
				-		}
			
 
				-	}
			
 
				 
			
 
				-        topology->nmicdevices = 0;
			
 
				-        unsigned i;
			
 
				-        for (i = 0; i < (unsigned) reqmicdevices; i++)
			
 
				-                if (0 == _starpu_init_mic_node (config, i, &mic_handles[i], &_starpu_mic_process[i]))
			
 
				-                        topology->nmicdevices++;
			
 
				+		if (reqmicdevices != -1)
			
 
				+		{
			
 
				+			if ((unsigned) reqmicdevices > nhwmicdevices)
			
 
				+			{
			
 
				+				/* The user requires more MIC devices than there is available */
			
 
				+				_STARPU_MSG("# Warning: %d MIC devices requested. Only %d available.\n", reqmicdevices, nhwmicdevices);
			
 
				+				reqmicdevices = nhwmicdevices;
			
 
				+			}
			
 
				+		}
			
 
				 
			
 
				+		topology->nmicdevices = 0;
			
 
				+		unsigned i;
			
 
				+		for (i = 0; i < (unsigned) reqmicdevices; i++)
			
 
				+			if (0 == _starpu_init_mic_node (config, i, &mic_handles[i], &_starpu_mic_process[i]))
			
 
				+				topology->nmicdevices++;
			
 
				 
			
 
				-        for (i = 0; i < topology->nmicdevices; i++)
			
 
				-                _starpu_init_mic_config (config, user_conf, i);
			
 
				-    }
			
 
				+		for (i = 0; i < topology->nmicdevices; i++)
			
 
				+			_starpu_init_mic_config (config, user_conf, i);
			
 
				+	}
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				-    {
			
 
				-            /* Discover and initialize the number of MPI nodes through the mp
			
 
				-             * infrastructure. */
			
 
				-            unsigned nhwmpidevices = _starpu_mpi_src_get_device_count();
			
 
				-
			
 
				-            int reqmpidevices = starpu_get_env_number("STARPU_NMPI_MS");
			
 
				-            if (reqmpidevices == -1 && user_conf)
			
 
				-                    reqmpidevices = user_conf->nmpi_ms;
			
 
				-            if (reqmpidevices == -1)
			
 
				-                    /* Nothing was specified, so let's use the number of
			
 
				-                     * detected mpi devices. ! */
			
 
				-                    reqmpidevices = nhwmpidevices;
			
 
				-
			
 
				-            if (reqmpidevices != -1)
			
 
				-            {
			
 
				-                    if ((unsigned) reqmpidevices > nhwmpidevices)
			
 
				-                    {
			
 
				-                            /* The user requires more MPI devices than there is available */
			
 
				-                            fprintf(stderr,
			
 
				-                                            "# Warning: %d MPI Master-Slave devices requested. Only %d available.\n",
			
 
				-                                            reqmpidevices, nhwmpidevices);
			
 
				-                            reqmpidevices = nhwmpidevices;
			
 
				-                    }
			
 
				-            }
			
 
				-
			
 
				-            topology->nmpidevices = reqmpidevices;
			
 
				-
			
 
				-            /* if user don't want to use MPI slaves, we close the slave processes */
			
 
				-            if (no_mp_config && topology->nmpidevices == 0)
			
 
				-            {
			
 
				-                    _starpu_mpi_common_mp_deinit();
			
 
				-                    exit(0);
			
 
				-            }
			
 
				-
			
 
				-            if (!no_mp_config)
			
 
				-            {
			
 
				-                    unsigned i;
			
 
				-                    for (i = 0; i < topology->nmpidevices; i++)
			
 
				-                            mpi_ms_nodes[i] = _starpu_mp_common_node_create(STARPU_NODE_MPI_SOURCE, i);
			
 
				-
			
 
				-
			
 
				-                    for (i = 0; i < topology->nmpidevices; i++)
			
 
				-                            _starpu_init_mpi_config (config, user_conf, i);
			
 
				-            }
			
 
				-    }
			
 
				+	{
			
 
				+		struct _starpu_machine_topology *topology = &config->topology;
			
 
				+
			
 
				+		/* Discover and initialize the number of MPI nodes through the mp
			
 
				+		 * infrastructure. */
			
 
				+		unsigned nhwmpidevices = _starpu_mpi_src_get_device_count();
			
 
				+
			
 
				+		int reqmpidevices = starpu_get_env_number("STARPU_NMPI_MS");
			
 
				+		if (reqmpidevices == -1 && user_conf)
			
 
				+			reqmpidevices = user_conf->nmpi_ms;
			
 
				+		if (reqmpidevices == -1)
			
 
				+			/* Nothing was specified, so let's use the number of
			
 
				+			 * detected mpi devices. ! */
			
 
				+			reqmpidevices = nhwmpidevices;
			
 
				+
			
 
				+		if (reqmpidevices != -1)
			
 
				+		{
			
 
				+			if ((unsigned) reqmpidevices > nhwmpidevices)
			
 
				+			{
			
 
				+				/* The user requires more MPI devices than there is available */
			
 
				+				_STARPU_MSG("# Warning: %d MPI Master-Slave devices requested. Only %d available.\n",
			
 
				+					    reqmpidevices, nhwmpidevices);
			
 
				+				reqmpidevices = nhwmpidevices;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		topology->nmpidevices = reqmpidevices;
			
 
				+
			
 
				+		/* if user don't want to use MPI slaves, we close the slave processes */
			
 
				+		if (no_mp_config && topology->nmpidevices == 0)
			
 
				+		{
			
 
				+			_starpu_mpi_common_mp_deinit();
			
 
				+			exit(0);
			
 
				+		}
			
 
				+
			
 
				+		if (!no_mp_config)
			
 
				+		{
			
 
				+			unsigned i;
			
 
				+			for (i = 0; i < topology->nmpidevices; i++)
			
 
				+				mpi_ms_nodes[i] = _starpu_mp_common_node_create(STARPU_NODE_MPI_SOURCE, i);
			
 
				+
			
 
				+			for (i = 0; i < topology->nmpidevices; i++)
			
 
				+				_starpu_init_mpi_config (config, user_conf, i);
			
 
				+		}
			
 
				+	}
			
 
				 #endif
			
 
				 }
			
 
				+#endif
			
 
				 
			
 
				 #ifdef STARPU_USE_MIC
			
 
				 static void
			
@@ -1104,13 +1105,14 @@ _starpu_deinit_mic_node (unsigned mic_idx)
 
				 #ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				 static void _starpu_deinit_mpi_node(int devid)
			
 
				 {
			
 
				-        _starpu_mp_common_send_command(mpi_ms_nodes[devid], STARPU_MP_COMMAND_EXIT, NULL, 0);                          
			
 
				+        _starpu_mp_common_send_command(mpi_ms_nodes[devid], STARPU_MP_COMMAND_EXIT, NULL, 0);
			
 
				 
			
 
				         _starpu_mp_common_node_destroy(mpi_ms_nodes[devid]);
			
 
				 }
			
 
				 #endif
			
 
				 
			
 
				 
			
 
				+#if defined(STARPU_USE_MIC) || defined(STARPU_USE_MPI_MASTER_SLAVE)
			
 
				 static void
			
 
				 _starpu_deinit_mp_config (struct _starpu_machine_config *config)
			
 
				 {
			
@@ -1127,6 +1129,7 @@ _starpu_deinit_mp_config (struct _starpu_machine_config *config)
 
				 		_starpu_deinit_mpi_node (i);
			
 
				 #endif
			
 
				 }
			
 
				+#endif
			
 
				 
			
 
				 #ifdef STARPU_HAVE_HWLOC
			
 
				 static unsigned
			
@@ -1466,8 +1469,13 @@ _starpu_init_machine_config(struct _starpu_machine_config *config, int no_mp_con
 
				             mpi_ms_busy_cpus = 1; /* we launch one thread to control all slaves */
			
 
				 #endif
			
 
				 #endif /* STARPU_USE_MPI_MASTER_SLAVE */
			
 
				-
			
 
				-			unsigned already_busy_cpus = mpi_ms_busy_cpus + mic_busy_cpus + topology->ncudagpus
			
 
				+	    unsigned cuda_busy_cpus = 0;
			
 
				+#if defined(STARPU_USE_CUDA)
			
 
				+	    cuda_busy_cpus = th_per_stream ? (nworker_per_cuda * topology->ncudagpus) : 
			
 
				+		    topology->ncudagpus;
			
 
				+#endif
			
 
				+			unsigned already_busy_cpus = mpi_ms_busy_cpus + mic_busy_cpus 
			
 
				+				+ cuda_busy_cpus
			
 
				 				+ topology->nopenclgpus + topology->nsccdevices;
			
 
				 
			
 
				 			long avail_cpus = (long) topology->nhwcpus - (long) already_busy_cpus;
			
@@ -1820,14 +1828,9 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
				 				}
			
 
				 				_starpu_memory_node_add_nworkers(memory_node);
			
 
				 
			
 
				-                                _starpu_worker_drives_memory_node(workerarg->workerid, STARPU_MAIN_RAM);
			
 
				-                                _starpu_worker_drives_memory_node(workerarg->workerid, memory_node);
			
 
				-
			
 
				-#ifdef STARPU_SIMGRID
			
 
				-				starpu_pthread_queue_register(&workerarg->wait, &_starpu_simgrid_transfer_queue[memory_node]);
			
 
				+                                _starpu_worker_drives_memory_node(workerarg, STARPU_MAIN_RAM);
			
 
				 				if (memory_node != STARPU_MAIN_RAM)
			
 
				-					starpu_pthread_queue_register(&workerarg->wait, &_starpu_simgrid_transfer_queue[STARPU_MAIN_RAM]);
			
 
				-#endif
			
 
				+					_starpu_worker_drives_memory_node(workerarg, memory_node);
			
 
				 				break;
			
 
				 			}
			
 
				 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
			
@@ -1916,12 +1919,9 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
				 				}
			
 
				 				_starpu_memory_node_add_nworkers(memory_node);
			
 
				 
			
 
				-                                _starpu_worker_drives_memory_node(workerarg->workerid, STARPU_MAIN_RAM);
			
 
				-                                _starpu_worker_drives_memory_node(workerarg->workerid, memory_node);
			
 
				-#ifdef STARPU_SIMGRID
			
 
				-				starpu_pthread_queue_register(&workerarg->set->workers[0].wait, &_starpu_simgrid_transfer_queue[memory_node]);
			
 
				-				starpu_pthread_queue_register(&workerarg->set->workers[0].wait, &_starpu_simgrid_transfer_queue[STARPU_MAIN_RAM]);
			
 
				-#endif
			
 
				+                                _starpu_worker_drives_memory_node(&workerarg->set->workers[0], STARPU_MAIN_RAM);
			
 
				+				if (memory_node != STARPU_MAIN_RAM)
			
 
				+					_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
			
 
				 				break;
			
 
				 #endif
			
 
				 
			
@@ -1959,12 +1959,9 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
				 				}
			
 
				 				_starpu_memory_node_add_nworkers(memory_node);
			
 
				 
			
 
				-                                _starpu_worker_drives_memory_node(workerarg->workerid, STARPU_MAIN_RAM);
			
 
				-                                _starpu_worker_drives_memory_node(workerarg->workerid, memory_node);
			
 
				-#ifdef STARPU_SIMGRID
			
 
				-				starpu_pthread_queue_register(&workerarg->wait, &_starpu_simgrid_transfer_queue[memory_node]);
			
 
				-				starpu_pthread_queue_register(&workerarg->wait, &_starpu_simgrid_transfer_queue[STARPU_MAIN_RAM]);
			
 
				-#endif
			
 
				+                                _starpu_worker_drives_memory_node(workerarg, STARPU_MAIN_RAM);
			
 
				+				if (memory_node != STARPU_MAIN_RAM)
			
 
				+					_starpu_worker_drives_memory_node(workerarg, memory_node);
			
 
				 				break;
			
 
				 #endif
			
 
				 
			
@@ -1995,12 +1992,9 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
				 				workerarg->bindid = mic_bindid[devid];
			
 
				 				_starpu_memory_node_add_nworkers(memory_node);
			
 
				 
			
 
				-                                _starpu_worker_drives_memory_node(workerarg->workerid, STARPU_MAIN_RAM);
			
 
				-                                _starpu_worker_drives_memory_node(workerarg->workerid, memory_node);
			
 
				-#ifdef STARPU_SIMGRID
			
 
				-				starpu_pthread_queue_register(&workerarg->set->workers[0].wait, &_starpu_simgrid_transfer_queue[memory_node]);
			
 
				-				starpu_pthread_queue_register(&workerarg->set->workers[0].wait, &_starpu_simgrid_transfer_queue[STARPU_MAIN_RAM]);
			
 
				-#endif
			
 
				+                                _starpu_worker_drives_memory_node(&workerarg->set->workers[0], STARPU_MAIN_RAM);
			
 
				+				if (memory_node != STARPU_MAIN_RAM)
			
 
				+					_starpu_worker_drives_memory_node(workerarg->workerid, memory_node);
			
 
				 				break;
			
 
				 #endif /* STARPU_USE_MIC */
			
 
				 
			
@@ -2014,18 +2008,15 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
				 				memory_node = ram_memory_node;
			
 
				 				_starpu_memory_node_add_nworkers(memory_node);
			
 
				 
			
 
				-                                _starpu_worker_drives_memory_node(workerarg->workerid, STARPU_MAIN_RAM);
			
 
				-                                _starpu_worker_drives_memory_node(workerarg->workerid, memory_node);
			
 
				-#ifdef STARPU_SIMGRID
			
 
				-				starpu_pthread_queue_register(&workerarg->wait, &_starpu_simgrid_transfer_queue[memory_node]);
			
 
				-				starpu_pthread_queue_register(&workerarg->wait, &_starpu_simgrid_transfer_queue[STARPU_MAIN_RAM]);
			
 
				-#endif
			
 
				+                                _starpu_worker_drives_memory_node(workerarg, STARPU_MAIN_RAM);
			
 
				+				if (memory_node != STARPU_MAIN_RAM)
			
 
				+					_starpu_worker_drives_memory_node(workerarg, memory_node);
			
 
				 			}
			
 
				 				break;
			
 
				 #endif /* STARPU_USE_SCC */
			
 
				 
			
 
				 #ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				-			case STARPU_MPI_WORKER:
			
 
				+			case STARPU_MPI_MS_WORKER:
			
 
				 			{
			
 
				 				if (mpi_init[devid])
			
 
				 				{
			
@@ -2040,28 +2031,25 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
				 					_starpu_register_bus(memory_node, STARPU_MAIN_RAM);
			
 
				 
			
 
				 				}
			
 
				-                                _starpu_worker_drives_memory_node(workerarg->workerid, STARPU_MAIN_RAM);
			
 
				-                                _starpu_worker_drives_memory_node(workerarg->workerid, memory_node);
			
 
				+                                _starpu_worker_drives_memory_node(&workerarg->set->workers[0], STARPU_MAIN_RAM);
			
 
				+				if (memory_node != STARPU_MAIN_RAM)
			
 
				+					_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
			
 
				 #ifndef STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD
			
 
				                                 /* MPI driver thread can manage all slave memories if we disable the MPI multiple thread */
			
 
				                                 unsigned findworker;
			
 
				                                 for (findworker = 0; findworker < worker; findworker++)
			
 
				                                 {
			
 
				                                         struct _starpu_worker *findworkerarg = &config->workers[findworker];
			
 
				-                                        if (findworkerarg->arch == STARPU_MPI_WORKER)
			
 
				+                                        if (findworkerarg->arch == STARPU_MPI_MS_WORKER)
			
 
				                                         {
			
 
				-                                                _starpu_worker_drives_memory_node(workerarg->workerid, findworkerarg->memory_node);
			
 
				-                                                _starpu_worker_drives_memory_node(findworkerarg->workerid, memory_node);
			
 
				+                                                _starpu_worker_drives_memory_node(workerarg, findworkerarg->memory_node);
			
 
				+                                                _starpu_worker_drives_memory_node(findworkerarg, memory_node);
			
 
				                                         }
			
 
				                                 }
			
 
				 #endif
			
 
				-                
			
 
				+
			
 
				 				workerarg->bindid = mpi_bindid[devid];
			
 
				 				_starpu_memory_node_add_nworkers(memory_node);
			
 
				-#ifdef STARPU_SIMGRID
			
 
				-				starpu_pthread_queue_register(&workerarg->set->workers[0].wait, &_starpu_simgrid_transfer_queue[memory_node]);
			
 
				-				starpu_pthread_queue_register(&workerarg->set->workers[0].wait, &_starpu_simgrid_transfer_queue[STARPU_MAIN_RAM]);
			
 
				-#endif
			
 
				 				break;
			
 
				 			}
			
 
				 #endif /* STARPU_USE_MPI_MASTER_SLAVE */
			
@@ -2192,7 +2180,7 @@ _starpu_build_topology (struct _starpu_machine_config *config, int no_mp_config)
 
				 				else if (config->scc_nodeid != (int) starpu_worker_get_memory_node(i))
			
 
				 					config->scc_nodeid = -2;
			
 
				 				break;
			
 
				-			case STARPU_MPI_WORKER:
			
 
				+			case STARPU_MPI_MS_WORKER:
			
 
				 				if (config->mpi_nodeid == -1)
			
 
				 					config->mpi_nodeid = starpu_worker_get_memory_node(i);
			
 
				 				else if (config->mpi_nodeid != (int) starpu_worker_get_memory_node(i))
			
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -4,7 +4,7 @@
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
			
 
				  * Copyright (C) 2010, 2011  INRIA
			
 
				  * Copyright (C) 2011  Télécom-SudParis
			
 
				- * Copyright (C) 2011-2012, 2016  INRIA
			
 
				+ * Copyright (C) 2011-2012, 2016, 2017  INRIA
			
 
				  * Copyright (C) 2016  Uppsala University
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -143,7 +143,7 @@ static uint32_t _starpu_worker_exists_and_can_execute(struct starpu_task *task,
 
				 				if (task->cl->cpu_funcs_name[impl] != NULL || task->cl->mic_funcs[impl] != NULL)
			
 
				 					test_implementation = 1;
			
 
				 				break;
			
 
				-                        case STARPU_MPI_WORKER:
			
 
				+                        case STARPU_MPI_MS_WORKER:
			
 
				                                 if (task->cl->cpu_funcs_name[impl] != NULL || task->cl->mpi_ms_funcs[impl] != NULL)
			
 
				                                         test_implementation = 1;
			
 
				                                 break;
			
@@ -212,7 +212,7 @@ uint32_t _starpu_worker_exists(struct starpu_task *task)
 
				 #endif
			
 
				 #ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				 	if ((task->cl->where & STARPU_MPI_MS) &&
			
 
				-	    _starpu_worker_exists_and_can_execute(task, STARPU_MPI_WORKER))
			
 
				+	    _starpu_worker_exists_and_can_execute(task, STARPU_MPI_MS_WORKER))
			
 
				 		return 1;
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_SCC
			
@@ -290,7 +290,7 @@ static inline int _starpu_can_use_nth_implementation(enum starpu_worker_archtype
 
				 
			
 
				 		return func != NULL || func_name != NULL;
			
 
				 	}
			
 
				-	case STARPU_MPI_WORKER:
			
 
				+	case STARPU_MPI_MS_WORKER:
			
 
				 	{
			
 
				 		starpu_mpi_ms_func_t func = _starpu_task_get_mpi_ms_nth_implementation(cl, nimpl);
			
 
				 		const char *func_name = _starpu_task_get_cpu_name_nth_implementation(cl, nimpl);
			
@@ -538,8 +538,9 @@ static void _starpu_worker_init(struct _starpu_worker *workerarg, struct _starpu
 
				 	starpu_pthread_wait_init(&workerarg->wait);
			
 
				 	starpu_pthread_queue_register(&workerarg->wait, &_starpu_simgrid_task_queue[workerarg->workerid]);
			
 
				 #endif
			
 
				-        workerarg->task_sending = NULL;
			
 
				-        workerarg->nb_buffers_sent = 0;
			
 
				+	workerarg->task_transferring = NULL;
			
 
				+	workerarg->nb_buffers_transferred = 0;
			
 
				+	workerarg->nb_buffers_totransfer = 0;
			
 
				 
			
 
				 	workerarg->first_task = 0;
			
 
				 	workerarg->ntasks = 0;
			
@@ -829,7 +830,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 
				 				break;
			
 
				 #endif /* STARPU_USE_SCC */
			
 
				 #ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				-			case STARPU_MPI_WORKER:
			
 
				+			case STARPU_MPI_MS_WORKER:
			
 
				 				/* We spawn only one thread
			
 
				 				 * per MPI device, which will control all MPI
			
 
				 				 * workers of this device. (by using a worker set). */
			
@@ -966,7 +967,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 
				 				break;
			
 
				 #endif
			
 
				 			case STARPU_MIC_WORKER:
			
 
				-                        case STARPU_MPI_WORKER:
			
 
				+                        case STARPU_MPI_MS_WORKER:
			
 
				 				/* Already waited above */
			
 
				 				break;
			
 
				 			case STARPU_SCC_WORKER:
			
@@ -1025,7 +1026,7 @@ int starpu_conf_init(struct starpu_conf *conf)
 
				 	conf->use_explicit_workers_opencl_gpuid = 0; /* TODO */
			
 
				 	conf->use_explicit_workers_mic_deviceid = 0; /* TODO */
			
 
				 	conf->use_explicit_workers_scc_deviceid = 0; /* TODO */
			
 
				-	conf->use_explicit_workers_mpi_deviceid = 0; /* TODO */
			
 
				+	conf->use_explicit_workers_mpi_ms_deviceid = 0; /* TODO */
			
 
				 
			
 
				 	conf->single_combined_worker = starpu_get_env_number("STARPU_SINGLE_COMBINED_WORKER");
			
 
				 	if (conf->single_combined_worker == -1)
			
@@ -1783,7 +1784,7 @@ int starpu_worker_get_count_by_type(enum starpu_worker_archtype type)
 
				 		case STARPU_SCC_WORKER:
			
 
				 			return _starpu_config.topology.nsccdevices;
			
 
				 
			
 
				-                case STARPU_MPI_WORKER:
			
 
				+                case STARPU_MPI_MS_WORKER:
			
 
				                         return _starpu_config.topology.nmpidevices;
			
 
				 
			
 
				                 case STARPU_ANY_WORKER:
			
@@ -2390,7 +2391,7 @@ char *starpu_worker_get_type_as_string(enum starpu_worker_archtype type)
 
				 	if (type == STARPU_CUDA_WORKER) return "STARPU_CUDA_WORKER";
			
 
				 	if (type == STARPU_OPENCL_WORKER) return "STARPU_OPENCL_WORKER";
			
 
				 	if (type == STARPU_MIC_WORKER) return "STARPU_MIC_WORKER";
			
 
				-        if (type == STARPU_MPI_WORKER) return "STARPU_MPI_WORKER";
			
 
				+        if (type == STARPU_MPI_MS_WORKER) return "STARPU_MPI_MS_WORKER";
			
 
				 	if (type == STARPU_SCC_WORKER) return "STARPU_SCC_WORKER";
			
 
				 	if (type == STARPU_ANY_WORKER) return "STARPU_ANY_WORKER";
			
 
				 	return "STARPU_unknown_WORKER";
			
--- a/src/core/workers.h
+++ b/src/core/workers.h
@@ -116,8 +116,9 @@ LIST_TYPE(_starpu_worker,
 
				 
			
 
				 	unsigned spinning_backoff ; /* number of cycles to pause when spinning  */
			
 
				 
			
 
				-        unsigned nb_buffers_sent; /* number of piece of data already send to remote side */
			
 
				-        struct starpu_task *task_sending; /* The buffers of this task are being sent */
			
 
				+	unsigned nb_buffers_transferred; /* number of piece of data already send to worker */
			
 
				+	unsigned nb_buffers_totransfer; /* number of piece of data already send to worker */
			
 
				+	struct starpu_task *task_transferring; /* The buffers of this task are being sent */
			
 
				 
			
 
				 	/* indicate whether the workers shares tasks lists with other workers*/
			
 
				 	/* in this case when removing him from a context it disapears instantly */
			
@@ -306,7 +307,7 @@ struct _starpu_machine_topology
 
				 	 */
			
 
				 	unsigned workers_scc_deviceid[STARPU_NMAXWORKERS];
			
 
				 
			
 
				-	unsigned workers_mpi_deviceid[STARPU_NMAXWORKERS];
			
 
				+	unsigned workers_mpi_ms_deviceid[STARPU_NMAXWORKERS];
			
 
				 };
			
 
				 
			
 
				 struct _starpu_machine_config
			
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c
@@ -932,28 +932,58 @@ struct _starpu_data_replicate *get_replicate(starpu_data_handle_t handle, enum s
 
				 		return &handle->per_node[node];
			
 
				 }
			
 
				 
			
 
				-/* Synchronously fetch data for a given task (if it's not there already) */
			
 
				-int _starpu_fetch_task_input(struct _starpu_job *j)
			
 
				+/* Callback used when a buffer is send asynchronously to the sink */
			
 
				+static void _starpu_fetch_task_input_cb(void *arg)
			
 
				 {
			
 
				-	_STARPU_TRACE_START_FETCH_INPUT(NULL);
			
 
				+   struct _starpu_worker * worker = (struct _starpu_worker *) arg;
			
 
				+
			
 
				+   /* increase the number of buffer received */
			
 
				+   STARPU_WMB();
			
 
				+   (void)STARPU_ATOMIC_ADD(&worker->nb_buffers_transferred, 1);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* Synchronously or asynchronously fetch data for a given task (if it's not there already) 
			
 
				+ * Returns the number of data acquired here.  */
			
 
				+
			
 
				+/* The synchronous version of _starpu_fetch_task_input must be called before
			
 
				+ * executing the task. __starpu_push_task_output but be called after the
			
 
				+ * execution of the task. */
			
 
				+/* To improve overlapping, the driver can, before calling the synchronous
			
 
				+ * version of _starpu_fetch_task_input, call _starpu_fetch_task_input with
			
 
				+ * async==1, then wait for transfers to complete, then call
			
 
				+ * _starpu_release_fetch_task_input_async to release them before calling the
			
 
				+ * synchronous version of _starpu_fetch_task_input. */
			
 
				+int _starpu_fetch_task_input(struct starpu_task *task, struct _starpu_job *j, int async)
			
 
				+{
			
 
				+	struct _starpu_worker *worker = _starpu_get_local_worker_key();
			
 
				+	int workerid = worker->workerid;
			
 
				+	if (async)
			
 
				+	{
			
 
				+		worker->task_transferring = task;
			
 
				+		worker->nb_buffers_transferred = 0;
			
 
				+		if (worker->ntasks <= 1)
			
 
				+			_STARPU_TRACE_WORKER_START_FETCH_INPUT(NULL, workerid);
			
 
				+	}
			
 
				+	else
			
 
				+		_STARPU_TRACE_START_FETCH_INPUT(NULL);
			
 
				 
			
 
				 	int profiling = starpu_profiling_status_get();
			
 
				-	struct starpu_task *task = j->task;
			
 
				 	if (profiling && task->profiling_info)
			
 
				 		_starpu_clock_gettime(&task->profiling_info->acquire_data_start_time);
			
 
				 
			
 
				 	struct _starpu_data_descr *descrs = _STARPU_JOB_GET_ORDERED_BUFFERS(j);
			
 
				 	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
			
 
				+	unsigned nacquires;
			
 
				 
			
 
				 	unsigned local_memory_node = _starpu_memory_node_get_local_key();
			
 
				 
			
 
				-	int workerid = starpu_worker_get_id_check();
			
 
				-
			
 
				 #ifdef STARPU_USE_FXT
			
 
				 	unsigned long total_size = 0;
			
 
				 #endif
			
 
				 
			
 
				 	unsigned index;
			
 
				+	nacquires = 0;
			
 
				 	for (index = 0; index < nbuffers; index++)
			
 
				 	{
			
 
				 		int ret;
			
@@ -977,13 +1007,33 @@ int _starpu_fetch_task_input(struct _starpu_job *j)
 
				 
			
 
				 		local_replicate = get_replicate(handle, mode, workerid, node);
			
 
				 
			
 
				-		ret = fetch_data(handle, node, local_replicate, mode, 0);
			
 
				-		if (STARPU_UNLIKELY(ret))
			
 
				-			goto enomem;
			
 
				+		if (async)
			
 
				+		{
			
 
				+			ret = _starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, 0, 1,
			
 
				+					_starpu_fetch_task_input_cb, worker, 0, "_starpu_src_common_worker_internal_work");
			
 
				+			if (STARPU_UNLIKELY(ret))
			
 
				+			{
			
 
				+				/* Ooops, not enough memory, make worker wait for these for now, and the synchronous call will finish by forcing eviction*/
			
 
				+				worker->nb_buffers_totransfer = nacquires;
			
 
				+				return 0;
			
 
				+			}
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			ret = fetch_data(handle, node, local_replicate, mode, 0);
			
 
				+			if (STARPU_UNLIKELY(ret))
			
 
				+				goto enomem;
			
 
				+		}
			
 
				 
			
 
				 #ifdef STARPU_USE_FXT
			
 
				 		total_size += _starpu_data_get_size(handle);
			
 
				 #endif
			
 
				+		nacquires++;
			
 
				+	}
			
 
				+	if (async)
			
 
				+	{
			
 
				+		worker->nb_buffers_totransfer = nacquires;
			
 
				+		return 0;
			
 
				 	}
			
 
				 
			
 
				 	_STARPU_TRACE_DATA_LOAD(workerid,total_size);
			
@@ -1044,6 +1094,56 @@ enomem:
 
				 	return -1;
			
 
				 }
			
 
				 
			
 
				+/* This is to be called after having called _starpu_fetch_task_input with async=1 and getting the cb called as many times as there are buffers.  */
			
 
				+int _starpu_release_fetch_task_input_async(struct _starpu_job *j, struct _starpu_worker *worker)
			
 
				+{
			
 
				+	unsigned workerid = worker->workerid;
			
 
				+	unsigned nbtransfers = worker->nb_buffers_totransfer;
			
 
				+	STARPU_RMB();
			
 
				+	if (worker->ntasks <= 1)
			
 
				+		_STARPU_TRACE_WORKER_END_FETCH_INPUT(NULL, workerid);
			
 
				+	struct starpu_task *task = j->task;
			
 
				+
			
 
				+	struct _starpu_data_descr *descrs = _STARPU_JOB_GET_ORDERED_BUFFERS(j);
			
 
				+	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
			
 
				+	unsigned local_memory_node = _starpu_memory_node_get_local_key();
			
 
				+	unsigned index;
			
 
				+	unsigned nreleases;
			
 
				+
			
 
				+	nreleases = 0;
			
 
				+	for (index = 0; index < nbuffers; index++)
			
 
				+	{
			
 
				+		if (nreleases == nbtransfers)
			
 
				+			/* That was a partial fetch */
			
 
				+			break;
			
 
				+		starpu_data_handle_t handle = descrs[index].handle;
			
 
				+		enum starpu_data_access_mode mode = descrs[index].mode;
			
 
				+		int node = descrs[index].node;
			
 
				+		if (node == -1)
			
 
				+			node = local_memory_node;
			
 
				+
			
 
				+		struct _starpu_data_replicate *local_replicate;
			
 
				+
			
 
				+		if (index && descrs[index-1].handle == descrs[index].handle)
			
 
				+			/* We have already took this data, skip it. This
			
 
				+			 * depends on ordering putting writes before reads, see
			
 
				+			 * _starpu_compar_handles */
			
 
				+			continue;
			
 
				+
			
 
				+		local_replicate = get_replicate(handle, mode, workerid, node);
			
 
				+
			
 
				+		/* Release our refcnt */
			
 
				+		_starpu_spin_lock(&handle->header_lock);
			
 
				+		local_replicate->refcnt--;
			
 
				+		STARPU_ASSERT(local_replicate->refcnt >= 0);
			
 
				+		STARPU_ASSERT(handle->busy_count > 0);
			
 
				+		handle->busy_count--;
			
 
				+		if (!_starpu_data_check_not_busy(handle))
			
 
				+			_starpu_spin_unlock(&handle->header_lock);
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 /* Release task data dependencies */
			
 
				 void __starpu_push_task_output(struct _starpu_job *j)
			
 
				 {
			
--- a/src/datawizard/coherency.h
+++ b/src/datawizard/coherency.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2009-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2017  CNRS
			
 
				  * Copyright (C) 2014-2016  Inria
			
 
				  *
			
@@ -300,8 +300,10 @@ void _starpu_push_task_output(struct _starpu_job *j);
 
				 
			
 
				 void _starpu_release_nowhere_task_output(struct _starpu_job *j);
			
 
				 
			
 
				+struct _starpu_worker;
			
 
				 STARPU_ATTRIBUTE_WARN_UNUSED_RESULT
			
 
				-int _starpu_fetch_task_input(struct _starpu_job *j);
			
 
				+int _starpu_fetch_task_input(struct starpu_task *task, struct _starpu_job *j, int async);
			
 
				+int _starpu_release_fetch_task_input_async(struct _starpu_job *j, struct _starpu_worker *worker);
			
 
				 void _starpu_fetch_nowhere_task_input(struct _starpu_job *j);
			
 
				 
			
 
				 unsigned _starpu_is_data_present_or_requested(struct _starpu_data_state *state, unsigned node);
			
--- a/src/datawizard/datawizard.c
+++ b/src/datawizard/datawizard.c
@@ -69,6 +69,10 @@ int __starpu_datawizard_progress(unsigned may_alloc, unsigned push_requests)
 
				         int current_worker_id = starpu_worker_get_id();
			
 
				         unsigned memnode;
			
 
				 
			
 
				+	if (current_worker_id < 0)
			
 
				+		/* Call from main application, only make RAM requests progress */
			
 
				+		return ___starpu_datawizard_progress(STARPU_MAIN_RAM, may_alloc, push_requests);
			
 
				+
			
 
				         int ret = 0;
			
 
				 
			
 
				         for (memnode = 0; memnode < STARPU_MAXNODES; memnode++)
			
--- a/src/datawizard/interfaces/data_interface.c
+++ b/src/datawizard/interfaces/data_interface.c
@@ -694,8 +694,6 @@ static void _starpu_data_unregister_fetch_data_callback(void *_arg)
 
				 static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned coherent, unsigned nowait)
			
 
				 {
			
 
				 	STARPU_ASSERT(handle);
			
 
				-	/* Prevent any further unregistration */
			
 
				-	handle->magic = 0;
			
 
				 	STARPU_ASSERT_MSG(handle->nchildren == 0, "data %p needs to be unpartitioned before unregistration", handle);
			
 
				 	STARPU_ASSERT_MSG(handle->nplans == 0, "data %p needs its partition plans to be cleaned before unregistration", handle);
			
 
				 	STARPU_ASSERT_MSG(handle->partitioned == 0, "data %p needs its partitioned plans to be unpartitioned before unregistration", handle);
			
@@ -808,6 +806,9 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+	/* Prevent any further unregistration */
			
 
				+	handle->magic = 0;
			
 
				+
			
 
				 	_starpu_spin_lock(&handle->header_lock);
			
 
				 	if (!coherent)
			
 
				 	{
			
--- a/src/datawizard/interfaces/vector_interface.c
+++ b/src/datawizard/interfaces/vector_interface.c
@@ -119,7 +119,7 @@ void starpu_vector_data_register(starpu_data_handle_t *handleptr, int home_node,
 
				 		.slice_base = 0,
			
 
				                 .offset = 0
			
 
				 	};
			
 
				-#ifndef STARPU_SIMGRID
			
 
				+#if (!defined(STARPU_SIMGRID) && !defined(STARPU_OPENMP))
			
 
				 	if (home_node >= 0 && starpu_node_get_kind(home_node) == STARPU_CPU_RAM)
			
 
				 	{
			
 
				 		STARPU_ASSERT_ACCESSIBLE(ptr);
			
--- a/src/datawizard/memory_nodes.c
+++ b/src/datawizard/memory_nodes.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2009-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -186,3 +186,12 @@ unsigned starpu_worker_get_memory_node(unsigned workerid)
 
				 	return _starpu_worker_get_memory_node(workerid);
			
 
				 }
			
 
				 
			
 
				+/* same utility as _starpu_memory_node_add_nworkers */
			
 
				+void _starpu_worker_drives_memory_node(struct _starpu_worker *worker, unsigned memnode)
			
 
				+{
			
 
				+	_starpu_worker_drives_memory[worker->workerid][memnode] = 1;
			
 
				+#ifdef STARPU_SIMGRID
			
 
				+	starpu_pthread_queue_register(&worker->wait, &_starpu_simgrid_transfer_queue[memnode]);
			
 
				+#endif
			
 
				+}
			
 
				+
			
--- a/src/datawizard/memory_nodes.h
+++ b/src/datawizard/memory_nodes.h