5 years ago · bc23f1b516
--- a/ChangeLog
+++ b/ChangeLog
@@ -31,9 +31,12 @@ New features:
 
				     files. This file can be parsed by the new script
			
 
				     starpu_fxt_number_events_to_names.py to convert event keys to event names.
			
 
				   * New STARPU_PER_WORKER perfmodel.
			
 
				+  * Add energy accounting in the simgrid mode: starpu_energy_use() and
			
 
				+    starpu_energy_used().
			
 
				 
			
 
				 Small changes:
			
 
				   * Use the S4U interface of Simgrid instead of xbt and MSG.
			
 
				+  * Add a synthetic energy efficiency testcase.
			
 
				 
			
 
				 StarPU 1.3.4 (git revision xxx)
			
 
				 ==============================================
			
@@ -60,6 +63,11 @@ Small features:
 
				   * New STARPU_BACKOFF_MIN and STARPU_BACKOFF_MAX environment variables to the
			
 
				     exponential backoff limits of the number of cycles to pause while drivers
			
 
				     are spinning.
			
 
				+  * Add STARPU_DISPLAY_BINDINGS environment variable and
			
 
				+    starpu_display_bindings() function to display all bindings on the machine by
			
 
				+    calling hwloc-ps
			
 
				+Small changes:
			
 
				+  * New configure option --disable-build-doc-pdf
			
 
				 
			
 
				 StarPU 1.3.3 (git revision 11afc5b007fe1ab1c729b55b47a5a98ef7f3cfad)
			
 
				 ====================================================================
			
--- a/configure.ac
+++ b/configure.ac
@@ -2245,6 +2245,14 @@ AC_MSG_RESULT($nmaxbuffers)
 
				 AC_DEFINE_UNQUOTED(STARPU_NMAXBUFS, [$nmaxbuffers],
			
 
				 		[how many buffers can be manipulated per task])
			
 
				 
			
 
				+AC_MSG_CHECKING(how many MPI nodes fxt files can be manipulated when generating traces)
			
 
				+AC_ARG_ENABLE(fxt-max-files, [AS_HELP_STRING([--enable-fxt-max-files=<nbuffers>],
			
 
				+			[maximum number of mpi nodes for traces])],
			
 
				+			nmaxfxtfiles=$enableval, nmaxfxtfiles=64)
			
 
				+AC_MSG_RESULT($nmaxfxtfiles)
			
 
				+AC_DEFINE_UNQUOTED(STARPU_FXT_MAX_FILES, [$nmaxfxtfiles],
			
 
				+		[how many MPI nodes fxt files can be manipulated when generating traces])
			
 
				+
			
 
				 AC_MSG_CHECKING(maximum number of memory nodes to use per MPI rank)
			
 
				 AC_ARG_ENABLE(maxnodes, [AS_HELP_STRING([--enable-maxnodes=<nnodes>],
			
 
				 			[maximum number of memory nodes per MPI rank])],
			
@@ -2537,6 +2545,7 @@ if test "x$enable_build_fortran_requested" = "xyes" ; then
 
				                  fi
			
 
				 	else
			
 
				 		if $FC -V 2>&1|grep -q 'Intel(R) Fortran'; then
			
 
				+			enable_build_fortran="yes"
			
 
				 			ifort_fc_version=`$FC -V 2>&1 |head -1|sed 's/.*Version //;s/ Build.*//'`
			
 
				 			ifort_maj_version=`echo $ifort_fc_version|cut -d. -f1`
			
 
				 
			
@@ -2553,38 +2562,28 @@ if test "x$enable_build_fortran_requested" = "xyes" ; then
 
				 				enable_build_fortran="no"
			
 
				 			else
			
 
				 				AC_MSG_WARN(Fortran compiler has not been tested for StarPU native Fortran support)
			
 
				+				 enable_build_fortran="yes"
			
 
				 			fi
			
 
				 		fi
			
 
				 	fi
			
 
				 	if test "x$enable_build_fortran" = "xyes" ; then
			
 
				 		AC_DEFINE(STARPU_HAVE_FC, [1], [Define this if a Fortran compiler is available])
			
 
				-		if test x$build_mpi_lib = xyes -o x$build_mpi_master_slave = xyes ; then
			
 
				-			AC_ARG_WITH(mpifort, [AS_HELP_STRING([--with-mpifort[=<path to mpifort>]],
			
 
				-				    [Path of the mpifort compiler])],
			
 
				-				    [
			
 
				-				     if test x$withval = xyes; then
			
 
				-					     AC_MSG_ERROR(--with-mpifort must be given a pathname)
			
 
				-					     else
			
 
				-						     mpifort_path=$withval
			
 
				-					     fi
			
 
				-					     ],
			
 
				-					     [
			
 
				-					      if test x$enable_simgrid = xyes ; then
			
 
				-						      DEFAULT_MPIFORT=smpifort
			
 
				-					      else
			
 
				-						      DEFAULT_MPIFORT=mpif90
			
 
				-					      fi
			
 
				-					      case $DEFAULT_MPIFORT in
			
 
				-					      	/*) mpifort_path="$DEFAULT_MPIFORT" ;;
			
 
				-					        *)  AC_PATH_PROG(mpifort_path, $DEFAULT_MPIFORT, [no], [$MPIPATH])
			
 
				-					      esac
			
 
				-					      ])
			
 
				-
			
 
				+		if test x$build_mpi_lib = xyes -o x$build_nmad_lib = xyes -o x$build_mpi_master_slave = xyes ; then
			
 
				+			#Check MPIFORT
			
 
				+			if test x$enable_simgrid = xyes ; then
			
 
				+				DEFAULT_MPIFORT=smpifort
			
 
				+			else
			
 
				+				DEFAULT_MPIFORT=mpifort
			
 
				+			fi
			
 
				+			AC_ARG_WITH(mpifort, [AS_HELP_STRING([--with-mpifort=<mpifort name or path to mpifort>], [Name or path of the mpifort compiler])], [DEFAULT_MPIFORT=$withval])
			
 
				+			case $DEFAULT_MPIFORT in
			
 
				+				/*) mpifort_path="$DEFAULT_MPIFORT" ;;
			
 
				+				*)  AC_PATH_PROG(mpifort_path, $DEFAULT_MPIFORT, [no], [$simgrid_dir/bin:$PATH]) ;;
			
 
				+			esac
			
 
				 			# We test if the MPIFORT compiler exists
			
 
				 			if test ! -x $mpifort_path; then
			
 
				-				#MPIFORT does not exists or is not executable
			
 
				 				AC_MSG_RESULT(The mpifort compiler '$mpifort_path' does not have the execute permission)
			
 
				-				use_mpi_fort=no
			
 
				+				mpifort_path=no
			
 
				 			else
			
 
				 				OLD_CC=$CC
			
 
				 				CC=$mpicc_path
			
@@ -2599,11 +2598,18 @@ if test "x$enable_build_fortran_requested" = "xyes" ; then
 
				 				CC=$OLD_CC
			
 
				 				if test "x$use_mpi_fort" = xyes; then
			
 
				 					AC_DEFINE([HAVE_MPI_COMM_F2C], [1], [Function MPI_Comm_f2c is available])
			
 
				-					AC_MSG_CHECKING(mpifort path)
			
 
				-					AC_MSG_RESULT($mpifort_path)
			
 
				-					AC_SUBST(MPIFORT, $mpifort_path)
			
 
				 				fi
			
 
				 			fi
			
 
				+
			
 
				+			AC_MSG_CHECKING(whether mpifort is available)
			
 
				+			AC_MSG_RESULT($mpifort_path)
			
 
				+			AC_SUBST(MPIFORT, $mpifort_path)
			
 
				+
			
 
				+			if test x$mpifort_path != xno ; then
			
 
				+				MPIPATH=$(dirname $mpifort_path):$PATH
			
 
				+			else
			
 
				+				MPIPATH=$PATH
			
 
				+			fi
			
 
				 		fi
			
 
				 	fi
			
 
				    fi
			
@@ -3413,34 +3419,51 @@ AC_ARG_ENABLE(build-doc, [AS_HELP_STRING([--disable-build-doc],
 
				 			[disable building of documentation])],
			
 
				 			enable_build_doc=$enableval, enable_build_doc=yes)
			
 
				 
			
 
				-if test "$enable_build_doc" = "yes" ; then
			
 
				-   # Check whether doxygen needed tools are installed
			
 
				-   AC_PATH_PROG(doxygencommand, doxygen)
			
 
				-   if test "$doxygencommand" = "" ; then
			
 
				-      	enable_build_doc="no"
			
 
				-   fi
			
 
				-   AC_PATH_PROG(pdflatexcommand, pdflatex)
			
 
				-   if test "$pdflatexcommand" = "" ; then
			
 
				-	enable_build_doc="no"
			
 
				-   fi
			
 
				-   AC_PATH_PROG(epstopdfcommand, epstopdf)
			
 
				-   if test "$epstopdfcommand" = "" ; then
			
 
				-	enable_build_doc="no"
			
 
				-   fi
			
 
				+AC_ARG_ENABLE(build-doc-pdf, [AS_HELP_STRING([--enable-build-doc-pdf],
			
 
				+			[enable building of PDF documentation])],
			
 
				+			enable_build_doc_pdf=$enableval, enable_build_doc_pdf=no)
			
 
				+
			
 
				+# Check whether doxygen needed tools are installed
			
 
				+AC_PATH_PROG(doxygencommand, doxygen)
			
 
				+if test "$doxygencommand" = "" ; then
			
 
				+   enable_build_doc="no"
			
 
				+   enable_build_doc_pdf="no"
			
 
				+fi
			
 
				+AC_PATH_PROG(pdflatexcommand, pdflatex)
			
 
				+if test "$pdflatexcommand" = "" ; then
			
 
				+   enable_build_doc_pdf="no"
			
 
				 fi
			
 
				+AC_PATH_PROG(epstopdfcommand, epstopdf)
			
 
				+if test "$epstopdfcommand" = "" ; then
			
 
				+   enable_build_doc_pdf="no"
			
 
				+fi
			
 
				+
			
 
				 available_doc="no"
			
 
				-if test -f "$srcdir/doc/doxygen/starpu.pdf" ; then
			
 
				+if test -d "$srcdir/doc/doxygen/html" ; then
			
 
				    enable_build_doc="no"
			
 
				    available_doc="yes"
			
 
				 fi
			
 
				-AC_MSG_CHECKING(whether documentation should be compiled)
			
 
				+available_doc_pdf="no"
			
 
				+if test -f "$srcdir/doc/doxygen/starpu.pdf" ; then
			
 
				+   enable_build_doc="no"
			
 
				+   enable_build_doc_pdf="no"
			
 
				+   available_doc_pdf="yes"
			
 
				+fi
			
 
				+AC_MSG_CHECKING(whether HTML documentation should be compiled)
			
 
				 AC_MSG_RESULT($enable_build_doc)
			
 
				-AC_MSG_CHECKING(whether documentation is available)
			
 
				+AC_MSG_CHECKING(whether HTML documentation is available)
			
 
				 AC_MSG_RESULT($available_doc)
			
 
				+AC_MSG_CHECKING(whether PDF documentation should be compiled)
			
 
				+AC_MSG_RESULT($enable_build_doc_pdf)
			
 
				+AC_MSG_CHECKING(whether PDF documentation is available)
			
 
				+AC_MSG_RESULT($available_doc_pdf)
			
 
				 
			
 
				 AM_CONDITIONAL(STARPU_BUILD_DOC, [test x$enable_build_doc != xno])
			
 
				 AM_CONDITIONAL(STARPU_AVAILABLE_DOC, [test x$available_doc != xno])
			
 
				 
			
 
				+AM_CONDITIONAL(STARPU_BUILD_DOC_PDF, [test x$enable_build_doc_pdf != xno])
			
 
				+AM_CONDITIONAL(STARPU_AVAILABLE_DOC_PDF, [test x$available_doc_pdf != xno])
			
 
				+
			
 
				 ###############################################################################
			
 
				 #                                                                             #
			
 
				 #                                Julia                                        #
			
@@ -3520,6 +3543,9 @@ AC_CONFIG_COMMANDS([executable-scripts], [
 
				   test -e tests/microbenchs/parallel_independent_homogeneous_tasks.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_homogeneous_tasks.sh tests/microbenchs/
			
 
				   test -e tests/microbenchs/parallel_redux_homogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_redux_homogeneous_tasks_data.sh tests/microbenchs/
			
 
				   test -e tests/microbenchs/parallel_redux_heterogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_redux_heterogeneous_tasks_data.sh tests/microbenchs/
			
 
				+  mkdir -p tests/energy
			
 
				+  test -e tests/energy/static.sh || ln -sf $ac_abs_top_srcdir/tests/energy/static.sh tests/energy/
			
 
				+  test -e tests/energy/dynamic.sh || ln -sf $ac_abs_top_srcdir/tests/energy/dynamic.sh tests/energy/
			
 
				   mkdir -p tests/datawizard
			
 
				   test -e tests/datawizard/locality.sh || ln -sf $ac_abs_top_srcdir/tests/datawizard/locality.sh tests/datawizard/
			
 
				   mkdir -p tests/overlap
			
@@ -3672,8 +3698,9 @@ AC_MSG_NOTICE([
 
				 	hwloc:             $have_valid_hwloc
			
 
				 	FxT trace enabled: $use_fxt
			
 
				 
			
 
				-        Documentation:     $enable_build_doc
			
 
				-        Examples:          $enable_build_examples
			
 
				+        Documentation HTML:  $enable_build_doc
			
 
				+        Documentation PDF:   $enable_build_doc_pdf
			
 
				+        Examples:            $enable_build_examples
			
 
				 
			
 
				 	StarPU Extensions:
			
 
				 	       StarPU MPI enabled:                            $build_mpi_lib
			
--- a/contrib/ci.inria.fr/job-0-tarball.sh
+++ b/contrib/ci.inria.fr/job-0-tarball.sh
@@ -21,7 +21,7 @@ export LD_LIBRARY_PATH=/home/ci/usr/local/lib:$LD_LIBRARY_PATH
 
				 ./autogen.sh
			
 
				 if test -d build ; then chmod -R 777 build && rm -rf build ; fi
			
 
				 mkdir build && cd build
			
 
				-../configure
			
 
				+../configure --enable-build-doc-pdf
			
 
				 make V=1
			
 
				 make dist
			
 
				 cp *gz ..
			
--- a/doc/doxygen/Makefile.am
+++ b/doc/doxygen/Makefile.am
@@ -30,9 +30,14 @@ txtdir   = $(docdir)/manual
 
				 EXTRA_DIST =
			
 
				 
			
 
				 if STARPU_BUILD_DOC
			
 
				+if STARPU_BUILD_DOC_PDF
			
 
				 all: $(DOX_HTML_DIR) $(DOX_PDF)
			
 
				 EXTRA_DIST += $(DOX_HTML_DIR) $(DOX_PDF)
			
 
				 txt_DATA = $(DOX_PDF)
			
 
				+else
			
 
				+all: $(DOX_HTML_DIR)
			
 
				+EXTRA_DIST += $(DOX_HTML_DIR)
			
 
				+endif
			
 
				 DOX_HTML_SRCDIR=$(DOX_HTML_DIR)
			
 
				 install-exec-hook:
			
 
				 	$(MKDIR_P) $(DESTDIR)$(docdir)/manual/html
			
@@ -41,8 +46,7 @@ uninstall-hook:
 
				 	rm -rf $(DESTDIR)$(docdir)/manual/html
			
 
				 else
			
 
				 if STARPU_AVAILABLE_DOC
			
 
				-EXTRA_DIST += $(top_srcdir)/doc/doxygen/html $(top_srcdir)/doc/doxygen/starpu.pdf
			
 
				-txt_DATA = $(top_srcdir)/doc/doxygen/starpu.pdf
			
 
				+EXTRA_DIST += $(top_srcdir)/doc/doxygen/html
			
 
				 DOX_HTML_SRCDIR=$(top_srcdir)/doc/doxygen/html
			
 
				 install-exec-hook:
			
 
				 	$(MKDIR_P) $(DESTDIR)$(docdir)/manual/html
			
@@ -50,6 +54,10 @@ install-exec-hook:
 
				 uninstall-hook:
			
 
				 	rm -rf $(DESTDIR)$(docdir)/manual/html
			
 
				 endif
			
 
				+if STARPU_AVAILABLE_DOC_PDF
			
 
				+EXTRA_DIST += $(top_srcdir)/doc/doxygen/starpu.pdf
			
 
				+txt_DATA = $(top_srcdir)/doc/doxygen/starpu.pdf
			
 
				+endif
			
 
				 endif
			
 
				 
			
 
				 chapters =	\
			
@@ -257,6 +265,8 @@ $(DOX_TAG): $(dox_inputs)
 
				 	@$(SED) -i '/\\begin{titlepage}/,$$d' $(DOX_LATEX_DIR)/refman.tex
			
 
				 	@cat $(top_srcdir)/doc/doxygen/refman.tex >> $(DOX_LATEX_DIR)/refman.tex
			
 
				 
			
 
				+$(DOX_HTML_DIR): $(DOX_TAG)
			
 
				+
			
 
				 $(DOX_PDF): $(DOX_TAG) refman.tex
			
 
				 	@cp $(top_srcdir)/doc/doxygen/chapters/version.sty $(DOX_LATEX_DIR)
			
 
				 	@cp $(top_srcdir)/doc/doxygen/chapters/images/*pdf $(DOX_LATEX_DIR)
			
@@ -294,5 +304,5 @@ EXTRA_DIST += doxygen.cfg refman.tex \
 
				 # Rule to update documentation on web server. Should only be used locally.
			
 
				 PUBLISHHOST	?= gforge
			
 
				 update-web: $(DOX_PDF)
			
 
				-	scp -pr starpu.pdf html $(PUBLISHHOST):/home/groups/starpu/htdocs/doc
			
 
				+	scp -pr starpu.pdf html $(PUBLISHHOST):/home/groups/starpu/htdocs/files/doc
			
 
				 
			
--- a/doc/doxygen/chapters/301_tasks.doxy
+++ b/doc/doxygen/chapters/301_tasks.doxy
@@ -118,7 +118,7 @@ to delay the termination of a task until the termination of other tasks.
 
				 
			
 
				 \section SettingManyDataHandlesForATask Setting Many Data Handles For a Task
			
 
				 
			
 
				-The maximum number of data a task can manage is fixed by the environment variable
			
 
				+The maximum number of data a task can manage is fixed by the macro
			
 
				 \ref STARPU_NMAXBUFS which has a default value which can be changed
			
 
				 through the \c configure option \ref enable-maxbuffers "--enable-maxbuffers".
			
 
				 
			
--- a/doc/doxygen/chapters/320_scheduling.doxy
+++ b/doc/doxygen/chapters/320_scheduling.doxy
@@ -185,6 +185,11 @@ already gives the good results that a precise estimation would give.
 
				 
			
 
				 \section Energy-basedScheduling Energy-based Scheduling
			
 
				 
			
 
				+Note: by default StarPU does not let CPU workers sleep, to let them react to
			
 
				+task release as quickly as possible. For idle time to really let CPU cores save
			
 
				+energy, one needs to use the \ref enable-blocking-drivers
			
 
				+"--enable-blocking-drivers" configuration option.
			
 
				+
			
 
				 If the application can provide some energy consumption performance model (through
			
 
				 the field starpu_codelet::energy_model), StarPU will
			
 
				 take it into account when distributing tasks. The target function that
			
--- a/doc/doxygen/chapters/380_offline_performance_tools.doxy
+++ b/doc/doxygen/chapters/380_offline_performance_tools.doxy
@@ -586,19 +586,31 @@ $ starpu_paje_sort paje.trace
 
				 \section PapiCounters PAPI counters
			
 
				 
			
 
				 Performance counter values could be obtained from the PAPI framework if
			
 
				-<c>./configure</c> detected the libpapi. One has to set the \ref STARPU_PROFILING
			
 
				-environment variable to 1 and then specify which events to record with the
			
 
				-\ref STARPU_PROF_PAPI_EVENTS environment variable. For instance:
			
 
				+<c>./configure</c> detected the libpapi.
			
 
				+
			
 
				+In Debian, packages <c>libpapi-dev</c> and <c>libpapi5.7</c> provide required
			
 
				+files.  Package <c>papi-tools</c> contains a set of useful tools, for example
			
 
				+<c>papi_avail</c> to see which counters are available.
			
 
				+
			
 
				+To be able to use Papi counters, one may need to reduce the level of the kernel
			
 
				+parameter <c>kernel.perf_event_paranoid</c> to at least 2. See
			
 
				+https://www.kernel.org/doc/html/latest/admin-guide/perf-security.html for the
			
 
				+security impact of this parameter.
			
 
				+
			
 
				+Then one has to set the \ref STARPU_PROFILING environment variable to 1 and
			
 
				+specify which events to record with the \ref STARPU_PROF_PAPI_EVENTS
			
 
				+environment variable. For instance:
			
 
				 
			
 
				 \verbatim
			
 
				 export STARPU_PROFILING=1 STARPU_PROF_PAPI_EVENTS="PAPI_TOT_INS PAPI_TOT_CYC"
			
 
				 \endverbatim
			
 
				 
			
 
				+The comma can also be used to separate events to monitor.
			
 
				+
			
 
				 In the current simple implementation, only CPU tasks have their events measured
			
 
				-and require CPUs that support the PAPI events. All events that PAPI support are
			
 
				-available from their documentation (https://icl.cs.utk.edu/projects/papi/wiki/PAPIC:Preset_Event_Definitions).
			
 
				-It is important to note that not all events are available on all systems, and
			
 
				-general PAPI recommendations should be followed.
			
 
				+and require CPUs that support the PAPI events. It is important to note that not
			
 
				+all events are available on all systems, and general PAPI recommendations
			
 
				+should be followed.
			
 
				 
			
 
				 The counter values can be accessed using the profiling interface:
			
 
				 \code{.c}
			
--- a/doc/doxygen/chapters/501_environment_variables.doxy
+++ b/doc/doxygen/chapters/501_environment_variables.doxy
@@ -1366,6 +1366,15 @@ application has crashed. Setting this variable to a value other than 1
 
				 will disable this behaviour. This should be done on JVM systems which
			
 
				 may use these signals for their own needs.
			
 
				 The flag can also be set through the field starpu_conf::catch_signals.
			
 
				+</dd>
			
 
				+
			
 
				+<dt>STARPU_DISPLAY_BINDINGS</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_DISPLAY_BINDINGS
			
 
				+\addindex __env__STARPU_DISPLAY_BINDINGS
			
 
				+Display the binding of all processes and threads running on the machine. If MPI is enabled, display the binding of each node.<br>
			
 
				+Users can manually display the binding by calling starpu_display_bindings().
			
 
				+</dd>
			
 
				 </dl>
			
 
				 
			
 
				 \section ConfiguringTheHypervisor Configuring The Hypervisor
			
--- a/doc/doxygen/chapters/510_configure_options.doxy
+++ b/doc/doxygen/chapters/510_configure_options.doxy
@@ -1,6 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ * Copyright (C) 2020       Federal University of Rio Grande do Sul (UFRGS)
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -115,7 +116,19 @@ Specify <c>hwloc</c> should not be used by StarPU.
 
				 \addindex __configure__--disable-build-doc
			
 
				 Disable the creation of the documentation. This should be done on a
			
 
				 machine which does not have the tools <c>doxygen</c> and <c>latex</c>
			
 
				-(plus the packages <c>latex-xcolor</c> and <c>texlive-latex-extra</c>).
			
 
				+(plus the packages <c>latex-xcolor</c> and
			
 
				+<c>texlive-latex-extra</c>).
			
 
				+</dd>
			
 
				+
			
 
				+<dt>--enable-build-doc-pdf</dt>
			
 
				+<dd>
			
 
				+\anchor enable-build-doc-pdf
			
 
				+\addindex __configure__--enable-build-doc-pdf
			
 
				+By default, ontly the HTML documentation is generated. Use this option
			
 
				+to also enable the generation of the PDF documentation. This should be
			
 
				+done on a machine which does have the tools <c>doxygen</c> and <c>latex</c>
			
 
				+(plus the packages <c>latex-xcolor</c> and
			
 
				+<c>texlive-latex-extra</c>).
			
 
				 </dd>
			
 
				 
			
 
				 <dt>--disable-icc</dt>
			
@@ -514,6 +527,15 @@ Define the maximum number of buffers that tasks will be able to take
 
				 as parameters, then available as the macro ::STARPU_NMAXBUFS.
			
 
				 </dd>
			
 
				 
			
 
				+<dt>--enable-fxt-max-files=<c>count</c></dt>
			
 
				+<dd>
			
 
				+\anchor enable-fxt-max-files
			
 
				+\addindex __configure__--enable-fxt-max-files
			
 
				+Use at most <c>count</c> mpi nodes fxt files for generating traces.  This information is then available as
			
 
				+the macro ::STARPU_FXT_MAX_FILES.  This information is used by FxT tools when considering multi node traces.
			
 
				+Default value is 64.
			
 
				+</dd>
			
 
				+
			
 
				 <dt>--enable-allocation-cache</dt>
			
 
				 <dd>
			
 
				 \anchor enable-allocation-cache
			
--- a/doc/doxygen/chapters/code/vector_scal_opencl.c
+++ b/doc/doxygen/chapters/code/vector_scal_opencl.c
@@ -57,6 +57,7 @@ void scal_opencl_func(void *buffers[], void *_args)
 
				         err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
			
 
				         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				         if (local > global) local=global;
			
 
				+        else global = (global + local-1) / local * local;
			
 
				 
			
 
				         err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
			
 
				         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
--- a/doc/doxygen_dev/Makefile.am
+++ b/doc/doxygen_dev/Makefile.am
@@ -30,9 +30,14 @@ txtdir   = $(docdir)/manual
 
				 EXTRA_DIST =
			
 
				 
			
 
				 if STARPU_BUILD_DOC
			
 
				+if STARPU_BUILD_DOC_PDF
			
 
				 all: $(DOX_HTML_DIR) $(DOX_PDF)
			
 
				 EXTRA_DIST += $(DOX_HTML_DIR) $(DOX_PDF)
			
 
				 txt_DATA = $(DOX_PDF)
			
 
				+else
			
 
				+all: $(DOX_HTML_DIR)
			
 
				+EXTRA_DIST += $(DOX_HTML_DIR)
			
 
				+endif
			
 
				 DOX_HTML_SRCDIR=$(DOX_HTML_DIR)
			
 
				 install-exec-hook:
			
 
				 	$(MKDIR_P) $(DESTDIR)$(docdir)/manual/html_dev
			
@@ -41,8 +46,7 @@ uninstall-hook:
 
				 	rm -rf $(DESTDIR)$(docdir)/manual/html_dev
			
 
				 else
			
 
				 if STARPU_AVAILABLE_DOC
			
 
				-EXTRA_DIST += $(top_srcdir)/doc/doxygen_dev/html_dev $(top_srcdir)/doc/doxygen_dev/starpu_dev.pdf
			
 
				-txt_DATA = $(top_srcdir)/doc/doxygen_dev/starpu_dev.pdf
			
 
				+EXTRA_DIST += $(top_srcdir)/doc/doxygen_dev/html_dev
			
 
				 DOX_HTML_SRCDIR=$(top_srcdir)/doc/doxygen_dev/html_dev
			
 
				 install-exec-hook:
			
 
				 	$(MKDIR_P) $(DESTDIR)$(docdir)/manual/html_dev
			
@@ -50,6 +54,10 @@ install-exec-hook:
 
				 uninstall-hook:
			
 
				 	rm -rf $(DESTDIR)$(docdir)/manual/html_dev
			
 
				 endif
			
 
				+if STARPU_AVAILABLE_DOC_PDF
			
 
				+EXTRA_DIST += $(top_srcdir)/doc/doxygen_dev/starpu_dev.pdf
			
 
				+txt_DATA = $(top_srcdir)/doc/doxygen_dev/starpu_dev.pdf
			
 
				+endif
			
 
				 endif
			
 
				 
			
 
				 chapters =	\
			
@@ -191,7 +199,7 @@ dox_inputs = $(DOX_CONFIG) 				\
 
				 	$(top_srcdir)/src/core/drivers.h	\
			
 
				 	$(top_srcdir)/src/core/workers.h
			
 
				 
			
 
				-$(DOX_HTML_DIR): $(DOX_TAG) refman.tex
			
 
				+$(DOX_HTML_DIR): $(DOX_TAG)
			
 
				 	@$(MKDIR_P) $(DOX_HTML_DIR)
			
 
				 
			
 
				 $(DOX_TAG): $(dox_inputs)
			
@@ -240,5 +248,5 @@ EXTRA_DIST += doxygen.cfg refman.tex \
 
				 # Rule to update documentation on web server. Should only be used locally.
			
 
				 PUBLISHHOST	?= gforge
			
 
				 update-web: $(DOX_PDF)
			
 
				-	scp -pr starpu_dev.pdf html_dev $(PUBLISHHOST):/home/groups/starpu/htdocs/doc
			
 
				+	scp -pr starpu_dev.pdf html_dev $(PUBLISHHOST):/home/groups/starpu/htdocs/files/doc
			
 
				 
			
--- a/examples/axpy/axpy_opencl.c
+++ b/examples/axpy/axpy_opencl.c
@@ -60,6 +60,8 @@ void axpy_opencl(void *buffers[], void *_args)
 
				 			STARPU_OPENCL_REPORT_ERROR(err);
			
 
				                 if (local > global)
			
 
				 			local=global;
			
 
				+                else
			
 
				+                        global = (global + local-1) / local * local;
			
 
				 
			
 
				 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
			
 
				 		if (err != CL_SUCCESS)
			
--- a/examples/basic_examples/multiformat_conversion_codelets_opencl.c
+++ b/examples/basic_examples/multiformat_conversion_codelets_opencl.c
@@ -74,6 +74,8 @@ void cpu_to_opencl_opencl_func(void *buffers[], void *args)
 
				 
			
 
				                 if (local > global)
			
 
				 			local = global;
			
 
				+                else
			
 
				+                        global = (global + local-1) / local * local;
			
 
				 
			
 
				 		err = clEnqueueNDRangeKernel(queue,
			
 
				 					kernel,
			
--- a/examples/basic_examples/multiformat_opencl.c
+++ b/examples/basic_examples/multiformat_opencl.c
@@ -68,6 +68,8 @@ void multiformat_scal_opencl_func(void *buffers[], void *args)
 
				 
			
 
				                 if (local > global)
			
 
				 			local = global;
			
 
				+                else
			
 
				+                        global = (global + local-1) / local * local;
			
 
				 
			
 
				 		err = clEnqueueNDRangeKernel(queue,
			
 
				 					kernel,
			
--- a/examples/basic_examples/vector_scal_opencl.c
+++ b/examples/basic_examples/vector_scal_opencl.c
@@ -57,6 +57,7 @@ void scal_opencl_func(void *buffers[], void *_args)
 
				                 err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
			
 
				                 if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				                 if (local > global) local=global;
			
 
				+                else global = (global + local-1) / local * local;
			
 
				 
			
 
				 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
			
 
				 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
			
--- a/examples/filters/custom_mf/conversion_opencl.c
+++ b/examples/filters/custom_mf/conversion_opencl.c
@@ -76,6 +76,8 @@ void cpu_to_opencl_opencl_func(void *buffers[], void *args)
 
				 
			
 
				                 if (local > global)
			
 
				 			local = global;
			
 
				+                else
			
 
				+                        global = (global + local-1) / local * local;
			
 
				 
			
 
				 		err = clEnqueueNDRangeKernel(
			
 
				 				queue,
			
--- a/examples/filters/custom_mf/custom_opencl.c
+++ b/examples/filters/custom_mf/custom_opencl.c
@@ -75,6 +75,8 @@ void custom_scal_opencl_func(void *buffers[], void *args)
 
				 
			
 
				                 if (local > global)
			
 
				 			local = global;
			
 
				+                else
			
 
				+                        global = (global + local-1) / local * local;
			
 
				 
			
 
				 		err = clEnqueueNDRangeKernel(
			
 
				 				queue,
			
--- a/examples/interface/complex_kernels_opencl.c
+++ b/examples/interface/complex_kernels_opencl.c
@@ -64,6 +64,8 @@ void copy_complex_codelet_opencl(void *buffers[], void *_args)
 
				 			STARPU_OPENCL_REPORT_ERROR(err);
			
 
				                 if (local > global)
			
 
				 			local=global;
			
 
				+                else
			
 
				+                        global = (global + local-1) / local * local;
			
 
				 
			
 
				 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
			
 
				 		if (err != CL_SUCCESS)
			
--- a/examples/mult/double.h
+++ b/examples/mult/double.h
@@ -15,6 +15,7 @@
 
				  */
			
 
				 
			
 
				 #define TYPE	double
			
 
				+#define EPSILON	0.000000000001
			
 
				 
			
 
				 #define CUBLAS_GEMM cublasDgemm
			
 
				 #define CPU_GEMM	STARPU_DGEMM
			
--- a/examples/mult/simple.h
+++ b/examples/mult/simple.h
@@ -15,6 +15,7 @@
 
				  */
			
 
				 
			
 
				 #define TYPE	float
			
 
				+#define EPSILON	0.000001
			
 
				 
			
 
				 #define CUBLAS_GEMM cublasSgemm
			
 
				 #define CPU_GEMM	STARPU_SGEMM
			
--- a/examples/mult/xgemm.c
+++ b/examples/mult/xgemm.c
@@ -75,7 +75,7 @@ static int check_output(void)
 
				 	TYPE err;
			
 
				 	err = CPU_ASUM(xdim*ydim, C, 1);
			
 
				 
			
 
				-	if (err < xdim*ydim*0.001)
			
 
				+	if (err < EPSILON*xdim*ydim*zdim)
			
 
				 	{
			
 
				 		FPRINTF(stderr, "Results are OK\n");
			
 
				 		return 0;
			
--- a/examples/reductions/dot_product.c
+++ b/examples/reductions/dot_product.c
@@ -185,18 +185,12 @@ void redux_opencl_func(void *buffers[], void *args)
 
				 
			
 
				 	{
			
 
				 		size_t global=1;
			
 
				-		size_t local;
			
 
				+                size_t local=1;
			
 
				                 size_t s;
			
 
				                 cl_device_id device;
			
 
				 
			
 
				                 starpu_opencl_get_device(devid, &device);
			
 
				 
			
 
				-                err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
			
 
				-                if (err != CL_SUCCESS)
			
 
				-			STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-                if (local > global)
			
 
				-			local=global;
			
 
				-
			
 
				 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
			
 
				 		if (err != CL_SUCCESS)
			
 
				 			STARPU_OPENCL_REPORT_ERROR(err);
			
@@ -306,18 +300,12 @@ void dot_opencl_func(void *buffers[], void *cl_arg)
 
				 
			
 
				 	{
			
 
				 		size_t global=1;
			
 
				-		size_t local;
			
 
				+                size_t local=1;
			
 
				                 size_t s;
			
 
				                 cl_device_id device;
			
 
				 
			
 
				                 starpu_opencl_get_device(devid, &device);
			
 
				 
			
 
				-                err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
			
 
				-                if (err != CL_SUCCESS)
			
 
				-			STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-                if (local > global)
			
 
				-			local=global;
			
 
				-
			
 
				 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
			
 
				 		if (err != CL_SUCCESS)
			
 
				 			STARPU_OPENCL_REPORT_ERROR(err);
			
--- a/examples/reductions/dot_product_opencl_kernels.cl
+++ b/examples/reductions/dot_product_opencl_kernels.cl
@@ -31,6 +31,7 @@ __kernel void _dot_opencl(__global float *x,
 
				 			  __global DOT_TYPE *dot,
			
 
				 			  unsigned n)
			
 
				 {
			
 
				+/* FIXME: real parallel implementation */
			
 
				 	unsigned i;
			
 
				 	__local double tmp;
			
 
				 	tmp = 0.0;
			
--- a/include/starpu.h
+++ b/include/starpu.h
@@ -111,6 +111,12 @@ struct starpu_conf
 
				 	int magic;
			
 
				 
			
 
				 	/**
			
 
				+	   @private
			
 
				+	   Tell starpu_init() if MPI will be initialized later.
			
 
				+	*/
			
 
				+	int will_use_mpi;
			
 
				+
			
 
				+	/**
			
 
				 	   Name of the scheduling policy. This can also be specified
			
 
				 	   with the environment variable \ref STARPU_SCHED. (default =
			
 
				 	   <c>NULL</c>).
			
--- a/include/starpu_config.h.in
+++ b/include/starpu_config.h.in
@@ -187,6 +187,15 @@
 
				 #undef STARPU_NMAXBUFS
			
 
				 
			
 
				 /**
			
 
				+   Define the maximum number of fxt mpi files that can be read when
			
 
				+   generating traces. The default value is 64, it can be changed by
			
 
				+   using the configure option \ref enable-fxt-max-files
			
 
				+   "--enable-fxt-max-files".
			
 
				+   @ingroup API_MPI_Support
			
 
				+*/
			
 
				+#undef STARPU_FXT_MAX_FILES
			
 
				+
			
 
				+/**
			
 
				    Define the maximum number of CPU workers managed by StarPU. The
			
 
				    default value can be modified at configure by using the option \ref
			
 
				    enable-maxcpus "--enable-maxcpus".
			
--- a/include/starpu_fxt.h
+++ b/include/starpu_fxt.h
@@ -20,6 +20,7 @@
 
				 #ifndef __STARPU_FXT_H__
			
 
				 #define __STARPU_FXT_H__
			
 
				 
			
 
				+#include <starpu_config.h>
			
 
				 #include <starpu_perfmodel.h>
			
 
				 
			
 
				 #ifdef __cplusplus
			
@@ -32,8 +33,6 @@ extern "C"
 
				    @{
			
 
				 */
			
 
				 
			
 
				-#define STARPU_FXT_MAX_FILES	64
			
 
				-
			
 
				 struct starpu_fxt_codelet_event
			
 
				 {
			
 
				 	char symbol[256];
			
--- a/include/starpu_helper.h
+++ b/include/starpu_helper.h
@@ -182,6 +182,14 @@ double starpu_timing_now(void);
 
				 */
			
 
				 int starpu_data_cpy(starpu_data_handle_t dst_handle, starpu_data_handle_t src_handle, int asynchronous, void (*callback_func)(void*), void *callback_arg);
			
 
				 
			
 
				+/**
			
 
				+   Call hwloc-ps to display binding of each processus and thread running on
			
 
				+   the machine.<br>
			
 
				+   Use the environment variable \ref STARPU_DISPLAY_BINDINGS to automatically
			
 
				+   call this function at the beginning of the execution of StarPU.
			
 
				+*/
			
 
				+void starpu_display_bindings(void);
			
 
				+
			
 
				 /** @} */
			
 
				 
			
 
				 #ifdef __cplusplus
			
--- a/include/starpu_stdlib.h
+++ b/include/starpu_stdlib.h
@@ -239,9 +239,32 @@ void starpu_memory_deallocate(unsigned node, size_t size);
 
				 */
			
 
				 void starpu_memory_wait_available(unsigned node, size_t size);
			
 
				 
			
 
				+/**
			
 
				+   Sleep for the given \p nb_sec seconds.
			
 
				+   In simgrid mode, this only sleeps within virtual time.
			
 
				+  */
			
 
				 void starpu_sleep(float nb_sec);
			
 
				+
			
 
				+/**
			
 
				+   Sleep for the given \p nb_micro_sec micro-seconds.
			
 
				+   In simgrid mode, this only sleeps within virtual time.
			
 
				+  */
			
 
				 void starpu_usleep(float nb_micro_sec);
			
 
				 
			
 
				+/**
			
 
				+   Account for \p joules J being used.
			
 
				+   This is support in simgrid mode, to record how much energy was used, and will
			
 
				+   show up in further call to starpu_energy_used().
			
 
				+  */
			
 
				+void starpu_energy_use(float joules);
			
 
				+
			
 
				+/**
			
 
				+   Return the amount of energy having been used in J.
			
 
				+   This account the amounts passed to starpu_energy_use(), but also the static
			
 
				+   energy use set by the \ref STARPU_IDLE_POWER environment variable.
			
 
				+  */
			
 
				+double starpu_energy_used(void);
			
 
				+
			
 
				 /** @} */
			
 
				 
			
 
				 #ifdef __cplusplus
			
--- a/include/starpu_task.h
+++ b/include/starpu_task.h
@@ -513,7 +513,7 @@ struct starpu_codelet
 
				 
			
 
				 	/**
			
 
				 	   Optional pointer to the task energy consumption performance
			
 
				-	   model associated to this codelet. This optional field is
			
 
				+	   model associated to this codelet (in J). This optional field is
			
 
				 	   ignored when set to <c>NULL</c> or when its field
			
 
				 	   starpu_perfmodel::symbol is not set. In the case of
			
 
				 	   parallel codelets, this has to account for all processing
			
--- a/include/starpu_util.h
+++ b/include/starpu_util.h
@@ -598,6 +598,17 @@ STARPU_ATOMIC_SOMETHING64(or, old | value)
 
				 #define STARPU_WMB() STARPU_SYNCHRONIZE()
			
 
				 #endif
			
 
				 
			
 
				+#if defined(__i386__) || defined(__x86_64__)
			
 
				+#define STARPU_CACHELINE_SIZE 64
			
 
				+#elif defined(__ppc__) || defined(__ppc64__) || defined(__ia64__)
			
 
				+#define STARPU_CACHELINE_SIZE 128
			
 
				+#elif defined(__s390__) || defined(__s390x__)
			
 
				+#define STARPU_CACHELINE_SIZE 256
			
 
				+#else
			
 
				+/* Conservative default */
			
 
				+#define STARPU_CACHELINE_SIZE 1024
			
 
				+#endif
			
 
				+
			
 
				 #ifdef _WIN32
			
 
				 /* Try to fetch the system definition of timespec */
			
 
				 #include <sys/types.h>
			
--- a/julia/examples/cholesky/cholesky_common.jl
+++ b/julia/examples/cholesky/cholesky_common.jl
@@ -1,3 +1,18 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				 # Standard kernels for the Cholesky factorization
			
 
				 # U22 is the gemm update
			
 
				 # U21 is the trsm update
			
--- a/julia/examples/cholesky/cholesky_native.jl
+++ b/julia/examples/cholesky/cholesky_native.jl
@@ -1,3 +1,18 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				 using LinearAlgebra
			
 
				 
			
 
				 function check(mat::Matrix{Float32})
			
--- a/julia/src/openblas_ldflags.jl
+++ b/julia/src/openblas_ldflags.jl
@@ -1,3 +1,18 @@
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+#
			
 
				 import LinearAlgebra.BLAS
			
 
				 import Libdl
			
 
				 
			
--- a/mpi/examples/Makefile.am
+++ b/mpi/examples/Makefile.am
@@ -83,6 +83,10 @@ EXTRA_DIST = 				\
 
				 	matrix_decomposition/mpi_decomposition_params.h	\
			
 
				 	matrix_decomposition/mpi_decomposition_matrix.h	\
			
 
				 	user_datatype/my_interface.h			\
			
 
				+	benchs/abstract_sendrecv_bench.h	\
			
 
				+	benchs/bench_helper.h			\
			
 
				+	benchs/gemm_helper.h			\
			
 
				+	benchs/burst_helper.h			\
			
 
				 	helper.h
			
 
				 
			
 
				 examplebindir = $(libdir)/starpu/mpi
			
@@ -399,3 +403,68 @@ native_fortran/nf_mm_task_build.o: nf_mm_cl.mod fstarpu_mpi_mod.mod fstarpu_mod.
 
				 native_fortran/nf_basic_ring.o: fstarpu_mpi_mod.mod fstarpu_mod.mod
			
 
				 endif
			
 
				 endif
			
 
				+
			
 
				+
			
 
				+##########
			
 
				+# benchs #
			
 
				+##########
			
 
				+
			
 
				+examplebin_PROGRAMS +=		\
			
 
				+	benchs/sendrecv_bench	\
			
 
				+	benchs/burst
			
 
				+
			
 
				+if !STARPU_USE_MPI_MPI
			
 
				+examplebin_PROGRAMS +=		\
			
 
				+	benchs/sendrecv_parallel_tasks_bench
			
 
				+endif
			
 
				+
			
 
				+if !STARPU_NO_BLAS_LIB
			
 
				+examplebin_PROGRAMS +=		\
			
 
				+	benchs/sendrecv_gemm_bench			\
			
 
				+	benchs/burst_gemm
			
 
				+endif
			
 
				+
			
 
				+if !STARPU_SIMGRID
			
 
				+starpu_mpi_EXAMPLES	+=	\
			
 
				+	benchs/sendrecv_bench	\
			
 
				+	benchs/burst
			
 
				+
			
 
				+if !STARPU_USE_MPI_MPI
			
 
				+starpu_mpi_EXAMPLES	+=	\
			
 
				+	benchs/sendrecv_parallel_tasks_bench
			
 
				+endif
			
 
				+
			
 
				+if !STARPU_NO_BLAS_LIB
			
 
				+starpu_mpi_EXAMPLES	+=	\
			
 
				+	benchs/sendrecv_gemm_bench			\
			
 
				+	benchs/burst_gemm
			
 
				+endif
			
 
				+endif
			
 
				+
			
 
				+benchs_sendrecv_bench_SOURCES = benchs/sendrecv_bench.c
			
 
				+benchs_sendrecv_bench_SOURCES += benchs/bench_helper.c
			
 
				+benchs_sendrecv_bench_SOURCES += benchs/abstract_sendrecv_bench.c
			
 
				+
			
 
				+benchs_sendrecv_parallel_tasks_bench_SOURCES = benchs/sendrecv_parallel_tasks_bench.c
			
 
				+benchs_sendrecv_parallel_tasks_bench_SOURCES += benchs/bench_helper.c
			
 
				+benchs_sendrecv_parallel_tasks_bench_SOURCES += benchs/abstract_sendrecv_bench.c
			
 
				+
			
 
				+benchs_burst_SOURCES = benchs/burst.c
			
 
				+benchs_burst_SOURCES += benchs/burst_helper.c
			
 
				+
			
 
				+if !STARPU_NO_BLAS_LIB
			
 
				+benchs_sendrecv_gemm_bench_SOURCES = benchs/sendrecv_gemm_bench.c
			
 
				+benchs_sendrecv_gemm_bench_SOURCES += benchs/bench_helper.c
			
 
				+benchs_sendrecv_gemm_bench_SOURCES += benchs/gemm_helper.c
			
 
				+benchs_sendrecv_gemm_bench_SOURCES += benchs/abstract_sendrecv_bench.c
			
 
				+benchs_sendrecv_gemm_bench_SOURCES += ../../examples/common/blas.c
			
 
				+
			
 
				+benchs_sendrecv_gemm_bench_LDADD = $(STARPU_BLAS_LDFLAGS)
			
 
				+
			
 
				+benchs_burst_gemm_SOURCES = benchs/burst_gemm.c
			
 
				+benchs_burst_gemm_SOURCES += benchs/gemm_helper.c
			
 
				+benchs_burst_gemm_SOURCES += benchs/burst_helper.c
			
 
				+benchs_burst_gemm_SOURCES += ../../examples/common/blas.c
			
 
				+
			
 
				+benchs_burst_gemm_LDADD = $(STARPU_BLAS_LDFLAGS)
			
 
				+endif
			
--- a/mpi/examples/benchs/abstract_sendrecv_bench.c
+++ b/mpi/examples/benchs/abstract_sendrecv_bench.c
--- a/mpi/examples/benchs/abstract_sendrecv_bench.h
+++ b/mpi/examples/benchs/abstract_sendrecv_bench.h
--- a/mpi/examples/benchs/bench_helper.c
+++ b/mpi/examples/benchs/bench_helper.c
--- a/mpi/examples/benchs/bench_helper.h
+++ b/mpi/examples/benchs/bench_helper.h
--- a/mpi/tests/burst.c
+++ b/mpi/tests/burst.c
@@ -49,13 +49,11 @@ void parse_args(int argc, char **argv)
 
				 
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				-	int ret, rank, mpi_init, other_rank;
			
 
				+	int ret, rank, other_rank;
			
 
				 
			
 
				 	parse_args(argc, argv);
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				-
			
 
				-	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
			
 
				+	ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
			
 
				 
			
 
				 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
@@ -68,8 +66,6 @@ int main(int argc, char **argv)
 
				 	burst_free_data(rank);
			
 
				 
			
 
				 	starpu_mpi_shutdown();
			
 
				-	if (!mpi_init)
			
 
				-		MPI_Finalize();
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/mpi/tests/burst_gemm.c
+++ b/mpi/tests/burst_gemm.c
@@ -90,12 +90,11 @@ void parse_args(int argc, char **argv)
 
				 
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				-	int ret, mpi_init, worldsize, mpi_rank;
			
 
				+	int ret, worldsize, mpi_rank;
			
 
				 
			
 
				 	parse_args(argc, argv);
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				-	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
			
 
				+	ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
			
 
				 	if (ret == -ENODEV)
			
 
				 		return 77;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
			
@@ -109,8 +108,7 @@ int main(int argc, char **argv)
 
				 			FPRINTF(stderr, "We need 2 processes.\n");
			
 
				 
			
 
				 		starpu_mpi_shutdown();
			
 
				-		if (!mpi_init)
			
 
				-			MPI_Finalize();
			
 
				+
			
 
				 		return STARPU_TEST_SKIPPED;
			
 
				 	}
			
 
				 
			
@@ -203,8 +201,6 @@ enodev:
 
				 	burst_free_data(mpi_rank);
			
 
				 
			
 
				 	starpu_mpi_shutdown();
			
 
				-	if (!mpi_init)
			
 
				-		MPI_Finalize();
			
 
				 
			
 
				 	return ret;
			
 
				 }
			
--- a/mpi/examples/benchs/burst_helper.c
+++ b/mpi/examples/benchs/burst_helper.c
--- a/mpi/examples/benchs/burst_helper.h
+++ b/mpi/examples/benchs/burst_helper.h
--- a/mpi/examples/benchs/gemm_helper.c
+++ b/mpi/examples/benchs/gemm_helper.c
--- a/mpi/examples/benchs/gemm_helper.h
+++ b/mpi/examples/benchs/gemm_helper.h
--- a/mpi/tests/sendrecv_bench.c
+++ b/mpi/tests/sendrecv_bench.c
@@ -26,7 +26,6 @@
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				 	int ret, rank, worldsize;
			
 
				-	int mpi_init;
			
 
				 	int pause_workers = 0;
			
 
				 
			
 
				 
			
@@ -52,8 +51,7 @@ int main(int argc, char **argv)
 
				 	}
			
 
				 
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				-	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
			
 
				+	ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
			
 
				 
			
 
				 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
@@ -65,8 +63,7 @@ int main(int argc, char **argv)
 
				 			FPRINTF(stderr, "We need 2 processes.\n");
			
 
				 
			
 
				 		starpu_mpi_shutdown();
			
 
				-		if (!mpi_init)
			
 
				-			MPI_Finalize();
			
 
				+
			
 
				 		return STARPU_TEST_SKIPPED;
			
 
				 	}
			
 
				 
			
@@ -85,8 +82,6 @@ int main(int argc, char **argv)
 
				 	}
			
 
				 
			
 
				 	starpu_mpi_shutdown();
			
 
				-	if (!mpi_init)
			
 
				-		MPI_Finalize();
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/mpi/tests/sendrecv_gemm_bench.c
+++ b/mpi/tests/sendrecv_gemm_bench.c
@@ -53,7 +53,7 @@ static void* comm_thread_func(void* arg)
 
				 	{
			
 
				 		char hostname[65];
			
 
				 		gethostname(hostname, sizeof(hostname));
			
 
				-		_STARPU_DISP("[%s] No core was available for the comm thread. You should increase STARPU_RESERVE_NCPU or decrease STARPU_NCPU\n", hostname);
			
 
				+		fprintf(stderr, "[%s] No core was available for the comm thread. You should increase STARPU_RESERVE_NCPU or decrease STARPU_NCPU\n", hostname);
			
 
				 	}
			
 
				 
			
 
				 	sendrecv_bench(mpi_rank, &thread_barrier);
			
@@ -118,7 +118,7 @@ void parse_args(int argc, char **argv)
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				 	double start, end;
			
 
				-	int ret, mpi_init, worldsize;
			
 
				+	int ret, worldsize;
			
 
				 	starpu_pthread_t comm_thread;
			
 
				 
			
 
				 	char hostname[255];
			
@@ -128,8 +128,7 @@ int main(int argc, char **argv)
 
				 
			
 
				 	starpu_fxt_autostart_profiling(0);
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				-	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
			
 
				+	ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
			
 
				 	if (ret == -ENODEV)
			
 
				 		return 77;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
			
@@ -143,8 +142,7 @@ int main(int argc, char **argv)
 
				 			FPRINTF(stderr, "We need 2 processes.\n");
			
 
				 
			
 
				 		starpu_mpi_shutdown();
			
 
				-		if (!mpi_init)
			
 
				-			MPI_Finalize();
			
 
				+
			
 
				 		return STARPU_TEST_SKIPPED;
			
 
				 	}
			
 
				 
			
@@ -162,7 +160,7 @@ int main(int argc, char **argv)
 
				 
			
 
				 	if (mpi_rank == 0)
			
 
				 	{
			
 
				-		PRINTF("# node\tx\ty\tz\tms\tGFlops\n");
			
 
				+		printf("# node\tx\ty\tz\tms\tGFlops\n");
			
 
				 	}
			
 
				 
			
 
				 	starpu_pause();
			
@@ -185,7 +183,7 @@ int main(int argc, char **argv)
 
				 	double timing = end - start;
			
 
				 	double flops = 2.0*((unsigned long long)matrix_dim) * ((unsigned long long)matrix_dim)*((unsigned long long)matrix_dim);
			
 
				 
			
 
				-	PRINTF("%s\t%u\t%u\t%u\t%.0f\t%.1f\n", hostname, matrix_dim, matrix_dim, matrix_dim, timing/1000.0, flops/timing/1000.0);
			
 
				+	printf("%s\t%u\t%u\t%u\t%.0f\t%.1f\n", hostname, matrix_dim, matrix_dim, matrix_dim, timing/1000.0, flops/timing/1000.0);
			
 
				 
			
 
				 
			
 
				 enodev:
			
@@ -200,8 +198,6 @@ enodev:
 
				 
			
 
				 	starpu_resume();
			
 
				 	starpu_mpi_shutdown();
			
 
				-	if (!mpi_init)
			
 
				-		MPI_Finalize();
			
 
				 
			
 
				 	return ret;
			
 
				 }
			
--- a/mpi/tests/sendrecv_parallel_tasks_bench.c
+++ b/mpi/tests/sendrecv_parallel_tasks_bench.c
@@ -134,10 +134,8 @@ static struct starpu_codelet cl =
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				 	int ret, rank, worldsize;
			
 
				-	int mpi_init;
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				-	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
			
 
				+	ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
			
 
				 
			
 
				 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
@@ -149,8 +147,7 @@ int main(int argc, char **argv)
 
				 			FPRINTF(stderr, "We need 2 processes.\n");
			
 
				 
			
 
				 		starpu_mpi_shutdown();
			
 
				-		if (!mpi_init)
			
 
				-			MPI_Finalize();
			
 
				+
			
 
				 		return STARPU_TEST_SKIPPED;
			
 
				 	}
			
 
				 
			
@@ -162,8 +159,7 @@ int main(int argc, char **argv)
 
				 	else if (rank >= 2)
			
 
				 	{
			
 
				 		starpu_mpi_shutdown();
			
 
				-		if (!mpi_init)
			
 
				-			MPI_Finalize();
			
 
				+
			
 
				 		return 0;
			
 
				 	}
			
 
				 
			
@@ -222,8 +218,6 @@ int main(int argc, char **argv)
 
				 	free(mpi_tags);
			
 
				 
			
 
				 	starpu_mpi_shutdown();
			
 
				-	if (!mpi_init)
			
 
				-		MPI_Finalize();
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/mpi/src/starpu_mpi.c
+++ b/mpi/src/starpu_mpi.c
@@ -431,6 +431,7 @@ void starpu_mpi_data_migrate(MPI_Comm comm, starpu_data_handle_t data, int new_r
 
				 
			
 
				 	/* Flush cache in all other nodes */
			
 
				 	/* TODO: Ideally we'd transmit the knowledge of who owns it */
			
 
				+	/* TODO: or at least remember that the previous owner has the data, that's an easy case to support */
			
 
				 	starpu_mpi_cache_flush(comm, data);
			
 
				 	return;
			
 
				 }
			
--- a/mpi/src/starpu_mpi_datatype.c
+++ b/mpi/src/starpu_mpi_datatype.c
@@ -26,17 +26,16 @@ struct _starpu_mpi_datatype_funcs
 
				 	UT_hash_handle hh;
			
 
				 };
			
 
				 
			
 
				-static starpu_pthread_mutex_t _starpu_mpi_datatype_funcs_table_mutex;
			
 
				+/* We want to allow applications calling starpu_mpi_interface_datatype_register/unregister as constructor/destructor */
			
 
				+static starpu_pthread_mutex_t _starpu_mpi_datatype_funcs_table_mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
			
 
				 static struct _starpu_mpi_datatype_funcs *_starpu_mpi_datatype_funcs_table = NULL;
			
 
				 
			
 
				 void _starpu_mpi_datatype_init(void)
			
 
				 {
			
 
				-	STARPU_PTHREAD_MUTEX_INIT(&_starpu_mpi_datatype_funcs_table_mutex, NULL);
			
 
				 }
			
 
				 
			
 
				 void _starpu_mpi_datatype_shutdown(void)
			
 
				 {
			
 
				-	STARPU_PTHREAD_MUTEX_DESTROY(&_starpu_mpi_datatype_funcs_table_mutex);
			
 
				 }
			
 
				 
			
 
				 /*
			
--- a/mpi/src/starpu_mpi_init.c
+++ b/mpi/src/starpu_mpi_init.c
@@ -138,7 +138,38 @@ int _starpu_mpi_initialize(int *argc, char ***argv, int initialize_mpi, MPI_Comm
 
				 	_starpu_mpi_do_initialize(argc_argv);
			
 
				 #endif
			
 
				 
			
 
				-	return _mpi_backend._starpu_mpi_backend_progress_init(argc_argv);
			
 
				+	int ret = _mpi_backend._starpu_mpi_backend_progress_init(argc_argv);
			
 
				+
			
 
				+	if (starpu_get_env_number_default("STARPU_DISPLAY_BINDINGS", 0))
			
 
				+	{
			
 
				+		int rank, size, i;
			
 
				+		char hostname[65];
			
 
				+
			
 
				+		starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+		starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+		gethostname(hostname, sizeof(hostname));
			
 
				+
			
 
				+		/* We make a barrier between each node calling hwloc-ps, to avoid mixing
			
 
				+		 * outputs in stdout. */
			
 
				+		for (i = 0; i < size; i++)
			
 
				+		{
			
 
				+			starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+			if (rank == i)
			
 
				+			{
			
 
				+				fprintf(stdout, "== Binding for rank %d on node %s ==\n", rank, hostname);
			
 
				+				starpu_display_bindings();
			
 
				+				fflush(stdout);
			
 
				+			}
			
 
				+		}
			
 
				+		starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+		if (rank == 0)
			
 
				+		{
			
 
				+			fprintf(stdout, "== End of bindings ==\n");
			
 
				+			fflush(stdout);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return ret;
			
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_SIMGRID
			
@@ -219,6 +250,8 @@ int starpu_mpi_init_conf(int *argc, char ***argv, int initialize_mpi, MPI_Comm c
 
				 			conf->reserve_ncpus++;
			
 
				 	}
			
 
				 
			
 
				+	conf->will_use_mpi = 1;
			
 
				+
			
 
				 	int ret = starpu_init(conf);
			
 
				 	if (ret < 0)
			
 
				 		return ret;
			
--- a/mpi/tests/Makefile.am
+++ b/mpi/tests/Makefile.am
@@ -62,11 +62,7 @@ BUILT_SOURCES =
 
				 CLEANFILES = *.gcno *.gcda *.linkinfo starpu_idle_microsec.log
			
 
				 
			
 
				 EXTRA_DIST = 				\
			
 
				-	abstract_sendrecv_bench.h	\
			
 
				-	bench_helper.h			\
			
 
				 	helper.h			\
			
 
				-	gemm_helper.h			\
			
 
				-	burst_helper.h			\
			
 
				 	user_defined_datatype_value.h
			
 
				 
			
 
				 examplebindir = $(libdir)/starpu/examples/mpi
			
@@ -142,19 +138,7 @@ starpu_mpi_TESTS +=				\
 
				 	temporary				\
			
 
				 	user_defined_datatype			\
			
 
				 	early_stuff				\
			
 
				-	sendrecv_bench				\
			
 
				-	burst
			
 
				-
			
 
				-if !STARPU_USE_MPI_MPI
			
 
				-starpu_mpi_TESTS +=				\
			
 
				-	sendrecv_parallel_tasks_bench
			
 
				-endif
			
 
				-
			
 
				-if !STARPU_NO_BLAS_LIB
			
 
				-starpu_mpi_TESTS +=				\
			
 
				-	sendrecv_gemm_bench			\
			
 
				-	burst_gemm
			
 
				-endif
			
 
				+	display_bindings
			
 
				 
			
 
				 if !STARPU_SIMGRID
			
 
				 # missing support in simgrid
			
@@ -243,16 +227,8 @@ noinst_PROGRAMS +=				\
 
				 	starpu_redefine				\
			
 
				 	load_balancer				\
			
 
				 	driver					\
			
 
				-	sendrecv_bench				\
			
 
				-	sendrecv_parallel_tasks_bench		\
			
 
				-	burst					\
			
 
				-	nothing
			
 
				-
			
 
				-if !STARPU_NO_BLAS_LIB
			
 
				-noinst_PROGRAMS +=				\
			
 
				-	sendrecv_gemm_bench			\
			
 
				-	burst_gemm
			
 
				-endif
			
 
				+	nothing							\
			
 
				+	display_bindings
			
 
				 
			
 
				 if STARPU_USE_MPI_FT
			
 
				 noinst_PROGRAMS +=  \
			
@@ -288,31 +264,3 @@ mpi_earlyrecv2_SOURCES = mpi_earlyrecv2.c
 
				 mpi_earlyrecv2_SOURCES += ../../examples/interface/complex_interface.c
			
 
				 mpi_earlyrecv2_sync_SOURCES = mpi_earlyrecv2_sync.c
			
 
				 mpi_earlyrecv2_sync_SOURCES += ../../examples/interface/complex_interface.c
			
 
				-
			
 
				-sendrecv_bench_SOURCES = sendrecv_bench.c
			
 
				-sendrecv_bench_SOURCES += bench_helper.c
			
 
				-sendrecv_bench_SOURCES += abstract_sendrecv_bench.c
			
 
				-
			
 
				-sendrecv_parallel_tasks_bench_SOURCES = sendrecv_parallel_tasks_bench.c
			
 
				-sendrecv_parallel_tasks_bench_SOURCES += bench_helper.c
			
 
				-sendrecv_parallel_tasks_bench_SOURCES += abstract_sendrecv_bench.c
			
 
				-
			
 
				-burst_SOURCES = burst.c
			
 
				-burst_SOURCES += burst_helper.c
			
 
				-
			
 
				-if !STARPU_NO_BLAS_LIB
			
 
				-sendrecv_gemm_bench_SOURCES = sendrecv_gemm_bench.c
			
 
				-sendrecv_gemm_bench_SOURCES += bench_helper.c
			
 
				-sendrecv_gemm_bench_SOURCES += gemm_helper.c
			
 
				-sendrecv_gemm_bench_SOURCES += abstract_sendrecv_bench.c
			
 
				-sendrecv_gemm_bench_SOURCES += ../../examples/common/blas.c
			
 
				-
			
 
				-sendrecv_gemm_bench_LDADD = $(STARPU_BLAS_LDFLAGS)
			
 
				-
			
 
				-burst_gemm_SOURCES = burst_gemm.c
			
 
				-burst_gemm_SOURCES += gemm_helper.c
			
 
				-burst_gemm_SOURCES += burst_helper.c
			
 
				-burst_gemm_SOURCES += ../../examples/common/blas.c
			
 
				-
			
 
				-burst_gemm_LDADD = $(STARPU_BLAS_LDFLAGS)
			
 
				-endif
			
--- a/mpi/tests/display_bindings.c
+++ b/mpi/tests/display_bindings.c
@@ -0,0 +1,44 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2011-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <starpu_mpi.h>
			
 
				+#include <stdlib.h>
			
 
				+#include "helper.h"
			
 
				+
			
 
				+#if !defined(STARPU_HAVE_SETENV)
			
 
				+#warning setenv is not defined. Skipping test
			
 
				+int main(void)
			
 
				+{
			
 
				+	return STARPU_TEST_SKIPPED;
			
 
				+}
			
 
				+#else
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	int ret;
			
 
				+	setenv("STARPU_DISPLAY_BINDINGS", "1", 1);
			
 
				+
			
 
				+	MPI_INIT_THREAD_real(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				+
			
 
				+	ret = starpu_mpi_init_conf(NULL, NULL, 0, MPI_COMM_WORLD, NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
			
 
				+
			
 
				+	starpu_mpi_shutdown();
			
 
				+	MPI_Finalize();
			
 
				+
			
 
				+	return EXIT_SUCCESS;
			
 
				+}
			
 
				+#endif
			
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -406,9 +406,16 @@ endif
 
				 # static inline definition
			
 
				 dist-hook:
			
 
				 	failed=0 ; \
			
 
				-	for i in $$( $(GREP) "static inline" $$(find $(srcdir) -name \*.h) | $(SED) -e 's/.*static inline //g' | $(GREP) -v ENAME | $(SED) -e 's/[^(]* \(\|\*\)\([^ (]*\)(.*/\2/' | $(GREP) -v _starpu_spin_init) ; do \
			
 
				-		for j in $(shell find . -name \*.o) ; do \
			
 
				-			nm $$j | $(GREP) "U $$i$$" && { echo $$j ; failed=1 ; } ; \
			
 
				-		done ; \
			
 
				+	look=""; \
			
 
				+	for i in $$( $(GREP) "static inline" $$(find $(srcdir) -name \*.h) | $(SED) -e 's/.*static inline //g' | $(GREP) -v ENAME\#\# | $(SED) -n -e 's/[^(]* \(\|\*\)\([^ (]*\)(.*/\2/' -e 'p;s/^_*//;p' | $(GREP) -v _starpu_spin_init | $(GREP) -v starpu_sched_ctx_worker_is_master_for_child_ctx) ; do \
			
 
				+		if [ -z "$$look" ] ; then \
			
 
				+			look="$$i" ; \
			
 
				+		else \
			
 
				+			look="$$look\|$$i" ; \
			
 
				+		fi ; \
			
 
				+	done ; \
			
 
				+	echo "$$look" ; \
			
 
				+	for j in $(shell find . -name \*.o) ; do \
			
 
				+		nm $$j | $(GREP) -e "U \($$look\)$$" && { echo $$j ; failed=1 ; } ; \
			
 
				 	done ; \
			
 
				 	[ $$failed == 0 ]
			
--- a/src/common/utils.c
+++ b/src/common/utils.c
@@ -740,3 +740,18 @@ int starpu_get_env_size_default(const char *str, int defval)
 
				 	}
			
 
				 	return val;
			
 
				 }
			
 
				+
			
 
				+void starpu_display_bindings(void)
			
 
				+{
			
 
				+#if defined(STARPU_HAVE_HWLOC) && !defined(STARPU_SIMGRID)
			
 
				+	int hwloc_ret = system("hwloc-ps -a -t -c");
			
 
				+	if (hwloc_ret)
			
 
				+	{
			
 
				+		_STARPU_DISP("hwloc-ps returned %d\n", hwloc_ret);
			
 
				+		fflush(stderr);
			
 
				+	}
			
 
				+	fflush(stdout);
			
 
				+#else
			
 
				+	_STARPU_DISP("hwloc not available to display bindings.\n");
			
 
				+#endif
			
 
				+}
			
--- a/src/core/jobs.c
+++ b/src/core/jobs.c
@@ -24,10 +24,12 @@
 
				 #include <common/config.h>
			
 
				 #include <common/utils.h>
			
 
				 #include <common/graph.h>
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				 #include <profiling/profiling.h>
			
 
				 #include <profiling/bound.h>
			
 
				 #include <core/debug.h>
			
 
				 #include <limits.h>
			
 
				+#include <core/workers.h>
			
 
				 
			
 
				 static int max_memory_use;
			
 
				 static unsigned long njobs, maxnjobs;
			
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
@@ -37,6 +37,7 @@
 
				 #include <core/topology.h>
			
 
				 #include <common/utils.h>
			
 
				 #include <drivers/mpi/driver_mpi_common.h>
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 #include <starpu_opencl.h>
			
@@ -177,7 +178,7 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_cuda(int dev, in
 
				 	/* Allocate a buffer on the device */
			
 
				 	unsigned char *d_buffer;
			
 
				 	cures = cudaMalloc((void **)&d_buffer, size);
			
 
				-	STARPU_ASSERT(cures == cudaSuccess);
			
 
				+	if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				 	/* hack to avoid third party libs to rebind threads */
			
 
				 	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
			
@@ -206,7 +207,7 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_cuda(int dev, in
 
				 		cudaHostRegister((void *)h_buffer, size, 0);
			
 
				 	}
			
 
				 
			
 
				-	STARPU_ASSERT(cures == cudaSuccess);
			
 
				+	if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				 	/* hack to avoid third party libs to rebind threads */
			
 
				 	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
			
@@ -331,7 +332,7 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
				 	/* Allocate a buffer on the device */
			
 
				 	unsigned char *s_buffer;
			
 
				 	cures = cudaMalloc((void **)&s_buffer, size);
			
 
				-	STARPU_ASSERT(cures == cudaSuccess);
			
 
				+	if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 	cudaMemset(s_buffer, 0, size);
			
 
				 	cudaDeviceSynchronize();
			
 
				 
			
@@ -357,7 +358,7 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
				 	/* Allocate a buffer on the device */
			
 
				 	unsigned char *d_buffer;
			
 
				 	cures = cudaMalloc((void **)&d_buffer, size);
			
 
				-	STARPU_ASSERT(cures == cudaSuccess);
			
 
				+	if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 	cudaMemset(d_buffer, 0, size);
			
 
				 	cudaDeviceSynchronize();
			
 
				 
			
--- a/src/core/perfmodel/perfmodel_history.c
+++ b/src/core/perfmodel/perfmodel_history.c
@@ -344,7 +344,10 @@ static void dump_reg_model(FILE *f, struct starpu_perfmodel *model, int comb, in
 
				 	double a = nan(""), b = nan(""), c = nan("");
			
 
				 
			
 
				 	if (model->type == STARPU_NL_REGRESSION_BASED)
			
 
				-		_starpu_regression_non_linear_power(per_arch_model->list, &a, &b, &c);
			
 
				+	{
			
 
				+		if (_starpu_regression_non_linear_power(per_arch_model->list, &a, &b, &c) != 0)
			
 
				+			_STARPU_DISP("Warning: could not compute a non-linear regression for model %s\n", model->symbol);
			
 
				+	}
			
 
				 
			
 
				 	fprintf(f, "# a\t\tb\t\tc\n");
			
 
				 	_starpu_write_double(f, "%-15e", a);
			
@@ -1491,6 +1494,8 @@ int starpu_perfmodel_load_file(const char *filename, struct starpu_perfmodel *mo
 
				 	res = fclose(f);
			
 
				 	STARPU_ASSERT(res == 0);
			
 
				 
			
 
				+	if (ret)
			
 
				+		starpu_perfmodel_unload_model(model);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
--- a/src/core/perfmodel/perfmodel_print.c
+++ b/src/core/perfmodel/perfmodel_print.c
@@ -19,6 +19,7 @@
 
				 #include <starpu.h>
			
 
				 #include <starpu_perfmodel.h>
			
 
				 #include <common/config.h>
			
 
				+#include <core/workers.h>
			
 
				 #include "perfmodel.h"
			
 
				 
			
 
				 static
			
--- a/src/core/perfmodel/regression.c
+++ b/src/core/perfmodel/regression.c
@@ -20,7 +20,32 @@
 
				 #define MAXREGITER	1000
			
 
				 #define EPS 1.0e-10
			
 
				 
			
 
				-static double compute_b(double c, unsigned n, unsigned *x, double *y)
			
 
				+/* For measurements close to C, we do not want to try to fit, since we are
			
 
				+   fitting the distance to C, which won't actually really get smaller */
			
 
				+#define C_RADIUS 1
			
 
				+
			
 
				+/*
			
 
				+ * smoothly ramp from 0 to 1 between 0 and 1
			
 
				+ * <= 0: stay 0
			
 
				+ * >= 1: stay 1 */
			
 
				+static double level(double x)
			
 
				+{
			
 
				+	if (x <= 0.)
			
 
				+		return 0.;
			
 
				+	if (x >= 1.)
			
 
				+		return 1.;
			
 
				+	if (x < 0.5)
			
 
				+		return -2*x*x+4*x-1;
			
 
				+	return 2*x*x;
			
 
				+}
			
 
				+
			
 
				+static double fixpop(unsigned pop, double c, double y)
			
 
				+{
			
 
				+	double distance = (y-c)/c;
			
 
				+	return pop * level((distance - C_RADIUS) / C_RADIUS);
			
 
				+}
			
 
				+
			
 
				+static double compute_b(double c, unsigned n, size_t *x, double *y, unsigned *pop)
			
 
				 {
			
 
				 	double b;
			
 
				 
			
@@ -29,43 +54,55 @@ static double compute_b(double c, unsigned n, unsigned *x, double *y)
 
				 	double sumx = 0.0;
			
 
				 	double sumx2 = 0.0;
			
 
				 	double sumy = 0.0;
			
 
				+	double nn = 0;
			
 
				 
			
 
				 	unsigned i;
			
 
				 	for (i = 0; i < n; i++)
			
 
				 	{
			
 
				 		double xi = log(x[i]);
			
 
				 		double yi = log(y[i]-c);
			
 
				+		double popi = fixpop(pop[i], c, y[i]);
			
 
				+		if (popi <= 0)
			
 
				+			continue;
			
 
				+
			
 
				+		sumxy += xi*yi*popi;
			
 
				+		sumx += xi*popi;
			
 
				+		sumx2 += xi*xi*popi;
			
 
				+		sumy += yi*popi;
			
 
				 
			
 
				-		sumxy += xi*yi;
			
 
				-		sumx += xi;
			
 
				-		sumx2 += xi*xi;
			
 
				-		sumy += yi;
			
 
				+		nn += popi;
			
 
				 	}
			
 
				 
			
 
				-	b = (n * sumxy - sumx * sumy) / (n*sumx2 - sumx*sumx);
			
 
				+	b = (nn * sumxy - sumx * sumy) / (nn*sumx2 - sumx*sumx);
			
 
				 
			
 
				 	return b;
			
 
				 }
			
 
				 
			
 
				-static double compute_a(double c, double b, unsigned n, unsigned *x, double *y)
			
 
				+static double compute_a(double c, double b, unsigned n, size_t *x, double *y, unsigned *pop)
			
 
				 {
			
 
				 	double a;
			
 
				 
			
 
				 	/* X = log (x) , Y = log (y - c) */
			
 
				 	double sumx = 0.0;
			
 
				 	double sumy = 0.0;
			
 
				+	double nn = 0;
			
 
				 
			
 
				 	unsigned i;
			
 
				 	for (i = 0; i < n; i++)
			
 
				 	{
			
 
				 		double xi = log(x[i]);
			
 
				 		double yi = log(y[i]-c);
			
 
				+		double popi = fixpop(pop[i], c, y[i]);
			
 
				+		if (popi <= 0)
			
 
				+			continue;
			
 
				 
			
 
				-		sumx += xi;
			
 
				-		sumy += yi;
			
 
				+		sumx += xi*popi;
			
 
				+		sumy += yi*popi;
			
 
				+
			
 
				+		nn += popi;
			
 
				 	}
			
 
				 
			
 
				-	a = (sumy - b*sumx) / n;
			
 
				+	a = (sumy - b*sumx) / nn;
			
 
				 
			
 
				 	return a;
			
 
				 }
			
@@ -73,7 +110,7 @@ static double compute_a(double c, double b, unsigned n, unsigned *x, double *y)
 
				 
			
 
				 
			
 
				 /* returns r */
			
 
				-static double test_r(double c, unsigned n, unsigned *x, double *y)
			
 
				+static double test_r(double c, unsigned n, size_t *x, double *y, unsigned *pop)
			
 
				 {
			
 
				 	double r;
			
 
				 
			
@@ -85,20 +122,26 @@ static double test_r(double c, unsigned n, unsigned *x, double *y)
 
				 	double sumx2 = 0.0;
			
 
				 	double sumy = 0.0;
			
 
				 	double sumy2 = 0.0;
			
 
				+	double nn = 0;
			
 
				 
			
 
				 	unsigned i;
			
 
				 	for (i = 0; i < n; i++)
			
 
				 	{
			
 
				 		double xi = log(x[i]);
			
 
				 		double yi = log(y[i]-c);
			
 
				+		double popi = fixpop(pop[i], c, y[i]);
			
 
				+		if (popi <= 0)
			
 
				+			continue;
			
 
				 
			
 
				 	//	printf("Xi = %e, Yi = %e\n", xi, yi);
			
 
				 
			
 
				-		sumxy += xi*yi;
			
 
				-		sumx += xi;
			
 
				-		sumx2 += xi*xi;
			
 
				-		sumy += yi;
			
 
				-		sumy2 += yi*yi;
			
 
				+		sumxy += xi*yi*popi;
			
 
				+		sumx += xi*popi;
			
 
				+		sumx2 += xi*xi*popi;
			
 
				+		sumy += yi*popi;
			
 
				+		sumy2 += yi*yi*popi;
			
 
				+
			
 
				+		nn += popi;
			
 
				 	}
			
 
				 
			
 
				 	//printf("sumxy %e\n", sumxy);
			
@@ -107,7 +150,7 @@ static double test_r(double c, unsigned n, unsigned *x, double *y)
 
				 	//printf("sumy %e\n", sumy);
			
 
				 	//printf("sumy2 %e\n", sumy2);
			
 
				 
			
 
				-	r = (n * sumxy - sumx * sumy) / sqrt( (n* sumx2 - sumx*sumx) * (n*sumy2 - sumy*sumy) );
			
 
				+	r = (nn * sumxy - sumx * sumy) / sqrt( (nn* sumx2 - sumx*sumx) * (nn*sumy2 - sumy*sumy) );
			
 
				 
			
 
				 	return r;
			
 
				 }
			
@@ -127,20 +170,29 @@ static unsigned find_list_size(struct starpu_perfmodel_history_list *list_histor
 
				 	return cnt;
			
 
				 }
			
 
				 
			
 
				-static double find_list_min(double *y, unsigned n)
			
 
				+static int compar(const void *_a, const void *_b)
			
 
				 {
			
 
				-	double min = DBL_MAX;
			
 
				+	double a = *(double*) _a;
			
 
				+	double b = *(double*) _b;
			
 
				+	if (a < b)
			
 
				+		return -1;
			
 
				+	if (a > b)
			
 
				+		return 1;
			
 
				+	return 0;
			
 
				+}
			
 
				 
			
 
				-	unsigned i;
			
 
				-	for (i = 0; i < n; i++)
			
 
				-	{
			
 
				-		min = STARPU_MIN(min, y[i]);
			
 
				-	}
			
 
				+static double get_list_fourth(double *y, unsigned n)
			
 
				+{
			
 
				+	double sorted[n];
			
 
				+
			
 
				+	memcpy(sorted, y, n * sizeof(*sorted));
			
 
				+
			
 
				+	qsort(sorted, n, sizeof(*sorted), compar);
			
 
				 
			
 
				-	return min;
			
 
				+	return sorted[n/3];
			
 
				 }
			
 
				 
			
 
				-static void dump_list(unsigned *x, double *y, struct starpu_perfmodel_history_list *list_history)
			
 
				+static void dump_list(size_t *x, double *y, unsigned *pop, struct starpu_perfmodel_history_list *list_history)
			
 
				 {
			
 
				 	struct starpu_perfmodel_history_list *ptr = list_history;
			
 
				 	unsigned i = 0;
			
@@ -151,6 +203,7 @@ static void dump_list(unsigned *x, double *y, struct starpu_perfmodel_history_li
 
				 		{
			
 
				 			x[i] = ptr->entry->size;
			
 
				 			y[i] = ptr->entry->mean;
			
 
				+			pop[i] = ptr->entry->nsample;
			
 
				 			i++;
			
 
				 		}
			
 
				 
			
@@ -163,52 +216,72 @@ static void dump_list(unsigned *x, double *y, struct starpu_perfmodel_history_li
 
				  * 	return 0 if success, -1 otherwise
			
 
				  * 	if success, a, b and c are modified
			
 
				  * */
			
 
				+
			
 
				+/* See in Cedric Augonnet's PhD thesis's Appendix B for the rationale
			
 
				+ * Scheduling Tasks over Multicore machines enhanced with Accelerators: a
			
 
				+ * Runtime System’s Perspective */
			
 
				 int _starpu_regression_non_linear_power(struct starpu_perfmodel_history_list *ptr, double *a, double *b, double *c)
			
 
				 {
			
 
				 	unsigned n = find_list_size(ptr);
			
 
				-	STARPU_ASSERT(n);
			
 
				+	if (!n)
			
 
				+		return -1;
			
 
				 
			
 
				-	unsigned *x;
			
 
				-	_STARPU_MALLOC(x, n*sizeof(unsigned));
			
 
				+	size_t *x;
			
 
				+	_STARPU_MALLOC(x, n*sizeof(size_t));
			
 
				 
			
 
				 	double *y;
			
 
				 	_STARPU_MALLOC(y, n*sizeof(double));
			
 
				 	STARPU_ASSERT(y);
			
 
				 
			
 
				-	dump_list(x, y, ptr);
			
 
				+	unsigned *pop;
			
 
				+	_STARPU_MALLOC(pop, n*sizeof(unsigned));
			
 
				+	STARPU_ASSERT(y);
			
 
				+
			
 
				+	dump_list(x, y, pop, ptr);
			
 
				 
			
 
				 	double cmin = 0.0;
			
 
				-	double cmax = find_list_min(y, n);
			
 
				+	double cmax = get_list_fourth(y, n);
			
 
				 
			
 
				 	unsigned iter;
			
 
				 
			
 
				 	double err = 100000.0;
			
 
				 
			
 
				+/*
			
 
				+	unsigned i;
			
 
				+	for (i = 0; i < 100; i++)
			
 
				+	{
			
 
				+		double ci = cmin + (cmax-cmin)*i/100.;
			
 
				+		fprintf(stderr,"%f: %f\n", ci, 1.0 - test_r(ci, n, x, y, pop));
			
 
				+	}
			
 
				+*/
			
 
				+
			
 
				+	/* Use dichotomy to find c that gives the best matching */
			
 
				 	for (iter = 0; iter < MAXREGITER; iter++)
			
 
				 	{
			
 
				 		double c1, c2;
			
 
				 		double r1, r2;
			
 
				 
			
 
				-		double radius = 0.01;
			
 
				-
			
 
				-		c1 = cmin + (0.5-radius)*(cmax - cmin);
			
 
				-		c2 = cmin + (0.5+radius)*(cmax - cmin);
			
 
				+		c1 = cmin + (0.33)*(cmax - cmin);
			
 
				+		c2 = cmin + (0.67)*(cmax - cmin);
			
 
				 
			
 
				-		r1 = test_r(c1, n, x, y);
			
 
				-		r2 = test_r(c2, n, x, y);
			
 
				+		r1 = test_r(c1, n, x, y, pop);
			
 
				+		r2 = test_r(c2, n, x, y, pop);
			
 
				 
			
 
				 		double err1, err2;
			
 
				 		err1 = fabs(1.0 - r1);
			
 
				 		err2 = fabs(1.0 - r2);
			
 
				 
			
 
				+		//fprintf(stderr,"%f - %f: %f - %f: %f - %f\n", cmin, c1, err1, c2, err2, cmax);
			
 
				+
			
 
				 		if (err1 < err2)
			
 
				 		{
			
 
				-			cmax = (cmin + cmax)/2;
			
 
				+			/* 1 is better */
			
 
				+			cmax = c2;
			
 
				 		}
			
 
				 		else
			
 
				 		{
			
 
				 			/* 2 is better */
			
 
				-			cmin = (cmin + cmax)/2;
			
 
				+			cmin = c1;
			
 
				 		}
			
 
				 
			
 
				 		if (fabs(err - STARPU_MIN(err1, err2)) < EPS)
			
@@ -219,11 +292,12 @@ int _starpu_regression_non_linear_power(struct starpu_perfmodel_history_list *pt
 
				 
			
 
				 	*c = (cmin + cmax)/2;
			
 
				 
			
 
				-	*b = compute_b(*c, n, x, y);
			
 
				-	*a = exp(compute_a(*c, *b, n, x, y));
			
 
				+	*b = compute_b(*c, n, x, y, pop);
			
 
				+	*a = exp(compute_a(*c, *b, n, x, y, pop));
			
 
				 
			
 
				 	free(x);
			
 
				 	free(y);
			
 
				+	free(pop);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/src/core/sched_ctx.c
+++ b/src/core/sched_ctx.c
@@ -21,6 +21,7 @@
 
				 #include <common/utils.h>
			
 
				 #include <stdarg.h>
			
 
				 #include <core/task.h>
			
 
				+#include <core/workers.h>
			
 
				 
			
 
				 enum _starpu_ctx_change_op
			
 
				 {
			
--- a/src/core/sched_policy.c
+++ b/src/core/sched_policy.c
@@ -22,6 +22,7 @@
 
				 #include <common/utils.h>
			
 
				 #include <core/sched_policy.h>
			
 
				 #include <profiling/profiling.h>
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				 #include <common/barrier.h>
			
 
				 #include <core/debug.h>
			
 
				 #include <core/task.h>
			
--- a/src/core/simgrid.c
+++ b/src/core/simgrid.c
@@ -58,6 +58,8 @@ extern int _starpu_mpi_simgrid_init(int argc, char *argv[]);
 
				 extern void smpi_process_set_user_data(void *);
			
 
				 #endif
			
 
				 
			
 
				+static double _starpu_simgrid_dynamic_energy = 0.0;
			
 
				+
			
 
				 /* 1 when MSG_init was done, 2 when initialized through redirected main, 3 when
			
 
				  * initialized through MSG_process_attach */
			
 
				 static int simgrid_started;
			
@@ -629,6 +631,7 @@ struct task
 
				 #else
			
 
				 	msg_task_t task;
			
 
				 #endif
			
 
				+	double energy;
			
 
				 
			
 
				 	/* communication termination signalization */
			
 
				 	unsigned *finished;
			
@@ -666,6 +669,7 @@ static void *task_execute(void *arg)
 
				 		MSG_task_execute(task->task);
			
 
				 		MSG_task_destroy(task->task);
			
 
				 #endif
			
 
				+		starpu_energy_use(task->energy);
			
 
				 		_STARPU_DEBUG("task %p finished\n", task);
			
 
				 
			
 
				 		*task->finished = 1;
			
@@ -702,7 +706,7 @@ void _starpu_simgrid_wait_tasks(int workerid)
 
				 }
			
 
				 
			
 
				 /* Task execution submitted by StarPU */
			
 
				-void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch, double length, unsigned *finished)
			
 
				+void _starpu_simgrid_submit_job(int workerid, int sched_ctx_id, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch, double length, double energy, unsigned *finished)
			
 
				 {
			
 
				 	struct starpu_task *starpu_task = j->task;
			
 
				 	double flops;
			
@@ -717,13 +721,19 @@ void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct star
 
				 
			
 
				 	if (isnan(length))
			
 
				 	{
			
 
				-		length = starpu_task_expected_length(starpu_task, perf_arch, j->nimpl);
			
 
				+		length = starpu_task_worker_expected_length(starpu_task, workerid, sched_ctx_id, j->nimpl);
			
 
				 		STARPU_ASSERT_MSG(!_STARPU_IS_ZERO(length) && !isnan(length),
			
 
				 				  "Codelet %s does not have a perfmodel (in directory %s), or is not calibrated enough, please re-run in non-simgrid mode until it is calibrated",
			
 
				 				  _starpu_job_get_model_name(j), _starpu_get_perf_model_dir_codelet());
			
 
				                 /* TODO: option to add variance according to performance model,
			
 
				                  * to be able to easily check scheduling robustness */
			
 
				 	}
			
 
				+	if (isnan(energy))
			
 
				+	{
			
 
				+		energy = starpu_task_worker_expected_energy(starpu_task, workerid, sched_ctx_id, j->nimpl);
			
 
				+		/* TODO: option to add variance according to performance model,
			
 
				+		 * to be able to easily check scheduling robustness */
			
 
				+	}
			
 
				 
			
 
				 #if defined(HAVE_SG_HOST_SPEED) || defined(sg_host_speed)
			
 
				 #  if defined(HAVE_SG_HOST_SELF) || defined(sg_host_self)
			
@@ -754,6 +764,7 @@ void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct star
 
				 		MSG_task_execute(simgrid_task);
			
 
				 		MSG_task_destroy(simgrid_task);
			
 
				 #endif
			
 
				+		starpu_energy_use(energy);
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
@@ -766,6 +777,7 @@ void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct star
 
				 #else
			
 
				 		task->task = simgrid_task;
			
 
				 #endif
			
 
				+		task->energy = energy;
			
 
				 		task->finished = finished;
			
 
				 		*finished = 0;
			
 
				 		task->next = NULL;
			
@@ -1391,5 +1403,15 @@ void _starpu_simgrid_data_transfer(size_t size, unsigned src_node, unsigned dst_
 
				 }
			
 
				 #endif
			
 
				 
			
 
				+void starpu_energy_use(float joules)
			
 
				+{
			
 
				+	_starpu_simgrid_dynamic_energy += joules;
			
 
				+}
			
 
				+
			
 
				+double starpu_energy_used(void)
			
 
				+{
			
 
				+	float idle_power = starpu_get_env_float_default("STARPU_IDLE_POWER", 0.0);
			
 
				+	return _starpu_simgrid_dynamic_energy + idle_power * starpu_timing_now() / 1000000;
			
 
				+}
			
 
				 
			
 
				 #endif
			
--- a/src/core/simgrid.h
+++ b/src/core/simgrid.h
@@ -66,7 +66,7 @@ void _starpu_simgrid_deinit_late(void);
 
				 void _starpu_simgrid_actor_setup(void);
			
 
				 void _starpu_simgrid_wait_tasks(int workerid);
			
 
				 struct _starpu_job;
			
 
				-void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *job, struct starpu_perfmodel_arch* perf_arch, double length, unsigned *finished);
			
 
				+void _starpu_simgrid_submit_job(int workerid, int sched_ctx_id, struct _starpu_job *job, struct starpu_perfmodel_arch* perf_arch, double length, double energy, unsigned *finished);
			
 
				 struct _starpu_data_request;
			
 
				 int _starpu_simgrid_transfer(size_t size, unsigned src_node, unsigned dst_node, struct _starpu_data_request *req);
			
 
				 union _starpu_async_channel_event;
			
--- a/src/core/task.c
+++ b/src/core/task.c
@@ -30,6 +30,7 @@
 
				 #include <common/utils.h>
			
 
				 #include <common/fxt.h>
			
 
				 #include <common/knobs.h>
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				 #include <profiling/profiling.h>
			
 
				 #include <profiling/bound.h>
			
 
				 #include <math.h>
			
--- a/src/core/topology.c
+++ b/src/core/topology.c
@@ -1983,7 +1983,11 @@ int _starpu_bind_thread_on_cpu(int cpuid STARPU_ATTRIBUTE_UNUSED, int workerid S
 
				 		{
			
 
				 			cpu_worker[cpuid] = workerid;
			
 
				 			if (name)
			
 
				+			{
			
 
				+				if (cpu_name[cpuid])
			
 
				+					free(cpu_name[cpuid]);
			
 
				 				cpu_name[cpuid] = strdup(name);
			
 
				+			}
			
 
				 		}
			
 
				 	}
			
 
				 
			
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -1059,6 +1059,7 @@ int starpu_conf_init(struct starpu_conf *conf)
 
				 
			
 
				 	memset(conf, 0, sizeof(*conf));
			
 
				 	conf->magic = 42;
			
 
				+	conf->will_use_mpi = 0;
			
 
				 	conf->sched_policy_name = starpu_getenv("STARPU_SCHED");
			
 
				 	conf->sched_policy = NULL;
			
 
				 	conf->global_sched_ctx_min_priority = starpu_get_env_number("STARPU_MIN_PRIO");
			
@@ -1666,6 +1667,15 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 
				 
			
 
				 	_starpu_catch_signals();
			
 
				 
			
 
				+	/* if MPI is enabled, binding display will be done later, after MPI initialization */
			
 
				+	if (!_starpu_config.conf.will_use_mpi && starpu_get_env_number_default("STARPU_DISPLAY_BINDINGS", 0))
			
 
				+	{
			
 
				+		fprintf(stdout, "== Binding ==\n");
			
 
				+		starpu_display_bindings();
			
 
				+		fprintf(stdout, "== End of binding ==\n");
			
 
				+		fflush(stdout);
			
 
				+	}
			
 
				+
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -2644,31 +2654,37 @@ int starpu_worker_get_relax_state(void)
 
				 	return _starpu_worker_get_relax_state();
			
 
				 }
			
 
				 
			
 
				+#undef starpu_worker_lock
			
 
				 void starpu_worker_lock(int workerid)
			
 
				 {
			
 
				 	_starpu_worker_lock(workerid);
			
 
				 }
			
 
				 
			
 
				+#undef starpu_worker_trylock
			
 
				 int starpu_worker_trylock(int workerid)
			
 
				 {
			
 
				 	return _starpu_worker_trylock(workerid);
			
 
				 }
			
 
				 
			
 
				+#undef starpu_worker_unlock
			
 
				 void starpu_worker_unlock(int workerid)
			
 
				 {
			
 
				 	_starpu_worker_unlock(workerid);
			
 
				 }
			
 
				 
			
 
				+#undef starpu_worker_lock_self
			
 
				 void starpu_worker_lock_self(void)
			
 
				 {
			
 
				 	_starpu_worker_lock_self();
			
 
				 }
			
 
				 
			
 
				+#undef starpu_worker_unlock_self
			
 
				 void starpu_worker_unlock_self(void)
			
 
				 {
			
 
				 	_starpu_worker_unlock_self();
			
 
				 }
			
 
				 
			
 
				+#undef starpu_wake_worker_relax
			
 
				 int starpu_wake_worker_relax(int workerid)
			
 
				 {
			
 
				 	return _starpu_wake_worker_relax(workerid);
			
--- a/src/core/workers.h
+++ b/src/core/workers.h
@@ -203,6 +203,10 @@ LIST_TYPE(_starpu_worker,
 
				 
			
 
				 	int enable_knob;
			
 
				 	int bindid_requested;
			
 
				+
			
 
				+	/* Keep this last, to make sure to separate worker data in separate
			
 
				+	  cache lines. */
			
 
				+	char padding[STARPU_CACHELINE_SIZE];
			
 
				 );
			
 
				 
			
 
				 struct _starpu_combined_worker
			
@@ -223,6 +227,10 @@ struct _starpu_combined_worker
 
				 #ifdef STARPU_HAVE_HWLOC
			
 
				 	hwloc_bitmap_t hwloc_cpu_set;
			
 
				 #endif
			
 
				+
			
 
				+	/* Keep this last, to make sure to separate worker data in separate
			
 
				+	  cache lines. */
			
 
				+	char padding[STARPU_CACHELINE_SIZE];
			
 
				 };
			
 
				 
			
 
				 /**
			
@@ -389,6 +397,9 @@ struct _starpu_machine_config
 
				 	/** Memory node for MPI, if only one */
			
 
				 	int mpi_nodeid;
			
 
				 
			
 
				+	/* Separate out previous variables from per-worker data. */
			
 
				+	char padding1[STARPU_CACHELINE_SIZE];
			
 
				+
			
 
				 	/** Basic workers : each of this worker is running its own driver and
			
 
				 	 * can be combined with other basic workers. */
			
 
				 	struct _starpu_worker workers[STARPU_NMAXWORKERS];
			
@@ -397,6 +408,11 @@ struct _starpu_machine_config
 
				 	 * that can run parallel tasks together. */
			
 
				 	struct _starpu_combined_worker combined_workers[STARPU_NMAX_COMBINEDWORKERS];
			
 
				 
			
 
				+	starpu_pthread_mutex_t submitted_mutex;
			
 
				+
			
 
				+	/* Separate out previous mutex from the rest of the data. */
			
 
				+	char padding2[STARPU_CACHELINE_SIZE];
			
 
				+
			
 
				 	/** Translation table from bindid to worker IDs */
			
 
				 	struct
			
 
				 	{
			
@@ -432,8 +448,6 @@ struct _starpu_machine_config
 
				 
			
 
				 	/** When >0, StarPU should stop performance counters collection. */
			
 
				 	int perf_counter_pause_depth;
			
 
				-
			
 
				-	starpu_pthread_mutex_t submitted_mutex;
			
 
				 };
			
 
				 
			
 
				 extern int _starpu_worker_parallel_blocks;
			
@@ -1103,6 +1117,7 @@ static inline void _starpu_worker_lock(int workerid)
 
				 		STARPU_PTHREAD_MUTEX_LOCK_SCHED(&worker->sched_mutex);
			
 
				 	}
			
 
				 }
			
 
				+#define starpu_worker_lock _starpu_worker_lock
			
 
				 
			
 
				 static inline int _starpu_worker_trylock(int workerid)
			
 
				 {
			
@@ -1133,6 +1148,7 @@ static inline int _starpu_worker_trylock(int workerid)
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&cur_worker->sched_mutex);
			
 
				 	return ret;
			
 
				 }
			
 
				+#define starpu_worker_trylock _starpu_worker_trylock
			
 
				 
			
 
				 static inline void _starpu_worker_unlock(int workerid)
			
 
				 {
			
@@ -1145,6 +1161,7 @@ static inline void _starpu_worker_unlock(int workerid)
 
				 		starpu_worker_relax_off();
			
 
				 	}
			
 
				 }
			
 
				+#define starpu_worker_unlock _starpu_worker_unlock
			
 
				 
			
 
				 static inline void _starpu_worker_lock_self(void)
			
 
				 {
			
@@ -1153,6 +1170,7 @@ static inline void _starpu_worker_lock_self(void)
 
				 	STARPU_ASSERT(worker != NULL);
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK_SCHED(&worker->sched_mutex);
			
 
				 }
			
 
				+#define starpu_worker_lock_self _starpu_worker_lock_self
			
 
				 
			
 
				 static inline void _starpu_worker_unlock_self(void)
			
 
				 {
			
@@ -1161,6 +1179,7 @@ static inline void _starpu_worker_unlock_self(void)
 
				 	STARPU_ASSERT(worker != NULL);
			
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&worker->sched_mutex);
			
 
				 }
			
 
				+#define starpu_worker_unlock_self _starpu_worker_unlock_self
			
 
				 
			
 
				 static inline int _starpu_wake_worker_relax(int workerid)
			
 
				 {
			
@@ -1169,6 +1188,7 @@ static inline int _starpu_wake_worker_relax(int workerid)
 
				 	_starpu_worker_unlock(workerid);
			
 
				 	return ret;
			
 
				 }
			
 
				+#define starpu_wake_worker_relax _starpu_wake_worker_relax
			
 
				 
			
 
				 int starpu_wake_worker_relax_light(int workerid);
			
 
				 
			
--- a/src/datawizard/data_request.c
+++ b/src/datawizard/data_request.c
@@ -25,6 +25,9 @@
 
				 #include <core/simgrid.h>
			
 
				 
			
 
				 /* requests that have not been treated at all */
			
 
				+#ifdef STARPU_DEVEL
			
 
				+#warning split into separate out/in queues for each node, so that MAX_PENDING_REQUESTS_PER_NODE is separate for them, since the links are bidirectionnal
			
 
				+#endif
			
 
				 static struct _starpu_data_request_prio_list data_requests[STARPU_MAXNODES];
			
 
				 static struct _starpu_data_request_prio_list prefetch_requests[STARPU_MAXNODES];
			
 
				 static struct _starpu_data_request_prio_list idle_requests[STARPU_MAXNODES];
			
--- a/src/datawizard/filters.c
+++ b/src/datawizard/filters.c
@@ -21,6 +21,7 @@
 
				 #include <datawizard/filters.h>
			
 
				 #include <datawizard/footprint.h>
			
 
				 #include <datawizard/interfaces/data_interface.h>
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				 #include <core/task.h>
			
 
				 
			
 
				 /*
			
--- a/src/datawizard/interfaces/bcsr_interface.c
+++ b/src/datawizard/interfaces/bcsr_interface.c
@@ -15,6 +15,9 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#ifdef BUILDING_STARPU
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				+#endif
			
 
				 
			
 
				 /*
			
 
				  * BCSR : blocked CSR, we use blocks of size (r x c)
			
--- a/src/datawizard/interfaces/block_interface.c
+++ b/src/datawizard/interfaces/block_interface.c
@@ -15,6 +15,9 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#ifdef BUILDING_STARPU
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				+#endif
			
 
				 
			
 
				 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
			
 
				 
			
--- a/src/datawizard/interfaces/coo_interface.c
+++ b/src/datawizard/interfaces/coo_interface.c
@@ -15,6 +15,9 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#ifdef BUILDING_STARPU
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				+#endif
			
 
				 
			
 
				 static int
			
 
				 copy_any_to_any(void *src_interface, unsigned src_node,
			
--- a/src/datawizard/interfaces/csr_interface.c
+++ b/src/datawizard/interfaces/csr_interface.c
@@ -16,6 +16,9 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#ifdef BUILDING_STARPU
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				+#endif
			
 
				 
			
 
				 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
			
 
				 
			
--- a/src/datawizard/interfaces/matrix_interface.c
+++ b/src/datawizard/interfaces/matrix_interface.c
@@ -15,6 +15,9 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#ifdef BUILDING_STARPU
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				+#endif
			
 
				 
			
 
				 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
			
 
				 
			
--- a/src/datawizard/interfaces/multiformat_interface.c
+++ b/src/datawizard/interfaces/multiformat_interface.c
@@ -15,6 +15,9 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#ifdef BUILDING_STARPU
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				+#endif
			
 
				 
			
 
				 static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
			
 
				 #ifdef STARPU_USE_CUDA
			
--- a/src/datawizard/interfaces/tensor_interface.c
+++ b/src/datawizard/interfaces/tensor_interface.c
@@ -15,6 +15,9 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#ifdef BUILDING_STARPU
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				+#endif
			
 
				 
			
 
				 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
			
 
				 
			
--- a/src/datawizard/interfaces/variable_interface.c
+++ b/src/datawizard/interfaces/variable_interface.c
@@ -15,6 +15,9 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#ifdef BUILDING_STARPU
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				+#endif
			
 
				 
			
 
				 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
			
 
				 
			
--- a/src/datawizard/interfaces/vector_interface.c
+++ b/src/datawizard/interfaces/vector_interface.c
@@ -15,6 +15,9 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#ifdef BUILDING_STARPU
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				+#endif
			
 
				 
			
 
				 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
			
 
				 
			
--- a/src/datawizard/interfaces/void_interface.c
+++ b/src/datawizard/interfaces/void_interface.c
@@ -15,6 +15,9 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#ifdef BUILDING_STARPU
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				+#endif
			
 
				 
			
 
				 static int dummy_copy(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
			
 
				 
			
--- a/src/datawizard/memory_manager.c
+++ b/src/datawizard/memory_manager.c
@@ -19,6 +19,7 @@
 
				 #include <common/thread.h>
			
 
				 #include <common/fxt.h>
			
 
				 #include <datawizard/memory_manager.h>
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				 #include <core/workers.h>
			
 
				 #include <starpu_stdlib.h>
			
 
				 
			
--- a/src/datawizard/reduction.c
+++ b/src/datawizard/reduction.c
@@ -22,6 +22,7 @@
 
				 #include <datawizard/datawizard.h>
			
 
				 #include <drivers/mic/driver_mic_source.h>
			
 
				 #include <drivers/mp_common/source_common.h>
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				 
			
 
				 void starpu_data_set_reduction_methods(starpu_data_handle_t handle,
			
 
				 				       struct starpu_codelet *redux_cl,
			
--- a/src/datawizard/user_interactions.c
+++ b/src/datawizard/user_interactions.c
@@ -22,6 +22,7 @@
 
				 #include <datawizard/write_back.h>
			
 
				 #include <core/dependencies/data_concurrency.h>
			
 
				 #include <core/sched_policy.h>
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				 
			
 
				 static void _starpu_data_check_initialized(starpu_data_handle_t handle, enum starpu_data_access_mode mode)
			
 
				 {
			
--- a/src/datawizard/write_back.c
+++ b/src/datawizard/write_back.c
@@ -17,6 +17,7 @@
 
				 #include <datawizard/datawizard.h>
			
 
				 #include <datawizard/write_back.h>
			
 
				 #include <core/dependencies/data_concurrency.h>
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				 
			
 
				 static void wt_callback(void *arg)
			
 
				 {
			
@@ -63,7 +64,7 @@ void _starpu_write_through_data(starpu_data_handle_t handle, unsigned requesting
 
				 
			
 
				 				struct _starpu_data_request *r;
			
 
				 				r = _starpu_create_request_to_fetch_data(handle, &handle->per_node[node],
			
 
				-									 STARPU_R, 1, 1, wt_callback, handle, 0, "_starpu_write_through_data");
			
 
				+									 STARPU_R, 2, 1, wt_callback, handle, 0, "_starpu_write_through_data");
			
 
				 
			
 
				 			        /* If no request was created, the handle was already up-to-date on the
			
 
				 			         * node */
			
--- a/src/debug/traces/starpu_fxt.c
+++ b/src/debug/traces/starpu_fxt.c
@@ -1194,8 +1194,8 @@ static void handle_new_mem_node(struct fxt_ev_64 *ev, struct starpu_fxt_options
 
				  */
			
 
				 static int create_ordered_stream_id (int nodeid, int devid)
			
 
				 {
			
 
				-	static int stable[MAX_MPI_NODES][STARPU_MAXCUDADEVS];
			
 
				-	STARPU_ASSERT(nodeid < MAX_MPI_NODES);
			
 
				+	static int stable[STARPU_FXT_MAX_FILES][STARPU_MAXCUDADEVS];
			
 
				+	STARPU_ASSERT(nodeid < STARPU_FXT_MAX_FILES);
			
 
				 	STARPU_ASSERT(devid < STARPU_MAXCUDADEVS);
			
 
				 	return stable[nodeid][devid]++;
			
 
				 }
			
--- a/src/debug/traces/starpu_fxt.h
+++ b/src/debug/traces/starpu_fxt.h
@@ -41,8 +41,6 @@
 
				 #include <starpu.h>
			
 
				 #include "../../../include/starpu_fxt.h"
			
 
				 
			
 
				-#define MAX_MPI_NODES 64
			
 
				-
			
 
				 extern char _starpu_last_codelet_symbol[STARPU_NMAXWORKERS][(FXT_MAX_PARAMS-5)*sizeof(unsigned long)];
			
 
				 
			
 
				 void _starpu_fxt_dag_init(char *dag_filename);
			
--- a/src/debug/traces/starpu_fxt_mpi.c
+++ b/src/debug/traces/starpu_fxt_mpi.c
@@ -103,27 +103,27 @@ int _starpu_fxt_mpi_find_sync_point(char *filename_in, uint64_t *offset, int *ke
 
				  */
			
 
				 
			
 
				 /* the list of MPI transfers found in the different traces */
			
 
				-static struct mpi_transfer *mpi_sends[MAX_MPI_NODES] = {NULL};
			
 
				-static struct mpi_transfer *mpi_recvs[MAX_MPI_NODES] = {NULL};
			
 
				+static struct mpi_transfer *mpi_sends[STARPU_FXT_MAX_FILES] = {NULL};
			
 
				+static struct mpi_transfer *mpi_recvs[STARPU_FXT_MAX_FILES] = {NULL};
			
 
				 
			
 
				 /* number of available slots in the lists  */
			
 
				-unsigned mpi_sends_list_size[MAX_MPI_NODES] = {0};
			
 
				-unsigned mpi_recvs_list_size[MAX_MPI_NODES] = {0};
			
 
				+unsigned mpi_sends_list_size[STARPU_FXT_MAX_FILES] = {0};
			
 
				+unsigned mpi_recvs_list_size[STARPU_FXT_MAX_FILES] = {0};
			
 
				 
			
 
				 /* number of slots actually used in the list  */
			
 
				-unsigned mpi_sends_used[MAX_MPI_NODES] = {0};
			
 
				-unsigned mpi_recvs_used[MAX_MPI_NODES] = {0};
			
 
				+unsigned mpi_sends_used[STARPU_FXT_MAX_FILES] = {0};
			
 
				+unsigned mpi_recvs_used[STARPU_FXT_MAX_FILES] = {0};
			
 
				 
			
 
				 /* number of slots already matched at the beginning of the list. This permits
			
 
				  * going through the lists from the beginning to match each and every
			
 
				  * transfer, thus avoiding a quadratic complexity. */
			
 
				-unsigned mpi_recvs_matched[MAX_MPI_NODES][MAX_MPI_NODES] = { {0} };
			
 
				-unsigned mpi_sends_matched[MAX_MPI_NODES][MAX_MPI_NODES] = { {0} };
			
 
				+unsigned mpi_recvs_matched[STARPU_FXT_MAX_FILES][STARPU_FXT_MAX_FILES] = { {0} };
			
 
				+unsigned mpi_sends_matched[STARPU_FXT_MAX_FILES][STARPU_FXT_MAX_FILES] = { {0} };
			
 
				 
			
 
				 void _starpu_fxt_mpi_add_send_transfer(int src, int dst STARPU_ATTRIBUTE_UNUSED, long mpi_tag, size_t size, float date, long jobid, unsigned long handle)
			
 
				 {
			
 
				 	STARPU_ASSERT(src >= 0);
			
 
				-	if (src >= MAX_MPI_NODES)
			
 
				+	if (src >= STARPU_FXT_MAX_FILES)
			
 
				 		return;
			
 
				 	unsigned slot = mpi_sends_used[src]++;
			
 
				 
			
@@ -153,7 +153,7 @@ void _starpu_fxt_mpi_add_send_transfer(int src, int dst STARPU_ATTRIBUTE_UNUSED,
 
				 
			
 
				 void _starpu_fxt_mpi_add_recv_transfer(int src STARPU_ATTRIBUTE_UNUSED, int dst, long mpi_tag, float date, long jobid, unsigned long handle)
			
 
				 {
			
 
				-	if (dst >= MAX_MPI_NODES)
			
 
				+	if (dst >= STARPU_FXT_MAX_FILES)
			
 
				 		return;
			
 
				 	unsigned slot = mpi_recvs_used[dst]++;
			
 
				 
			
@@ -220,11 +220,11 @@ static unsigned long mpi_com_id = 0;
 
				 
			
 
				 static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comms_file, unsigned n)
			
 
				 {
			
 
				-	unsigned slot[MAX_MPI_NODES] = { 0 }, node;
			
 
				+	unsigned slot[STARPU_FXT_MAX_FILES] = { 0 }, node;
			
 
				 	unsigned nb_wrong_comm_timing = 0;
			
 
				 	struct mpi_transfer_list pending_receives; /* Sorted list of matches which have not happened yet */
			
 
				-	double current_out_bandwidth[MAX_MPI_NODES] = { 0. };
			
 
				-	double current_in_bandwidth[MAX_MPI_NODES] = { 0. };
			
 
				+	double current_out_bandwidth[STARPU_FXT_MAX_FILES] = { 0. };
			
 
				+	double current_in_bandwidth[STARPU_FXT_MAX_FILES] = { 0. };
			
 
				 #ifdef STARPU_HAVE_POTI
			
 
				 	char mpi_container[STARPU_POTI_STR_LEN];
			
 
				 #endif
			
@@ -246,7 +246,7 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 
				 		else
			
 
				 			start_date = mpi_transfer_list_front(&pending_receives)->date;
			
 
				 
			
 
				-		src = MAX_MPI_NODES;
			
 
				+		src = STARPU_FXT_MAX_FILES;
			
 
				 		for (node = 0; node < n; node++)
			
 
				 		{
			
 
				 			if (slot[node] < mpi_sends_used[node] && mpi_sends[node][slot[node]].date < start_date)
			
@@ -260,7 +260,7 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 
				 			/* No event any more, we're finished! */
			
 
				 			break;
			
 
				 
			
 
				-		if (src == MAX_MPI_NODES)
			
 
				+		if (src == STARPU_FXT_MAX_FILES)
			
 
				 		{
			
 
				 			/* Pending match is earlier than all new sends, finish its communication */
			
 
				 			match = mpi_transfer_list_pop_front(&pending_receives);
			
@@ -284,7 +284,7 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 
				 		size_t size = cur->size;
			
 
				 		unsigned long send_handle = cur->handle;
			
 
				 
			
 
				-		if (dst < MAX_MPI_NODES)
			
 
				+		if (dst < STARPU_FXT_MAX_FILES)
			
 
				 			match = try_to_match_send_transfer(src, dst, mpi_tag);
			
 
				 		else
			
 
				 			match = NULL;
			
@@ -377,10 +377,10 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 
				 
			
 
				 void _starpu_fxt_display_mpi_transfers(struct starpu_fxt_options *options, int *ranks STARPU_ATTRIBUTE_UNUSED, FILE *out_paje_file, FILE* out_comms_file)
			
 
				 {
			
 
				-	if (options->ninputfiles > MAX_MPI_NODES)
			
 
				+	if (options->ninputfiles > STARPU_FXT_MAX_FILES)
			
 
				 	{
			
 
				-		_STARPU_DISP("Warning: %u files given, maximum %u supported, truncating to %u\n", options->ninputfiles, MAX_MPI_NODES, MAX_MPI_NODES);
			
 
				-		options->ninputfiles = MAX_MPI_NODES;
			
 
				+		_STARPU_DISP("Warning: %u files given, maximum %u supported, truncating to %u\n", options->ninputfiles, STARPU_FXT_MAX_FILES, STARPU_FXT_MAX_FILES);
			
 
				+		options->ninputfiles = STARPU_FXT_MAX_FILES;
			
 
				 	}
			
 
				 
			
 
				 	/* display the MPI transfers if possible */
			
--- a/src/drivers/cpu/driver_cpu.c
+++ b/src/drivers/cpu/driver_cpu.c
@@ -108,7 +108,10 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 
				 				_SIMGRID_TIMER_END;
			
 
				 			}
			
 
				 			else
			
 
				-				_starpu_simgrid_submit_job(cpu_args->workerid, j, perf_arch, NAN, NULL);
			
 
				+			{
			
 
				+				struct _starpu_sched_ctx *sched_ctx = _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(cpu_args, j);
			
 
				+				_starpu_simgrid_submit_job(cpu_args->workerid, sched_ctx->id, j, perf_arch, NAN, NAN, NULL);
			
 
				+			}
			
 
				 #else
			
 
				 #  ifdef STARPU_PAPI
			
 
				 			_starpu_profiling_papi_task_start_counters(task);
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -531,8 +531,11 @@ static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *worke
 
				 				_SIMGRID_TIMER_END;
			
 
				 			}
			
 
				 		else
			
 
				-			_starpu_simgrid_submit_job(workerid, j, &worker->perf_arch, NAN,
			
 
				+		{
			
 
				+			struct _starpu_sched_ctx *sched_ctx = _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(worker, j);
			
 
				+			_starpu_simgrid_submit_job(workerid, sched_ctx->id, j, &worker->perf_arch, NAN, NAN,
			
 
				 				async ? &task_finished[workerid][pipeline_idx] : NULL);
			
 
				+		}
			
 
				 #else
			
 
				 #ifdef HAVE_LIBNVIDIA_ML
			
 
				 		unsigned long long energy_start = 0;
			
--- a/src/drivers/disk/driver_disk.c
+++ b/src/drivers/disk/driver_disk.c
@@ -21,6 +21,7 @@
 
				 #include <drivers/disk/driver_disk.h>
			
 
				 #include <drivers/cpu/driver_cpu.h>
			
 
				 #include <datawizard/coherency.h>
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				 
			
 
				 int _starpu_disk_copy_src_to_disk(void * src, unsigned src_node, void * dst, size_t dst_offset, unsigned dst_node, size_t size, void * async_channel)
			
 
				 {
			
--- a/src/drivers/driver_common/driver_common.c
+++ b/src/drivers/driver_common/driver_common.c
@@ -27,6 +27,7 @@
 
				 #include <core/sched_policy.h>
			
 
				 #include <core/debug.h>
			
 
				 #include <core/task.h>
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				 
			
 
				 
			
 
				 void _starpu_driver_start_job(struct _starpu_worker *worker, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch, int rank, int profiling)
			
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -948,6 +948,7 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker
 
				 		_STARPU_TRACE_START_EXECUTING();
			
 
				 #ifdef STARPU_SIMGRID
			
 
				 		double length = NAN;
			
 
				+		double energy = NAN;
			
 
				 		int async = task->cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC;
			
 
				 		int simulate = 1;
			
 
				 		if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE && !async)
			
@@ -976,6 +977,7 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker
 
				 #else
			
 
				 			length = ((double) profiling_info->used_cycles)/MSG_get_host_speed(MSG_host_self());
			
 
				 #endif
			
 
				+			energy = info->energy_consumed;
			
 
				 			/* And give the simulated time to simgrid */
			
 
				 			simulate = 1;
			
 
				 #endif
			
@@ -989,8 +991,11 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker
 
				 			}
			
 
				 
			
 
				 		if (simulate)
			
 
				-			_starpu_simgrid_submit_job(worker->workerid, j, &worker->perf_arch, length,
			
 
				+		{
			
 
				+			struct _starpu_sched_ctx *sched_ctx = _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(worker, j);
			
 
				+			_starpu_simgrid_submit_job(sched_ctx->id, worker->workerid, j, &worker->perf_arch, length, energy,
			
 
				 						   async ? &task_finished[worker->devid][pipeline_idx] : NULL);
			
 
				+		}
			
 
				 #else
			
 
				 		func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
			
 
				 
			
--- a/src/profiling/bound.c
+++ b/src/profiling/bound.c
@@ -26,6 +26,7 @@
 
				 #include <profiling/bound.h>
			
 
				 #include <core/jobs.h>
			
 
				 #include <core/workers.h>
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				 
			
 
				 #ifdef STARPU_HAVE_GLPK_H
			
 
				 #include <glpk.h>
			
--- a/src/profiling/profiling.c
+++ b/src/profiling/profiling.c
@@ -29,6 +29,8 @@
 
				 #include <papi.h>
			
 
				 #endif
			
 
				 
			
 
				+/* TODO: move to worker structure */
			
 
				+
			
 
				 static struct starpu_profiling_worker_info worker_info[STARPU_NMAXWORKERS];
			
 
				 /* TODO: rather use rwlock */
			
 
				 static starpu_pthread_mutex_t worker_info_mutex[STARPU_NMAXWORKERS];
			
@@ -44,6 +46,7 @@ static struct timespec executing_start_date[STARPU_NMAXWORKERS];
 
				 #ifdef STARPU_PAPI
			
 
				 static int papi_events[PAPI_MAX_HWCTRS];
			
 
				 static int papi_nevents = 0;
			
 
				+static int warned_component_unavailable = 0;
			
 
				 #endif
			
 
				 
			
 
				 /* Store the busid of the different (src, dst) pairs. busid_matrix[src][dst]
			
@@ -158,7 +161,7 @@ void _starpu_profiling_init(void)
 
				 		conf_papi_events = starpu_getenv("STARPU_PROF_PAPI_EVENTS");
			
 
				 		if (conf_papi_events != NULL)
			
 
				 		{
			
 
				-			while ((papi_event_name = strtok_r(conf_papi_events, " ", &conf_papi_events)))
			
 
				+			while ((papi_event_name = strtok_r(conf_papi_events, " ,", &conf_papi_events)))
			
 
				 			{
			
 
				 				_STARPU_DEBUG("Loading PAPI Event:%s\n", papi_event_name);
			
 
				 				retval = PAPI_event_name_to_code ((char*)papi_event_name, &papi_events[papi_nevents]);
			
@@ -186,7 +189,12 @@ void _starpu_profiling_papi_task_start_counters(struct starpu_task *task)
 
				 		PAPI_create_eventset(&profiling_info->papi_event_set);
			
 
				 		for(int i=0; i<papi_nevents; i++)
			
 
				 		{
			
 
				-			PAPI_add_event(profiling_info->papi_event_set, papi_events[i]);
			
 
				+			int ret = PAPI_add_event(profiling_info->papi_event_set, papi_events[i]);
			
 
				+			if (ret == PAPI_ECMP_DISABLED && !warned_component_unavailable)
			
 
				+			{
			
 
				+				_STARPU_MSG("Error while registering Papi event: Component containing event is disabled. Try running `papi_component_avail` to get more information.\n");
			
 
				+				warned_component_unavailable = 1;
			
 
				+			}
			
 
				 			profiling_info->papi_values[i]=0;
			
 
				 		}
			
 
				 		PAPI_reset(profiling_info->papi_event_set);
			
--- a/src/profiling/profiling_helpers.c
+++ b/src/profiling/profiling_helpers.c
@@ -99,8 +99,9 @@ void _starpu_profiling_worker_helper_display_summary(FILE *stream)
 
				 	for (workerid = 0; workerid < worker_cnt; workerid++)
			
 
				 	{
			
 
				 		struct starpu_profiling_worker_info info;
			
 
				-		starpu_profiling_worker_get_info(workerid, &info);
			
 
				+		int ret = starpu_profiling_worker_get_info(workerid, &info);
			
 
				 		char name[64];
			
 
				+		STARPU_ASSERT(!ret);
			
 
				 
			
 
				 		starpu_worker_get_name(workerid, name, sizeof(name));
			
 
				 
			
--- a/src/sched_policies/component_best_implementation.c
+++ b/src/sched_policies/component_best_implementation.c
@@ -19,7 +19,9 @@
 
				 
			
 
				 #include <starpu_sched_component.h>
			
 
				 #include <starpu_scheduler.h>
			
 
				+#ifdef BUILDING_STARPU
			
 
				 #include <core/workers.h>
			
 
				+#endif
			
 
				 
			
 
				 /* return true if workerid can execute task, and fill task->predicted and task->predicted_transfer
			
 
				  *  according to best implementation predictions
			
@@ -39,12 +41,11 @@ static int find_best_impl(unsigned sched_ctx_id, struct starpu_task * task, int
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				-		struct starpu_perfmodel_arch* archtype = starpu_worker_get_perf_archtype(workerid, sched_ctx_id);
			
 
				 		for(impl = 0; impl < STARPU_MAXIMPLEMENTATIONS; impl++)
			
 
				 		{
			
 
				 			if(starpu_worker_can_execute_task(workerid, task, impl))
			
 
				 			{
			
 
				-				double d = starpu_task_expected_length(task, archtype, impl);
			
 
				+				double d = starpu_task_worker_expected_length(task, workerid, sched_ctx_id, impl);
			
 
				 				if(isnan(d))
			
 
				 				{
			
 
				 					best_impl = impl;
			
--- a/src/sched_policies/component_eager.c
+++ b/src/sched_policies/component_eager.c
@@ -16,6 +16,9 @@
 
				 
			
 
				 #include <starpu_sched_component.h>
			
 
				 #include <starpu_scheduler.h>
			
 
				+#ifdef BUILDING_STARPU
			
 
				+#include <core/workers.h>
			
 
				+#endif
			
 
				 
			
 
				 struct _starpu_eager_data
			
 
				 {
			
--- a/src/sched_policies/component_heft.c
+++ b/src/sched_policies/component_heft.c
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2013-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				  * Copyright (C) 2013       Simon Archipoff
			
 
				+ * Copyright (C) 2020       Télécom-Sud Paris
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/src/sched_policies/component_heteroprio.c
+++ b/src/sched_policies/component_heteroprio.c
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2013-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				  * Copyright (C) 2013       Simon Archipoff
			
 
				+ * Copyright (C) 2020       Télécom-Sud Paris
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/src/sched_policies/component_mct.c
+++ b/src/sched_policies/component_mct.c
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2013-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				  * Copyright (C) 2013       Simon Archipoff
			
 
				+ * Copyright (C) 2020       Télécom-Sud Paris
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
--- a/src/sched_policies/component_sched.c
+++ b/src/sched_policies/component_sched.c