5 år sedan · bc23f1b516
--- a/ChangeLog
+++ b/ChangeLog
@@ -31,9 +31,12 @@ New features:
 
																     files. This file can be parsed by the new script
															
 
																     starpu_fxt_number_events_to_names.py to convert event keys to event names.
															
 
																   * New STARPU_PER_WORKER perfmodel.
															
 
																+  * Add energy accounting in the simgrid mode: starpu_energy_use() and
															
 
																+    starpu_energy_used().
															
 
																 Small changes:
															
 
																   * Use the S4U interface of Simgrid instead of xbt and MSG.
															
 
																+  * Add a synthetic energy efficiency testcase.
															
 
																 StarPU 1.3.4 (git revision xxx)
															
 
																 ==============================================
															
@@ -60,6 +63,11 @@ Small features:
 
																   * New STARPU_BACKOFF_MIN and STARPU_BACKOFF_MAX environment variables to the
															
 
																     exponential backoff limits of the number of cycles to pause while drivers
															
 
																     are spinning.
															
 
																+  * Add STARPU_DISPLAY_BINDINGS environment variable and
															
 
																+    starpu_display_bindings() function to display all bindings on the machine by
															
 
																+    calling hwloc-ps
															
 
																+Small changes:
															
 
																+  * New configure option --disable-build-doc-pdf
															
 
																 StarPU 1.3.3 (git revision 11afc5b007fe1ab1c729b55b47a5a98ef7f3cfad)
															
 
																 ====================================================================
															
--- a/configure.ac
+++ b/configure.ac
@@ -2245,6 +2245,14 @@ AC_MSG_RESULT($nmaxbuffers)
 
																 AC_DEFINE_UNQUOTED(STARPU_NMAXBUFS, [$nmaxbuffers],
															
 
																 		[how many buffers can be manipulated per task])
															
 
																+AC_MSG_CHECKING(how many MPI nodes fxt files can be manipulated when generating traces)
															
 
																+AC_ARG_ENABLE(fxt-max-files, [AS_HELP_STRING([--enable-fxt-max-files=<nbuffers>],
															
 
																+			[maximum number of mpi nodes for traces])],
															
 
																+			nmaxfxtfiles=$enableval, nmaxfxtfiles=64)
															
 
																+AC_MSG_RESULT($nmaxfxtfiles)
															
 
																+AC_DEFINE_UNQUOTED(STARPU_FXT_MAX_FILES, [$nmaxfxtfiles],
															
 
																+		[how many MPI nodes fxt files can be manipulated when generating traces])
															
 
																+
															
 
																 AC_MSG_CHECKING(maximum number of memory nodes to use per MPI rank)
															
 
																 AC_ARG_ENABLE(maxnodes, [AS_HELP_STRING([--enable-maxnodes=<nnodes>],
															
 
																 			[maximum number of memory nodes per MPI rank])],
															
@@ -2537,6 +2545,7 @@ if test "x$enable_build_fortran_requested" = "xyes" ; then
 
																                  fi
															
 
																 	else
															
 
																 		if $FC -V 2>&1|grep -q 'Intel(R) Fortran'; then
															
 
																+			enable_build_fortran="yes"
															
 
																 			ifort_fc_version=`$FC -V 2>&1 |head -1|sed 's/.*Version //;s/ Build.*//'`
															
 
																 			ifort_maj_version=`echo $ifort_fc_version|cut -d. -f1`
															
@@ -2553,38 +2562,28 @@ if test "x$enable_build_fortran_requested" = "xyes" ; then
 
																 				enable_build_fortran="no"
															
 
																 			else
															
 
																 				AC_MSG_WARN(Fortran compiler has not been tested for StarPU native Fortran support)
															
 
																+				 enable_build_fortran="yes"
															
 
																 			fi
															
 
																 		fi
															
 
																 	fi
															
 
																 	if test "x$enable_build_fortran" = "xyes" ; then
															
 
																 		AC_DEFINE(STARPU_HAVE_FC, [1], [Define this if a Fortran compiler is available])
															
 
																-		if test x$build_mpi_lib = xyes -o x$build_mpi_master_slave = xyes ; then
															
 
																-			AC_ARG_WITH(mpifort, [AS_HELP_STRING([--with-mpifort[=<path to mpifort>]],
															
 
																-				    [Path of the mpifort compiler])],
															
 
																-				    [
															
 
																-				     if test x$withval = xyes; then
															
 
																-					     AC_MSG_ERROR(--with-mpifort must be given a pathname)
															
 
																-					     else
															
 
																-						     mpifort_path=$withval
															
 
																-					     fi
															
 
																-					     ],
															
 
																-					     [
															
 
																-					      if test x$enable_simgrid = xyes ; then
															
 
																-						      DEFAULT_MPIFORT=smpifort
															
 
																-					      else
															
 
																-						      DEFAULT_MPIFORT=mpif90
															
 
																-					      fi
															
 
																-					      case $DEFAULT_MPIFORT in
															
 
																-					      	/*) mpifort_path="$DEFAULT_MPIFORT" ;;
															
 
																-					        *)  AC_PATH_PROG(mpifort_path, $DEFAULT_MPIFORT, [no], [$MPIPATH])
															
 
																-					      esac
															
 
																-					      ])
															
 
																-
															
 
																+		if test x$build_mpi_lib = xyes -o x$build_nmad_lib = xyes -o x$build_mpi_master_slave = xyes ; then
															
 
																+			#Check MPIFORT
															
 
																+			if test x$enable_simgrid = xyes ; then
															
 
																+				DEFAULT_MPIFORT=smpifort
															
 
																+			else
															
 
																+				DEFAULT_MPIFORT=mpifort
															
 
																+			fi
															
 
																+			AC_ARG_WITH(mpifort, [AS_HELP_STRING([--with-mpifort=<mpifort name or path to mpifort>], [Name or path of the mpifort compiler])], [DEFAULT_MPIFORT=$withval])
															
 
																+			case $DEFAULT_MPIFORT in
															
 
																+				/*) mpifort_path="$DEFAULT_MPIFORT" ;;
															
 
																+				*)  AC_PATH_PROG(mpifort_path, $DEFAULT_MPIFORT, [no], [$simgrid_dir/bin:$PATH]) ;;
															
 
																+			esac
															
 
																 			# We test if the MPIFORT compiler exists
															
 
																 			if test ! -x $mpifort_path; then
															
 
																-				#MPIFORT does not exists or is not executable
															
 
																 				AC_MSG_RESULT(The mpifort compiler '$mpifort_path' does not have the execute permission)
															
 
																-				use_mpi_fort=no
															
 
																+				mpifort_path=no
															
 
																 			else
															
 
																 				OLD_CC=$CC
															
 
																 				CC=$mpicc_path
															
@@ -2599,11 +2598,18 @@ if test "x$enable_build_fortran_requested" = "xyes" ; then
 
																 				CC=$OLD_CC
															
 
																 				if test "x$use_mpi_fort" = xyes; then
															
 
																 					AC_DEFINE([HAVE_MPI_COMM_F2C], [1], [Function MPI_Comm_f2c is available])
															
 
																-					AC_MSG_CHECKING(mpifort path)
															
 
																-					AC_MSG_RESULT($mpifort_path)
															
 
																-					AC_SUBST(MPIFORT, $mpifort_path)
															
 
																 				fi
															
 
																 			fi
															
 
																+
															
 
																+			AC_MSG_CHECKING(whether mpifort is available)
															
 
																+			AC_MSG_RESULT($mpifort_path)
															
 
																+			AC_SUBST(MPIFORT, $mpifort_path)
															
 
																+
															
 
																+			if test x$mpifort_path != xno ; then
															
 
																+				MPIPATH=$(dirname $mpifort_path):$PATH
															
 
																+			else
															
 
																+				MPIPATH=$PATH
															
 
																+			fi
															
 
																 		fi
															
 
																 	fi
															
 
																    fi
															
@@ -3413,34 +3419,51 @@ AC_ARG_ENABLE(build-doc, [AS_HELP_STRING([--disable-build-doc],
 
																 			[disable building of documentation])],
															
 
																 			enable_build_doc=$enableval, enable_build_doc=yes)
															
 
																-if test "$enable_build_doc" = "yes" ; then
															
 
																-   # Check whether doxygen needed tools are installed
															
 
																-   AC_PATH_PROG(doxygencommand, doxygen)
															
 
																-   if test "$doxygencommand" = "" ; then
															
 
																-      	enable_build_doc="no"
															
 
																-   fi
															
 
																-   AC_PATH_PROG(pdflatexcommand, pdflatex)
															
 
																-   if test "$pdflatexcommand" = "" ; then
															
 
																-	enable_build_doc="no"
															
 
																-   fi
															
 
																-   AC_PATH_PROG(epstopdfcommand, epstopdf)
															
 
																-   if test "$epstopdfcommand" = "" ; then
															
 
																-	enable_build_doc="no"
															
 
																-   fi
															
 
																+AC_ARG_ENABLE(build-doc-pdf, [AS_HELP_STRING([--enable-build-doc-pdf],
															
 
																+			[enable building of PDF documentation])],
															
 
																+			enable_build_doc_pdf=$enableval, enable_build_doc_pdf=no)
															
 
																+
															
 
																+# Check whether doxygen needed tools are installed
															
 
																+AC_PATH_PROG(doxygencommand, doxygen)
															
 
																+if test "$doxygencommand" = "" ; then
															
 
																+   enable_build_doc="no"
															
 
																+   enable_build_doc_pdf="no"
															
 
																+fi
															
 
																+AC_PATH_PROG(pdflatexcommand, pdflatex)
															
 
																+if test "$pdflatexcommand" = "" ; then
															
 
																+   enable_build_doc_pdf="no"
															
 
																 fi
															
 
																+AC_PATH_PROG(epstopdfcommand, epstopdf)
															
 
																+if test "$epstopdfcommand" = "" ; then
															
 
																+   enable_build_doc_pdf="no"
															
 
																+fi
															
 
																+
															
 
																 available_doc="no"
															
 
																-if test -f "$srcdir/doc/doxygen/starpu.pdf" ; then
															
 
																+if test -d "$srcdir/doc/doxygen/html" ; then
															
 
																    enable_build_doc="no"
															
 
																    available_doc="yes"
															
 
																 fi
															
 
																-AC_MSG_CHECKING(whether documentation should be compiled)
															
 
																+available_doc_pdf="no"
															
 
																+if test -f "$srcdir/doc/doxygen/starpu.pdf" ; then
															
 
																+   enable_build_doc="no"
															
 
																+   enable_build_doc_pdf="no"
															
 
																+   available_doc_pdf="yes"
															
 
																+fi
															
 
																+AC_MSG_CHECKING(whether HTML documentation should be compiled)
															
 
																 AC_MSG_RESULT($enable_build_doc)
															
 
																-AC_MSG_CHECKING(whether documentation is available)
															
 
																+AC_MSG_CHECKING(whether HTML documentation is available)
															
 
																 AC_MSG_RESULT($available_doc)
															
 
																+AC_MSG_CHECKING(whether PDF documentation should be compiled)
															
 
																+AC_MSG_RESULT($enable_build_doc_pdf)
															
 
																+AC_MSG_CHECKING(whether PDF documentation is available)
															
 
																+AC_MSG_RESULT($available_doc_pdf)
															
 
																 AM_CONDITIONAL(STARPU_BUILD_DOC, [test x$enable_build_doc != xno])
															
 
																 AM_CONDITIONAL(STARPU_AVAILABLE_DOC, [test x$available_doc != xno])
															
 
																+AM_CONDITIONAL(STARPU_BUILD_DOC_PDF, [test x$enable_build_doc_pdf != xno])
															
 
																+AM_CONDITIONAL(STARPU_AVAILABLE_DOC_PDF, [test x$available_doc_pdf != xno])
															
 
																+
															
 
																 ###############################################################################
															
 
																 #                                                                             #
															
 
																 #                                Julia                                        #
															
@@ -3520,6 +3543,9 @@ AC_CONFIG_COMMANDS([executable-scripts], [
 
																   test -e tests/microbenchs/parallel_independent_homogeneous_tasks.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_homogeneous_tasks.sh tests/microbenchs/
															
 
																   test -e tests/microbenchs/parallel_redux_homogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_redux_homogeneous_tasks_data.sh tests/microbenchs/
															
 
																   test -e tests/microbenchs/parallel_redux_heterogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_redux_heterogeneous_tasks_data.sh tests/microbenchs/
															
 
																+  mkdir -p tests/energy
															
 
																+  test -e tests/energy/static.sh || ln -sf $ac_abs_top_srcdir/tests/energy/static.sh tests/energy/
															
 
																+  test -e tests/energy/dynamic.sh || ln -sf $ac_abs_top_srcdir/tests/energy/dynamic.sh tests/energy/
															
 
																   mkdir -p tests/datawizard
															
 
																   test -e tests/datawizard/locality.sh || ln -sf $ac_abs_top_srcdir/tests/datawizard/locality.sh tests/datawizard/
															
 
																   mkdir -p tests/overlap
															
@@ -3672,8 +3698,9 @@ AC_MSG_NOTICE([
 
																 	hwloc:             $have_valid_hwloc
															
 
																 	FxT trace enabled: $use_fxt
															
 
																-        Documentation:     $enable_build_doc
															
 
																-        Examples:          $enable_build_examples
															
 
																+        Documentation HTML:  $enable_build_doc
															
 
																+        Documentation PDF:   $enable_build_doc_pdf
															
 
																+        Examples:            $enable_build_examples
															
 
																 	StarPU Extensions:
															
 
																 	       StarPU MPI enabled:                            $build_mpi_lib
															
--- a/contrib/ci.inria.fr/job-0-tarball.sh
+++ b/contrib/ci.inria.fr/job-0-tarball.sh
@@ -21,7 +21,7 @@ export LD_LIBRARY_PATH=/home/ci/usr/local/lib:$LD_LIBRARY_PATH
 
																 ./autogen.sh
															
 
																 if test -d build ; then chmod -R 777 build && rm -rf build ; fi
															
 
																 mkdir build && cd build
															
 
																-../configure
															
 
																+../configure --enable-build-doc-pdf
															
 
																 make V=1
															
 
																 make dist
															
 
																 cp *gz ..
															
--- a/doc/doxygen/Makefile.am
+++ b/doc/doxygen/Makefile.am
@@ -30,9 +30,14 @@ txtdir   = $(docdir)/manual
 
																 EXTRA_DIST =
															
 
																 if STARPU_BUILD_DOC
															
 
																+if STARPU_BUILD_DOC_PDF
															
 
																 all: $(DOX_HTML_DIR) $(DOX_PDF)
															
 
																 EXTRA_DIST += $(DOX_HTML_DIR) $(DOX_PDF)
															
 
																 txt_DATA = $(DOX_PDF)
															
 
																+else
															
 
																+all: $(DOX_HTML_DIR)
															
 
																+EXTRA_DIST += $(DOX_HTML_DIR)
															
 
																+endif
															
 
																 DOX_HTML_SRCDIR=$(DOX_HTML_DIR)
															
 
																 install-exec-hook:
															
 
																 	$(MKDIR_P) $(DESTDIR)$(docdir)/manual/html
															
@@ -41,8 +46,7 @@ uninstall-hook:
 
																 	rm -rf $(DESTDIR)$(docdir)/manual/html
															
 
																 else
															
 
																 if STARPU_AVAILABLE_DOC
															
 
																-EXTRA_DIST += $(top_srcdir)/doc/doxygen/html $(top_srcdir)/doc/doxygen/starpu.pdf
															
 
																-txt_DATA = $(top_srcdir)/doc/doxygen/starpu.pdf
															
 
																+EXTRA_DIST += $(top_srcdir)/doc/doxygen/html
															
 
																 DOX_HTML_SRCDIR=$(top_srcdir)/doc/doxygen/html
															
 
																 install-exec-hook:
															
 
																 	$(MKDIR_P) $(DESTDIR)$(docdir)/manual/html
															
@@ -50,6 +54,10 @@ install-exec-hook:
 
																 uninstall-hook:
															
 
																 	rm -rf $(DESTDIR)$(docdir)/manual/html
															
 
																 endif
															
 
																+if STARPU_AVAILABLE_DOC_PDF
															
 
																+EXTRA_DIST += $(top_srcdir)/doc/doxygen/starpu.pdf
															
 
																+txt_DATA = $(top_srcdir)/doc/doxygen/starpu.pdf
															
 
																+endif
															
 
																 endif
															
 
																 chapters =	\
															
@@ -257,6 +265,8 @@ $(DOX_TAG): $(dox_inputs)
 
																 	@$(SED) -i '/\\begin{titlepage}/,$$d' $(DOX_LATEX_DIR)/refman.tex
															
 
																 	@cat $(top_srcdir)/doc/doxygen/refman.tex >> $(DOX_LATEX_DIR)/refman.tex
															
 
																+$(DOX_HTML_DIR): $(DOX_TAG)
															
 
																+
															
 
																 $(DOX_PDF): $(DOX_TAG) refman.tex
															
 
																 	@cp $(top_srcdir)/doc/doxygen/chapters/version.sty $(DOX_LATEX_DIR)
															
 
																 	@cp $(top_srcdir)/doc/doxygen/chapters/images/*pdf $(DOX_LATEX_DIR)
															
@@ -294,5 +304,5 @@ EXTRA_DIST += doxygen.cfg refman.tex \
 
																 # Rule to update documentation on web server. Should only be used locally.
															
 
																 PUBLISHHOST	?= gforge
															
 
																 update-web: $(DOX_PDF)
															
 
																-	scp -pr starpu.pdf html $(PUBLISHHOST):/home/groups/starpu/htdocs/doc
															
 
																+	scp -pr starpu.pdf html $(PUBLISHHOST):/home/groups/starpu/htdocs/files/doc
															
--- a/doc/doxygen/chapters/301_tasks.doxy
+++ b/doc/doxygen/chapters/301_tasks.doxy
@@ -118,7 +118,7 @@ to delay the termination of a task until the termination of other tasks.
 
																 \section SettingManyDataHandlesForATask Setting Many Data Handles For a Task
															
 
																-The maximum number of data a task can manage is fixed by the environment variable
															
 
																+The maximum number of data a task can manage is fixed by the macro
															
 
																 \ref STARPU_NMAXBUFS which has a default value which can be changed
															
 
																 through the \c configure option \ref enable-maxbuffers "--enable-maxbuffers".
															
--- a/doc/doxygen/chapters/320_scheduling.doxy
+++ b/doc/doxygen/chapters/320_scheduling.doxy
@@ -185,6 +185,11 @@ already gives the good results that a precise estimation would give.
 
																 \section Energy-basedScheduling Energy-based Scheduling
															
 
																+Note: by default StarPU does not let CPU workers sleep, to let them react to
															
 
																+task release as quickly as possible. For idle time to really let CPU cores save
															
 
																+energy, one needs to use the \ref enable-blocking-drivers
															
 
																+"--enable-blocking-drivers" configuration option.
															
 
																+
															
 
																 If the application can provide some energy consumption performance model (through
															
 
																 the field starpu_codelet::energy_model), StarPU will
															
 
																 take it into account when distributing tasks. The target function that
															
--- a/doc/doxygen/chapters/380_offline_performance_tools.doxy
+++ b/doc/doxygen/chapters/380_offline_performance_tools.doxy
@@ -586,19 +586,31 @@ $ starpu_paje_sort paje.trace
 
																 \section PapiCounters PAPI counters
															
 
																 Performance counter values could be obtained from the PAPI framework if
															
 
																-<c>./configure</c> detected the libpapi. One has to set the \ref STARPU_PROFILING
															
 
																-environment variable to 1 and then specify which events to record with the
															
 
																-\ref STARPU_PROF_PAPI_EVENTS environment variable. For instance:
															
 
																+<c>./configure</c> detected the libpapi.
															
 
																+
															
 
																+In Debian, packages <c>libpapi-dev</c> and <c>libpapi5.7</c> provide required
															
 
																+files.  Package <c>papi-tools</c> contains a set of useful tools, for example
															
 
																+<c>papi_avail</c> to see which counters are available.
															
 
																+
															
 
																+To be able to use Papi counters, one may need to reduce the level of the kernel
															
 
																+parameter <c>kernel.perf_event_paranoid</c> to at least 2. See
															
 
																+https://www.kernel.org/doc/html/latest/admin-guide/perf-security.html for the
															
 
																+security impact of this parameter.
															
 
																+
															
 
																+Then one has to set the \ref STARPU_PROFILING environment variable to 1 and
															
 
																+specify which events to record with the \ref STARPU_PROF_PAPI_EVENTS
															
 
																+environment variable. For instance:
															
 
																 \verbatim
															
 
																 export STARPU_PROFILING=1 STARPU_PROF_PAPI_EVENTS="PAPI_TOT_INS PAPI_TOT_CYC"
															
 
																 \endverbatim
															
 
																+The comma can also be used to separate events to monitor.
															
 
																+
															
 
																 In the current simple implementation, only CPU tasks have their events measured
															
 
																-and require CPUs that support the PAPI events. All events that PAPI support are
															
 
																-available from their documentation (https://icl.cs.utk.edu/projects/papi/wiki/PAPIC:Preset_Event_Definitions).
															
 
																-It is important to note that not all events are available on all systems, and
															
 
																-general PAPI recommendations should be followed.
															
 
																+and require CPUs that support the PAPI events. It is important to note that not
															
 
																+all events are available on all systems, and general PAPI recommendations
															
 
																+should be followed.
															
 
																 The counter values can be accessed using the profiling interface:
															
 
																 \code{.c}
															
--- a/doc/doxygen/chapters/501_environment_variables.doxy
+++ b/doc/doxygen/chapters/501_environment_variables.doxy
@@ -1366,6 +1366,15 @@ application has crashed. Setting this variable to a value other than 1
 
																 will disable this behaviour. This should be done on JVM systems which
															
 
																 may use these signals for their own needs.
															
 
																 The flag can also be set through the field starpu_conf::catch_signals.
															
 
																+</dd>
															
 
																+
															
 
																+<dt>STARPU_DISPLAY_BINDINGS</dt>
															
 
																+<dd>
															
 
																+\anchor STARPU_DISPLAY_BINDINGS
															
 
																+\addindex __env__STARPU_DISPLAY_BINDINGS
															
 
																+Display the binding of all processes and threads running on the machine. If MPI is enabled, display the binding of each node.<br>
															
 
																+Users can manually display the binding by calling starpu_display_bindings().
															
 
																+</dd>
															
 
																 </dl>
															
 
																 \section ConfiguringTheHypervisor Configuring The Hypervisor
															
--- a/doc/doxygen/chapters/510_configure_options.doxy
+++ b/doc/doxygen/chapters/510_configure_options.doxy
@@ -1,6 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
															
 
																+ * Copyright (C) 2020       Federal University of Rio Grande do Sul (UFRGS)
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -115,7 +116,19 @@ Specify <c>hwloc</c> should not be used by StarPU.
 
																 \addindex __configure__--disable-build-doc
															
 
																 Disable the creation of the documentation. This should be done on a
															
 
																 machine which does not have the tools <c>doxygen</c> and <c>latex</c>
															
 
																-(plus the packages <c>latex-xcolor</c> and <c>texlive-latex-extra</c>).
															
 
																+(plus the packages <c>latex-xcolor</c> and
															
 
																+<c>texlive-latex-extra</c>).
															
 
																+</dd>
															
 
																+
															
 
																+<dt>--enable-build-doc-pdf</dt>
															
 
																+<dd>
															
 
																+\anchor enable-build-doc-pdf
															
 
																+\addindex __configure__--enable-build-doc-pdf
															
 
																+By default, ontly the HTML documentation is generated. Use this option
															
 
																+to also enable the generation of the PDF documentation. This should be
															
 
																+done on a machine which does have the tools <c>doxygen</c> and <c>latex</c>
															
 
																+(plus the packages <c>latex-xcolor</c> and
															
 
																+<c>texlive-latex-extra</c>).
															
 
																 </dd>
															
 
																 <dt>--disable-icc</dt>
															
@@ -514,6 +527,15 @@ Define the maximum number of buffers that tasks will be able to take
 
																 as parameters, then available as the macro ::STARPU_NMAXBUFS.
															
 
																 </dd>
															
 
																+<dt>--enable-fxt-max-files=<c>count</c></dt>
															
 
																+<dd>
															
 
																+\anchor enable-fxt-max-files
															
 
																+\addindex __configure__--enable-fxt-max-files
															
 
																+Use at most <c>count</c> mpi nodes fxt files for generating traces.  This information is then available as
															
 
																+the macro ::STARPU_FXT_MAX_FILES.  This information is used by FxT tools when considering multi node traces.
															
 
																+Default value is 64.
															
 
																+</dd>
															
 
																+
															
 
																 <dt>--enable-allocation-cache</dt>
															
 
																 <dd>
															
 
																 \anchor enable-allocation-cache
															
--- a/doc/doxygen/chapters/code/vector_scal_opencl.c
+++ b/doc/doxygen/chapters/code/vector_scal_opencl.c
@@ -57,6 +57,7 @@ void scal_opencl_func(void *buffers[], void *_args)
 
																         err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
															
 
																         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																         if (local > global) local=global;
															
 
																+        else global = (global + local-1) / local * local;
															
 
																         err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
															
 
																         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
--- a/doc/doxygen_dev/Makefile.am
+++ b/doc/doxygen_dev/Makefile.am
@@ -30,9 +30,14 @@ txtdir   = $(docdir)/manual
 
																 EXTRA_DIST =
															
 
																 if STARPU_BUILD_DOC
															
 
																+if STARPU_BUILD_DOC_PDF
															
 
																 all: $(DOX_HTML_DIR) $(DOX_PDF)
															
 
																 EXTRA_DIST += $(DOX_HTML_DIR) $(DOX_PDF)
															
 
																 txt_DATA = $(DOX_PDF)
															
 
																+else
															
 
																+all: $(DOX_HTML_DIR)
															
 
																+EXTRA_DIST += $(DOX_HTML_DIR)
															
 
																+endif
															
 
																 DOX_HTML_SRCDIR=$(DOX_HTML_DIR)
															
 
																 install-exec-hook:
															
 
																 	$(MKDIR_P) $(DESTDIR)$(docdir)/manual/html_dev
															
@@ -41,8 +46,7 @@ uninstall-hook:
 
																 	rm -rf $(DESTDIR)$(docdir)/manual/html_dev
															
 
																 else
															
 
																 if STARPU_AVAILABLE_DOC
															
 
																-EXTRA_DIST += $(top_srcdir)/doc/doxygen_dev/html_dev $(top_srcdir)/doc/doxygen_dev/starpu_dev.pdf
															
 
																-txt_DATA = $(top_srcdir)/doc/doxygen_dev/starpu_dev.pdf
															
 
																+EXTRA_DIST += $(top_srcdir)/doc/doxygen_dev/html_dev
															
 
																 DOX_HTML_SRCDIR=$(top_srcdir)/doc/doxygen_dev/html_dev
															
 
																 install-exec-hook:
															
 
																 	$(MKDIR_P) $(DESTDIR)$(docdir)/manual/html_dev
															
@@ -50,6 +54,10 @@ install-exec-hook:
 
																 uninstall-hook:
															
 
																 	rm -rf $(DESTDIR)$(docdir)/manual/html_dev
															
 
																 endif
															
 
																+if STARPU_AVAILABLE_DOC_PDF
															
 
																+EXTRA_DIST += $(top_srcdir)/doc/doxygen_dev/starpu_dev.pdf
															
 
																+txt_DATA = $(top_srcdir)/doc/doxygen_dev/starpu_dev.pdf
															
 
																+endif
															
 
																 endif
															
 
																 chapters =	\
															
@@ -191,7 +199,7 @@ dox_inputs = $(DOX_CONFIG) 				\
 
																 	$(top_srcdir)/src/core/drivers.h	\
															
 
																 	$(top_srcdir)/src/core/workers.h
															
 
																-$(DOX_HTML_DIR): $(DOX_TAG) refman.tex
															
 
																+$(DOX_HTML_DIR): $(DOX_TAG)
															
 
																 	@$(MKDIR_P) $(DOX_HTML_DIR)
															
 
																 $(DOX_TAG): $(dox_inputs)
															
@@ -240,5 +248,5 @@ EXTRA_DIST += doxygen.cfg refman.tex \
 
																 # Rule to update documentation on web server. Should only be used locally.
															
 
																 PUBLISHHOST	?= gforge
															
 
																 update-web: $(DOX_PDF)
															
 
																-	scp -pr starpu_dev.pdf html_dev $(PUBLISHHOST):/home/groups/starpu/htdocs/doc
															
 
																+	scp -pr starpu_dev.pdf html_dev $(PUBLISHHOST):/home/groups/starpu/htdocs/files/doc
															
--- a/examples/axpy/axpy_opencl.c
+++ b/examples/axpy/axpy_opencl.c
@@ -60,6 +60,8 @@ void axpy_opencl(void *buffers[], void *_args)
 
																 			STARPU_OPENCL_REPORT_ERROR(err);
															
 
																                 if (local > global)
															
 
																 			local=global;
															
 
																+                else
															
 
																+                        global = (global + local-1) / local * local;
															
 
																 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
															
 
																 		if (err != CL_SUCCESS)
															
--- a/examples/basic_examples/multiformat_conversion_codelets_opencl.c
+++ b/examples/basic_examples/multiformat_conversion_codelets_opencl.c
@@ -74,6 +74,8 @@ void cpu_to_opencl_opencl_func(void *buffers[], void *args)
 
																                 if (local > global)
															
 
																 			local = global;
															
 
																+                else
															
 
																+                        global = (global + local-1) / local * local;
															
 
																 		err = clEnqueueNDRangeKernel(queue,
															
 
																 					kernel,
															
--- a/examples/basic_examples/multiformat_opencl.c
+++ b/examples/basic_examples/multiformat_opencl.c
@@ -68,6 +68,8 @@ void multiformat_scal_opencl_func(void *buffers[], void *args)
 
																                 if (local > global)
															
 
																 			local = global;
															
 
																+                else
															
 
																+                        global = (global + local-1) / local * local;
															
 
																 		err = clEnqueueNDRangeKernel(queue,
															
 
																 					kernel,
															
--- a/examples/basic_examples/vector_scal_opencl.c
+++ b/examples/basic_examples/vector_scal_opencl.c
@@ -57,6 +57,7 @@ void scal_opencl_func(void *buffers[], void *_args)
 
																                 err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
															
 
																                 if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																                 if (local > global) local=global;
															
 
																+                else global = (global + local-1) / local * local;
															
 
																 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
															
 
																 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
--- a/examples/filters/custom_mf/conversion_opencl.c
+++ b/examples/filters/custom_mf/conversion_opencl.c
@@ -76,6 +76,8 @@ void cpu_to_opencl_opencl_func(void *buffers[], void *args)
 
																                 if (local > global)
															
 
																 			local = global;
															
 
																+                else
															
 
																+                        global = (global + local-1) / local * local;
															
 
																 		err = clEnqueueNDRangeKernel(
															
 
																 				queue,
															
--- a/examples/filters/custom_mf/custom_opencl.c
+++ b/examples/filters/custom_mf/custom_opencl.c
@@ -75,6 +75,8 @@ void custom_scal_opencl_func(void *buffers[], void *args)
 
																                 if (local > global)
															
 
																 			local = global;
															
 
																+                else
															
 
																+                        global = (global + local-1) / local * local;
															
 
																 		err = clEnqueueNDRangeKernel(
															
 
																 				queue,
															
--- a/examples/interface/complex_kernels_opencl.c
+++ b/examples/interface/complex_kernels_opencl.c
@@ -64,6 +64,8 @@ void copy_complex_codelet_opencl(void *buffers[], void *_args)
 
																 			STARPU_OPENCL_REPORT_ERROR(err);
															
 
																                 if (local > global)
															
 
																 			local=global;
															
 
																+                else
															
 
																+                        global = (global + local-1) / local * local;
															
 
																 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
															
 
																 		if (err != CL_SUCCESS)
															
--- a/examples/mult/double.h
+++ b/examples/mult/double.h
@@ -15,6 +15,7 @@
 
																  */
															
 
																 #define TYPE	double
															
 
																+#define EPSILON	0.000000000001
															
 
																 #define CUBLAS_GEMM cublasDgemm
															
 
																 #define CPU_GEMM	STARPU_DGEMM
															
--- a/examples/mult/simple.h
+++ b/examples/mult/simple.h
@@ -15,6 +15,7 @@
 
																  */
															
 
																 #define TYPE	float
															
 
																+#define EPSILON	0.000001
															
 
																 #define CUBLAS_GEMM cublasSgemm
															
 
																 #define CPU_GEMM	STARPU_SGEMM
															
--- a/examples/mult/xgemm.c
+++ b/examples/mult/xgemm.c
@@ -75,7 +75,7 @@ static int check_output(void)
 
																 	TYPE err;
															
 
																 	err = CPU_ASUM(xdim*ydim, C, 1);
															
 
																-	if (err < xdim*ydim*0.001)
															
 
																+	if (err < EPSILON*xdim*ydim*zdim)
															
 
																 	{
															
 
																 		FPRINTF(stderr, "Results are OK\n");
															
 
																 		return 0;
															
--- a/examples/reductions/dot_product.c
+++ b/examples/reductions/dot_product.c
@@ -185,18 +185,12 @@ void redux_opencl_func(void *buffers[], void *args)
 
																 	{
															
 
																 		size_t global=1;
															
 
																-		size_t local;
															
 
																+                size_t local=1;
															
 
																                 size_t s;
															
 
																                 cl_device_id device;
															
 
																                 starpu_opencl_get_device(devid, &device);
															
 
																-                err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
															
 
																-                if (err != CL_SUCCESS)
															
 
																-			STARPU_OPENCL_REPORT_ERROR(err);
															
 
																-                if (local > global)
															
 
																-			local=global;
															
 
																-
															
 
																 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
															
 
																 		if (err != CL_SUCCESS)
															
 
																 			STARPU_OPENCL_REPORT_ERROR(err);
															
@@ -306,18 +300,12 @@ void dot_opencl_func(void *buffers[], void *cl_arg)
 
																 	{
															
 
																 		size_t global=1;
															
 
																-		size_t local;
															
 
																+                size_t local=1;
															
 
																                 size_t s;
															
 
																                 cl_device_id device;
															
 
																                 starpu_opencl_get_device(devid, &device);
															
 
																-                err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
															
 
																-                if (err != CL_SUCCESS)
															
 
																-			STARPU_OPENCL_REPORT_ERROR(err);
															
 
																-                if (local > global)
															
 
																-			local=global;
															
 
																-
															
 
																 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
															
 
																 		if (err != CL_SUCCESS)
															
 
																 			STARPU_OPENCL_REPORT_ERROR(err);
															
--- a/examples/reductions/dot_product_opencl_kernels.cl
+++ b/examples/reductions/dot_product_opencl_kernels.cl
@@ -31,6 +31,7 @@ __kernel void _dot_opencl(__global float *x,
 
																 			  __global DOT_TYPE *dot,
															
 
																 			  unsigned n)
															
 
																 {
															
 
																+/* FIXME: real parallel implementation */
															
 
																 	unsigned i;
															
 
																 	__local double tmp;
															
 
																 	tmp = 0.0;
															
--- a/include/starpu.h
+++ b/include/starpu.h
@@ -111,6 +111,12 @@ struct starpu_conf
 
																 	int magic;
															
 
																 	/**
															
 
																+	   @private
															
 
																+	   Tell starpu_init() if MPI will be initialized later.
															
 
																+	*/
															
 
																+	int will_use_mpi;
															
 
																+
															
 
																+	/**
															
 
																 	   Name of the scheduling policy. This can also be specified
															
 
																 	   with the environment variable \ref STARPU_SCHED. (default =
															
 
																 	   <c>NULL</c>).
															
--- a/include/starpu_config.h.in
+++ b/include/starpu_config.h.in
@@ -187,6 +187,15 @@
 
																 #undef STARPU_NMAXBUFS
															
 
																 /**
															
 
																+   Define the maximum number of fxt mpi files that can be read when
															
 
																+   generating traces. The default value is 64, it can be changed by
															
 
																+   using the configure option \ref enable-fxt-max-files
															
 
																+   "--enable-fxt-max-files".
															
 
																+   @ingroup API_MPI_Support
															
 
																+*/
															
 
																+#undef STARPU_FXT_MAX_FILES
															
 
																+
															
 
																+/**
															
 
																    Define the maximum number of CPU workers managed by StarPU. The
															
 
																    default value can be modified at configure by using the option \ref
															
 
																    enable-maxcpus "--enable-maxcpus".
															
--- a/include/starpu_fxt.h
+++ b/include/starpu_fxt.h
@@ -20,6 +20,7 @@
 
																 #ifndef __STARPU_FXT_H__
															
 
																 #define __STARPU_FXT_H__
															
 
																+#include <starpu_config.h>
															
 
																 #include <starpu_perfmodel.h>
															
 
																 #ifdef __cplusplus
															
@@ -32,8 +33,6 @@ extern "C"
 
																    @{
															
 
																 */
															
 
																-#define STARPU_FXT_MAX_FILES	64
															
 
																-
															
 
																 struct starpu_fxt_codelet_event
															
 
																 {
															
 
																 	char symbol[256];
															
--- a/include/starpu_helper.h
+++ b/include/starpu_helper.h
@@ -182,6 +182,14 @@ double starpu_timing_now(void);
 
																 */
															
 
																 int starpu_data_cpy(starpu_data_handle_t dst_handle, starpu_data_handle_t src_handle, int asynchronous, void (*callback_func)(void*), void *callback_arg);
															
 
																+/**
															
 
																+   Call hwloc-ps to display binding of each processus and thread running on
															
 
																+   the machine.<br>
															
 
																+   Use the environment variable \ref STARPU_DISPLAY_BINDINGS to automatically
															
 
																+   call this function at the beginning of the execution of StarPU.
															
 
																+*/
															
 
																+void starpu_display_bindings(void);
															
 
																+
															
 
																 /** @} */
															
 
																 #ifdef __cplusplus
															
--- a/include/starpu_stdlib.h
+++ b/include/starpu_stdlib.h
@@ -239,9 +239,32 @@ void starpu_memory_deallocate(unsigned node, size_t size);
 
																 */
															
 
																 void starpu_memory_wait_available(unsigned node, size_t size);
															
 
																+/**
															
 
																+   Sleep for the given \p nb_sec seconds.
															
 
																+   In simgrid mode, this only sleeps within virtual time.
															
 
																+  */
															
 
																 void starpu_sleep(float nb_sec);
															
 
																+
															
 
																+/**
															
 
																+   Sleep for the given \p nb_micro_sec micro-seconds.
															
 
																+   In simgrid mode, this only sleeps within virtual time.
															
 
																+  */
															
 
																 void starpu_usleep(float nb_micro_sec);
															
 
																+/**
															
 
																+   Account for \p joules J being used.
															
 
																+   This is support in simgrid mode, to record how much energy was used, and will
															
 
																+   show up in further call to starpu_energy_used().
															
 
																+  */
															
 
																+void starpu_energy_use(float joules);
															
 
																+
															
 
																+/**
															
 
																+   Return the amount of energy having been used in J.
															
 
																+   This account the amounts passed to starpu_energy_use(), but also the static
															
 
																+   energy use set by the \ref STARPU_IDLE_POWER environment variable.
															
 
																+  */
															
 
																+double starpu_energy_used(void);
															
 
																+
															
 
																 /** @} */
															
 
																 #ifdef __cplusplus
															
--- a/include/starpu_task.h
+++ b/include/starpu_task.h
@@ -513,7 +513,7 @@ struct starpu_codelet
 
																 	/**
															
 
																 	   Optional pointer to the task energy consumption performance
															
 
																-	   model associated to this codelet. This optional field is
															
 
																+	   model associated to this codelet (in J). This optional field is
															
 
																 	   ignored when set to <c>NULL</c> or when its field
															
 
																 	   starpu_perfmodel::symbol is not set. In the case of
															
 
																 	   parallel codelets, this has to account for all processing
															
--- a/include/starpu_util.h
+++ b/include/starpu_util.h
@@ -598,6 +598,17 @@ STARPU_ATOMIC_SOMETHING64(or, old | value)
 
																 #define STARPU_WMB() STARPU_SYNCHRONIZE()
															
 
																 #endif
															
 
																+#if defined(__i386__) || defined(__x86_64__)
															
 
																+#define STARPU_CACHELINE_SIZE 64
															
 
																+#elif defined(__ppc__) || defined(__ppc64__) || defined(__ia64__)
															
 
																+#define STARPU_CACHELINE_SIZE 128
															
 
																+#elif defined(__s390__) || defined(__s390x__)
															
 
																+#define STARPU_CACHELINE_SIZE 256
															
 
																+#else
															
 
																+/* Conservative default */
															
 
																+#define STARPU_CACHELINE_SIZE 1024
															
 
																+#endif
															
 
																+
															
 
																 #ifdef _WIN32
															
 
																 /* Try to fetch the system definition of timespec */
															
 
																 #include <sys/types.h>
															
--- a/julia/examples/cholesky/cholesky_common.jl
+++ b/julia/examples/cholesky/cholesky_common.jl
@@ -1,3 +1,18 @@
 
																+# StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+#
															
 
																+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
															
 
																+#
															
 
																+# StarPU is free software; you can redistribute it and/or modify
															
 
																+# it under the terms of the GNU Lesser General Public License as published by
															
 
																+# the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+# your option) any later version.
															
 
																+#
															
 
																+# StarPU is distributed in the hope that it will be useful, but
															
 
																+# WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+#
															
 
																+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+#
															
 
																 # Standard kernels for the Cholesky factorization
															
 
																 # U22 is the gemm update
															
 
																 # U21 is the trsm update
															
--- a/julia/examples/cholesky/cholesky_native.jl
+++ b/julia/examples/cholesky/cholesky_native.jl
@@ -1,3 +1,18 @@
 
																+# StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+#
															
 
																+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
															
 
																+#
															
 
																+# StarPU is free software; you can redistribute it and/or modify
															
 
																+# it under the terms of the GNU Lesser General Public License as published by
															
 
																+# the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+# your option) any later version.
															
 
																+#
															
 
																+# StarPU is distributed in the hope that it will be useful, but
															
 
																+# WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+#
															
 
																+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+#
															
 
																 using LinearAlgebra
															
 
																 function check(mat::Matrix{Float32})
															
--- a/julia/src/openblas_ldflags.jl
+++ b/julia/src/openblas_ldflags.jl
@@ -1,3 +1,18 @@
 
																+# StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+#
															
 
																+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
															
 
																+#
															
 
																+# StarPU is free software; you can redistribute it and/or modify
															
 
																+# it under the terms of the GNU Lesser General Public License as published by
															
 
																+# the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+# your option) any later version.
															
 
																+#
															
 
																+# StarPU is distributed in the hope that it will be useful, but
															
 
																+# WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+#
															
 
																+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+#
															
 
																 import LinearAlgebra.BLAS
															
 
																 import Libdl
															
--- a/mpi/examples/Makefile.am
+++ b/mpi/examples/Makefile.am
@@ -83,6 +83,10 @@ EXTRA_DIST = 				\
 
																 	matrix_decomposition/mpi_decomposition_params.h	\
															
 
																 	matrix_decomposition/mpi_decomposition_matrix.h	\
															
 
																 	user_datatype/my_interface.h			\
															
 
																+	benchs/abstract_sendrecv_bench.h	\
															
 
																+	benchs/bench_helper.h			\
															
 
																+	benchs/gemm_helper.h			\
															
 
																+	benchs/burst_helper.h			\
															
 
																 	helper.h
															
 
																 examplebindir = $(libdir)/starpu/mpi
															
@@ -399,3 +403,68 @@ native_fortran/nf_mm_task_build.o: nf_mm_cl.mod fstarpu_mpi_mod.mod fstarpu_mod.
 
																 native_fortran/nf_basic_ring.o: fstarpu_mpi_mod.mod fstarpu_mod.mod
															
 
																 endif
															
 
																 endif
															
 
																+
															
 
																+
															
 
																+##########
															
 
																+# benchs #
															
 
																+##########
															
 
																+
															
 
																+examplebin_PROGRAMS +=		\
															
 
																+	benchs/sendrecv_bench	\
															
 
																+	benchs/burst
															
 
																+
															
 
																+if !STARPU_USE_MPI_MPI
															
 
																+examplebin_PROGRAMS +=		\
															
 
																+	benchs/sendrecv_parallel_tasks_bench
															
 
																+endif
															
 
																+
															
 
																+if !STARPU_NO_BLAS_LIB
															
 
																+examplebin_PROGRAMS +=		\
															
 
																+	benchs/sendrecv_gemm_bench			\
															
 
																+	benchs/burst_gemm
															
 
																+endif
															
 
																+
															
 
																+if !STARPU_SIMGRID
															
 
																+starpu_mpi_EXAMPLES	+=	\
															
 
																+	benchs/sendrecv_bench	\
															
 
																+	benchs/burst
															
 
																+
															
 
																+if !STARPU_USE_MPI_MPI
															
 
																+starpu_mpi_EXAMPLES	+=	\
															
 
																+	benchs/sendrecv_parallel_tasks_bench
															
 
																+endif
															
 
																+
															
 
																+if !STARPU_NO_BLAS_LIB
															
 
																+starpu_mpi_EXAMPLES	+=	\
															
 
																+	benchs/sendrecv_gemm_bench			\
															
 
																+	benchs/burst_gemm
															
 
																+endif
															
 
																+endif
															
 
																+
															
 
																+benchs_sendrecv_bench_SOURCES = benchs/sendrecv_bench.c
															
 
																+benchs_sendrecv_bench_SOURCES += benchs/bench_helper.c
															
 
																+benchs_sendrecv_bench_SOURCES += benchs/abstract_sendrecv_bench.c
															
 
																+
															
 
																+benchs_sendrecv_parallel_tasks_bench_SOURCES = benchs/sendrecv_parallel_tasks_bench.c
															
 
																+benchs_sendrecv_parallel_tasks_bench_SOURCES += benchs/bench_helper.c
															
 
																+benchs_sendrecv_parallel_tasks_bench_SOURCES += benchs/abstract_sendrecv_bench.c
															
 
																+
															
 
																+benchs_burst_SOURCES = benchs/burst.c
															
 
																+benchs_burst_SOURCES += benchs/burst_helper.c
															
 
																+
															
 
																+if !STARPU_NO_BLAS_LIB
															
 
																+benchs_sendrecv_gemm_bench_SOURCES = benchs/sendrecv_gemm_bench.c
															
 
																+benchs_sendrecv_gemm_bench_SOURCES += benchs/bench_helper.c
															
 
																+benchs_sendrecv_gemm_bench_SOURCES += benchs/gemm_helper.c
															
 
																+benchs_sendrecv_gemm_bench_SOURCES += benchs/abstract_sendrecv_bench.c
															
 
																+benchs_sendrecv_gemm_bench_SOURCES += ../../examples/common/blas.c
															
 
																+
															
 
																+benchs_sendrecv_gemm_bench_LDADD = $(STARPU_BLAS_LDFLAGS)
															
 
																+
															
 
																+benchs_burst_gemm_SOURCES = benchs/burst_gemm.c
															
 
																+benchs_burst_gemm_SOURCES += benchs/gemm_helper.c
															
 
																+benchs_burst_gemm_SOURCES += benchs/burst_helper.c
															
 
																+benchs_burst_gemm_SOURCES += ../../examples/common/blas.c
															
 
																+
															
 
																+benchs_burst_gemm_LDADD = $(STARPU_BLAS_LDFLAGS)
															
 
																+endif
															
--- a/mpi/examples/benchs/abstract_sendrecv_bench.c
+++ b/mpi/examples/benchs/abstract_sendrecv_bench.c
--- a/mpi/examples/benchs/abstract_sendrecv_bench.h
+++ b/mpi/examples/benchs/abstract_sendrecv_bench.h
--- a/mpi/examples/benchs/bench_helper.c
+++ b/mpi/examples/benchs/bench_helper.c
--- a/mpi/examples/benchs/bench_helper.h
+++ b/mpi/examples/benchs/bench_helper.h
--- a/mpi/tests/burst.c
+++ b/mpi/tests/burst.c
@@ -49,13 +49,11 @@ void parse_args(int argc, char **argv)
 
																 int main(int argc, char **argv)
															
 
																 {
															
 
																-	int ret, rank, mpi_init, other_rank;
															
 
																+	int ret, rank, other_rank;
															
 
																 	parse_args(argc, argv);
															
 
																-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
															
 
																-
															
 
																-	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
															
 
																+	ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
															
 
																 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
															
@@ -68,8 +66,6 @@ int main(int argc, char **argv)
 
																 	burst_free_data(rank);
															
 
																 	starpu_mpi_shutdown();
															
 
																-	if (!mpi_init)
															
 
																-		MPI_Finalize();
															
 
																 	return 0;
															
 
																 }
															
--- a/mpi/tests/burst_gemm.c
+++ b/mpi/tests/burst_gemm.c
@@ -90,12 +90,11 @@ void parse_args(int argc, char **argv)
 
																 int main(int argc, char **argv)
															
 
																 {
															
 
																-	int ret, mpi_init, worldsize, mpi_rank;
															
 
																+	int ret, worldsize, mpi_rank;
															
 
																 	parse_args(argc, argv);
															
 
																-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
															
 
																-	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
															
 
																+	ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
															
 
																 	if (ret == -ENODEV)
															
 
																 		return 77;
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
															
@@ -109,8 +108,7 @@ int main(int argc, char **argv)
 
																 			FPRINTF(stderr, "We need 2 processes.\n");
															
 
																 		starpu_mpi_shutdown();
															
 
																-		if (!mpi_init)
															
 
																-			MPI_Finalize();
															
 
																+
															
 
																 		return STARPU_TEST_SKIPPED;
															
 
																 	}
															
@@ -203,8 +201,6 @@ enodev:
 
																 	burst_free_data(mpi_rank);
															
 
																 	starpu_mpi_shutdown();
															
 
																-	if (!mpi_init)
															
 
																-		MPI_Finalize();
															
 
																 	return ret;
															
 
																 }
															
--- a/mpi/examples/benchs/burst_helper.c
+++ b/mpi/examples/benchs/burst_helper.c
--- a/mpi/examples/benchs/burst_helper.h
+++ b/mpi/examples/benchs/burst_helper.h
--- a/mpi/examples/benchs/gemm_helper.c
+++ b/mpi/examples/benchs/gemm_helper.c
--- a/mpi/examples/benchs/gemm_helper.h
+++ b/mpi/examples/benchs/gemm_helper.h
--- a/mpi/tests/sendrecv_bench.c
+++ b/mpi/tests/sendrecv_bench.c
@@ -26,7 +26,6 @@
 
																 int main(int argc, char **argv)
															
 
																 {
															
 
																 	int ret, rank, worldsize;
															
 
																-	int mpi_init;
															
 
																 	int pause_workers = 0;
															
@@ -52,8 +51,7 @@ int main(int argc, char **argv)
 
																 	}
															
 
																-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
															
 
																-	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
															
 
																+	ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
															
 
																 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
															
@@ -65,8 +63,7 @@ int main(int argc, char **argv)
 
																 			FPRINTF(stderr, "We need 2 processes.\n");
															
 
																 		starpu_mpi_shutdown();
															
 
																-		if (!mpi_init)
															
 
																-			MPI_Finalize();
															
 
																+
															
 
																 		return STARPU_TEST_SKIPPED;
															
 
																 	}
															
@@ -85,8 +82,6 @@ int main(int argc, char **argv)
 
																 	}
															
 
																 	starpu_mpi_shutdown();
															
 
																-	if (!mpi_init)
															
 
																-		MPI_Finalize();
															
 
																 	return 0;
															
 
																 }
															
--- a/mpi/tests/sendrecv_gemm_bench.c
+++ b/mpi/tests/sendrecv_gemm_bench.c
@@ -53,7 +53,7 @@ static void* comm_thread_func(void* arg)
 
																 	{
															
 
																 		char hostname[65];
															
 
																 		gethostname(hostname, sizeof(hostname));
															
 
																-		_STARPU_DISP("[%s] No core was available for the comm thread. You should increase STARPU_RESERVE_NCPU or decrease STARPU_NCPU\n", hostname);
															
 
																+		fprintf(stderr, "[%s] No core was available for the comm thread. You should increase STARPU_RESERVE_NCPU or decrease STARPU_NCPU\n", hostname);
															
 
																 	}
															
 
																 	sendrecv_bench(mpi_rank, &thread_barrier);
															
@@ -118,7 +118,7 @@ void parse_args(int argc, char **argv)
 
																 int main(int argc, char **argv)
															
 
																 {
															
 
																 	double start, end;
															
 
																-	int ret, mpi_init, worldsize;
															
 
																+	int ret, worldsize;
															
 
																 	starpu_pthread_t comm_thread;
															
 
																 	char hostname[255];
															
@@ -128,8 +128,7 @@ int main(int argc, char **argv)
 
																 	starpu_fxt_autostart_profiling(0);
															
 
																-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
															
 
																-	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
															
 
																+	ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
															
 
																 	if (ret == -ENODEV)
															
 
																 		return 77;
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
															
@@ -143,8 +142,7 @@ int main(int argc, char **argv)
 
																 			FPRINTF(stderr, "We need 2 processes.\n");
															
 
																 		starpu_mpi_shutdown();
															
 
																-		if (!mpi_init)
															
 
																-			MPI_Finalize();
															
 
																+
															
 
																 		return STARPU_TEST_SKIPPED;
															
 
																 	}
															
@@ -162,7 +160,7 @@ int main(int argc, char **argv)
 
																 	if (mpi_rank == 0)
															
 
																 	{
															
 
																-		PRINTF("# node\tx\ty\tz\tms\tGFlops\n");
															
 
																+		printf("# node\tx\ty\tz\tms\tGFlops\n");
															
 
																 	}
															
 
																 	starpu_pause();
															
@@ -185,7 +183,7 @@ int main(int argc, char **argv)
 
																 	double timing = end - start;
															
 
																 	double flops = 2.0*((unsigned long long)matrix_dim) * ((unsigned long long)matrix_dim)*((unsigned long long)matrix_dim);
															
 
																-	PRINTF("%s\t%u\t%u\t%u\t%.0f\t%.1f\n", hostname, matrix_dim, matrix_dim, matrix_dim, timing/1000.0, flops/timing/1000.0);
															
 
																+	printf("%s\t%u\t%u\t%u\t%.0f\t%.1f\n", hostname, matrix_dim, matrix_dim, matrix_dim, timing/1000.0, flops/timing/1000.0);
															
 
																 enodev:
															
@@ -200,8 +198,6 @@ enodev:
 
																 	starpu_resume();
															
 
																 	starpu_mpi_shutdown();
															
 
																-	if (!mpi_init)
															
 
																-		MPI_Finalize();
															
 
																 	return ret;
															
 
																 }
															
--- a/mpi/tests/sendrecv_parallel_tasks_bench.c
+++ b/mpi/tests/sendrecv_parallel_tasks_bench.c
@@ -134,10 +134,8 @@ static struct starpu_codelet cl =
 
																 int main(int argc, char **argv)
															
 
																 {
															
 
																 	int ret, rank, worldsize;
															
 
																-	int mpi_init;
															
 
																-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
															
 
																-	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
															
 
																+	ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
															
 
																 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
															
@@ -149,8 +147,7 @@ int main(int argc, char **argv)
 
																 			FPRINTF(stderr, "We need 2 processes.\n");
															
 
																 		starpu_mpi_shutdown();
															
 
																-		if (!mpi_init)
															
 
																-			MPI_Finalize();
															
 
																+
															
 
																 		return STARPU_TEST_SKIPPED;
															
 
																 	}
															
@@ -162,8 +159,7 @@ int main(int argc, char **argv)
 
																 	else if (rank >= 2)
															
 
																 	{
															
 
																 		starpu_mpi_shutdown();
															
 
																-		if (!mpi_init)
															
 
																-			MPI_Finalize();
															
 
																+
															
 
																 		return 0;
															
 
																 	}
															
@@ -222,8 +218,6 @@ int main(int argc, char **argv)
 
																 	free(mpi_tags);
															
 
																 	starpu_mpi_shutdown();
															
 
																-	if (!mpi_init)
															
 
																-		MPI_Finalize();
															
 
																 	return 0;
															
 
																 }
															
--- a/mpi/src/starpu_mpi.c
+++ b/mpi/src/starpu_mpi.c
@@ -431,6 +431,7 @@ void starpu_mpi_data_migrate(MPI_Comm comm, starpu_data_handle_t data, int new_r
 
																 	/* Flush cache in all other nodes */
															
 
																 	/* TODO: Ideally we'd transmit the knowledge of who owns it */
															
 
																+	/* TODO: or at least remember that the previous owner has the data, that's an easy case to support */
															
 
																 	starpu_mpi_cache_flush(comm, data);
															
 
																 	return;
															
 
																 }
															
--- a/mpi/src/starpu_mpi_datatype.c
+++ b/mpi/src/starpu_mpi_datatype.c
@@ -26,17 +26,16 @@ struct _starpu_mpi_datatype_funcs
 
																 	UT_hash_handle hh;
															
 
																 };
															
 
																-static starpu_pthread_mutex_t _starpu_mpi_datatype_funcs_table_mutex;
															
 
																+/* We want to allow applications calling starpu_mpi_interface_datatype_register/unregister as constructor/destructor */
															
 
																+static starpu_pthread_mutex_t _starpu_mpi_datatype_funcs_table_mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
															
 
																 static struct _starpu_mpi_datatype_funcs *_starpu_mpi_datatype_funcs_table = NULL;
															
 
																 void _starpu_mpi_datatype_init(void)
															
 
																 {
															
 
																-	STARPU_PTHREAD_MUTEX_INIT(&_starpu_mpi_datatype_funcs_table_mutex, NULL);
															
 
																 }
															
 
																 void _starpu_mpi_datatype_shutdown(void)
															
 
																 {
															
 
																-	STARPU_PTHREAD_MUTEX_DESTROY(&_starpu_mpi_datatype_funcs_table_mutex);
															
 
																 }
															
 
																 /*
															
--- a/mpi/src/starpu_mpi_init.c
+++ b/mpi/src/starpu_mpi_init.c
@@ -138,7 +138,38 @@ int _starpu_mpi_initialize(int *argc, char ***argv, int initialize_mpi, MPI_Comm
 
																 	_starpu_mpi_do_initialize(argc_argv);
															
 
																 #endif
															
 
																-	return _mpi_backend._starpu_mpi_backend_progress_init(argc_argv);
															
 
																+	int ret = _mpi_backend._starpu_mpi_backend_progress_init(argc_argv);
															
 
																+
															
 
																+	if (starpu_get_env_number_default("STARPU_DISPLAY_BINDINGS", 0))
															
 
																+	{
															
 
																+		int rank, size, i;
															
 
																+		char hostname[65];
															
 
																+
															
 
																+		starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
															
 
																+		starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
															
 
																+		gethostname(hostname, sizeof(hostname));
															
 
																+
															
 
																+		/* We make a barrier between each node calling hwloc-ps, to avoid mixing
															
 
																+		 * outputs in stdout. */
															
 
																+		for (i = 0; i < size; i++)
															
 
																+		{
															
 
																+			starpu_mpi_barrier(MPI_COMM_WORLD);
															
 
																+			if (rank == i)
															
 
																+			{
															
 
																+				fprintf(stdout, "== Binding for rank %d on node %s ==\n", rank, hostname);
															
 
																+				starpu_display_bindings();
															
 
																+				fflush(stdout);
															
 
																+			}
															
 
																+		}
															
 
																+		starpu_mpi_barrier(MPI_COMM_WORLD);
															
 
																+		if (rank == 0)
															
 
																+		{
															
 
																+			fprintf(stdout, "== End of bindings ==\n");
															
 
																+			fflush(stdout);
															
 
																+		}
															
 
																+	}
															
 
																+
															
 
																+	return ret;
															
 
																 }
															
 
																 #ifdef STARPU_SIMGRID
															
@@ -219,6 +250,8 @@ int starpu_mpi_init_conf(int *argc, char ***argv, int initialize_mpi, MPI_Comm c
 
																 			conf->reserve_ncpus++;
															
 
																 	}
															
 
																+	conf->will_use_mpi = 1;
															
 
																+
															
 
																 	int ret = starpu_init(conf);
															
 
																 	if (ret < 0)
															
 
																 		return ret;
															
--- a/mpi/tests/Makefile.am
+++ b/mpi/tests/Makefile.am
@@ -62,11 +62,7 @@ BUILT_SOURCES =
 
																 CLEANFILES = *.gcno *.gcda *.linkinfo starpu_idle_microsec.log
															
 
																 EXTRA_DIST = 				\
															
 
																-	abstract_sendrecv_bench.h	\
															
 
																-	bench_helper.h			\
															
 
																 	helper.h			\
															
 
																-	gemm_helper.h			\
															
 
																-	burst_helper.h			\
															
 
																 	user_defined_datatype_value.h
															
 
																 examplebindir = $(libdir)/starpu/examples/mpi
															
@@ -142,19 +138,7 @@ starpu_mpi_TESTS +=				\
 
																 	temporary				\
															
 
																 	user_defined_datatype			\
															
 
																 	early_stuff				\
															
 
																-	sendrecv_bench				\
															
 
																-	burst
															
 
																-
															
 
																-if !STARPU_USE_MPI_MPI
															
 
																-starpu_mpi_TESTS +=				\
															
 
																-	sendrecv_parallel_tasks_bench
															
 
																-endif
															
 
																-
															
 
																-if !STARPU_NO_BLAS_LIB
															
 
																-starpu_mpi_TESTS +=				\
															
 
																-	sendrecv_gemm_bench			\
															
 
																-	burst_gemm
															
 
																-endif
															
 
																+	display_bindings
															
 
																 if !STARPU_SIMGRID
															
 
																 # missing support in simgrid
															
@@ -243,16 +227,8 @@ noinst_PROGRAMS +=				\
 
																 	starpu_redefine				\
															
 
																 	load_balancer				\
															
 
																 	driver					\
															
 
																-	sendrecv_bench				\
															
 
																-	sendrecv_parallel_tasks_bench		\
															
 
																-	burst					\
															
 
																-	nothing
															
 
																-
															
 
																-if !STARPU_NO_BLAS_LIB
															
 
																-noinst_PROGRAMS +=				\
															
 
																-	sendrecv_gemm_bench			\
															
 
																-	burst_gemm
															
 
																-endif
															
 
																+	nothing							\
															
 
																+	display_bindings
															
 
																 if STARPU_USE_MPI_FT
															
 
																 noinst_PROGRAMS +=  \
															
@@ -288,31 +264,3 @@ mpi_earlyrecv2_SOURCES = mpi_earlyrecv2.c
 
																 mpi_earlyrecv2_SOURCES += ../../examples/interface/complex_interface.c
															
 
																 mpi_earlyrecv2_sync_SOURCES = mpi_earlyrecv2_sync.c
															
 
																 mpi_earlyrecv2_sync_SOURCES += ../../examples/interface/complex_interface.c
															
 
																-
															
 
																-sendrecv_bench_SOURCES = sendrecv_bench.c
															
 
																-sendrecv_bench_SOURCES += bench_helper.c
															
 
																-sendrecv_bench_SOURCES += abstract_sendrecv_bench.c
															
 
																-
															
 
																-sendrecv_parallel_tasks_bench_SOURCES = sendrecv_parallel_tasks_bench.c
															
 
																-sendrecv_parallel_tasks_bench_SOURCES += bench_helper.c
															
 
																-sendrecv_parallel_tasks_bench_SOURCES += abstract_sendrecv_bench.c
															
 
																-
															
 
																-burst_SOURCES = burst.c
															
 
																-burst_SOURCES += burst_helper.c
															
 
																-
															
 
																-if !STARPU_NO_BLAS_LIB
															
 
																-sendrecv_gemm_bench_SOURCES = sendrecv_gemm_bench.c
															
 
																-sendrecv_gemm_bench_SOURCES += bench_helper.c
															
 
																-sendrecv_gemm_bench_SOURCES += gemm_helper.c
															
 
																-sendrecv_gemm_bench_SOURCES += abstract_sendrecv_bench.c
															
 
																-sendrecv_gemm_bench_SOURCES += ../../examples/common/blas.c
															
 
																-
															
 
																-sendrecv_gemm_bench_LDADD = $(STARPU_BLAS_LDFLAGS)
															
 
																-
															
 
																-burst_gemm_SOURCES = burst_gemm.c
															
 
																-burst_gemm_SOURCES += gemm_helper.c
															
 
																-burst_gemm_SOURCES += burst_helper.c
															
 
																-burst_gemm_SOURCES += ../../examples/common/blas.c
															
 
																-
															
 
																-burst_gemm_LDADD = $(STARPU_BLAS_LDFLAGS)
															
 
																-endif
															
--- a/mpi/tests/display_bindings.c
+++ b/mpi/tests/display_bindings.c
@@ -0,0 +1,44 @@
 
																+/* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+ *
															
 
																+ * Copyright (C) 2011-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
															
 
																+ *
															
 
																+ * StarPU is free software; you can redistribute it and/or modify
															
 
																+ * it under the terms of the GNU Lesser General Public License as published by
															
 
																+ * the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+ * your option) any later version.
															
 
																+ *
															
 
																+ * StarPU is distributed in the hope that it will be useful, but
															
 
																+ * WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+ *
															
 
																+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+ */
															
 
																+
															
 
																+#include <starpu.h>
															
 
																+#include <starpu_mpi.h>
															
 
																+#include <stdlib.h>
															
 
																+#include "helper.h"
															
 
																+
															
 
																+#if !defined(STARPU_HAVE_SETENV)
															
 
																+#warning setenv is not defined. Skipping test
															
 
																+int main(void)
															
 
																+{
															
 
																+	return STARPU_TEST_SKIPPED;
															
 
																+}
															
 
																+#else
															
 
																+int main(int argc, char **argv)
															
 
																+{
															
 
																+	int ret;
															
 
																+	setenv("STARPU_DISPLAY_BINDINGS", "1", 1);
															
 
																+
															
 
																+	MPI_INIT_THREAD_real(&argc, &argv, MPI_THREAD_SERIALIZED);
															
 
																+
															
 
																+	ret = starpu_mpi_init_conf(NULL, NULL, 0, MPI_COMM_WORLD, NULL);
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
															
 
																+
															
 
																+	starpu_mpi_shutdown();
															
 
																+	MPI_Finalize();
															
 
																+
															
 
																+	return EXIT_SUCCESS;
															
 
																+}
															
 
																+#endif
															
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -406,9 +406,16 @@ endif
 
																 # static inline definition
															
 
																 dist-hook:
															
 
																 	failed=0 ; \
															
 
																-	for i in $$( $(GREP) "static inline" $$(find $(srcdir) -name \*.h) | $(SED) -e 's/.*static inline //g' | $(GREP) -v ENAME | $(SED) -e 's/[^(]* \(\|\*\)\([^ (]*\)(.*/\2/' | $(GREP) -v _starpu_spin_init) ; do \
															
 
																-		for j in $(shell find . -name \*.o) ; do \
															
 
																-			nm $$j | $(GREP) "U $$i$$" && { echo $$j ; failed=1 ; } ; \
															
 
																-		done ; \
															
 
																+	look=""; \
															
 
																+	for i in $$( $(GREP) "static inline" $$(find $(srcdir) -name \*.h) | $(SED) -e 's/.*static inline //g' | $(GREP) -v ENAME\#\# | $(SED) -n -e 's/[^(]* \(\|\*\)\([^ (]*\)(.*/\2/' -e 'p;s/^_*//;p' | $(GREP) -v _starpu_spin_init | $(GREP) -v starpu_sched_ctx_worker_is_master_for_child_ctx) ; do \
															
 
																+		if [ -z "$$look" ] ; then \
															
 
																+			look="$$i" ; \
															
 
																+		else \
															
 
																+			look="$$look\|$$i" ; \
															
 
																+		fi ; \
															
 
																+	done ; \
															
 
																+	echo "$$look" ; \
															
 
																+	for j in $(shell find . -name \*.o) ; do \
															
 
																+		nm $$j | $(GREP) -e "U \($$look\)$$" && { echo $$j ; failed=1 ; } ; \
															
 
																 	done ; \
															
 
																 	[ $$failed == 0 ]
															
--- a/src/common/utils.c
+++ b/src/common/utils.c
@@ -740,3 +740,18 @@ int starpu_get_env_size_default(const char *str, int defval)
 
																 	}
															
 
																 	return val;
															
 
																 }
															
 
																+
															
 
																+void starpu_display_bindings(void)
															
 
																+{
															
 
																+#if defined(STARPU_HAVE_HWLOC) && !defined(STARPU_SIMGRID)
															
 
																+	int hwloc_ret = system("hwloc-ps -a -t -c");
															
 
																+	if (hwloc_ret)
															
 
																+	{
															
 
																+		_STARPU_DISP("hwloc-ps returned %d\n", hwloc_ret);
															
 
																+		fflush(stderr);
															
 
																+	}
															
 
																+	fflush(stdout);
															
 
																+#else
															
 
																+	_STARPU_DISP("hwloc not available to display bindings.\n");
															
 
																+#endif
															
 
																+}
															
--- a/src/core/jobs.c
+++ b/src/core/jobs.c
@@ -24,10 +24,12 @@
 
																 #include <common/config.h>
															
 
																 #include <common/utils.h>
															
 
																 #include <common/graph.h>
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																 #include <profiling/profiling.h>
															
 
																 #include <profiling/bound.h>
															
 
																 #include <core/debug.h>
															
 
																 #include <limits.h>
															
 
																+#include <core/workers.h>
															
 
																 static int max_memory_use;
															
 
																 static unsigned long njobs, maxnjobs;
															
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
@@ -37,6 +37,7 @@
 
																 #include <core/topology.h>
															
 
																 #include <common/utils.h>
															
 
																 #include <drivers/mpi/driver_mpi_common.h>
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																 #ifdef STARPU_USE_OPENCL
															
 
																 #include <starpu_opencl.h>
															
@@ -177,7 +178,7 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_cuda(int dev, in
 
																 	/* Allocate a buffer on the device */
															
 
																 	unsigned char *d_buffer;
															
 
																 	cures = cudaMalloc((void **)&d_buffer, size);
															
 
																-	STARPU_ASSERT(cures == cudaSuccess);
															
 
																+	if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
															
 
																 	/* hack to avoid third party libs to rebind threads */
															
 
																 	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
															
@@ -206,7 +207,7 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_cuda(int dev, in
 
																 		cudaHostRegister((void *)h_buffer, size, 0);
															
 
																 	}
															
 
																-	STARPU_ASSERT(cures == cudaSuccess);
															
 
																+	if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
															
 
																 	/* hack to avoid third party libs to rebind threads */
															
 
																 	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
															
@@ -331,7 +332,7 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
																 	/* Allocate a buffer on the device */
															
 
																 	unsigned char *s_buffer;
															
 
																 	cures = cudaMalloc((void **)&s_buffer, size);
															
 
																-	STARPU_ASSERT(cures == cudaSuccess);
															
 
																+	if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
															
 
																 	cudaMemset(s_buffer, 0, size);
															
 
																 	cudaDeviceSynchronize();
															
@@ -357,7 +358,7 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
																 	/* Allocate a buffer on the device */
															
 
																 	unsigned char *d_buffer;
															
 
																 	cures = cudaMalloc((void **)&d_buffer, size);
															
 
																-	STARPU_ASSERT(cures == cudaSuccess);
															
 
																+	if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
															
 
																 	cudaMemset(d_buffer, 0, size);
															
 
																 	cudaDeviceSynchronize();
															
--- a/src/core/perfmodel/perfmodel_history.c
+++ b/src/core/perfmodel/perfmodel_history.c
@@ -344,7 +344,10 @@ static void dump_reg_model(FILE *f, struct starpu_perfmodel *model, int comb, in
 
																 	double a = nan(""), b = nan(""), c = nan("");
															
 
																 	if (model->type == STARPU_NL_REGRESSION_BASED)
															
 
																-		_starpu_regression_non_linear_power(per_arch_model->list, &a, &b, &c);
															
 
																+	{
															
 
																+		if (_starpu_regression_non_linear_power(per_arch_model->list, &a, &b, &c) != 0)
															
 
																+			_STARPU_DISP("Warning: could not compute a non-linear regression for model %s\n", model->symbol);
															
 
																+	}
															
 
																 	fprintf(f, "# a\t\tb\t\tc\n");
															
 
																 	_starpu_write_double(f, "%-15e", a);
															
@@ -1491,6 +1494,8 @@ int starpu_perfmodel_load_file(const char *filename, struct starpu_perfmodel *mo
 
																 	res = fclose(f);
															
 
																 	STARPU_ASSERT(res == 0);
															
 
																+	if (ret)
															
 
																+		starpu_perfmodel_unload_model(model);
															
 
																 	return ret;
															
 
																 }
															
--- a/src/core/perfmodel/perfmodel_print.c
+++ b/src/core/perfmodel/perfmodel_print.c
@@ -19,6 +19,7 @@
 
																 #include <starpu.h>
															
 
																 #include <starpu_perfmodel.h>
															
 
																 #include <common/config.h>
															
 
																+#include <core/workers.h>
															
 
																 #include "perfmodel.h"
															
 
																 static
															
--- a/src/core/perfmodel/regression.c
+++ b/src/core/perfmodel/regression.c
@@ -20,7 +20,32 @@
 
																 #define MAXREGITER	1000
															
 
																 #define EPS 1.0e-10
															
 
																-static double compute_b(double c, unsigned n, unsigned *x, double *y)
															
 
																+/* For measurements close to C, we do not want to try to fit, since we are
															
 
																+   fitting the distance to C, which won't actually really get smaller */
															
 
																+#define C_RADIUS 1
															
 
																+
															
 
																+/*
															
 
																+ * smoothly ramp from 0 to 1 between 0 and 1
															
 
																+ * <= 0: stay 0
															
 
																+ * >= 1: stay 1 */
															
 
																+static double level(double x)
															
 
																+{
															
 
																+	if (x <= 0.)
															
 
																+		return 0.;
															
 
																+	if (x >= 1.)
															
 
																+		return 1.;
															
 
																+	if (x < 0.5)
															
 
																+		return -2*x*x+4*x-1;
															
 
																+	return 2*x*x;
															
 
																+}
															
 
																+
															
 
																+static double fixpop(unsigned pop, double c, double y)
															
 
																+{
															
 
																+	double distance = (y-c)/c;
															
 
																+	return pop * level((distance - C_RADIUS) / C_RADIUS);
															
 
																+}
															
 
																+
															
 
																+static double compute_b(double c, unsigned n, size_t *x, double *y, unsigned *pop)
															
 
																 {
															
 
																 	double b;
															
@@ -29,43 +54,55 @@ static double compute_b(double c, unsigned n, unsigned *x, double *y)
 
																 	double sumx = 0.0;
															
 
																 	double sumx2 = 0.0;
															
 
																 	double sumy = 0.0;
															
 
																+	double nn = 0;
															
 
																 	unsigned i;
															
 
																 	for (i = 0; i < n; i++)
															
 
																 	{
															
 
																 		double xi = log(x[i]);
															
 
																 		double yi = log(y[i]-c);
															
 
																+		double popi = fixpop(pop[i], c, y[i]);
															
 
																+		if (popi <= 0)
															
 
																+			continue;
															
 
																+
															
 
																+		sumxy += xi*yi*popi;
															
 
																+		sumx += xi*popi;
															
 
																+		sumx2 += xi*xi*popi;
															
 
																+		sumy += yi*popi;
															
 
																-		sumxy += xi*yi;
															
 
																-		sumx += xi;
															
 
																-		sumx2 += xi*xi;
															
 
																-		sumy += yi;
															
 
																+		nn += popi;
															
 
																 	}
															
 
																-	b = (n * sumxy - sumx * sumy) / (n*sumx2 - sumx*sumx);
															
 
																+	b = (nn * sumxy - sumx * sumy) / (nn*sumx2 - sumx*sumx);
															
 
																 	return b;
															
 
																 }
															
 
																-static double compute_a(double c, double b, unsigned n, unsigned *x, double *y)
															
 
																+static double compute_a(double c, double b, unsigned n, size_t *x, double *y, unsigned *pop)
															
 
																 {
															
 
																 	double a;
															
 
																 	/* X = log (x) , Y = log (y - c) */
															
 
																 	double sumx = 0.0;
															
 
																 	double sumy = 0.0;
															
 
																+	double nn = 0;
															
 
																 	unsigned i;
															
 
																 	for (i = 0; i < n; i++)
															
 
																 	{
															
 
																 		double xi = log(x[i]);
															
 
																 		double yi = log(y[i]-c);
															
 
																+		double popi = fixpop(pop[i], c, y[i]);
															
 
																+		if (popi <= 0)
															
 
																+			continue;
															
 
																-		sumx += xi;
															
 
																-		sumy += yi;
															
 
																+		sumx += xi*popi;
															
 
																+		sumy += yi*popi;
															
 
																+
															
 
																+		nn += popi;
															
 
																 	}
															
 
																-	a = (sumy - b*sumx) / n;
															
 
																+	a = (sumy - b*sumx) / nn;
															
 
																 	return a;
															
 
																 }
															
@@ -73,7 +110,7 @@ static double compute_a(double c, double b, unsigned n, unsigned *x, double *y)
 
																 /* returns r */
															
 
																-static double test_r(double c, unsigned n, unsigned *x, double *y)
															
 
																+static double test_r(double c, unsigned n, size_t *x, double *y, unsigned *pop)
															
 
																 {
															
 
																 	double r;
															
@@ -85,20 +122,26 @@ static double test_r(double c, unsigned n, unsigned *x, double *y)
 
																 	double sumx2 = 0.0;
															
 
																 	double sumy = 0.0;
															
 
																 	double sumy2 = 0.0;
															
 
																+	double nn = 0;
															
 
																 	unsigned i;
															
 
																 	for (i = 0; i < n; i++)
															
 
																 	{
															
 
																 		double xi = log(x[i]);
															
 
																 		double yi = log(y[i]-c);
															
 
																+		double popi = fixpop(pop[i], c, y[i]);
															
 
																+		if (popi <= 0)
															
 
																+			continue;
															
 
																 	//	printf("Xi = %e, Yi = %e\n", xi, yi);
															
 
																-		sumxy += xi*yi;
															
 
																-		sumx += xi;
															
 
																-		sumx2 += xi*xi;
															
 
																-		sumy += yi;
															
 
																-		sumy2 += yi*yi;
															
 
																+		sumxy += xi*yi*popi;
															
 
																+		sumx += xi*popi;
															
 
																+		sumx2 += xi*xi*popi;
															
 
																+		sumy += yi*popi;
															
 
																+		sumy2 += yi*yi*popi;
															
 
																+
															
 
																+		nn += popi;
															
 
																 	}
															
 
																 	//printf("sumxy %e\n", sumxy);
															
@@ -107,7 +150,7 @@ static double test_r(double c, unsigned n, unsigned *x, double *y)
 
																 	//printf("sumy %e\n", sumy);
															
 
																 	//printf("sumy2 %e\n", sumy2);
															
 
																-	r = (n * sumxy - sumx * sumy) / sqrt( (n* sumx2 - sumx*sumx) * (n*sumy2 - sumy*sumy) );
															
 
																+	r = (nn * sumxy - sumx * sumy) / sqrt( (nn* sumx2 - sumx*sumx) * (nn*sumy2 - sumy*sumy) );
															
 
																 	return r;
															
 
																 }
															
@@ -127,20 +170,29 @@ static unsigned find_list_size(struct starpu_perfmodel_history_list *list_histor
 
																 	return cnt;
															
 
																 }
															
 
																-static double find_list_min(double *y, unsigned n)
															
 
																+static int compar(const void *_a, const void *_b)
															
 
																 {
															
 
																-	double min = DBL_MAX;
															
 
																+	double a = *(double*) _a;
															
 
																+	double b = *(double*) _b;
															
 
																+	if (a < b)
															
 
																+		return -1;
															
 
																+	if (a > b)
															
 
																+		return 1;
															
 
																+	return 0;
															
 
																+}
															
 
																-	unsigned i;
															
 
																-	for (i = 0; i < n; i++)
															
 
																-	{
															
 
																-		min = STARPU_MIN(min, y[i]);
															
 
																-	}
															
 
																+static double get_list_fourth(double *y, unsigned n)
															
 
																+{
															
 
																+	double sorted[n];
															
 
																+
															
 
																+	memcpy(sorted, y, n * sizeof(*sorted));
															
 
																+
															
 
																+	qsort(sorted, n, sizeof(*sorted), compar);
															
 
																-	return min;
															
 
																+	return sorted[n/3];
															
 
																 }
															
 
																-static void dump_list(unsigned *x, double *y, struct starpu_perfmodel_history_list *list_history)
															
 
																+static void dump_list(size_t *x, double *y, unsigned *pop, struct starpu_perfmodel_history_list *list_history)
															
 
																 {
															
 
																 	struct starpu_perfmodel_history_list *ptr = list_history;
															
 
																 	unsigned i = 0;
															
@@ -151,6 +203,7 @@ static void dump_list(unsigned *x, double *y, struct starpu_perfmodel_history_li
 
																 		{
															
 
																 			x[i] = ptr->entry->size;
															
 
																 			y[i] = ptr->entry->mean;
															
 
																+			pop[i] = ptr->entry->nsample;
															
 
																 			i++;
															
 
																 		}
															
@@ -163,52 +216,72 @@ static void dump_list(unsigned *x, double *y, struct starpu_perfmodel_history_li
 
																  * 	return 0 if success, -1 otherwise
															
 
																  * 	if success, a, b and c are modified
															
 
																  * */
															
 
																+
															
 
																+/* See in Cedric Augonnet's PhD thesis's Appendix B for the rationale
															
 
																+ * Scheduling Tasks over Multicore machines enhanced with Accelerators: a
															
 
																+ * Runtime System’s Perspective */
															
 
																 int _starpu_regression_non_linear_power(struct starpu_perfmodel_history_list *ptr, double *a, double *b, double *c)
															
 
																 {
															
 
																 	unsigned n = find_list_size(ptr);
															
 
																-	STARPU_ASSERT(n);
															
 
																+	if (!n)
															
 
																+		return -1;
															
 
																-	unsigned *x;
															
 
																-	_STARPU_MALLOC(x, n*sizeof(unsigned));
															
 
																+	size_t *x;
															
 
																+	_STARPU_MALLOC(x, n*sizeof(size_t));
															
 
																 	double *y;
															
 
																 	_STARPU_MALLOC(y, n*sizeof(double));
															
 
																 	STARPU_ASSERT(y);
															
 
																-	dump_list(x, y, ptr);
															
 
																+	unsigned *pop;
															
 
																+	_STARPU_MALLOC(pop, n*sizeof(unsigned));
															
 
																+	STARPU_ASSERT(y);
															
 
																+
															
 
																+	dump_list(x, y, pop, ptr);
															
 
																 	double cmin = 0.0;
															
 
																-	double cmax = find_list_min(y, n);
															
 
																+	double cmax = get_list_fourth(y, n);
															
 
																 	unsigned iter;
															
 
																 	double err = 100000.0;
															
 
																+/*
															
 
																+	unsigned i;
															
 
																+	for (i = 0; i < 100; i++)
															
 
																+	{
															
 
																+		double ci = cmin + (cmax-cmin)*i/100.;
															
 
																+		fprintf(stderr,"%f: %f\n", ci, 1.0 - test_r(ci, n, x, y, pop));
															
 
																+	}
															
 
																+*/
															
 
																+
															
 
																+	/* Use dichotomy to find c that gives the best matching */
															
 
																 	for (iter = 0; iter < MAXREGITER; iter++)
															
 
																 	{
															
 
																 		double c1, c2;
															
 
																 		double r1, r2;
															
 
																-		double radius = 0.01;
															
 
																-
															
 
																-		c1 = cmin + (0.5-radius)*(cmax - cmin);
															
 
																-		c2 = cmin + (0.5+radius)*(cmax - cmin);
															
 
																+		c1 = cmin + (0.33)*(cmax - cmin);
															
 
																+		c2 = cmin + (0.67)*(cmax - cmin);
															
 
																-		r1 = test_r(c1, n, x, y);
															
 
																-		r2 = test_r(c2, n, x, y);
															
 
																+		r1 = test_r(c1, n, x, y, pop);
															
 
																+		r2 = test_r(c2, n, x, y, pop);
															
 
																 		double err1, err2;
															
 
																 		err1 = fabs(1.0 - r1);
															
 
																 		err2 = fabs(1.0 - r2);
															
 
																+		//fprintf(stderr,"%f - %f: %f - %f: %f - %f\n", cmin, c1, err1, c2, err2, cmax);
															
 
																+
															
 
																 		if (err1 < err2)
															
 
																 		{
															
 
																-			cmax = (cmin + cmax)/2;
															
 
																+			/* 1 is better */
															
 
																+			cmax = c2;
															
 
																 		}
															
 
																 		else
															
 
																 		{
															
 
																 			/* 2 is better */
															
 
																-			cmin = (cmin + cmax)/2;
															
 
																+			cmin = c1;
															
 
																 		}
															
 
																 		if (fabs(err - STARPU_MIN(err1, err2)) < EPS)
															
@@ -219,11 +292,12 @@ int _starpu_regression_non_linear_power(struct starpu_perfmodel_history_list *pt
 
																 	*c = (cmin + cmax)/2;
															
 
																-	*b = compute_b(*c, n, x, y);
															
 
																-	*a = exp(compute_a(*c, *b, n, x, y));
															
 
																+	*b = compute_b(*c, n, x, y, pop);
															
 
																+	*a = exp(compute_a(*c, *b, n, x, y, pop));
															
 
																 	free(x);
															
 
																 	free(y);
															
 
																+	free(pop);
															
 
																 	return 0;
															
 
																 }
															
--- a/src/core/sched_ctx.c
+++ b/src/core/sched_ctx.c
@@ -21,6 +21,7 @@
 
																 #include <common/utils.h>
															
 
																 #include <stdarg.h>
															
 
																 #include <core/task.h>
															
 
																+#include <core/workers.h>
															
 
																 enum _starpu_ctx_change_op
															
 
																 {
															
--- a/src/core/sched_policy.c
+++ b/src/core/sched_policy.c
@@ -22,6 +22,7 @@
 
																 #include <common/utils.h>
															
 
																 #include <core/sched_policy.h>
															
 
																 #include <profiling/profiling.h>
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																 #include <common/barrier.h>
															
 
																 #include <core/debug.h>
															
 
																 #include <core/task.h>
															
--- a/src/core/simgrid.c
+++ b/src/core/simgrid.c
@@ -58,6 +58,8 @@ extern int _starpu_mpi_simgrid_init(int argc, char *argv[]);
 
																 extern void smpi_process_set_user_data(void *);
															
 
																 #endif
															
 
																+static double _starpu_simgrid_dynamic_energy = 0.0;
															
 
																+
															
 
																 /* 1 when MSG_init was done, 2 when initialized through redirected main, 3 when
															
 
																  * initialized through MSG_process_attach */
															
 
																 static int simgrid_started;
															
@@ -629,6 +631,7 @@ struct task
 
																 #else
															
 
																 	msg_task_t task;
															
 
																 #endif
															
 
																+	double energy;
															
 
																 	/* communication termination signalization */
															
 
																 	unsigned *finished;
															
@@ -666,6 +669,7 @@ static void *task_execute(void *arg)
 
																 		MSG_task_execute(task->task);
															
 
																 		MSG_task_destroy(task->task);
															
 
																 #endif
															
 
																+		starpu_energy_use(task->energy);
															
 
																 		_STARPU_DEBUG("task %p finished\n", task);
															
 
																 		*task->finished = 1;
															
@@ -702,7 +706,7 @@ void _starpu_simgrid_wait_tasks(int workerid)
 
																 }
															
 
																 /* Task execution submitted by StarPU */
															
 
																-void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch, double length, unsigned *finished)
															
 
																+void _starpu_simgrid_submit_job(int workerid, int sched_ctx_id, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch, double length, double energy, unsigned *finished)
															
 
																 {
															
 
																 	struct starpu_task *starpu_task = j->task;
															
 
																 	double flops;
															
@@ -717,13 +721,19 @@ void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct star
 
																 	if (isnan(length))
															
 
																 	{
															
 
																-		length = starpu_task_expected_length(starpu_task, perf_arch, j->nimpl);
															
 
																+		length = starpu_task_worker_expected_length(starpu_task, workerid, sched_ctx_id, j->nimpl);
															
 
																 		STARPU_ASSERT_MSG(!_STARPU_IS_ZERO(length) && !isnan(length),
															
 
																 				  "Codelet %s does not have a perfmodel (in directory %s), or is not calibrated enough, please re-run in non-simgrid mode until it is calibrated",
															
 
																 				  _starpu_job_get_model_name(j), _starpu_get_perf_model_dir_codelet());
															
 
																                 /* TODO: option to add variance according to performance model,
															
 
																                  * to be able to easily check scheduling robustness */
															
 
																 	}
															
 
																+	if (isnan(energy))
															
 
																+	{
															
 
																+		energy = starpu_task_worker_expected_energy(starpu_task, workerid, sched_ctx_id, j->nimpl);
															
 
																+		/* TODO: option to add variance according to performance model,
															
 
																+		 * to be able to easily check scheduling robustness */
															
 
																+	}
															
 
																 #if defined(HAVE_SG_HOST_SPEED) || defined(sg_host_speed)
															
 
																 #  if defined(HAVE_SG_HOST_SELF) || defined(sg_host_self)
															
@@ -754,6 +764,7 @@ void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct star
 
																 		MSG_task_execute(simgrid_task);
															
 
																 		MSG_task_destroy(simgrid_task);
															
 
																 #endif
															
 
																+		starpu_energy_use(energy);
															
 
																 	}
															
 
																 	else
															
 
																 	{
															
@@ -766,6 +777,7 @@ void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct star
 
																 #else
															
 
																 		task->task = simgrid_task;
															
 
																 #endif
															
 
																+		task->energy = energy;
															
 
																 		task->finished = finished;
															
 
																 		*finished = 0;
															
 
																 		task->next = NULL;
															
@@ -1391,5 +1403,15 @@ void _starpu_simgrid_data_transfer(size_t size, unsigned src_node, unsigned dst_
 
																 }
															
 
																 #endif
															
 
																+void starpu_energy_use(float joules)
															
 
																+{
															
 
																+	_starpu_simgrid_dynamic_energy += joules;
															
 
																+}
															
 
																+
															
 
																+double starpu_energy_used(void)
															
 
																+{
															
 
																+	float idle_power = starpu_get_env_float_default("STARPU_IDLE_POWER", 0.0);
															
 
																+	return _starpu_simgrid_dynamic_energy + idle_power * starpu_timing_now() / 1000000;
															
 
																+}
															
 
																 #endif
															
--- a/src/core/simgrid.h
+++ b/src/core/simgrid.h
@@ -66,7 +66,7 @@ void _starpu_simgrid_deinit_late(void);
 
																 void _starpu_simgrid_actor_setup(void);
															
 
																 void _starpu_simgrid_wait_tasks(int workerid);
															
 
																 struct _starpu_job;
															
 
																-void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *job, struct starpu_perfmodel_arch* perf_arch, double length, unsigned *finished);
															
 
																+void _starpu_simgrid_submit_job(int workerid, int sched_ctx_id, struct _starpu_job *job, struct starpu_perfmodel_arch* perf_arch, double length, double energy, unsigned *finished);
															
 
																 struct _starpu_data_request;
															
 
																 int _starpu_simgrid_transfer(size_t size, unsigned src_node, unsigned dst_node, struct _starpu_data_request *req);
															
 
																 union _starpu_async_channel_event;
															
--- a/src/core/task.c
+++ b/src/core/task.c
@@ -30,6 +30,7 @@
 
																 #include <common/utils.h>
															
 
																 #include <common/fxt.h>
															
 
																 #include <common/knobs.h>
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																 #include <profiling/profiling.h>
															
 
																 #include <profiling/bound.h>
															
 
																 #include <math.h>
															
--- a/src/core/topology.c
+++ b/src/core/topology.c
@@ -1983,7 +1983,11 @@ int _starpu_bind_thread_on_cpu(int cpuid STARPU_ATTRIBUTE_UNUSED, int workerid S
 
																 		{
															
 
																 			cpu_worker[cpuid] = workerid;
															
 
																 			if (name)
															
 
																+			{
															
 
																+				if (cpu_name[cpuid])
															
 
																+					free(cpu_name[cpuid]);
															
 
																 				cpu_name[cpuid] = strdup(name);
															
 
																+			}
															
 
																 		}
															
 
																 	}
															
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -1059,6 +1059,7 @@ int starpu_conf_init(struct starpu_conf *conf)
 
																 	memset(conf, 0, sizeof(*conf));
															
 
																 	conf->magic = 42;
															
 
																+	conf->will_use_mpi = 0;
															
 
																 	conf->sched_policy_name = starpu_getenv("STARPU_SCHED");
															
 
																 	conf->sched_policy = NULL;
															
 
																 	conf->global_sched_ctx_min_priority = starpu_get_env_number("STARPU_MIN_PRIO");
															
@@ -1666,6 +1667,15 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 
																 	_starpu_catch_signals();
															
 
																+	/* if MPI is enabled, binding display will be done later, after MPI initialization */
															
 
																+	if (!_starpu_config.conf.will_use_mpi && starpu_get_env_number_default("STARPU_DISPLAY_BINDINGS", 0))
															
 
																+	{
															
 
																+		fprintf(stdout, "== Binding ==\n");
															
 
																+		starpu_display_bindings();
															
 
																+		fprintf(stdout, "== End of binding ==\n");
															
 
																+		fflush(stdout);
															
 
																+	}
															
 
																+
															
 
																 	return 0;
															
 
																 }
															
@@ -2644,31 +2654,37 @@ int starpu_worker_get_relax_state(void)
 
																 	return _starpu_worker_get_relax_state();
															
 
																 }
															
 
																+#undef starpu_worker_lock
															
 
																 void starpu_worker_lock(int workerid)
															
 
																 {
															
 
																 	_starpu_worker_lock(workerid);
															
 
																 }
															
 
																+#undef starpu_worker_trylock
															
 
																 int starpu_worker_trylock(int workerid)
															
 
																 {
															
 
																 	return _starpu_worker_trylock(workerid);
															
 
																 }
															
 
																+#undef starpu_worker_unlock
															
 
																 void starpu_worker_unlock(int workerid)
															
 
																 {
															
 
																 	_starpu_worker_unlock(workerid);
															
 
																 }
															
 
																+#undef starpu_worker_lock_self
															
 
																 void starpu_worker_lock_self(void)
															
 
																 {
															
 
																 	_starpu_worker_lock_self();
															
 
																 }
															
 
																+#undef starpu_worker_unlock_self
															
 
																 void starpu_worker_unlock_self(void)
															
 
																 {
															
 
																 	_starpu_worker_unlock_self();
															
 
																 }
															
 
																+#undef starpu_wake_worker_relax
															
 
																 int starpu_wake_worker_relax(int workerid)
															
 
																 {
															
 
																 	return _starpu_wake_worker_relax(workerid);
															
--- a/src/core/workers.h
+++ b/src/core/workers.h
@@ -203,6 +203,10 @@ LIST_TYPE(_starpu_worker,
 
																 	int enable_knob;
															
 
																 	int bindid_requested;
															
 
																+
															
 
																+	/* Keep this last, to make sure to separate worker data in separate
															
 
																+	  cache lines. */
															
 
																+	char padding[STARPU_CACHELINE_SIZE];
															
 
																 );
															
 
																 struct _starpu_combined_worker
															
@@ -223,6 +227,10 @@ struct _starpu_combined_worker
 
																 #ifdef STARPU_HAVE_HWLOC
															
 
																 	hwloc_bitmap_t hwloc_cpu_set;
															
 
																 #endif
															
 
																+
															
 
																+	/* Keep this last, to make sure to separate worker data in separate
															
 
																+	  cache lines. */
															
 
																+	char padding[STARPU_CACHELINE_SIZE];
															
 
																 };
															
 
																 /**
															
@@ -389,6 +397,9 @@ struct _starpu_machine_config
 
																 	/** Memory node for MPI, if only one */
															
 
																 	int mpi_nodeid;
															
 
																+	/* Separate out previous variables from per-worker data. */
															
 
																+	char padding1[STARPU_CACHELINE_SIZE];
															
 
																+
															
 
																 	/** Basic workers : each of this worker is running its own driver and
															
 
																 	 * can be combined with other basic workers. */
															
 
																 	struct _starpu_worker workers[STARPU_NMAXWORKERS];
															
@@ -397,6 +408,11 @@ struct _starpu_machine_config
 
																 	 * that can run parallel tasks together. */
															
 
																 	struct _starpu_combined_worker combined_workers[STARPU_NMAX_COMBINEDWORKERS];
															
 
																+	starpu_pthread_mutex_t submitted_mutex;
															
 
																+
															
 
																+	/* Separate out previous mutex from the rest of the data. */
															
 
																+	char padding2[STARPU_CACHELINE_SIZE];
															
 
																+
															
 
																 	/** Translation table from bindid to worker IDs */
															
 
																 	struct
															
 
																 	{
															
@@ -432,8 +448,6 @@ struct _starpu_machine_config
 
																 	/** When >0, StarPU should stop performance counters collection. */
															
 
																 	int perf_counter_pause_depth;
															
 
																-
															
 
																-	starpu_pthread_mutex_t submitted_mutex;
															
 
																 };
															
 
																 extern int _starpu_worker_parallel_blocks;
															
@@ -1103,6 +1117,7 @@ static inline void _starpu_worker_lock(int workerid)
 
																 		STARPU_PTHREAD_MUTEX_LOCK_SCHED(&worker->sched_mutex);
															
 
																 	}
															
 
																 }
															
 
																+#define starpu_worker_lock _starpu_worker_lock
															
 
																 static inline int _starpu_worker_trylock(int workerid)
															
 
																 {
															
@@ -1133,6 +1148,7 @@ static inline int _starpu_worker_trylock(int workerid)
 
																 	STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&cur_worker->sched_mutex);
															
 
																 	return ret;
															
 
																 }
															
 
																+#define starpu_worker_trylock _starpu_worker_trylock
															
 
																 static inline void _starpu_worker_unlock(int workerid)
															
 
																 {
															
@@ -1145,6 +1161,7 @@ static inline void _starpu_worker_unlock(int workerid)
 
																 		starpu_worker_relax_off();
															
 
																 	}
															
 
																 }
															
 
																+#define starpu_worker_unlock _starpu_worker_unlock
															
 
																 static inline void _starpu_worker_lock_self(void)
															
 
																 {
															
@@ -1153,6 +1170,7 @@ static inline void _starpu_worker_lock_self(void)
 
																 	STARPU_ASSERT(worker != NULL);
															
 
																 	STARPU_PTHREAD_MUTEX_LOCK_SCHED(&worker->sched_mutex);
															
 
																 }
															
 
																+#define starpu_worker_lock_self _starpu_worker_lock_self
															
 
																 static inline void _starpu_worker_unlock_self(void)
															
 
																 {
															
@@ -1161,6 +1179,7 @@ static inline void _starpu_worker_unlock_self(void)
 
																 	STARPU_ASSERT(worker != NULL);
															
 
																 	STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&worker->sched_mutex);
															
 
																 }
															
 
																+#define starpu_worker_unlock_self _starpu_worker_unlock_self
															
 
																 static inline int _starpu_wake_worker_relax(int workerid)
															
 
																 {
															
@@ -1169,6 +1188,7 @@ static inline int _starpu_wake_worker_relax(int workerid)
 
																 	_starpu_worker_unlock(workerid);
															
 
																 	return ret;
															
 
																 }
															
 
																+#define starpu_wake_worker_relax _starpu_wake_worker_relax
															
 
																 int starpu_wake_worker_relax_light(int workerid);
															
--- a/src/datawizard/data_request.c
+++ b/src/datawizard/data_request.c
@@ -25,6 +25,9 @@
 
																 #include <core/simgrid.h>
															
 
																 /* requests that have not been treated at all */
															
 
																+#ifdef STARPU_DEVEL
															
 
																+#warning split into separate out/in queues for each node, so that MAX_PENDING_REQUESTS_PER_NODE is separate for them, since the links are bidirectionnal
															
 
																+#endif
															
 
																 static struct _starpu_data_request_prio_list data_requests[STARPU_MAXNODES];
															
 
																 static struct _starpu_data_request_prio_list prefetch_requests[STARPU_MAXNODES];
															
 
																 static struct _starpu_data_request_prio_list idle_requests[STARPU_MAXNODES];
															
--- a/src/datawizard/filters.c
+++ b/src/datawizard/filters.c
@@ -21,6 +21,7 @@
 
																 #include <datawizard/filters.h>
															
 
																 #include <datawizard/footprint.h>
															
 
																 #include <datawizard/interfaces/data_interface.h>
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																 #include <core/task.h>
															
 
																 /*
															
--- a/src/datawizard/interfaces/bcsr_interface.c
+++ b/src/datawizard/interfaces/bcsr_interface.c
@@ -15,6 +15,9 @@
 
																  */
															
 
																 #include <starpu.h>
															
 
																+#ifdef BUILDING_STARPU
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																+#endif
															
 
																 /*
															
 
																  * BCSR : blocked CSR, we use blocks of size (r x c)
															
--- a/src/datawizard/interfaces/block_interface.c
+++ b/src/datawizard/interfaces/block_interface.c
@@ -15,6 +15,9 @@
 
																  */
															
 
																 #include <starpu.h>
															
 
																+#ifdef BUILDING_STARPU
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																+#endif
															
 
																 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
															
--- a/src/datawizard/interfaces/coo_interface.c
+++ b/src/datawizard/interfaces/coo_interface.c
@@ -15,6 +15,9 @@
 
																  */
															
 
																 #include <starpu.h>
															
 
																+#ifdef BUILDING_STARPU
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																+#endif
															
 
																 static int
															
 
																 copy_any_to_any(void *src_interface, unsigned src_node,
															
--- a/src/datawizard/interfaces/csr_interface.c
+++ b/src/datawizard/interfaces/csr_interface.c
@@ -16,6 +16,9 @@
 
																  */
															
 
																 #include <starpu.h>
															
 
																+#ifdef BUILDING_STARPU
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																+#endif
															
 
																 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
															
--- a/src/datawizard/interfaces/matrix_interface.c
+++ b/src/datawizard/interfaces/matrix_interface.c
@@ -15,6 +15,9 @@
 
																  */
															
 
																 #include <starpu.h>
															
 
																+#ifdef BUILDING_STARPU
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																+#endif
															
 
																 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
															
--- a/src/datawizard/interfaces/multiformat_interface.c
+++ b/src/datawizard/interfaces/multiformat_interface.c
@@ -15,6 +15,9 @@
 
																  */
															
 
																 #include <starpu.h>
															
 
																+#ifdef BUILDING_STARPU
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																+#endif
															
 
																 static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
															
 
																 #ifdef STARPU_USE_CUDA
															
--- a/src/datawizard/interfaces/tensor_interface.c
+++ b/src/datawizard/interfaces/tensor_interface.c
@@ -15,6 +15,9 @@
 
																  */
															
 
																 #include <starpu.h>
															
 
																+#ifdef BUILDING_STARPU
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																+#endif
															
 
																 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
															
--- a/src/datawizard/interfaces/variable_interface.c
+++ b/src/datawizard/interfaces/variable_interface.c
@@ -15,6 +15,9 @@
 
																  */
															
 
																 #include <starpu.h>
															
 
																+#ifdef BUILDING_STARPU
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																+#endif
															
 
																 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
															
--- a/src/datawizard/interfaces/vector_interface.c
+++ b/src/datawizard/interfaces/vector_interface.c
@@ -15,6 +15,9 @@
 
																  */
															
 
																 #include <starpu.h>
															
 
																+#ifdef BUILDING_STARPU
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																+#endif
															
 
																 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
															
--- a/src/datawizard/interfaces/void_interface.c
+++ b/src/datawizard/interfaces/void_interface.c
@@ -15,6 +15,9 @@
 
																  */
															
 
																 #include <starpu.h>
															
 
																+#ifdef BUILDING_STARPU
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																+#endif
															
 
																 static int dummy_copy(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
															
--- a/src/datawizard/memory_manager.c
+++ b/src/datawizard/memory_manager.c
@@ -19,6 +19,7 @@
 
																 #include <common/thread.h>
															
 
																 #include <common/fxt.h>
															
 
																 #include <datawizard/memory_manager.h>
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																 #include <core/workers.h>
															
 
																 #include <starpu_stdlib.h>
															
--- a/src/datawizard/reduction.c
+++ b/src/datawizard/reduction.c
@@ -22,6 +22,7 @@
 
																 #include <datawizard/datawizard.h>
															
 
																 #include <drivers/mic/driver_mic_source.h>
															
 
																 #include <drivers/mp_common/source_common.h>
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																 void starpu_data_set_reduction_methods(starpu_data_handle_t handle,
															
 
																 				       struct starpu_codelet *redux_cl,
															
--- a/src/datawizard/user_interactions.c
+++ b/src/datawizard/user_interactions.c
@@ -22,6 +22,7 @@
 
																 #include <datawizard/write_back.h>
															
 
																 #include <core/dependencies/data_concurrency.h>
															
 
																 #include <core/sched_policy.h>
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																 static void _starpu_data_check_initialized(starpu_data_handle_t handle, enum starpu_data_access_mode mode)
															
 
																 {
															
--- a/src/datawizard/write_back.c
+++ b/src/datawizard/write_back.c
@@ -17,6 +17,7 @@
 
																 #include <datawizard/datawizard.h>
															
 
																 #include <datawizard/write_back.h>
															
 
																 #include <core/dependencies/data_concurrency.h>
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																 static void wt_callback(void *arg)
															
 
																 {
															
@@ -63,7 +64,7 @@ void _starpu_write_through_data(starpu_data_handle_t handle, unsigned requesting
 
																 				struct _starpu_data_request *r;
															
 
																 				r = _starpu_create_request_to_fetch_data(handle, &handle->per_node[node],
															
 
																-									 STARPU_R, 1, 1, wt_callback, handle, 0, "_starpu_write_through_data");
															
 
																+									 STARPU_R, 2, 1, wt_callback, handle, 0, "_starpu_write_through_data");
															
 
																 			        /* If no request was created, the handle was already up-to-date on the
															
 
																 			         * node */
															
--- a/src/debug/traces/starpu_fxt.c
+++ b/src/debug/traces/starpu_fxt.c
@@ -1194,8 +1194,8 @@ static void handle_new_mem_node(struct fxt_ev_64 *ev, struct starpu_fxt_options
 
																  */
															
 
																 static int create_ordered_stream_id (int nodeid, int devid)
															
 
																 {
															
 
																-	static int stable[MAX_MPI_NODES][STARPU_MAXCUDADEVS];
															
 
																-	STARPU_ASSERT(nodeid < MAX_MPI_NODES);
															
 
																+	static int stable[STARPU_FXT_MAX_FILES][STARPU_MAXCUDADEVS];
															
 
																+	STARPU_ASSERT(nodeid < STARPU_FXT_MAX_FILES);
															
 
																 	STARPU_ASSERT(devid < STARPU_MAXCUDADEVS);
															
 
																 	return stable[nodeid][devid]++;
															
 
																 }
															
--- a/src/debug/traces/starpu_fxt.h
+++ b/src/debug/traces/starpu_fxt.h
@@ -41,8 +41,6 @@
 
																 #include <starpu.h>
															
 
																 #include "../../../include/starpu_fxt.h"
															
 
																-#define MAX_MPI_NODES 64
															
 
																-
															
 
																 extern char _starpu_last_codelet_symbol[STARPU_NMAXWORKERS][(FXT_MAX_PARAMS-5)*sizeof(unsigned long)];
															
 
																 void _starpu_fxt_dag_init(char *dag_filename);
															
--- a/src/debug/traces/starpu_fxt_mpi.c
+++ b/src/debug/traces/starpu_fxt_mpi.c
@@ -103,27 +103,27 @@ int _starpu_fxt_mpi_find_sync_point(char *filename_in, uint64_t *offset, int *ke
 
																  */
															
 
																 /* the list of MPI transfers found in the different traces */
															
 
																-static struct mpi_transfer *mpi_sends[MAX_MPI_NODES] = {NULL};
															
 
																-static struct mpi_transfer *mpi_recvs[MAX_MPI_NODES] = {NULL};
															
 
																+static struct mpi_transfer *mpi_sends[STARPU_FXT_MAX_FILES] = {NULL};
															
 
																+static struct mpi_transfer *mpi_recvs[STARPU_FXT_MAX_FILES] = {NULL};
															
 
																 /* number of available slots in the lists  */
															
 
																-unsigned mpi_sends_list_size[MAX_MPI_NODES] = {0};
															
 
																-unsigned mpi_recvs_list_size[MAX_MPI_NODES] = {0};
															
 
																+unsigned mpi_sends_list_size[STARPU_FXT_MAX_FILES] = {0};
															
 
																+unsigned mpi_recvs_list_size[STARPU_FXT_MAX_FILES] = {0};
															
 
																 /* number of slots actually used in the list  */
															
 
																-unsigned mpi_sends_used[MAX_MPI_NODES] = {0};
															
 
																-unsigned mpi_recvs_used[MAX_MPI_NODES] = {0};
															
 
																+unsigned mpi_sends_used[STARPU_FXT_MAX_FILES] = {0};
															
 
																+unsigned mpi_recvs_used[STARPU_FXT_MAX_FILES] = {0};
															
 
																 /* number of slots already matched at the beginning of the list. This permits
															
 
																  * going through the lists from the beginning to match each and every
															
 
																  * transfer, thus avoiding a quadratic complexity. */
															
 
																-unsigned mpi_recvs_matched[MAX_MPI_NODES][MAX_MPI_NODES] = { {0} };
															
 
																-unsigned mpi_sends_matched[MAX_MPI_NODES][MAX_MPI_NODES] = { {0} };
															
 
																+unsigned mpi_recvs_matched[STARPU_FXT_MAX_FILES][STARPU_FXT_MAX_FILES] = { {0} };
															
 
																+unsigned mpi_sends_matched[STARPU_FXT_MAX_FILES][STARPU_FXT_MAX_FILES] = { {0} };
															
 
																 void _starpu_fxt_mpi_add_send_transfer(int src, int dst STARPU_ATTRIBUTE_UNUSED, long mpi_tag, size_t size, float date, long jobid, unsigned long handle)
															
 
																 {
															
 
																 	STARPU_ASSERT(src >= 0);
															
 
																-	if (src >= MAX_MPI_NODES)
															
 
																+	if (src >= STARPU_FXT_MAX_FILES)
															
 
																 		return;
															
 
																 	unsigned slot = mpi_sends_used[src]++;
															
@@ -153,7 +153,7 @@ void _starpu_fxt_mpi_add_send_transfer(int src, int dst STARPU_ATTRIBUTE_UNUSED,
 
																 void _starpu_fxt_mpi_add_recv_transfer(int src STARPU_ATTRIBUTE_UNUSED, int dst, long mpi_tag, float date, long jobid, unsigned long handle)
															
 
																 {
															
 
																-	if (dst >= MAX_MPI_NODES)
															
 
																+	if (dst >= STARPU_FXT_MAX_FILES)
															
 
																 		return;
															
 
																 	unsigned slot = mpi_recvs_used[dst]++;
															
@@ -220,11 +220,11 @@ static unsigned long mpi_com_id = 0;
 
																 static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comms_file, unsigned n)
															
 
																 {
															
 
																-	unsigned slot[MAX_MPI_NODES] = { 0 }, node;
															
 
																+	unsigned slot[STARPU_FXT_MAX_FILES] = { 0 }, node;
															
 
																 	unsigned nb_wrong_comm_timing = 0;
															
 
																 	struct mpi_transfer_list pending_receives; /* Sorted list of matches which have not happened yet */
															
 
																-	double current_out_bandwidth[MAX_MPI_NODES] = { 0. };
															
 
																-	double current_in_bandwidth[MAX_MPI_NODES] = { 0. };
															
 
																+	double current_out_bandwidth[STARPU_FXT_MAX_FILES] = { 0. };
															
 
																+	double current_in_bandwidth[STARPU_FXT_MAX_FILES] = { 0. };
															
 
																 #ifdef STARPU_HAVE_POTI
															
 
																 	char mpi_container[STARPU_POTI_STR_LEN];
															
 
																 #endif
															
@@ -246,7 +246,7 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 
																 		else
															
 
																 			start_date = mpi_transfer_list_front(&pending_receives)->date;
															
 
																-		src = MAX_MPI_NODES;
															
 
																+		src = STARPU_FXT_MAX_FILES;
															
 
																 		for (node = 0; node < n; node++)
															
 
																 		{
															
 
																 			if (slot[node] < mpi_sends_used[node] && mpi_sends[node][slot[node]].date < start_date)
															
@@ -260,7 +260,7 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 
																 			/* No event any more, we're finished! */
															
 
																 			break;
															
 
																-		if (src == MAX_MPI_NODES)
															
 
																+		if (src == STARPU_FXT_MAX_FILES)
															
 
																 		{
															
 
																 			/* Pending match is earlier than all new sends, finish its communication */
															
 
																 			match = mpi_transfer_list_pop_front(&pending_receives);
															
@@ -284,7 +284,7 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 
																 		size_t size = cur->size;
															
 
																 		unsigned long send_handle = cur->handle;
															
 
																-		if (dst < MAX_MPI_NODES)
															
 
																+		if (dst < STARPU_FXT_MAX_FILES)
															
 
																 			match = try_to_match_send_transfer(src, dst, mpi_tag);
															
 
																 		else
															
 
																 			match = NULL;
															
@@ -377,10 +377,10 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 
																 void _starpu_fxt_display_mpi_transfers(struct starpu_fxt_options *options, int *ranks STARPU_ATTRIBUTE_UNUSED, FILE *out_paje_file, FILE* out_comms_file)
															
 
																 {
															
 
																-	if (options->ninputfiles > MAX_MPI_NODES)
															
 
																+	if (options->ninputfiles > STARPU_FXT_MAX_FILES)
															
 
																 	{
															
 
																-		_STARPU_DISP("Warning: %u files given, maximum %u supported, truncating to %u\n", options->ninputfiles, MAX_MPI_NODES, MAX_MPI_NODES);
															
 
																-		options->ninputfiles = MAX_MPI_NODES;
															
 
																+		_STARPU_DISP("Warning: %u files given, maximum %u supported, truncating to %u\n", options->ninputfiles, STARPU_FXT_MAX_FILES, STARPU_FXT_MAX_FILES);
															
 
																+		options->ninputfiles = STARPU_FXT_MAX_FILES;
															
 
																 	}
															
 
																 	/* display the MPI transfers if possible */
															
--- a/src/drivers/cpu/driver_cpu.c
+++ b/src/drivers/cpu/driver_cpu.c
@@ -108,7 +108,10 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 
																 				_SIMGRID_TIMER_END;
															
 
																 			}
															
 
																 			else
															
 
																-				_starpu_simgrid_submit_job(cpu_args->workerid, j, perf_arch, NAN, NULL);
															
 
																+			{
															
 
																+				struct _starpu_sched_ctx *sched_ctx = _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(cpu_args, j);
															
 
																+				_starpu_simgrid_submit_job(cpu_args->workerid, sched_ctx->id, j, perf_arch, NAN, NAN, NULL);
															
 
																+			}
															
 
																 #else
															
 
																 #  ifdef STARPU_PAPI
															
 
																 			_starpu_profiling_papi_task_start_counters(task);
															
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -531,8 +531,11 @@ static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *worke
 
																 				_SIMGRID_TIMER_END;
															
 
																 			}
															
 
																 		else
															
 
																-			_starpu_simgrid_submit_job(workerid, j, &worker->perf_arch, NAN,
															
 
																+		{
															
 
																+			struct _starpu_sched_ctx *sched_ctx = _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(worker, j);
															
 
																+			_starpu_simgrid_submit_job(workerid, sched_ctx->id, j, &worker->perf_arch, NAN, NAN,
															
 
																 				async ? &task_finished[workerid][pipeline_idx] : NULL);
															
 
																+		}
															
 
																 #else
															
 
																 #ifdef HAVE_LIBNVIDIA_ML
															
 
																 		unsigned long long energy_start = 0;
															
--- a/src/drivers/disk/driver_disk.c
+++ b/src/drivers/disk/driver_disk.c
@@ -21,6 +21,7 @@
 
																 #include <drivers/disk/driver_disk.h>
															
 
																 #include <drivers/cpu/driver_cpu.h>
															
 
																 #include <datawizard/coherency.h>
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																 int _starpu_disk_copy_src_to_disk(void * src, unsigned src_node, void * dst, size_t dst_offset, unsigned dst_node, size_t size, void * async_channel)
															
 
																 {
															
--- a/src/drivers/driver_common/driver_common.c
+++ b/src/drivers/driver_common/driver_common.c
@@ -27,6 +27,7 @@
 
																 #include <core/sched_policy.h>
															
 
																 #include <core/debug.h>
															
 
																 #include <core/task.h>
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																 void _starpu_driver_start_job(struct _starpu_worker *worker, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch, int rank, int profiling)
															
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -948,6 +948,7 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker
 
																 		_STARPU_TRACE_START_EXECUTING();
															
 
																 #ifdef STARPU_SIMGRID
															
 
																 		double length = NAN;
															
 
																+		double energy = NAN;
															
 
																 		int async = task->cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC;
															
 
																 		int simulate = 1;
															
 
																 		if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE && !async)
															
@@ -976,6 +977,7 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker
 
																 #else
															
 
																 			length = ((double) profiling_info->used_cycles)/MSG_get_host_speed(MSG_host_self());
															
 
																 #endif
															
 
																+			energy = info->energy_consumed;
															
 
																 			/* And give the simulated time to simgrid */
															
 
																 			simulate = 1;
															
 
																 #endif
															
@@ -989,8 +991,11 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker
 
																 			}
															
 
																 		if (simulate)
															
 
																-			_starpu_simgrid_submit_job(worker->workerid, j, &worker->perf_arch, length,
															
 
																+		{
															
 
																+			struct _starpu_sched_ctx *sched_ctx = _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(worker, j);
															
 
																+			_starpu_simgrid_submit_job(sched_ctx->id, worker->workerid, j, &worker->perf_arch, length, energy,
															
 
																 						   async ? &task_finished[worker->devid][pipeline_idx] : NULL);
															
 
																+		}
															
 
																 #else
															
 
																 		func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
															
--- a/src/profiling/bound.c
+++ b/src/profiling/bound.c
@@ -26,6 +26,7 @@
 
																 #include <profiling/bound.h>
															
 
																 #include <core/jobs.h>
															
 
																 #include <core/workers.h>
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																 #ifdef STARPU_HAVE_GLPK_H
															
 
																 #include <glpk.h>
															
--- a/src/profiling/profiling.c
+++ b/src/profiling/profiling.c
@@ -29,6 +29,8 @@
 
																 #include <papi.h>
															
 
																 #endif
															
 
																+/* TODO: move to worker structure */
															
 
																+
															
 
																 static struct starpu_profiling_worker_info worker_info[STARPU_NMAXWORKERS];
															
 
																 /* TODO: rather use rwlock */
															
 
																 static starpu_pthread_mutex_t worker_info_mutex[STARPU_NMAXWORKERS];
															
@@ -44,6 +46,7 @@ static struct timespec executing_start_date[STARPU_NMAXWORKERS];
 
																 #ifdef STARPU_PAPI
															
 
																 static int papi_events[PAPI_MAX_HWCTRS];
															
 
																 static int papi_nevents = 0;
															
 
																+static int warned_component_unavailable = 0;
															
 
																 #endif
															
 
																 /* Store the busid of the different (src, dst) pairs. busid_matrix[src][dst]
															
@@ -158,7 +161,7 @@ void _starpu_profiling_init(void)
 
																 		conf_papi_events = starpu_getenv("STARPU_PROF_PAPI_EVENTS");
															
 
																 		if (conf_papi_events != NULL)
															
 
																 		{
															
 
																-			while ((papi_event_name = strtok_r(conf_papi_events, " ", &conf_papi_events)))
															
 
																+			while ((papi_event_name = strtok_r(conf_papi_events, " ,", &conf_papi_events)))
															
 
																 			{
															
 
																 				_STARPU_DEBUG("Loading PAPI Event:%s\n", papi_event_name);
															
 
																 				retval = PAPI_event_name_to_code ((char*)papi_event_name, &papi_events[papi_nevents]);
															
@@ -186,7 +189,12 @@ void _starpu_profiling_papi_task_start_counters(struct starpu_task *task)
 
																 		PAPI_create_eventset(&profiling_info->papi_event_set);
															
 
																 		for(int i=0; i<papi_nevents; i++)
															
 
																 		{
															
 
																-			PAPI_add_event(profiling_info->papi_event_set, papi_events[i]);
															
 
																+			int ret = PAPI_add_event(profiling_info->papi_event_set, papi_events[i]);
															
 
																+			if (ret == PAPI_ECMP_DISABLED && !warned_component_unavailable)
															
 
																+			{
															
 
																+				_STARPU_MSG("Error while registering Papi event: Component containing event is disabled. Try running `papi_component_avail` to get more information.\n");
															
 
																+				warned_component_unavailable = 1;
															
 
																+			}
															
 
																 			profiling_info->papi_values[i]=0;
															
 
																 		}
															
 
																 		PAPI_reset(profiling_info->papi_event_set);
															
--- a/src/profiling/profiling_helpers.c
+++ b/src/profiling/profiling_helpers.c
@@ -99,8 +99,9 @@ void _starpu_profiling_worker_helper_display_summary(FILE *stream)
 
																 	for (workerid = 0; workerid < worker_cnt; workerid++)
															
 
																 	{
															
 
																 		struct starpu_profiling_worker_info info;
															
 
																-		starpu_profiling_worker_get_info(workerid, &info);
															
 
																+		int ret = starpu_profiling_worker_get_info(workerid, &info);
															
 
																 		char name[64];
															
 
																+		STARPU_ASSERT(!ret);
															
 
																 		starpu_worker_get_name(workerid, name, sizeof(name));
															
--- a/src/sched_policies/component_best_implementation.c
+++ b/src/sched_policies/component_best_implementation.c
@@ -19,7 +19,9 @@
 
																 #include <starpu_sched_component.h>
															
 
																 #include <starpu_scheduler.h>
															
 
																+#ifdef BUILDING_STARPU
															
 
																 #include <core/workers.h>
															
 
																+#endif
															
 
																 /* return true if workerid can execute task, and fill task->predicted and task->predicted_transfer
															
 
																  *  according to best implementation predictions
															
@@ -39,12 +41,11 @@ static int find_best_impl(unsigned sched_ctx_id, struct starpu_task * task, int
 
																 	}
															
 
																 	else
															
 
																 	{
															
 
																-		struct starpu_perfmodel_arch* archtype = starpu_worker_get_perf_archtype(workerid, sched_ctx_id);
															
 
																 		for(impl = 0; impl < STARPU_MAXIMPLEMENTATIONS; impl++)
															
 
																 		{
															
 
																 			if(starpu_worker_can_execute_task(workerid, task, impl))
															
 
																 			{
															
 
																-				double d = starpu_task_expected_length(task, archtype, impl);
															
 
																+				double d = starpu_task_worker_expected_length(task, workerid, sched_ctx_id, impl);
															
 
																 				if(isnan(d))
															
 
																 				{
															
 
																 					best_impl = impl;
															
--- a/src/sched_policies/component_eager.c
+++ b/src/sched_policies/component_eager.c
@@ -16,6 +16,9 @@
 
																 #include <starpu_sched_component.h>
															
 
																 #include <starpu_scheduler.h>
															
 
																+#ifdef BUILDING_STARPU
															
 
																+#include <core/workers.h>
															
 
																+#endif
															
 
																 struct _starpu_eager_data
															
 
																 {
															
--- a/src/sched_policies/component_heft.c
+++ b/src/sched_policies/component_heft.c
@@ -2,6 +2,7 @@
 
																  *
															
 
																  * Copyright (C) 2013-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
															
 
																  * Copyright (C) 2013       Simon Archipoff
															
 
																+ * Copyright (C) 2020       Télécom-Sud Paris
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
--- a/src/sched_policies/component_heteroprio.c
+++ b/src/sched_policies/component_heteroprio.c
@@ -2,6 +2,7 @@
 
																  *
															
 
																  * Copyright (C) 2013-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
															
 
																  * Copyright (C) 2013       Simon Archipoff
															
 
																+ * Copyright (C) 2020       Télécom-Sud Paris
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
--- a/src/sched_policies/component_mct.c
+++ b/src/sched_policies/component_mct.c
@@ -2,6 +2,7 @@
 
																  *
															
 
																  * Copyright (C) 2013-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
															
 
																  * Copyright (C) 2013       Simon Archipoff
															
 
																+ * Copyright (C) 2020       Télécom-Sud Paris
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
--- a/src/sched_policies/component_sched.c
+++ b/src/sched_policies/component_sched.c