Browse Source

Merge branch 'master' of git+ssh://scm.gforge.inria.fr/gitroot/starpu/starpu into ft_checkpoint

Romain LION 5 years ago
parent
commit
bc23f1b516
100 changed files with 780 additions and 266 deletions
  1. 8 0
      ChangeLog
  2. 73 46
      configure.ac
  3. 1 1
      contrib/ci.inria.fr/job-0-tarball.sh
  4. 13 3
      doc/doxygen/Makefile.am
  5. 1 1
      doc/doxygen/chapters/301_tasks.doxy
  6. 5 0
      doc/doxygen/chapters/320_scheduling.doxy
  7. 19 7
      doc/doxygen/chapters/380_offline_performance_tools.doxy
  8. 9 0
      doc/doxygen/chapters/501_environment_variables.doxy
  9. 23 1
      doc/doxygen/chapters/510_configure_options.doxy
  10. 1 0
      doc/doxygen/chapters/code/vector_scal_opencl.c
  11. 12 4
      doc/doxygen_dev/Makefile.am
  12. 2 0
      examples/axpy/axpy_opencl.c
  13. 2 0
      examples/basic_examples/multiformat_conversion_codelets_opencl.c
  14. 2 0
      examples/basic_examples/multiformat_opencl.c
  15. 1 0
      examples/basic_examples/vector_scal_opencl.c
  16. 2 0
      examples/filters/custom_mf/conversion_opencl.c
  17. 2 0
      examples/filters/custom_mf/custom_opencl.c
  18. 2 0
      examples/interface/complex_kernels_opencl.c
  19. 1 0
      examples/mult/double.h
  20. 1 0
      examples/mult/simple.h
  21. 1 1
      examples/mult/xgemm.c
  22. 2 14
      examples/reductions/dot_product.c
  23. 1 0
      examples/reductions/dot_product_opencl_kernels.cl
  24. 6 0
      include/starpu.h
  25. 9 0
      include/starpu_config.h.in
  26. 1 2
      include/starpu_fxt.h
  27. 8 0
      include/starpu_helper.h
  28. 23 0
      include/starpu_stdlib.h
  29. 1 1
      include/starpu_task.h
  30. 11 0
      include/starpu_util.h
  31. 15 0
      julia/examples/cholesky/cholesky_common.jl
  32. 15 0
      julia/examples/cholesky/cholesky_native.jl
  33. 15 0
      julia/src/openblas_ldflags.jl
  34. 69 0
      mpi/examples/Makefile.am
  35. 0 0
      mpi/examples/benchs/abstract_sendrecv_bench.c
  36. 0 0
      mpi/examples/benchs/abstract_sendrecv_bench.h
  37. 0 0
      mpi/examples/benchs/bench_helper.c
  38. 0 0
      mpi/examples/benchs/bench_helper.h
  39. 2 6
      mpi/tests/burst.c
  40. 3 7
      mpi/tests/burst_gemm.c
  41. 0 0
      mpi/examples/benchs/burst_helper.c
  42. 0 0
      mpi/examples/benchs/burst_helper.h
  43. 0 0
      mpi/examples/benchs/gemm_helper.c
  44. 0 0
      mpi/examples/benchs/gemm_helper.h
  45. 2 7
      mpi/tests/sendrecv_bench.c
  46. 6 10
      mpi/tests/sendrecv_gemm_bench.c
  47. 3 9
      mpi/tests/sendrecv_parallel_tasks_bench.c
  48. 1 0
      mpi/src/starpu_mpi.c
  49. 2 3
      mpi/src/starpu_mpi_datatype.c
  50. 34 1
      mpi/src/starpu_mpi_init.c
  51. 3 55
      mpi/tests/Makefile.am
  52. 44 0
      mpi/tests/display_bindings.c
  53. 11 4
      src/Makefile.am
  54. 15 0
      src/common/utils.c
  55. 2 0
      src/core/jobs.c
  56. 5 4
      src/core/perfmodel/perfmodel_bus.c
  57. 6 1
      src/core/perfmodel/perfmodel_history.c
  58. 1 0
      src/core/perfmodel/perfmodel_print.c
  59. 115 41
      src/core/perfmodel/regression.c
  60. 1 0
      src/core/sched_ctx.c
  61. 1 0
      src/core/sched_policy.c
  62. 24 2
      src/core/simgrid.c
  63. 1 1
      src/core/simgrid.h
  64. 1 0
      src/core/task.c
  65. 4 0
      src/core/topology.c
  66. 16 0
      src/core/workers.c
  67. 22 2
      src/core/workers.h
  68. 3 0
      src/datawizard/data_request.c
  69. 1 0
      src/datawizard/filters.c
  70. 3 0
      src/datawizard/interfaces/bcsr_interface.c
  71. 3 0
      src/datawizard/interfaces/block_interface.c
  72. 3 0
      src/datawizard/interfaces/coo_interface.c
  73. 3 0
      src/datawizard/interfaces/csr_interface.c
  74. 3 0
      src/datawizard/interfaces/matrix_interface.c
  75. 3 0
      src/datawizard/interfaces/multiformat_interface.c
  76. 3 0
      src/datawizard/interfaces/tensor_interface.c
  77. 3 0
      src/datawizard/interfaces/variable_interface.c
  78. 3 0
      src/datawizard/interfaces/vector_interface.c
  79. 3 0
      src/datawizard/interfaces/void_interface.c
  80. 1 0
      src/datawizard/memory_manager.c
  81. 1 0
      src/datawizard/reduction.c
  82. 1 0
      src/datawizard/user_interactions.c
  83. 2 1
      src/datawizard/write_back.c
  84. 2 2
      src/debug/traces/starpu_fxt.c
  85. 0 2
      src/debug/traces/starpu_fxt.h
  86. 19 19
      src/debug/traces/starpu_fxt_mpi.c
  87. 4 1
      src/drivers/cpu/driver_cpu.c
  88. 4 1
      src/drivers/cuda/driver_cuda.c
  89. 1 0
      src/drivers/disk/driver_disk.c
  90. 1 0
      src/drivers/driver_common/driver_common.c
  91. 6 1
      src/drivers/opencl/driver_opencl.c
  92. 1 0
      src/profiling/bound.c
  93. 10 2
      src/profiling/profiling.c
  94. 2 1
      src/profiling/profiling_helpers.c
  95. 3 2
      src/sched_policies/component_best_implementation.c
  96. 3 0
      src/sched_policies/component_eager.c
  97. 1 0
      src/sched_policies/component_heft.c
  98. 1 0
      src/sched_policies/component_heteroprio.c
  99. 1 0
      src/sched_policies/component_mct.c
  100. 0 0
      src/sched_policies/component_sched.c

+ 8 - 0
ChangeLog

@@ -31,9 +31,12 @@ New features:
     files. This file can be parsed by the new script
     starpu_fxt_number_events_to_names.py to convert event keys to event names.
   * New STARPU_PER_WORKER perfmodel.
+  * Add energy accounting in the simgrid mode: starpu_energy_use() and
+    starpu_energy_used().
 
 Small changes:
   * Use the S4U interface of Simgrid instead of xbt and MSG.
+  * Add a synthetic energy efficiency testcase.
 
 StarPU 1.3.4 (git revision xxx)
 ==============================================
@@ -60,6 +63,11 @@ Small features:
   * New STARPU_BACKOFF_MIN and STARPU_BACKOFF_MAX environment variables to the
     exponential backoff limits of the number of cycles to pause while drivers
     are spinning.
+  * Add STARPU_DISPLAY_BINDINGS environment variable and
+    starpu_display_bindings() function to display all bindings on the machine by
+    calling hwloc-ps
+Small changes:
+  * New configure option --disable-build-doc-pdf
 
 StarPU 1.3.3 (git revision 11afc5b007fe1ab1c729b55b47a5a98ef7f3cfad)
 ====================================================================

+ 73 - 46
configure.ac

@@ -2245,6 +2245,14 @@ AC_MSG_RESULT($nmaxbuffers)
 AC_DEFINE_UNQUOTED(STARPU_NMAXBUFS, [$nmaxbuffers],
 		[how many buffers can be manipulated per task])
 
+AC_MSG_CHECKING(how many MPI nodes fxt files can be manipulated when generating traces)
+AC_ARG_ENABLE(fxt-max-files, [AS_HELP_STRING([--enable-fxt-max-files=<nbuffers>],
+			[maximum number of mpi nodes for traces])],
+			nmaxfxtfiles=$enableval, nmaxfxtfiles=64)
+AC_MSG_RESULT($nmaxfxtfiles)
+AC_DEFINE_UNQUOTED(STARPU_FXT_MAX_FILES, [$nmaxfxtfiles],
+		[how many MPI nodes fxt files can be manipulated when generating traces])
+
 AC_MSG_CHECKING(maximum number of memory nodes to use per MPI rank)
 AC_ARG_ENABLE(maxnodes, [AS_HELP_STRING([--enable-maxnodes=<nnodes>],
 			[maximum number of memory nodes per MPI rank])],
@@ -2537,6 +2545,7 @@ if test "x$enable_build_fortran_requested" = "xyes" ; then
                  fi
 	else
 		if $FC -V 2>&1|grep -q 'Intel(R) Fortran'; then
+			enable_build_fortran="yes"
 			ifort_fc_version=`$FC -V 2>&1 |head -1|sed 's/.*Version //;s/ Build.*//'`
 			ifort_maj_version=`echo $ifort_fc_version|cut -d. -f1`
 
@@ -2553,38 +2562,28 @@ if test "x$enable_build_fortran_requested" = "xyes" ; then
 				enable_build_fortran="no"
 			else
 				AC_MSG_WARN(Fortran compiler has not been tested for StarPU native Fortran support)
+				 enable_build_fortran="yes"
 			fi
 		fi
 	fi
 	if test "x$enable_build_fortran" = "xyes" ; then
 		AC_DEFINE(STARPU_HAVE_FC, [1], [Define this if a Fortran compiler is available])
-		if test x$build_mpi_lib = xyes -o x$build_mpi_master_slave = xyes ; then
-			AC_ARG_WITH(mpifort, [AS_HELP_STRING([--with-mpifort[=<path to mpifort>]],
-				    [Path of the mpifort compiler])],
-				    [
-				     if test x$withval = xyes; then
-					     AC_MSG_ERROR(--with-mpifort must be given a pathname)
-					     else
-						     mpifort_path=$withval
-					     fi
-					     ],
-					     [
-					      if test x$enable_simgrid = xyes ; then
-						      DEFAULT_MPIFORT=smpifort
-					      else
-						      DEFAULT_MPIFORT=mpif90
-					      fi
-					      case $DEFAULT_MPIFORT in
-					      	/*) mpifort_path="$DEFAULT_MPIFORT" ;;
-					        *)  AC_PATH_PROG(mpifort_path, $DEFAULT_MPIFORT, [no], [$MPIPATH])
-					      esac
-					      ])
-
+		if test x$build_mpi_lib = xyes -o x$build_nmad_lib = xyes -o x$build_mpi_master_slave = xyes ; then
+			#Check MPIFORT
+			if test x$enable_simgrid = xyes ; then
+				DEFAULT_MPIFORT=smpifort
+			else
+				DEFAULT_MPIFORT=mpifort
+			fi
+			AC_ARG_WITH(mpifort, [AS_HELP_STRING([--with-mpifort=<mpifort name or path to mpifort>], [Name or path of the mpifort compiler])], [DEFAULT_MPIFORT=$withval])
+			case $DEFAULT_MPIFORT in
+				/*) mpifort_path="$DEFAULT_MPIFORT" ;;
+				*)  AC_PATH_PROG(mpifort_path, $DEFAULT_MPIFORT, [no], [$simgrid_dir/bin:$PATH]) ;;
+			esac
 			# We test if the MPIFORT compiler exists
 			if test ! -x $mpifort_path; then
-				#MPIFORT does not exists or is not executable
 				AC_MSG_RESULT(The mpifort compiler '$mpifort_path' does not have the execute permission)
-				use_mpi_fort=no
+				mpifort_path=no
 			else
 				OLD_CC=$CC
 				CC=$mpicc_path
@@ -2599,11 +2598,18 @@ if test "x$enable_build_fortran_requested" = "xyes" ; then
 				CC=$OLD_CC
 				if test "x$use_mpi_fort" = xyes; then
 					AC_DEFINE([HAVE_MPI_COMM_F2C], [1], [Function MPI_Comm_f2c is available])
-					AC_MSG_CHECKING(mpifort path)
-					AC_MSG_RESULT($mpifort_path)
-					AC_SUBST(MPIFORT, $mpifort_path)
 				fi
 			fi
+
+			AC_MSG_CHECKING(whether mpifort is available)
+			AC_MSG_RESULT($mpifort_path)
+			AC_SUBST(MPIFORT, $mpifort_path)
+
+			if test x$mpifort_path != xno ; then
+				MPIPATH=$(dirname $mpifort_path):$PATH
+			else
+				MPIPATH=$PATH
+			fi
 		fi
 	fi
    fi
@@ -3413,34 +3419,51 @@ AC_ARG_ENABLE(build-doc, [AS_HELP_STRING([--disable-build-doc],
 			[disable building of documentation])],
 			enable_build_doc=$enableval, enable_build_doc=yes)
 
-if test "$enable_build_doc" = "yes" ; then
-   # Check whether doxygen needed tools are installed
-   AC_PATH_PROG(doxygencommand, doxygen)
-   if test "$doxygencommand" = "" ; then
-      	enable_build_doc="no"
-   fi
-   AC_PATH_PROG(pdflatexcommand, pdflatex)
-   if test "$pdflatexcommand" = "" ; then
-	enable_build_doc="no"
-   fi
-   AC_PATH_PROG(epstopdfcommand, epstopdf)
-   if test "$epstopdfcommand" = "" ; then
-	enable_build_doc="no"
-   fi
+AC_ARG_ENABLE(build-doc-pdf, [AS_HELP_STRING([--enable-build-doc-pdf],
+			[enable building of PDF documentation])],
+			enable_build_doc_pdf=$enableval, enable_build_doc_pdf=no)
+
+# Check whether doxygen needed tools are installed
+AC_PATH_PROG(doxygencommand, doxygen)
+if test "$doxygencommand" = "" ; then
+   enable_build_doc="no"
+   enable_build_doc_pdf="no"
+fi
+AC_PATH_PROG(pdflatexcommand, pdflatex)
+if test "$pdflatexcommand" = "" ; then
+   enable_build_doc_pdf="no"
 fi
+AC_PATH_PROG(epstopdfcommand, epstopdf)
+if test "$epstopdfcommand" = "" ; then
+   enable_build_doc_pdf="no"
+fi
+
 available_doc="no"
-if test -f "$srcdir/doc/doxygen/starpu.pdf" ; then
+if test -d "$srcdir/doc/doxygen/html" ; then
    enable_build_doc="no"
    available_doc="yes"
 fi
-AC_MSG_CHECKING(whether documentation should be compiled)
+available_doc_pdf="no"
+if test -f "$srcdir/doc/doxygen/starpu.pdf" ; then
+   enable_build_doc="no"
+   enable_build_doc_pdf="no"
+   available_doc_pdf="yes"
+fi
+AC_MSG_CHECKING(whether HTML documentation should be compiled)
 AC_MSG_RESULT($enable_build_doc)
-AC_MSG_CHECKING(whether documentation is available)
+AC_MSG_CHECKING(whether HTML documentation is available)
 AC_MSG_RESULT($available_doc)
+AC_MSG_CHECKING(whether PDF documentation should be compiled)
+AC_MSG_RESULT($enable_build_doc_pdf)
+AC_MSG_CHECKING(whether PDF documentation is available)
+AC_MSG_RESULT($available_doc_pdf)
 
 AM_CONDITIONAL(STARPU_BUILD_DOC, [test x$enable_build_doc != xno])
 AM_CONDITIONAL(STARPU_AVAILABLE_DOC, [test x$available_doc != xno])
 
+AM_CONDITIONAL(STARPU_BUILD_DOC_PDF, [test x$enable_build_doc_pdf != xno])
+AM_CONDITIONAL(STARPU_AVAILABLE_DOC_PDF, [test x$available_doc_pdf != xno])
+
 ###############################################################################
 #                                                                             #
 #                                Julia                                        #
@@ -3520,6 +3543,9 @@ AC_CONFIG_COMMANDS([executable-scripts], [
   test -e tests/microbenchs/parallel_independent_homogeneous_tasks.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_homogeneous_tasks.sh tests/microbenchs/
   test -e tests/microbenchs/parallel_redux_homogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_redux_homogeneous_tasks_data.sh tests/microbenchs/
   test -e tests/microbenchs/parallel_redux_heterogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_redux_heterogeneous_tasks_data.sh tests/microbenchs/
+  mkdir -p tests/energy
+  test -e tests/energy/static.sh || ln -sf $ac_abs_top_srcdir/tests/energy/static.sh tests/energy/
+  test -e tests/energy/dynamic.sh || ln -sf $ac_abs_top_srcdir/tests/energy/dynamic.sh tests/energy/
   mkdir -p tests/datawizard
   test -e tests/datawizard/locality.sh || ln -sf $ac_abs_top_srcdir/tests/datawizard/locality.sh tests/datawizard/
   mkdir -p tests/overlap
@@ -3672,8 +3698,9 @@ AC_MSG_NOTICE([
 	hwloc:             $have_valid_hwloc
 	FxT trace enabled: $use_fxt
 
-        Documentation:     $enable_build_doc
-        Examples:          $enable_build_examples
+        Documentation HTML:  $enable_build_doc
+        Documentation PDF:   $enable_build_doc_pdf
+        Examples:            $enable_build_examples
 
 	StarPU Extensions:
 	       StarPU MPI enabled:                            $build_mpi_lib

+ 1 - 1
contrib/ci.inria.fr/job-0-tarball.sh

@@ -21,7 +21,7 @@ export LD_LIBRARY_PATH=/home/ci/usr/local/lib:$LD_LIBRARY_PATH
 ./autogen.sh
 if test -d build ; then chmod -R 777 build && rm -rf build ; fi
 mkdir build && cd build
-../configure
+../configure --enable-build-doc-pdf
 make V=1
 make dist
 cp *gz ..

+ 13 - 3
doc/doxygen/Makefile.am

@@ -30,9 +30,14 @@ txtdir   = $(docdir)/manual
 EXTRA_DIST =
 
 if STARPU_BUILD_DOC
+if STARPU_BUILD_DOC_PDF
 all: $(DOX_HTML_DIR) $(DOX_PDF)
 EXTRA_DIST += $(DOX_HTML_DIR) $(DOX_PDF)
 txt_DATA = $(DOX_PDF)
+else
+all: $(DOX_HTML_DIR)
+EXTRA_DIST += $(DOX_HTML_DIR)
+endif
 DOX_HTML_SRCDIR=$(DOX_HTML_DIR)
 install-exec-hook:
 	$(MKDIR_P) $(DESTDIR)$(docdir)/manual/html
@@ -41,8 +46,7 @@ uninstall-hook:
 	rm -rf $(DESTDIR)$(docdir)/manual/html
 else
 if STARPU_AVAILABLE_DOC
-EXTRA_DIST += $(top_srcdir)/doc/doxygen/html $(top_srcdir)/doc/doxygen/starpu.pdf
-txt_DATA = $(top_srcdir)/doc/doxygen/starpu.pdf
+EXTRA_DIST += $(top_srcdir)/doc/doxygen/html
 DOX_HTML_SRCDIR=$(top_srcdir)/doc/doxygen/html
 install-exec-hook:
 	$(MKDIR_P) $(DESTDIR)$(docdir)/manual/html
@@ -50,6 +54,10 @@ install-exec-hook:
 uninstall-hook:
 	rm -rf $(DESTDIR)$(docdir)/manual/html
 endif
+if STARPU_AVAILABLE_DOC_PDF
+EXTRA_DIST += $(top_srcdir)/doc/doxygen/starpu.pdf
+txt_DATA = $(top_srcdir)/doc/doxygen/starpu.pdf
+endif
 endif
 
 chapters =	\
@@ -257,6 +265,8 @@ $(DOX_TAG): $(dox_inputs)
 	@$(SED) -i '/\\begin{titlepage}/,$$d' $(DOX_LATEX_DIR)/refman.tex
 	@cat $(top_srcdir)/doc/doxygen/refman.tex >> $(DOX_LATEX_DIR)/refman.tex
 
+$(DOX_HTML_DIR): $(DOX_TAG)
+
 $(DOX_PDF): $(DOX_TAG) refman.tex
 	@cp $(top_srcdir)/doc/doxygen/chapters/version.sty $(DOX_LATEX_DIR)
 	@cp $(top_srcdir)/doc/doxygen/chapters/images/*pdf $(DOX_LATEX_DIR)
@@ -294,5 +304,5 @@ EXTRA_DIST += doxygen.cfg refman.tex \
 # Rule to update documentation on web server. Should only be used locally.
 PUBLISHHOST	?= gforge
 update-web: $(DOX_PDF)
-	scp -pr starpu.pdf html $(PUBLISHHOST):/home/groups/starpu/htdocs/doc
+	scp -pr starpu.pdf html $(PUBLISHHOST):/home/groups/starpu/htdocs/files/doc
 

+ 1 - 1
doc/doxygen/chapters/301_tasks.doxy

@@ -118,7 +118,7 @@ to delay the termination of a task until the termination of other tasks.
 
 \section SettingManyDataHandlesForATask Setting Many Data Handles For a Task
 
-The maximum number of data a task can manage is fixed by the environment variable
+The maximum number of data a task can manage is fixed by the macro
 \ref STARPU_NMAXBUFS which has a default value which can be changed
 through the \c configure option \ref enable-maxbuffers "--enable-maxbuffers".
 

+ 5 - 0
doc/doxygen/chapters/320_scheduling.doxy

@@ -185,6 +185,11 @@ already gives the good results that a precise estimation would give.
 
 \section Energy-basedScheduling Energy-based Scheduling
 
+Note: by default StarPU does not let CPU workers sleep, to let them react to
+task release as quickly as possible. For idle time to really let CPU cores save
+energy, one needs to use the \ref enable-blocking-drivers
+"--enable-blocking-drivers" configuration option.
+
 If the application can provide some energy consumption performance model (through
 the field starpu_codelet::energy_model), StarPU will
 take it into account when distributing tasks. The target function that

+ 19 - 7
doc/doxygen/chapters/380_offline_performance_tools.doxy

@@ -586,19 +586,31 @@ $ starpu_paje_sort paje.trace
 \section PapiCounters PAPI counters
 
 Performance counter values could be obtained from the PAPI framework if
-<c>./configure</c> detected the libpapi. One has to set the \ref STARPU_PROFILING
-environment variable to 1 and then specify which events to record with the
-\ref STARPU_PROF_PAPI_EVENTS environment variable. For instance:
+<c>./configure</c> detected the libpapi.
+
+In Debian, packages <c>libpapi-dev</c> and <c>libpapi5.7</c> provide required
+files.  Package <c>papi-tools</c> contains a set of useful tools, for example
+<c>papi_avail</c> to see which counters are available.
+
+To be able to use Papi counters, one may need to reduce the level of the kernel
+parameter <c>kernel.perf_event_paranoid</c> to at least 2. See
+https://www.kernel.org/doc/html/latest/admin-guide/perf-security.html for the
+security impact of this parameter.
+
+Then one has to set the \ref STARPU_PROFILING environment variable to 1 and
+specify which events to record with the \ref STARPU_PROF_PAPI_EVENTS
+environment variable. For instance:
 
 \verbatim
 export STARPU_PROFILING=1 STARPU_PROF_PAPI_EVENTS="PAPI_TOT_INS PAPI_TOT_CYC"
 \endverbatim
 
+The comma can also be used to separate events to monitor.
+
 In the current simple implementation, only CPU tasks have their events measured
-and require CPUs that support the PAPI events. All events that PAPI support are
-available from their documentation (https://icl.cs.utk.edu/projects/papi/wiki/PAPIC:Preset_Event_Definitions).
-It is important to note that not all events are available on all systems, and
-general PAPI recommendations should be followed.
+and require CPUs that support the PAPI events. It is important to note that not
+all events are available on all systems, and general PAPI recommendations
+should be followed.
 
 The counter values can be accessed using the profiling interface:
 \code{.c}

+ 9 - 0
doc/doxygen/chapters/501_environment_variables.doxy

@@ -1366,6 +1366,15 @@ application has crashed. Setting this variable to a value other than 1
 will disable this behaviour. This should be done on JVM systems which
 may use these signals for their own needs.
 The flag can also be set through the field starpu_conf::catch_signals.
+</dd>
+
+<dt>STARPU_DISPLAY_BINDINGS</dt>
+<dd>
+\anchor STARPU_DISPLAY_BINDINGS
+\addindex __env__STARPU_DISPLAY_BINDINGS
+Display the binding of all processes and threads running on the machine. If MPI is enabled, display the binding of each node.<br>
+Users can manually display the binding by calling starpu_display_bindings().
+</dd>
 </dl>
 
 \section ConfiguringTheHypervisor Configuring The Hypervisor

+ 23 - 1
doc/doxygen/chapters/510_configure_options.doxy

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020       Federal University of Rio Grande do Sul (UFRGS)
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -115,7 +116,19 @@ Specify <c>hwloc</c> should not be used by StarPU.
 \addindex __configure__--disable-build-doc
 Disable the creation of the documentation. This should be done on a
 machine which does not have the tools <c>doxygen</c> and <c>latex</c>
-(plus the packages <c>latex-xcolor</c> and <c>texlive-latex-extra</c>).
+(plus the packages <c>latex-xcolor</c> and
+<c>texlive-latex-extra</c>).
+</dd>
+
+<dt>--enable-build-doc-pdf</dt>
+<dd>
+\anchor enable-build-doc-pdf
+\addindex __configure__--enable-build-doc-pdf
+By default, ontly the HTML documentation is generated. Use this option
+to also enable the generation of the PDF documentation. This should be
+done on a machine which does have the tools <c>doxygen</c> and <c>latex</c>
+(plus the packages <c>latex-xcolor</c> and
+<c>texlive-latex-extra</c>).
 </dd>
 
 <dt>--disable-icc</dt>
@@ -514,6 +527,15 @@ Define the maximum number of buffers that tasks will be able to take
 as parameters, then available as the macro ::STARPU_NMAXBUFS.
 </dd>
 
+<dt>--enable-fxt-max-files=<c>count</c></dt>
+<dd>
+\anchor enable-fxt-max-files
+\addindex __configure__--enable-fxt-max-files
+Use at most <c>count</c> mpi nodes fxt files for generating traces.  This information is then available as
+the macro ::STARPU_FXT_MAX_FILES.  This information is used by FxT tools when considering multi node traces.
+Default value is 64.
+</dd>
+
 <dt>--enable-allocation-cache</dt>
 <dd>
 \anchor enable-allocation-cache

+ 1 - 0
doc/doxygen/chapters/code/vector_scal_opencl.c

@@ -57,6 +57,7 @@ void scal_opencl_func(void *buffers[], void *_args)
         err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
         if (local > global) local=global;
+        else global = (global + local-1) / local * local;
 
         err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);

+ 12 - 4
doc/doxygen_dev/Makefile.am

@@ -30,9 +30,14 @@ txtdir   = $(docdir)/manual
 EXTRA_DIST =
 
 if STARPU_BUILD_DOC
+if STARPU_BUILD_DOC_PDF
 all: $(DOX_HTML_DIR) $(DOX_PDF)
 EXTRA_DIST += $(DOX_HTML_DIR) $(DOX_PDF)
 txt_DATA = $(DOX_PDF)
+else
+all: $(DOX_HTML_DIR)
+EXTRA_DIST += $(DOX_HTML_DIR)
+endif
 DOX_HTML_SRCDIR=$(DOX_HTML_DIR)
 install-exec-hook:
 	$(MKDIR_P) $(DESTDIR)$(docdir)/manual/html_dev
@@ -41,8 +46,7 @@ uninstall-hook:
 	rm -rf $(DESTDIR)$(docdir)/manual/html_dev
 else
 if STARPU_AVAILABLE_DOC
-EXTRA_DIST += $(top_srcdir)/doc/doxygen_dev/html_dev $(top_srcdir)/doc/doxygen_dev/starpu_dev.pdf
-txt_DATA = $(top_srcdir)/doc/doxygen_dev/starpu_dev.pdf
+EXTRA_DIST += $(top_srcdir)/doc/doxygen_dev/html_dev
 DOX_HTML_SRCDIR=$(top_srcdir)/doc/doxygen_dev/html_dev
 install-exec-hook:
 	$(MKDIR_P) $(DESTDIR)$(docdir)/manual/html_dev
@@ -50,6 +54,10 @@ install-exec-hook:
 uninstall-hook:
 	rm -rf $(DESTDIR)$(docdir)/manual/html_dev
 endif
+if STARPU_AVAILABLE_DOC_PDF
+EXTRA_DIST += $(top_srcdir)/doc/doxygen_dev/starpu_dev.pdf
+txt_DATA = $(top_srcdir)/doc/doxygen_dev/starpu_dev.pdf
+endif
 endif
 
 chapters =	\
@@ -191,7 +199,7 @@ dox_inputs = $(DOX_CONFIG) 				\
 	$(top_srcdir)/src/core/drivers.h	\
 	$(top_srcdir)/src/core/workers.h
 
-$(DOX_HTML_DIR): $(DOX_TAG) refman.tex
+$(DOX_HTML_DIR): $(DOX_TAG)
 	@$(MKDIR_P) $(DOX_HTML_DIR)
 
 $(DOX_TAG): $(dox_inputs)
@@ -240,5 +248,5 @@ EXTRA_DIST += doxygen.cfg refman.tex \
 # Rule to update documentation on web server. Should only be used locally.
 PUBLISHHOST	?= gforge
 update-web: $(DOX_PDF)
-	scp -pr starpu_dev.pdf html_dev $(PUBLISHHOST):/home/groups/starpu/htdocs/doc
+	scp -pr starpu_dev.pdf html_dev $(PUBLISHHOST):/home/groups/starpu/htdocs/files/doc
 

+ 2 - 0
examples/axpy/axpy_opencl.c

@@ -60,6 +60,8 @@ void axpy_opencl(void *buffers[], void *_args)
 			STARPU_OPENCL_REPORT_ERROR(err);
                 if (local > global)
 			local=global;
+                else
+                        global = (global + local-1) / local * local;
 
 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
 		if (err != CL_SUCCESS)

+ 2 - 0
examples/basic_examples/multiformat_conversion_codelets_opencl.c

@@ -74,6 +74,8 @@ void cpu_to_opencl_opencl_func(void *buffers[], void *args)
 
                 if (local > global)
 			local = global;
+                else
+                        global = (global + local-1) / local * local;
 
 		err = clEnqueueNDRangeKernel(queue,
 					kernel,

+ 2 - 0
examples/basic_examples/multiformat_opencl.c

@@ -68,6 +68,8 @@ void multiformat_scal_opencl_func(void *buffers[], void *args)
 
                 if (local > global)
 			local = global;
+                else
+                        global = (global + local-1) / local * local;
 
 		err = clEnqueueNDRangeKernel(queue,
 					kernel,

+ 1 - 0
examples/basic_examples/vector_scal_opencl.c

@@ -57,6 +57,7 @@ void scal_opencl_func(void *buffers[], void *_args)
                 err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
                 if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
                 if (local > global) local=global;
+                else global = (global + local-1) / local * local;
 
 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
 		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);

+ 2 - 0
examples/filters/custom_mf/conversion_opencl.c

@@ -76,6 +76,8 @@ void cpu_to_opencl_opencl_func(void *buffers[], void *args)
 
                 if (local > global)
 			local = global;
+                else
+                        global = (global + local-1) / local * local;
 
 		err = clEnqueueNDRangeKernel(
 				queue,

+ 2 - 0
examples/filters/custom_mf/custom_opencl.c

@@ -75,6 +75,8 @@ void custom_scal_opencl_func(void *buffers[], void *args)
 
                 if (local > global)
 			local = global;
+                else
+                        global = (global + local-1) / local * local;
 
 		err = clEnqueueNDRangeKernel(
 				queue,

+ 2 - 0
examples/interface/complex_kernels_opencl.c

@@ -64,6 +64,8 @@ void copy_complex_codelet_opencl(void *buffers[], void *_args)
 			STARPU_OPENCL_REPORT_ERROR(err);
                 if (local > global)
 			local=global;
+                else
+                        global = (global + local-1) / local * local;
 
 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
 		if (err != CL_SUCCESS)

+ 1 - 0
examples/mult/double.h

@@ -15,6 +15,7 @@
  */
 
 #define TYPE	double
+#define EPSILON	0.000000000001
 
 #define CUBLAS_GEMM cublasDgemm
 #define CPU_GEMM	STARPU_DGEMM

+ 1 - 0
examples/mult/simple.h

@@ -15,6 +15,7 @@
  */
 
 #define TYPE	float
+#define EPSILON	0.000001
 
 #define CUBLAS_GEMM cublasSgemm
 #define CPU_GEMM	STARPU_SGEMM

+ 1 - 1
examples/mult/xgemm.c

@@ -75,7 +75,7 @@ static int check_output(void)
 	TYPE err;
 	err = CPU_ASUM(xdim*ydim, C, 1);
 
-	if (err < xdim*ydim*0.001)
+	if (err < EPSILON*xdim*ydim*zdim)
 	{
 		FPRINTF(stderr, "Results are OK\n");
 		return 0;

+ 2 - 14
examples/reductions/dot_product.c

@@ -185,18 +185,12 @@ void redux_opencl_func(void *buffers[], void *args)
 
 	{
 		size_t global=1;
-		size_t local;
+                size_t local=1;
                 size_t s;
                 cl_device_id device;
 
                 starpu_opencl_get_device(devid, &device);
 
-                err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
-                if (err != CL_SUCCESS)
-			STARPU_OPENCL_REPORT_ERROR(err);
-                if (local > global)
-			local=global;
-
 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
 		if (err != CL_SUCCESS)
 			STARPU_OPENCL_REPORT_ERROR(err);
@@ -306,18 +300,12 @@ void dot_opencl_func(void *buffers[], void *cl_arg)
 
 	{
 		size_t global=1;
-		size_t local;
+                size_t local=1;
                 size_t s;
                 cl_device_id device;
 
                 starpu_opencl_get_device(devid, &device);
 
-                err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
-                if (err != CL_SUCCESS)
-			STARPU_OPENCL_REPORT_ERROR(err);
-                if (local > global)
-			local=global;
-
 		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
 		if (err != CL_SUCCESS)
 			STARPU_OPENCL_REPORT_ERROR(err);

+ 1 - 0
examples/reductions/dot_product_opencl_kernels.cl

@@ -31,6 +31,7 @@ __kernel void _dot_opencl(__global float *x,
 			  __global DOT_TYPE *dot,
 			  unsigned n)
 {
+/* FIXME: real parallel implementation */
 	unsigned i;
 	__local double tmp;
 	tmp = 0.0;

+ 6 - 0
include/starpu.h

@@ -111,6 +111,12 @@ struct starpu_conf
 	int magic;
 
 	/**
+	   @private
+	   Tell starpu_init() if MPI will be initialized later.
+	*/
+	int will_use_mpi;
+
+	/**
 	   Name of the scheduling policy. This can also be specified
 	   with the environment variable \ref STARPU_SCHED. (default =
 	   <c>NULL</c>).

+ 9 - 0
include/starpu_config.h.in

@@ -187,6 +187,15 @@
 #undef STARPU_NMAXBUFS
 
 /**
+   Define the maximum number of fxt mpi files that can be read when
+   generating traces. The default value is 64, it can be changed by
+   using the configure option \ref enable-fxt-max-files
+   "--enable-fxt-max-files".
+   @ingroup API_MPI_Support
+*/
+#undef STARPU_FXT_MAX_FILES
+
+/**
    Define the maximum number of CPU workers managed by StarPU. The
    default value can be modified at configure by using the option \ref
    enable-maxcpus "--enable-maxcpus".

+ 1 - 2
include/starpu_fxt.h

@@ -20,6 +20,7 @@
 #ifndef __STARPU_FXT_H__
 #define __STARPU_FXT_H__
 
+#include <starpu_config.h>
 #include <starpu_perfmodel.h>
 
 #ifdef __cplusplus
@@ -32,8 +33,6 @@ extern "C"
    @{
 */
 
-#define STARPU_FXT_MAX_FILES	64
-
 struct starpu_fxt_codelet_event
 {
 	char symbol[256];

+ 8 - 0
include/starpu_helper.h

@@ -182,6 +182,14 @@ double starpu_timing_now(void);
 */
 int starpu_data_cpy(starpu_data_handle_t dst_handle, starpu_data_handle_t src_handle, int asynchronous, void (*callback_func)(void*), void *callback_arg);
 
+/**
+   Call hwloc-ps to display binding of each processus and thread running on
+   the machine.<br>
+   Use the environment variable \ref STARPU_DISPLAY_BINDINGS to automatically
+   call this function at the beginning of the execution of StarPU.
+*/
+void starpu_display_bindings(void);
+
 /** @} */
 
 #ifdef __cplusplus

+ 23 - 0
include/starpu_stdlib.h

@@ -239,9 +239,32 @@ void starpu_memory_deallocate(unsigned node, size_t size);
 */
 void starpu_memory_wait_available(unsigned node, size_t size);
 
+/**
+   Sleep for the given \p nb_sec seconds.
+   In simgrid mode, this only sleeps within virtual time.
+  */
 void starpu_sleep(float nb_sec);
+
+/**
+   Sleep for the given \p nb_micro_sec micro-seconds.
+   In simgrid mode, this only sleeps within virtual time.
+  */
 void starpu_usleep(float nb_micro_sec);
 
+/**
+   Account for \p joules J being used.
+   This is support in simgrid mode, to record how much energy was used, and will
+   show up in further call to starpu_energy_used().
+  */
+void starpu_energy_use(float joules);
+
+/**
+   Return the amount of energy having been used in J.
+   This account the amounts passed to starpu_energy_use(), but also the static
+   energy use set by the \ref STARPU_IDLE_POWER environment variable.
+  */
+double starpu_energy_used(void);
+
 /** @} */
 
 #ifdef __cplusplus

+ 1 - 1
include/starpu_task.h

@@ -513,7 +513,7 @@ struct starpu_codelet
 
 	/**
 	   Optional pointer to the task energy consumption performance
-	   model associated to this codelet. This optional field is
+	   model associated to this codelet (in J). This optional field is
 	   ignored when set to <c>NULL</c> or when its field
 	   starpu_perfmodel::symbol is not set. In the case of
 	   parallel codelets, this has to account for all processing

+ 11 - 0
include/starpu_util.h

@@ -598,6 +598,17 @@ STARPU_ATOMIC_SOMETHING64(or, old | value)
 #define STARPU_WMB() STARPU_SYNCHRONIZE()
 #endif
 
+#if defined(__i386__) || defined(__x86_64__)
+#define STARPU_CACHELINE_SIZE 64
+#elif defined(__ppc__) || defined(__ppc64__) || defined(__ia64__)
+#define STARPU_CACHELINE_SIZE 128
+#elif defined(__s390__) || defined(__s390x__)
+#define STARPU_CACHELINE_SIZE 256
+#else
+/* Conservative default */
+#define STARPU_CACHELINE_SIZE 1024
+#endif
+
 #ifdef _WIN32
 /* Try to fetch the system definition of timespec */
 #include <sys/types.h>

+ 15 - 0
julia/examples/cholesky/cholesky_common.jl

@@ -1,3 +1,18 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
 # Standard kernels for the Cholesky factorization
 # U22 is the gemm update
 # U21 is the trsm update

+ 15 - 0
julia/examples/cholesky/cholesky_native.jl

@@ -1,3 +1,18 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
 using LinearAlgebra
 
 function check(mat::Matrix{Float32})

+ 15 - 0
julia/src/openblas_ldflags.jl

@@ -1,3 +1,18 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
 import LinearAlgebra.BLAS
 import Libdl
 

+ 69 - 0
mpi/examples/Makefile.am

@@ -83,6 +83,10 @@ EXTRA_DIST = 				\
 	matrix_decomposition/mpi_decomposition_params.h	\
 	matrix_decomposition/mpi_decomposition_matrix.h	\
 	user_datatype/my_interface.h			\
+	benchs/abstract_sendrecv_bench.h	\
+	benchs/bench_helper.h			\
+	benchs/gemm_helper.h			\
+	benchs/burst_helper.h			\
 	helper.h
 
 examplebindir = $(libdir)/starpu/mpi
@@ -399,3 +403,68 @@ native_fortran/nf_mm_task_build.o: nf_mm_cl.mod fstarpu_mpi_mod.mod fstarpu_mod.
 native_fortran/nf_basic_ring.o: fstarpu_mpi_mod.mod fstarpu_mod.mod
 endif
 endif
+
+
+##########
+# benchs #
+##########
+
+examplebin_PROGRAMS +=		\
+	benchs/sendrecv_bench	\
+	benchs/burst
+
+if !STARPU_USE_MPI_MPI
+examplebin_PROGRAMS +=		\
+	benchs/sendrecv_parallel_tasks_bench
+endif
+
+if !STARPU_NO_BLAS_LIB
+examplebin_PROGRAMS +=		\
+	benchs/sendrecv_gemm_bench			\
+	benchs/burst_gemm
+endif
+
+if !STARPU_SIMGRID
+starpu_mpi_EXAMPLES	+=	\
+	benchs/sendrecv_bench	\
+	benchs/burst
+
+if !STARPU_USE_MPI_MPI
+starpu_mpi_EXAMPLES	+=	\
+	benchs/sendrecv_parallel_tasks_bench
+endif
+
+if !STARPU_NO_BLAS_LIB
+starpu_mpi_EXAMPLES	+=	\
+	benchs/sendrecv_gemm_bench			\
+	benchs/burst_gemm
+endif
+endif
+
+benchs_sendrecv_bench_SOURCES = benchs/sendrecv_bench.c
+benchs_sendrecv_bench_SOURCES += benchs/bench_helper.c
+benchs_sendrecv_bench_SOURCES += benchs/abstract_sendrecv_bench.c
+
+benchs_sendrecv_parallel_tasks_bench_SOURCES = benchs/sendrecv_parallel_tasks_bench.c
+benchs_sendrecv_parallel_tasks_bench_SOURCES += benchs/bench_helper.c
+benchs_sendrecv_parallel_tasks_bench_SOURCES += benchs/abstract_sendrecv_bench.c
+
+benchs_burst_SOURCES = benchs/burst.c
+benchs_burst_SOURCES += benchs/burst_helper.c
+
+if !STARPU_NO_BLAS_LIB
+benchs_sendrecv_gemm_bench_SOURCES = benchs/sendrecv_gemm_bench.c
+benchs_sendrecv_gemm_bench_SOURCES += benchs/bench_helper.c
+benchs_sendrecv_gemm_bench_SOURCES += benchs/gemm_helper.c
+benchs_sendrecv_gemm_bench_SOURCES += benchs/abstract_sendrecv_bench.c
+benchs_sendrecv_gemm_bench_SOURCES += ../../examples/common/blas.c
+
+benchs_sendrecv_gemm_bench_LDADD = $(STARPU_BLAS_LDFLAGS)
+
+benchs_burst_gemm_SOURCES = benchs/burst_gemm.c
+benchs_burst_gemm_SOURCES += benchs/gemm_helper.c
+benchs_burst_gemm_SOURCES += benchs/burst_helper.c
+benchs_burst_gemm_SOURCES += ../../examples/common/blas.c
+
+benchs_burst_gemm_LDADD = $(STARPU_BLAS_LDFLAGS)
+endif

mpi/tests/abstract_sendrecv_bench.c → mpi/examples/benchs/abstract_sendrecv_bench.c


mpi/tests/abstract_sendrecv_bench.h → mpi/examples/benchs/abstract_sendrecv_bench.h


mpi/tests/bench_helper.c → mpi/examples/benchs/bench_helper.c


mpi/tests/bench_helper.h → mpi/examples/benchs/bench_helper.h


+ 2 - 6
mpi/tests/burst.c

@@ -49,13 +49,11 @@ void parse_args(int argc, char **argv)
 
 int main(int argc, char **argv)
 {
-	int ret, rank, mpi_init, other_rank;
+	int ret, rank, other_rank;
 
 	parse_args(argc, argv);
 
-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
-
-	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
+	ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
 
 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
@@ -68,8 +66,6 @@ int main(int argc, char **argv)
 	burst_free_data(rank);
 
 	starpu_mpi_shutdown();
-	if (!mpi_init)
-		MPI_Finalize();
 
 	return 0;
 }

+ 3 - 7
mpi/tests/burst_gemm.c

@@ -90,12 +90,11 @@ void parse_args(int argc, char **argv)
 
 int main(int argc, char **argv)
 {
-	int ret, mpi_init, worldsize, mpi_rank;
+	int ret, worldsize, mpi_rank;
 
 	parse_args(argc, argv);
 
-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
-	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
+	ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
 	if (ret == -ENODEV)
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
@@ -109,8 +108,7 @@ int main(int argc, char **argv)
 			FPRINTF(stderr, "We need 2 processes.\n");
 
 		starpu_mpi_shutdown();
-		if (!mpi_init)
-			MPI_Finalize();
+
 		return STARPU_TEST_SKIPPED;
 	}
 
@@ -203,8 +201,6 @@ enodev:
 	burst_free_data(mpi_rank);
 
 	starpu_mpi_shutdown();
-	if (!mpi_init)
-		MPI_Finalize();
 
 	return ret;
 }

mpi/tests/burst_helper.c → mpi/examples/benchs/burst_helper.c


mpi/tests/burst_helper.h → mpi/examples/benchs/burst_helper.h


mpi/tests/gemm_helper.c → mpi/examples/benchs/gemm_helper.c


mpi/tests/gemm_helper.h → mpi/examples/benchs/gemm_helper.h


+ 2 - 7
mpi/tests/sendrecv_bench.c

@@ -26,7 +26,6 @@
 int main(int argc, char **argv)
 {
 	int ret, rank, worldsize;
-	int mpi_init;
 	int pause_workers = 0;
 
 
@@ -52,8 +51,7 @@ int main(int argc, char **argv)
 	}
 
 
-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
-	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
+	ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
 
 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
@@ -65,8 +63,7 @@ int main(int argc, char **argv)
 			FPRINTF(stderr, "We need 2 processes.\n");
 
 		starpu_mpi_shutdown();
-		if (!mpi_init)
-			MPI_Finalize();
+
 		return STARPU_TEST_SKIPPED;
 	}
 
@@ -85,8 +82,6 @@ int main(int argc, char **argv)
 	}
 
 	starpu_mpi_shutdown();
-	if (!mpi_init)
-		MPI_Finalize();
 
 	return 0;
 }

+ 6 - 10
mpi/tests/sendrecv_gemm_bench.c

@@ -53,7 +53,7 @@ static void* comm_thread_func(void* arg)
 	{
 		char hostname[65];
 		gethostname(hostname, sizeof(hostname));
-		_STARPU_DISP("[%s] No core was available for the comm thread. You should increase STARPU_RESERVE_NCPU or decrease STARPU_NCPU\n", hostname);
+		fprintf(stderr, "[%s] No core was available for the comm thread. You should increase STARPU_RESERVE_NCPU or decrease STARPU_NCPU\n", hostname);
 	}
 
 	sendrecv_bench(mpi_rank, &thread_barrier);
@@ -118,7 +118,7 @@ void parse_args(int argc, char **argv)
 int main(int argc, char **argv)
 {
 	double start, end;
-	int ret, mpi_init, worldsize;
+	int ret, worldsize;
 	starpu_pthread_t comm_thread;
 
 	char hostname[255];
@@ -128,8 +128,7 @@ int main(int argc, char **argv)
 
 	starpu_fxt_autostart_profiling(0);
 
-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
-	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
+	ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
 	if (ret == -ENODEV)
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
@@ -143,8 +142,7 @@ int main(int argc, char **argv)
 			FPRINTF(stderr, "We need 2 processes.\n");
 
 		starpu_mpi_shutdown();
-		if (!mpi_init)
-			MPI_Finalize();
+
 		return STARPU_TEST_SKIPPED;
 	}
 
@@ -162,7 +160,7 @@ int main(int argc, char **argv)
 
 	if (mpi_rank == 0)
 	{
-		PRINTF("# node\tx\ty\tz\tms\tGFlops\n");
+		printf("# node\tx\ty\tz\tms\tGFlops\n");
 	}
 
 	starpu_pause();
@@ -185,7 +183,7 @@ int main(int argc, char **argv)
 	double timing = end - start;
 	double flops = 2.0*((unsigned long long)matrix_dim) * ((unsigned long long)matrix_dim)*((unsigned long long)matrix_dim);
 
-	PRINTF("%s\t%u\t%u\t%u\t%.0f\t%.1f\n", hostname, matrix_dim, matrix_dim, matrix_dim, timing/1000.0, flops/timing/1000.0);
+	printf("%s\t%u\t%u\t%u\t%.0f\t%.1f\n", hostname, matrix_dim, matrix_dim, matrix_dim, timing/1000.0, flops/timing/1000.0);
 
 
 enodev:
@@ -200,8 +198,6 @@ enodev:
 
 	starpu_resume();
 	starpu_mpi_shutdown();
-	if (!mpi_init)
-		MPI_Finalize();
 
 	return ret;
 }

+ 3 - 9
mpi/tests/sendrecv_parallel_tasks_bench.c

@@ -134,10 +134,8 @@ static struct starpu_codelet cl =
 int main(int argc, char **argv)
 {
 	int ret, rank, worldsize;
-	int mpi_init;
 
-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
-	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
+	ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
 
 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
@@ -149,8 +147,7 @@ int main(int argc, char **argv)
 			FPRINTF(stderr, "We need 2 processes.\n");
 
 		starpu_mpi_shutdown();
-		if (!mpi_init)
-			MPI_Finalize();
+
 		return STARPU_TEST_SKIPPED;
 	}
 
@@ -162,8 +159,7 @@ int main(int argc, char **argv)
 	else if (rank >= 2)
 	{
 		starpu_mpi_shutdown();
-		if (!mpi_init)
-			MPI_Finalize();
+
 		return 0;
 	}
 
@@ -222,8 +218,6 @@ int main(int argc, char **argv)
 	free(mpi_tags);
 
 	starpu_mpi_shutdown();
-	if (!mpi_init)
-		MPI_Finalize();
 
 	return 0;
 }

+ 1 - 0
mpi/src/starpu_mpi.c

@@ -431,6 +431,7 @@ void starpu_mpi_data_migrate(MPI_Comm comm, starpu_data_handle_t data, int new_r
 
 	/* Flush cache in all other nodes */
 	/* TODO: Ideally we'd transmit the knowledge of who owns it */
+	/* TODO: or at least remember that the previous owner has the data, that's an easy case to support */
 	starpu_mpi_cache_flush(comm, data);
 	return;
 }

+ 2 - 3
mpi/src/starpu_mpi_datatype.c

@@ -26,17 +26,16 @@ struct _starpu_mpi_datatype_funcs
 	UT_hash_handle hh;
 };
 
-static starpu_pthread_mutex_t _starpu_mpi_datatype_funcs_table_mutex;
+/* We want to allow applications calling starpu_mpi_interface_datatype_register/unregister as constructor/destructor */
+static starpu_pthread_mutex_t _starpu_mpi_datatype_funcs_table_mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
 static struct _starpu_mpi_datatype_funcs *_starpu_mpi_datatype_funcs_table = NULL;
 
 void _starpu_mpi_datatype_init(void)
 {
-	STARPU_PTHREAD_MUTEX_INIT(&_starpu_mpi_datatype_funcs_table_mutex, NULL);
 }
 
 void _starpu_mpi_datatype_shutdown(void)
 {
-	STARPU_PTHREAD_MUTEX_DESTROY(&_starpu_mpi_datatype_funcs_table_mutex);
 }
 
 /*

+ 34 - 1
mpi/src/starpu_mpi_init.c

@@ -138,7 +138,38 @@ int _starpu_mpi_initialize(int *argc, char ***argv, int initialize_mpi, MPI_Comm
 	_starpu_mpi_do_initialize(argc_argv);
 #endif
 
-	return _mpi_backend._starpu_mpi_backend_progress_init(argc_argv);
+	int ret = _mpi_backend._starpu_mpi_backend_progress_init(argc_argv);
+
+	if (starpu_get_env_number_default("STARPU_DISPLAY_BINDINGS", 0))
+	{
+		int rank, size, i;
+		char hostname[65];
+
+		starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
+		starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
+		gethostname(hostname, sizeof(hostname));
+
+		/* We make a barrier between each node calling hwloc-ps, to avoid mixing
+		 * outputs in stdout. */
+		for (i = 0; i < size; i++)
+		{
+			starpu_mpi_barrier(MPI_COMM_WORLD);
+			if (rank == i)
+			{
+				fprintf(stdout, "== Binding for rank %d on node %s ==\n", rank, hostname);
+				starpu_display_bindings();
+				fflush(stdout);
+			}
+		}
+		starpu_mpi_barrier(MPI_COMM_WORLD);
+		if (rank == 0)
+		{
+			fprintf(stdout, "== End of bindings ==\n");
+			fflush(stdout);
+		}
+	}
+
+	return ret;
 }
 
 #ifdef STARPU_SIMGRID
@@ -219,6 +250,8 @@ int starpu_mpi_init_conf(int *argc, char ***argv, int initialize_mpi, MPI_Comm c
 			conf->reserve_ncpus++;
 	}
 
+	conf->will_use_mpi = 1;
+
 	int ret = starpu_init(conf);
 	if (ret < 0)
 		return ret;

+ 3 - 55
mpi/tests/Makefile.am

@@ -62,11 +62,7 @@ BUILT_SOURCES =
 CLEANFILES = *.gcno *.gcda *.linkinfo starpu_idle_microsec.log
 
 EXTRA_DIST = 				\
-	abstract_sendrecv_bench.h	\
-	bench_helper.h			\
 	helper.h			\
-	gemm_helper.h			\
-	burst_helper.h			\
 	user_defined_datatype_value.h
 
 examplebindir = $(libdir)/starpu/examples/mpi
@@ -142,19 +138,7 @@ starpu_mpi_TESTS +=				\
 	temporary				\
 	user_defined_datatype			\
 	early_stuff				\
-	sendrecv_bench				\
-	burst
-
-if !STARPU_USE_MPI_MPI
-starpu_mpi_TESTS +=				\
-	sendrecv_parallel_tasks_bench
-endif
-
-if !STARPU_NO_BLAS_LIB
-starpu_mpi_TESTS +=				\
-	sendrecv_gemm_bench			\
-	burst_gemm
-endif
+	display_bindings
 
 if !STARPU_SIMGRID
 # missing support in simgrid
@@ -243,16 +227,8 @@ noinst_PROGRAMS +=				\
 	starpu_redefine				\
 	load_balancer				\
 	driver					\
-	sendrecv_bench				\
-	sendrecv_parallel_tasks_bench		\
-	burst					\
-	nothing
-
-if !STARPU_NO_BLAS_LIB
-noinst_PROGRAMS +=				\
-	sendrecv_gemm_bench			\
-	burst_gemm
-endif
+	nothing							\
+	display_bindings
 
 if STARPU_USE_MPI_FT
 noinst_PROGRAMS +=  \
@@ -288,31 +264,3 @@ mpi_earlyrecv2_SOURCES = mpi_earlyrecv2.c
 mpi_earlyrecv2_SOURCES += ../../examples/interface/complex_interface.c
 mpi_earlyrecv2_sync_SOURCES = mpi_earlyrecv2_sync.c
 mpi_earlyrecv2_sync_SOURCES += ../../examples/interface/complex_interface.c
-
-sendrecv_bench_SOURCES = sendrecv_bench.c
-sendrecv_bench_SOURCES += bench_helper.c
-sendrecv_bench_SOURCES += abstract_sendrecv_bench.c
-
-sendrecv_parallel_tasks_bench_SOURCES = sendrecv_parallel_tasks_bench.c
-sendrecv_parallel_tasks_bench_SOURCES += bench_helper.c
-sendrecv_parallel_tasks_bench_SOURCES += abstract_sendrecv_bench.c
-
-burst_SOURCES = burst.c
-burst_SOURCES += burst_helper.c
-
-if !STARPU_NO_BLAS_LIB
-sendrecv_gemm_bench_SOURCES = sendrecv_gemm_bench.c
-sendrecv_gemm_bench_SOURCES += bench_helper.c
-sendrecv_gemm_bench_SOURCES += gemm_helper.c
-sendrecv_gemm_bench_SOURCES += abstract_sendrecv_bench.c
-sendrecv_gemm_bench_SOURCES += ../../examples/common/blas.c
-
-sendrecv_gemm_bench_LDADD = $(STARPU_BLAS_LDFLAGS)
-
-burst_gemm_SOURCES = burst_gemm.c
-burst_gemm_SOURCES += gemm_helper.c
-burst_gemm_SOURCES += burst_helper.c
-burst_gemm_SOURCES += ../../examples/common/blas.c
-
-burst_gemm_LDADD = $(STARPU_BLAS_LDFLAGS)
-endif

+ 44 - 0
mpi/tests/display_bindings.c

@@ -0,0 +1,44 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2011-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_mpi.h>
+#include <stdlib.h>
+#include "helper.h"
+
+#if !defined(STARPU_HAVE_SETENV)
+#warning setenv is not defined. Skipping test
+int main(void)
+{
+	return STARPU_TEST_SKIPPED;
+}
+#else
+int main(int argc, char **argv)
+{
+	int ret;
+	setenv("STARPU_DISPLAY_BINDINGS", "1", 1);
+
+	MPI_INIT_THREAD_real(&argc, &argv, MPI_THREAD_SERIALIZED);
+
+	ret = starpu_mpi_init_conf(NULL, NULL, 0, MPI_COMM_WORLD, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
+
+	starpu_mpi_shutdown();
+	MPI_Finalize();
+
+	return EXIT_SUCCESS;
+}
+#endif

+ 11 - 4
src/Makefile.am

@@ -406,9 +406,16 @@ endif
 # static inline definition
 dist-hook:
 	failed=0 ; \
-	for i in $$( $(GREP) "static inline" $$(find $(srcdir) -name \*.h) | $(SED) -e 's/.*static inline //g' | $(GREP) -v ENAME | $(SED) -e 's/[^(]* \(\|\*\)\([^ (]*\)(.*/\2/' | $(GREP) -v _starpu_spin_init) ; do \
-		for j in $(shell find . -name \*.o) ; do \
-			nm $$j | $(GREP) "U $$i$$" && { echo $$j ; failed=1 ; } ; \
-		done ; \
+	look=""; \
+	for i in $$( $(GREP) "static inline" $$(find $(srcdir) -name \*.h) | $(SED) -e 's/.*static inline //g' | $(GREP) -v ENAME\#\# | $(SED) -n -e 's/[^(]* \(\|\*\)\([^ (]*\)(.*/\2/' -e 'p;s/^_*//;p' | $(GREP) -v _starpu_spin_init | $(GREP) -v starpu_sched_ctx_worker_is_master_for_child_ctx) ; do \
+		if [ -z "$$look" ] ; then \
+			look="$$i" ; \
+		else \
+			look="$$look\|$$i" ; \
+		fi ; \
+	done ; \
+	echo "$$look" ; \
+	for j in $(shell find . -name \*.o) ; do \
+		nm $$j | $(GREP) -e "U \($$look\)$$" && { echo $$j ; failed=1 ; } ; \
 	done ; \
 	[ $$failed == 0 ]

+ 15 - 0
src/common/utils.c

@@ -740,3 +740,18 @@ int starpu_get_env_size_default(const char *str, int defval)
 	}
 	return val;
 }
+
+void starpu_display_bindings(void)
+{
+#if defined(STARPU_HAVE_HWLOC) && !defined(STARPU_SIMGRID)
+	int hwloc_ret = system("hwloc-ps -a -t -c");
+	if (hwloc_ret)
+	{
+		_STARPU_DISP("hwloc-ps returned %d\n", hwloc_ret);
+		fflush(stderr);
+	}
+	fflush(stdout);
+#else
+	_STARPU_DISP("hwloc not available to display bindings.\n");
+#endif
+}

+ 2 - 0
src/core/jobs.c

@@ -24,10 +24,12 @@
 #include <common/config.h>
 #include <common/utils.h>
 #include <common/graph.h>
+#include <datawizard/memory_nodes.h>
 #include <profiling/profiling.h>
 #include <profiling/bound.h>
 #include <core/debug.h>
 #include <limits.h>
+#include <core/workers.h>
 
 static int max_memory_use;
 static unsigned long njobs, maxnjobs;

+ 5 - 4
src/core/perfmodel/perfmodel_bus.c

@@ -37,6 +37,7 @@
 #include <core/topology.h>
 #include <common/utils.h>
 #include <drivers/mpi/driver_mpi_common.h>
+#include <datawizard/memory_nodes.h>
 
 #ifdef STARPU_USE_OPENCL
 #include <starpu_opencl.h>
@@ -177,7 +178,7 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_cuda(int dev, in
 	/* Allocate a buffer on the device */
 	unsigned char *d_buffer;
 	cures = cudaMalloc((void **)&d_buffer, size);
-	STARPU_ASSERT(cures == cudaSuccess);
+	if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
 
 	/* hack to avoid third party libs to rebind threads */
 	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
@@ -206,7 +207,7 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_cuda(int dev, in
 		cudaHostRegister((void *)h_buffer, size, 0);
 	}
 
-	STARPU_ASSERT(cures == cudaSuccess);
+	if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
 
 	/* hack to avoid third party libs to rebind threads */
 	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
@@ -331,7 +332,7 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 	/* Allocate a buffer on the device */
 	unsigned char *s_buffer;
 	cures = cudaMalloc((void **)&s_buffer, size);
-	STARPU_ASSERT(cures == cudaSuccess);
+	if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
 	cudaMemset(s_buffer, 0, size);
 	cudaDeviceSynchronize();
 
@@ -357,7 +358,7 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 	/* Allocate a buffer on the device */
 	unsigned char *d_buffer;
 	cures = cudaMalloc((void **)&d_buffer, size);
-	STARPU_ASSERT(cures == cudaSuccess);
+	if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
 	cudaMemset(d_buffer, 0, size);
 	cudaDeviceSynchronize();
 

+ 6 - 1
src/core/perfmodel/perfmodel_history.c

@@ -344,7 +344,10 @@ static void dump_reg_model(FILE *f, struct starpu_perfmodel *model, int comb, in
 	double a = nan(""), b = nan(""), c = nan("");
 
 	if (model->type == STARPU_NL_REGRESSION_BASED)
-		_starpu_regression_non_linear_power(per_arch_model->list, &a, &b, &c);
+	{
+		if (_starpu_regression_non_linear_power(per_arch_model->list, &a, &b, &c) != 0)
+			_STARPU_DISP("Warning: could not compute a non-linear regression for model %s\n", model->symbol);
+	}
 
 	fprintf(f, "# a\t\tb\t\tc\n");
 	_starpu_write_double(f, "%-15e", a);
@@ -1491,6 +1494,8 @@ int starpu_perfmodel_load_file(const char *filename, struct starpu_perfmodel *mo
 	res = fclose(f);
 	STARPU_ASSERT(res == 0);
 
+	if (ret)
+		starpu_perfmodel_unload_model(model);
 	return ret;
 }
 

+ 1 - 0
src/core/perfmodel/perfmodel_print.c

@@ -19,6 +19,7 @@
 #include <starpu.h>
 #include <starpu_perfmodel.h>
 #include <common/config.h>
+#include <core/workers.h>
 #include "perfmodel.h"
 
 static

+ 115 - 41
src/core/perfmodel/regression.c

@@ -20,7 +20,32 @@
 #define MAXREGITER	1000
 #define EPS 1.0e-10
 
-static double compute_b(double c, unsigned n, unsigned *x, double *y)
+/* For measurements close to C, we do not want to try to fit, since we are
+   fitting the distance to C, which won't actually really get smaller */
+#define C_RADIUS 1
+
+/*
+ * smoothly ramp from 0 to 1 between 0 and 1
+ * <= 0: stay 0
+ * >= 1: stay 1 */
+static double level(double x)
+{
+	if (x <= 0.)
+		return 0.;
+	if (x >= 1.)
+		return 1.;
+	if (x < 0.5)
+		return -2*x*x+4*x-1;
+	return 2*x*x;
+}
+
+static double fixpop(unsigned pop, double c, double y)
+{
+	double distance = (y-c)/c;
+	return pop * level((distance - C_RADIUS) / C_RADIUS);
+}
+
+static double compute_b(double c, unsigned n, size_t *x, double *y, unsigned *pop)
 {
 	double b;
 
@@ -29,43 +54,55 @@ static double compute_b(double c, unsigned n, unsigned *x, double *y)
 	double sumx = 0.0;
 	double sumx2 = 0.0;
 	double sumy = 0.0;
+	double nn = 0;
 
 	unsigned i;
 	for (i = 0; i < n; i++)
 	{
 		double xi = log(x[i]);
 		double yi = log(y[i]-c);
+		double popi = fixpop(pop[i], c, y[i]);
+		if (popi <= 0)
+			continue;
+
+		sumxy += xi*yi*popi;
+		sumx += xi*popi;
+		sumx2 += xi*xi*popi;
+		sumy += yi*popi;
 
-		sumxy += xi*yi;
-		sumx += xi;
-		sumx2 += xi*xi;
-		sumy += yi;
+		nn += popi;
 	}
 
-	b = (n * sumxy - sumx * sumy) / (n*sumx2 - sumx*sumx);
+	b = (nn * sumxy - sumx * sumy) / (nn*sumx2 - sumx*sumx);
 
 	return b;
 }
 
-static double compute_a(double c, double b, unsigned n, unsigned *x, double *y)
+static double compute_a(double c, double b, unsigned n, size_t *x, double *y, unsigned *pop)
 {
 	double a;
 
 	/* X = log (x) , Y = log (y - c) */
 	double sumx = 0.0;
 	double sumy = 0.0;
+	double nn = 0;
 
 	unsigned i;
 	for (i = 0; i < n; i++)
 	{
 		double xi = log(x[i]);
 		double yi = log(y[i]-c);
+		double popi = fixpop(pop[i], c, y[i]);
+		if (popi <= 0)
+			continue;
 
-		sumx += xi;
-		sumy += yi;
+		sumx += xi*popi;
+		sumy += yi*popi;
+
+		nn += popi;
 	}
 
-	a = (sumy - b*sumx) / n;
+	a = (sumy - b*sumx) / nn;
 
 	return a;
 }
@@ -73,7 +110,7 @@ static double compute_a(double c, double b, unsigned n, unsigned *x, double *y)
 
 
 /* returns r */
-static double test_r(double c, unsigned n, unsigned *x, double *y)
+static double test_r(double c, unsigned n, size_t *x, double *y, unsigned *pop)
 {
 	double r;
 
@@ -85,20 +122,26 @@ static double test_r(double c, unsigned n, unsigned *x, double *y)
 	double sumx2 = 0.0;
 	double sumy = 0.0;
 	double sumy2 = 0.0;
+	double nn = 0;
 
 	unsigned i;
 	for (i = 0; i < n; i++)
 	{
 		double xi = log(x[i]);
 		double yi = log(y[i]-c);
+		double popi = fixpop(pop[i], c, y[i]);
+		if (popi <= 0)
+			continue;
 
 	//	printf("Xi = %e, Yi = %e\n", xi, yi);
 
-		sumxy += xi*yi;
-		sumx += xi;
-		sumx2 += xi*xi;
-		sumy += yi;
-		sumy2 += yi*yi;
+		sumxy += xi*yi*popi;
+		sumx += xi*popi;
+		sumx2 += xi*xi*popi;
+		sumy += yi*popi;
+		sumy2 += yi*yi*popi;
+
+		nn += popi;
 	}
 
 	//printf("sumxy %e\n", sumxy);
@@ -107,7 +150,7 @@ static double test_r(double c, unsigned n, unsigned *x, double *y)
 	//printf("sumy %e\n", sumy);
 	//printf("sumy2 %e\n", sumy2);
 
-	r = (n * sumxy - sumx * sumy) / sqrt( (n* sumx2 - sumx*sumx) * (n*sumy2 - sumy*sumy) );
+	r = (nn * sumxy - sumx * sumy) / sqrt( (nn* sumx2 - sumx*sumx) * (nn*sumy2 - sumy*sumy) );
 
 	return r;
 }
@@ -127,20 +170,29 @@ static unsigned find_list_size(struct starpu_perfmodel_history_list *list_histor
 	return cnt;
 }
 
-static double find_list_min(double *y, unsigned n)
+static int compar(const void *_a, const void *_b)
 {
-	double min = DBL_MAX;
+	double a = *(double*) _a;
+	double b = *(double*) _b;
+	if (a < b)
+		return -1;
+	if (a > b)
+		return 1;
+	return 0;
+}
 
-	unsigned i;
-	for (i = 0; i < n; i++)
-	{
-		min = STARPU_MIN(min, y[i]);
-	}
+static double get_list_fourth(double *y, unsigned n)
+{
+	double sorted[n];
+
+	memcpy(sorted, y, n * sizeof(*sorted));
+
+	qsort(sorted, n, sizeof(*sorted), compar);
 
-	return min;
+	return sorted[n/3];
 }
 
-static void dump_list(unsigned *x, double *y, struct starpu_perfmodel_history_list *list_history)
+static void dump_list(size_t *x, double *y, unsigned *pop, struct starpu_perfmodel_history_list *list_history)
 {
 	struct starpu_perfmodel_history_list *ptr = list_history;
 	unsigned i = 0;
@@ -151,6 +203,7 @@ static void dump_list(unsigned *x, double *y, struct starpu_perfmodel_history_li
 		{
 			x[i] = ptr->entry->size;
 			y[i] = ptr->entry->mean;
+			pop[i] = ptr->entry->nsample;
 			i++;
 		}
 
@@ -163,52 +216,72 @@ static void dump_list(unsigned *x, double *y, struct starpu_perfmodel_history_li
  * 	return 0 if success, -1 otherwise
  * 	if success, a, b and c are modified
  * */
+
+/* See in Cedric Augonnet's PhD thesis's Appendix B for the rationale
+ * Scheduling Tasks over Multicore machines enhanced with Accelerators: a
+ * Runtime System’s Perspective */
 int _starpu_regression_non_linear_power(struct starpu_perfmodel_history_list *ptr, double *a, double *b, double *c)
 {
 	unsigned n = find_list_size(ptr);
-	STARPU_ASSERT(n);
+	if (!n)
+		return -1;
 
-	unsigned *x;
-	_STARPU_MALLOC(x, n*sizeof(unsigned));
+	size_t *x;
+	_STARPU_MALLOC(x, n*sizeof(size_t));
 
 	double *y;
 	_STARPU_MALLOC(y, n*sizeof(double));
 	STARPU_ASSERT(y);
 
-	dump_list(x, y, ptr);
+	unsigned *pop;
+	_STARPU_MALLOC(pop, n*sizeof(unsigned));
+	STARPU_ASSERT(y);
+
+	dump_list(x, y, pop, ptr);
 
 	double cmin = 0.0;
-	double cmax = find_list_min(y, n);
+	double cmax = get_list_fourth(y, n);
 
 	unsigned iter;
 
 	double err = 100000.0;
 
+/*
+	unsigned i;
+	for (i = 0; i < 100; i++)
+	{
+		double ci = cmin + (cmax-cmin)*i/100.;
+		fprintf(stderr,"%f: %f\n", ci, 1.0 - test_r(ci, n, x, y, pop));
+	}
+*/
+
+	/* Use dichotomy to find c that gives the best matching */
 	for (iter = 0; iter < MAXREGITER; iter++)
 	{
 		double c1, c2;
 		double r1, r2;
 
-		double radius = 0.01;
-
-		c1 = cmin + (0.5-radius)*(cmax - cmin);
-		c2 = cmin + (0.5+radius)*(cmax - cmin);
+		c1 = cmin + (0.33)*(cmax - cmin);
+		c2 = cmin + (0.67)*(cmax - cmin);
 
-		r1 = test_r(c1, n, x, y);
-		r2 = test_r(c2, n, x, y);
+		r1 = test_r(c1, n, x, y, pop);
+		r2 = test_r(c2, n, x, y, pop);
 
 		double err1, err2;
 		err1 = fabs(1.0 - r1);
 		err2 = fabs(1.0 - r2);
 
+		//fprintf(stderr,"%f - %f: %f - %f: %f - %f\n", cmin, c1, err1, c2, err2, cmax);
+
 		if (err1 < err2)
 		{
-			cmax = (cmin + cmax)/2;
+			/* 1 is better */
+			cmax = c2;
 		}
 		else
 		{
 			/* 2 is better */
-			cmin = (cmin + cmax)/2;
+			cmin = c1;
 		}
 
 		if (fabs(err - STARPU_MIN(err1, err2)) < EPS)
@@ -219,11 +292,12 @@ int _starpu_regression_non_linear_power(struct starpu_perfmodel_history_list *pt
 
 	*c = (cmin + cmax)/2;
 
-	*b = compute_b(*c, n, x, y);
-	*a = exp(compute_a(*c, *b, n, x, y));
+	*b = compute_b(*c, n, x, y, pop);
+	*a = exp(compute_a(*c, *b, n, x, y, pop));
 
 	free(x);
 	free(y);
+	free(pop);
 
 	return 0;
 }

+ 1 - 0
src/core/sched_ctx.c

@@ -21,6 +21,7 @@
 #include <common/utils.h>
 #include <stdarg.h>
 #include <core/task.h>
+#include <core/workers.h>
 
 enum _starpu_ctx_change_op
 {

+ 1 - 0
src/core/sched_policy.c

@@ -22,6 +22,7 @@
 #include <common/utils.h>
 #include <core/sched_policy.h>
 #include <profiling/profiling.h>
+#include <datawizard/memory_nodes.h>
 #include <common/barrier.h>
 #include <core/debug.h>
 #include <core/task.h>

+ 24 - 2
src/core/simgrid.c

@@ -58,6 +58,8 @@ extern int _starpu_mpi_simgrid_init(int argc, char *argv[]);
 extern void smpi_process_set_user_data(void *);
 #endif
 
+static double _starpu_simgrid_dynamic_energy = 0.0;
+
 /* 1 when MSG_init was done, 2 when initialized through redirected main, 3 when
  * initialized through MSG_process_attach */
 static int simgrid_started;
@@ -629,6 +631,7 @@ struct task
 #else
 	msg_task_t task;
 #endif
+	double energy;
 
 	/* communication termination signalization */
 	unsigned *finished;
@@ -666,6 +669,7 @@ static void *task_execute(void *arg)
 		MSG_task_execute(task->task);
 		MSG_task_destroy(task->task);
 #endif
+		starpu_energy_use(task->energy);
 		_STARPU_DEBUG("task %p finished\n", task);
 
 		*task->finished = 1;
@@ -702,7 +706,7 @@ void _starpu_simgrid_wait_tasks(int workerid)
 }
 
 /* Task execution submitted by StarPU */
-void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch, double length, unsigned *finished)
+void _starpu_simgrid_submit_job(int workerid, int sched_ctx_id, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch, double length, double energy, unsigned *finished)
 {
 	struct starpu_task *starpu_task = j->task;
 	double flops;
@@ -717,13 +721,19 @@ void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct star
 
 	if (isnan(length))
 	{
-		length = starpu_task_expected_length(starpu_task, perf_arch, j->nimpl);
+		length = starpu_task_worker_expected_length(starpu_task, workerid, sched_ctx_id, j->nimpl);
 		STARPU_ASSERT_MSG(!_STARPU_IS_ZERO(length) && !isnan(length),
 				  "Codelet %s does not have a perfmodel (in directory %s), or is not calibrated enough, please re-run in non-simgrid mode until it is calibrated",
 				  _starpu_job_get_model_name(j), _starpu_get_perf_model_dir_codelet());
                 /* TODO: option to add variance according to performance model,
                  * to be able to easily check scheduling robustness */
 	}
+	if (isnan(energy))
+	{
+		energy = starpu_task_worker_expected_energy(starpu_task, workerid, sched_ctx_id, j->nimpl);
+		/* TODO: option to add variance according to performance model,
+		 * to be able to easily check scheduling robustness */
+	}
 
 #if defined(HAVE_SG_HOST_SPEED) || defined(sg_host_speed)
 #  if defined(HAVE_SG_HOST_SELF) || defined(sg_host_self)
@@ -754,6 +764,7 @@ void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct star
 		MSG_task_execute(simgrid_task);
 		MSG_task_destroy(simgrid_task);
 #endif
+		starpu_energy_use(energy);
 	}
 	else
 	{
@@ -766,6 +777,7 @@ void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct star
 #else
 		task->task = simgrid_task;
 #endif
+		task->energy = energy;
 		task->finished = finished;
 		*finished = 0;
 		task->next = NULL;
@@ -1391,5 +1403,15 @@ void _starpu_simgrid_data_transfer(size_t size, unsigned src_node, unsigned dst_
 }
 #endif
 
+void starpu_energy_use(float joules)
+{
+	_starpu_simgrid_dynamic_energy += joules;
+}
+
+double starpu_energy_used(void)
+{
+	float idle_power = starpu_get_env_float_default("STARPU_IDLE_POWER", 0.0);
+	return _starpu_simgrid_dynamic_energy + idle_power * starpu_timing_now() / 1000000;
+}
 
 #endif

+ 1 - 1
src/core/simgrid.h

@@ -66,7 +66,7 @@ void _starpu_simgrid_deinit_late(void);
 void _starpu_simgrid_actor_setup(void);
 void _starpu_simgrid_wait_tasks(int workerid);
 struct _starpu_job;
-void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *job, struct starpu_perfmodel_arch* perf_arch, double length, unsigned *finished);
+void _starpu_simgrid_submit_job(int workerid, int sched_ctx_id, struct _starpu_job *job, struct starpu_perfmodel_arch* perf_arch, double length, double energy, unsigned *finished);
 struct _starpu_data_request;
 int _starpu_simgrid_transfer(size_t size, unsigned src_node, unsigned dst_node, struct _starpu_data_request *req);
 union _starpu_async_channel_event;

+ 1 - 0
src/core/task.c

@@ -30,6 +30,7 @@
 #include <common/utils.h>
 #include <common/fxt.h>
 #include <common/knobs.h>
+#include <datawizard/memory_nodes.h>
 #include <profiling/profiling.h>
 #include <profiling/bound.h>
 #include <math.h>

+ 4 - 0
src/core/topology.c

@@ -1983,7 +1983,11 @@ int _starpu_bind_thread_on_cpu(int cpuid STARPU_ATTRIBUTE_UNUSED, int workerid S
 		{
 			cpu_worker[cpuid] = workerid;
 			if (name)
+			{
+				if (cpu_name[cpuid])
+					free(cpu_name[cpuid]);
 				cpu_name[cpuid] = strdup(name);
+			}
 		}
 	}
 

+ 16 - 0
src/core/workers.c

@@ -1059,6 +1059,7 @@ int starpu_conf_init(struct starpu_conf *conf)
 
 	memset(conf, 0, sizeof(*conf));
 	conf->magic = 42;
+	conf->will_use_mpi = 0;
 	conf->sched_policy_name = starpu_getenv("STARPU_SCHED");
 	conf->sched_policy = NULL;
 	conf->global_sched_ctx_min_priority = starpu_get_env_number("STARPU_MIN_PRIO");
@@ -1666,6 +1667,15 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 
 	_starpu_catch_signals();
 
+	/* if MPI is enabled, binding display will be done later, after MPI initialization */
+	if (!_starpu_config.conf.will_use_mpi && starpu_get_env_number_default("STARPU_DISPLAY_BINDINGS", 0))
+	{
+		fprintf(stdout, "== Binding ==\n");
+		starpu_display_bindings();
+		fprintf(stdout, "== End of binding ==\n");
+		fflush(stdout);
+	}
+
 	return 0;
 }
 
@@ -2644,31 +2654,37 @@ int starpu_worker_get_relax_state(void)
 	return _starpu_worker_get_relax_state();
 }
 
+#undef starpu_worker_lock
 void starpu_worker_lock(int workerid)
 {
 	_starpu_worker_lock(workerid);
 }
 
+#undef starpu_worker_trylock
 int starpu_worker_trylock(int workerid)
 {
 	return _starpu_worker_trylock(workerid);
 }
 
+#undef starpu_worker_unlock
 void starpu_worker_unlock(int workerid)
 {
 	_starpu_worker_unlock(workerid);
 }
 
+#undef starpu_worker_lock_self
 void starpu_worker_lock_self(void)
 {
 	_starpu_worker_lock_self();
 }
 
+#undef starpu_worker_unlock_self
 void starpu_worker_unlock_self(void)
 {
 	_starpu_worker_unlock_self();
 }
 
+#undef starpu_wake_worker_relax
 int starpu_wake_worker_relax(int workerid)
 {
 	return _starpu_wake_worker_relax(workerid);

+ 22 - 2
src/core/workers.h

@@ -203,6 +203,10 @@ LIST_TYPE(_starpu_worker,
 
 	int enable_knob;
 	int bindid_requested;
+
+	/* Keep this last, to make sure to separate worker data in separate
+	  cache lines. */
+	char padding[STARPU_CACHELINE_SIZE];
 );
 
 struct _starpu_combined_worker
@@ -223,6 +227,10 @@ struct _starpu_combined_worker
 #ifdef STARPU_HAVE_HWLOC
 	hwloc_bitmap_t hwloc_cpu_set;
 #endif
+
+	/* Keep this last, to make sure to separate worker data in separate
+	  cache lines. */
+	char padding[STARPU_CACHELINE_SIZE];
 };
 
 /**
@@ -389,6 +397,9 @@ struct _starpu_machine_config
 	/** Memory node for MPI, if only one */
 	int mpi_nodeid;
 
+	/* Separate out previous variables from per-worker data. */
+	char padding1[STARPU_CACHELINE_SIZE];
+
 	/** Basic workers : each of this worker is running its own driver and
 	 * can be combined with other basic workers. */
 	struct _starpu_worker workers[STARPU_NMAXWORKERS];
@@ -397,6 +408,11 @@ struct _starpu_machine_config
 	 * that can run parallel tasks together. */
 	struct _starpu_combined_worker combined_workers[STARPU_NMAX_COMBINEDWORKERS];
 
+	starpu_pthread_mutex_t submitted_mutex;
+
+	/* Separate out previous mutex from the rest of the data. */
+	char padding2[STARPU_CACHELINE_SIZE];
+
 	/** Translation table from bindid to worker IDs */
 	struct
 	{
@@ -432,8 +448,6 @@ struct _starpu_machine_config
 
 	/** When >0, StarPU should stop performance counters collection. */
 	int perf_counter_pause_depth;
-
-	starpu_pthread_mutex_t submitted_mutex;
 };
 
 extern int _starpu_worker_parallel_blocks;
@@ -1103,6 +1117,7 @@ static inline void _starpu_worker_lock(int workerid)
 		STARPU_PTHREAD_MUTEX_LOCK_SCHED(&worker->sched_mutex);
 	}
 }
+#define starpu_worker_lock _starpu_worker_lock
 
 static inline int _starpu_worker_trylock(int workerid)
 {
@@ -1133,6 +1148,7 @@ static inline int _starpu_worker_trylock(int workerid)
 	STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&cur_worker->sched_mutex);
 	return ret;
 }
+#define starpu_worker_trylock _starpu_worker_trylock
 
 static inline void _starpu_worker_unlock(int workerid)
 {
@@ -1145,6 +1161,7 @@ static inline void _starpu_worker_unlock(int workerid)
 		starpu_worker_relax_off();
 	}
 }
+#define starpu_worker_unlock _starpu_worker_unlock
 
 static inline void _starpu_worker_lock_self(void)
 {
@@ -1153,6 +1170,7 @@ static inline void _starpu_worker_lock_self(void)
 	STARPU_ASSERT(worker != NULL);
 	STARPU_PTHREAD_MUTEX_LOCK_SCHED(&worker->sched_mutex);
 }
+#define starpu_worker_lock_self _starpu_worker_lock_self
 
 static inline void _starpu_worker_unlock_self(void)
 {
@@ -1161,6 +1179,7 @@ static inline void _starpu_worker_unlock_self(void)
 	STARPU_ASSERT(worker != NULL);
 	STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&worker->sched_mutex);
 }
+#define starpu_worker_unlock_self _starpu_worker_unlock_self
 
 static inline int _starpu_wake_worker_relax(int workerid)
 {
@@ -1169,6 +1188,7 @@ static inline int _starpu_wake_worker_relax(int workerid)
 	_starpu_worker_unlock(workerid);
 	return ret;
 }
+#define starpu_wake_worker_relax _starpu_wake_worker_relax
 
 int starpu_wake_worker_relax_light(int workerid);
 

+ 3 - 0
src/datawizard/data_request.c

@@ -25,6 +25,9 @@
 #include <core/simgrid.h>
 
 /* requests that have not been treated at all */
+#ifdef STARPU_DEVEL
+#warning split into separate out/in queues for each node, so that MAX_PENDING_REQUESTS_PER_NODE is separate for them, since the links are bidirectionnal
+#endif
 static struct _starpu_data_request_prio_list data_requests[STARPU_MAXNODES];
 static struct _starpu_data_request_prio_list prefetch_requests[STARPU_MAXNODES];
 static struct _starpu_data_request_prio_list idle_requests[STARPU_MAXNODES];

+ 1 - 0
src/datawizard/filters.c

@@ -21,6 +21,7 @@
 #include <datawizard/filters.h>
 #include <datawizard/footprint.h>
 #include <datawizard/interfaces/data_interface.h>
+#include <datawizard/memory_nodes.h>
 #include <core/task.h>
 
 /*

+ 3 - 0
src/datawizard/interfaces/bcsr_interface.c

@@ -15,6 +15,9 @@
  */
 
 #include <starpu.h>
+#ifdef BUILDING_STARPU
+#include <datawizard/memory_nodes.h>
+#endif
 
 /*
  * BCSR : blocked CSR, we use blocks of size (r x c)

+ 3 - 0
src/datawizard/interfaces/block_interface.c

@@ -15,6 +15,9 @@
  */
 
 #include <starpu.h>
+#ifdef BUILDING_STARPU
+#include <datawizard/memory_nodes.h>
+#endif
 
 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 

+ 3 - 0
src/datawizard/interfaces/coo_interface.c

@@ -15,6 +15,9 @@
  */
 
 #include <starpu.h>
+#ifdef BUILDING_STARPU
+#include <datawizard/memory_nodes.h>
+#endif
 
 static int
 copy_any_to_any(void *src_interface, unsigned src_node,

+ 3 - 0
src/datawizard/interfaces/csr_interface.c

@@ -16,6 +16,9 @@
  */
 
 #include <starpu.h>
+#ifdef BUILDING_STARPU
+#include <datawizard/memory_nodes.h>
+#endif
 
 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 

+ 3 - 0
src/datawizard/interfaces/matrix_interface.c

@@ -15,6 +15,9 @@
  */
 
 #include <starpu.h>
+#ifdef BUILDING_STARPU
+#include <datawizard/memory_nodes.h>
+#endif
 
 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 

+ 3 - 0
src/datawizard/interfaces/multiformat_interface.c

@@ -15,6 +15,9 @@
  */
 
 #include <starpu.h>
+#ifdef BUILDING_STARPU
+#include <datawizard/memory_nodes.h>
+#endif
 
 static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
 #ifdef STARPU_USE_CUDA

+ 3 - 0
src/datawizard/interfaces/tensor_interface.c

@@ -15,6 +15,9 @@
  */
 
 #include <starpu.h>
+#ifdef BUILDING_STARPU
+#include <datawizard/memory_nodes.h>
+#endif
 
 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 

+ 3 - 0
src/datawizard/interfaces/variable_interface.c

@@ -15,6 +15,9 @@
  */
 
 #include <starpu.h>
+#ifdef BUILDING_STARPU
+#include <datawizard/memory_nodes.h>
+#endif
 
 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 

+ 3 - 0
src/datawizard/interfaces/vector_interface.c

@@ -15,6 +15,9 @@
  */
 
 #include <starpu.h>
+#ifdef BUILDING_STARPU
+#include <datawizard/memory_nodes.h>
+#endif
 
 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 

+ 3 - 0
src/datawizard/interfaces/void_interface.c

@@ -15,6 +15,9 @@
  */
 
 #include <starpu.h>
+#ifdef BUILDING_STARPU
+#include <datawizard/memory_nodes.h>
+#endif
 
 static int dummy_copy(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 

+ 1 - 0
src/datawizard/memory_manager.c

@@ -19,6 +19,7 @@
 #include <common/thread.h>
 #include <common/fxt.h>
 #include <datawizard/memory_manager.h>
+#include <datawizard/memory_nodes.h>
 #include <core/workers.h>
 #include <starpu_stdlib.h>
 

+ 1 - 0
src/datawizard/reduction.c

@@ -22,6 +22,7 @@
 #include <datawizard/datawizard.h>
 #include <drivers/mic/driver_mic_source.h>
 #include <drivers/mp_common/source_common.h>
+#include <datawizard/memory_nodes.h>
 
 void starpu_data_set_reduction_methods(starpu_data_handle_t handle,
 				       struct starpu_codelet *redux_cl,

+ 1 - 0
src/datawizard/user_interactions.c

@@ -22,6 +22,7 @@
 #include <datawizard/write_back.h>
 #include <core/dependencies/data_concurrency.h>
 #include <core/sched_policy.h>
+#include <datawizard/memory_nodes.h>
 
 static void _starpu_data_check_initialized(starpu_data_handle_t handle, enum starpu_data_access_mode mode)
 {

+ 2 - 1
src/datawizard/write_back.c

@@ -17,6 +17,7 @@
 #include <datawizard/datawizard.h>
 #include <datawizard/write_back.h>
 #include <core/dependencies/data_concurrency.h>
+#include <datawizard/memory_nodes.h>
 
 static void wt_callback(void *arg)
 {
@@ -63,7 +64,7 @@ void _starpu_write_through_data(starpu_data_handle_t handle, unsigned requesting
 
 				struct _starpu_data_request *r;
 				r = _starpu_create_request_to_fetch_data(handle, &handle->per_node[node],
-									 STARPU_R, 1, 1, wt_callback, handle, 0, "_starpu_write_through_data");
+									 STARPU_R, 2, 1, wt_callback, handle, 0, "_starpu_write_through_data");
 
 			        /* If no request was created, the handle was already up-to-date on the
 			         * node */

+ 2 - 2
src/debug/traces/starpu_fxt.c

@@ -1194,8 +1194,8 @@ static void handle_new_mem_node(struct fxt_ev_64 *ev, struct starpu_fxt_options
  */
 static int create_ordered_stream_id (int nodeid, int devid)
 {
-	static int stable[MAX_MPI_NODES][STARPU_MAXCUDADEVS];
-	STARPU_ASSERT(nodeid < MAX_MPI_NODES);
+	static int stable[STARPU_FXT_MAX_FILES][STARPU_MAXCUDADEVS];
+	STARPU_ASSERT(nodeid < STARPU_FXT_MAX_FILES);
 	STARPU_ASSERT(devid < STARPU_MAXCUDADEVS);
 	return stable[nodeid][devid]++;
 }

+ 0 - 2
src/debug/traces/starpu_fxt.h

@@ -41,8 +41,6 @@
 #include <starpu.h>
 #include "../../../include/starpu_fxt.h"
 
-#define MAX_MPI_NODES 64
-
 extern char _starpu_last_codelet_symbol[STARPU_NMAXWORKERS][(FXT_MAX_PARAMS-5)*sizeof(unsigned long)];
 
 void _starpu_fxt_dag_init(char *dag_filename);

+ 19 - 19
src/debug/traces/starpu_fxt_mpi.c

@@ -103,27 +103,27 @@ int _starpu_fxt_mpi_find_sync_point(char *filename_in, uint64_t *offset, int *ke
  */
 
 /* the list of MPI transfers found in the different traces */
-static struct mpi_transfer *mpi_sends[MAX_MPI_NODES] = {NULL};
-static struct mpi_transfer *mpi_recvs[MAX_MPI_NODES] = {NULL};
+static struct mpi_transfer *mpi_sends[STARPU_FXT_MAX_FILES] = {NULL};
+static struct mpi_transfer *mpi_recvs[STARPU_FXT_MAX_FILES] = {NULL};
 
 /* number of available slots in the lists  */
-unsigned mpi_sends_list_size[MAX_MPI_NODES] = {0};
-unsigned mpi_recvs_list_size[MAX_MPI_NODES] = {0};
+unsigned mpi_sends_list_size[STARPU_FXT_MAX_FILES] = {0};
+unsigned mpi_recvs_list_size[STARPU_FXT_MAX_FILES] = {0};
 
 /* number of slots actually used in the list  */
-unsigned mpi_sends_used[MAX_MPI_NODES] = {0};
-unsigned mpi_recvs_used[MAX_MPI_NODES] = {0};
+unsigned mpi_sends_used[STARPU_FXT_MAX_FILES] = {0};
+unsigned mpi_recvs_used[STARPU_FXT_MAX_FILES] = {0};
 
 /* number of slots already matched at the beginning of the list. This permits
  * going through the lists from the beginning to match each and every
  * transfer, thus avoiding a quadratic complexity. */
-unsigned mpi_recvs_matched[MAX_MPI_NODES][MAX_MPI_NODES] = { {0} };
-unsigned mpi_sends_matched[MAX_MPI_NODES][MAX_MPI_NODES] = { {0} };
+unsigned mpi_recvs_matched[STARPU_FXT_MAX_FILES][STARPU_FXT_MAX_FILES] = { {0} };
+unsigned mpi_sends_matched[STARPU_FXT_MAX_FILES][STARPU_FXT_MAX_FILES] = { {0} };
 
 void _starpu_fxt_mpi_add_send_transfer(int src, int dst STARPU_ATTRIBUTE_UNUSED, long mpi_tag, size_t size, float date, long jobid, unsigned long handle)
 {
 	STARPU_ASSERT(src >= 0);
-	if (src >= MAX_MPI_NODES)
+	if (src >= STARPU_FXT_MAX_FILES)
 		return;
 	unsigned slot = mpi_sends_used[src]++;
 
@@ -153,7 +153,7 @@ void _starpu_fxt_mpi_add_send_transfer(int src, int dst STARPU_ATTRIBUTE_UNUSED,
 
 void _starpu_fxt_mpi_add_recv_transfer(int src STARPU_ATTRIBUTE_UNUSED, int dst, long mpi_tag, float date, long jobid, unsigned long handle)
 {
-	if (dst >= MAX_MPI_NODES)
+	if (dst >= STARPU_FXT_MAX_FILES)
 		return;
 	unsigned slot = mpi_recvs_used[dst]++;
 
@@ -220,11 +220,11 @@ static unsigned long mpi_com_id = 0;
 
 static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comms_file, unsigned n)
 {
-	unsigned slot[MAX_MPI_NODES] = { 0 }, node;
+	unsigned slot[STARPU_FXT_MAX_FILES] = { 0 }, node;
 	unsigned nb_wrong_comm_timing = 0;
 	struct mpi_transfer_list pending_receives; /* Sorted list of matches which have not happened yet */
-	double current_out_bandwidth[MAX_MPI_NODES] = { 0. };
-	double current_in_bandwidth[MAX_MPI_NODES] = { 0. };
+	double current_out_bandwidth[STARPU_FXT_MAX_FILES] = { 0. };
+	double current_in_bandwidth[STARPU_FXT_MAX_FILES] = { 0. };
 #ifdef STARPU_HAVE_POTI
 	char mpi_container[STARPU_POTI_STR_LEN];
 #endif
@@ -246,7 +246,7 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 		else
 			start_date = mpi_transfer_list_front(&pending_receives)->date;
 
-		src = MAX_MPI_NODES;
+		src = STARPU_FXT_MAX_FILES;
 		for (node = 0; node < n; node++)
 		{
 			if (slot[node] < mpi_sends_used[node] && mpi_sends[node][slot[node]].date < start_date)
@@ -260,7 +260,7 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 			/* No event any more, we're finished! */
 			break;
 
-		if (src == MAX_MPI_NODES)
+		if (src == STARPU_FXT_MAX_FILES)
 		{
 			/* Pending match is earlier than all new sends, finish its communication */
 			match = mpi_transfer_list_pop_front(&pending_receives);
@@ -284,7 +284,7 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 		size_t size = cur->size;
 		unsigned long send_handle = cur->handle;
 
-		if (dst < MAX_MPI_NODES)
+		if (dst < STARPU_FXT_MAX_FILES)
 			match = try_to_match_send_transfer(src, dst, mpi_tag);
 		else
 			match = NULL;
@@ -377,10 +377,10 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 
 void _starpu_fxt_display_mpi_transfers(struct starpu_fxt_options *options, int *ranks STARPU_ATTRIBUTE_UNUSED, FILE *out_paje_file, FILE* out_comms_file)
 {
-	if (options->ninputfiles > MAX_MPI_NODES)
+	if (options->ninputfiles > STARPU_FXT_MAX_FILES)
 	{
-		_STARPU_DISP("Warning: %u files given, maximum %u supported, truncating to %u\n", options->ninputfiles, MAX_MPI_NODES, MAX_MPI_NODES);
-		options->ninputfiles = MAX_MPI_NODES;
+		_STARPU_DISP("Warning: %u files given, maximum %u supported, truncating to %u\n", options->ninputfiles, STARPU_FXT_MAX_FILES, STARPU_FXT_MAX_FILES);
+		options->ninputfiles = STARPU_FXT_MAX_FILES;
 	}
 
 	/* display the MPI transfers if possible */

+ 4 - 1
src/drivers/cpu/driver_cpu.c

@@ -108,7 +108,10 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 				_SIMGRID_TIMER_END;
 			}
 			else
-				_starpu_simgrid_submit_job(cpu_args->workerid, j, perf_arch, NAN, NULL);
+			{
+				struct _starpu_sched_ctx *sched_ctx = _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(cpu_args, j);
+				_starpu_simgrid_submit_job(cpu_args->workerid, sched_ctx->id, j, perf_arch, NAN, NAN, NULL);
+			}
 #else
 #  ifdef STARPU_PAPI
 			_starpu_profiling_papi_task_start_counters(task);

+ 4 - 1
src/drivers/cuda/driver_cuda.c

@@ -531,8 +531,11 @@ static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *worke
 				_SIMGRID_TIMER_END;
 			}
 		else
-			_starpu_simgrid_submit_job(workerid, j, &worker->perf_arch, NAN,
+		{
+			struct _starpu_sched_ctx *sched_ctx = _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(worker, j);
+			_starpu_simgrid_submit_job(workerid, sched_ctx->id, j, &worker->perf_arch, NAN, NAN,
 				async ? &task_finished[workerid][pipeline_idx] : NULL);
+		}
 #else
 #ifdef HAVE_LIBNVIDIA_ML
 		unsigned long long energy_start = 0;

+ 1 - 0
src/drivers/disk/driver_disk.c

@@ -21,6 +21,7 @@
 #include <drivers/disk/driver_disk.h>
 #include <drivers/cpu/driver_cpu.h>
 #include <datawizard/coherency.h>
+#include <datawizard/memory_nodes.h>
 
 int _starpu_disk_copy_src_to_disk(void * src, unsigned src_node, void * dst, size_t dst_offset, unsigned dst_node, size_t size, void * async_channel)
 {

+ 1 - 0
src/drivers/driver_common/driver_common.c

@@ -27,6 +27,7 @@
 #include <core/sched_policy.h>
 #include <core/debug.h>
 #include <core/task.h>
+#include <datawizard/memory_nodes.h>
 
 
 void _starpu_driver_start_job(struct _starpu_worker *worker, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch, int rank, int profiling)

+ 6 - 1
src/drivers/opencl/driver_opencl.c

@@ -948,6 +948,7 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker
 		_STARPU_TRACE_START_EXECUTING();
 #ifdef STARPU_SIMGRID
 		double length = NAN;
+		double energy = NAN;
 		int async = task->cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC;
 		int simulate = 1;
 		if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE && !async)
@@ -976,6 +977,7 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker
 #else
 			length = ((double) profiling_info->used_cycles)/MSG_get_host_speed(MSG_host_self());
 #endif
+			energy = info->energy_consumed;
 			/* And give the simulated time to simgrid */
 			simulate = 1;
 #endif
@@ -989,8 +991,11 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker
 			}
 
 		if (simulate)
-			_starpu_simgrid_submit_job(worker->workerid, j, &worker->perf_arch, length,
+		{
+			struct _starpu_sched_ctx *sched_ctx = _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(worker, j);
+			_starpu_simgrid_submit_job(sched_ctx->id, worker->workerid, j, &worker->perf_arch, length, energy,
 						   async ? &task_finished[worker->devid][pipeline_idx] : NULL);
+		}
 #else
 		func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
 

+ 1 - 0
src/profiling/bound.c

@@ -26,6 +26,7 @@
 #include <profiling/bound.h>
 #include <core/jobs.h>
 #include <core/workers.h>
+#include <datawizard/memory_nodes.h>
 
 #ifdef STARPU_HAVE_GLPK_H
 #include <glpk.h>

+ 10 - 2
src/profiling/profiling.c

@@ -29,6 +29,8 @@
 #include <papi.h>
 #endif
 
+/* TODO: move to worker structure */
+
 static struct starpu_profiling_worker_info worker_info[STARPU_NMAXWORKERS];
 /* TODO: rather use rwlock */
 static starpu_pthread_mutex_t worker_info_mutex[STARPU_NMAXWORKERS];
@@ -44,6 +46,7 @@ static struct timespec executing_start_date[STARPU_NMAXWORKERS];
 #ifdef STARPU_PAPI
 static int papi_events[PAPI_MAX_HWCTRS];
 static int papi_nevents = 0;
+static int warned_component_unavailable = 0;
 #endif
 
 /* Store the busid of the different (src, dst) pairs. busid_matrix[src][dst]
@@ -158,7 +161,7 @@ void _starpu_profiling_init(void)
 		conf_papi_events = starpu_getenv("STARPU_PROF_PAPI_EVENTS");
 		if (conf_papi_events != NULL)
 		{
-			while ((papi_event_name = strtok_r(conf_papi_events, " ", &conf_papi_events)))
+			while ((papi_event_name = strtok_r(conf_papi_events, " ,", &conf_papi_events)))
 			{
 				_STARPU_DEBUG("Loading PAPI Event:%s\n", papi_event_name);
 				retval = PAPI_event_name_to_code ((char*)papi_event_name, &papi_events[papi_nevents]);
@@ -186,7 +189,12 @@ void _starpu_profiling_papi_task_start_counters(struct starpu_task *task)
 		PAPI_create_eventset(&profiling_info->papi_event_set);
 		for(int i=0; i<papi_nevents; i++)
 		{
-			PAPI_add_event(profiling_info->papi_event_set, papi_events[i]);
+			int ret = PAPI_add_event(profiling_info->papi_event_set, papi_events[i]);
+			if (ret == PAPI_ECMP_DISABLED && !warned_component_unavailable)
+			{
+				_STARPU_MSG("Error while registering Papi event: Component containing event is disabled. Try running `papi_component_avail` to get more information.\n");
+				warned_component_unavailable = 1;
+			}
 			profiling_info->papi_values[i]=0;
 		}
 		PAPI_reset(profiling_info->papi_event_set);

+ 2 - 1
src/profiling/profiling_helpers.c

@@ -99,8 +99,9 @@ void _starpu_profiling_worker_helper_display_summary(FILE *stream)
 	for (workerid = 0; workerid < worker_cnt; workerid++)
 	{
 		struct starpu_profiling_worker_info info;
-		starpu_profiling_worker_get_info(workerid, &info);
+		int ret = starpu_profiling_worker_get_info(workerid, &info);
 		char name[64];
+		STARPU_ASSERT(!ret);
 
 		starpu_worker_get_name(workerid, name, sizeof(name));
 

+ 3 - 2
src/sched_policies/component_best_implementation.c

@@ -19,7 +19,9 @@
 
 #include <starpu_sched_component.h>
 #include <starpu_scheduler.h>
+#ifdef BUILDING_STARPU
 #include <core/workers.h>
+#endif
 
 /* return true if workerid can execute task, and fill task->predicted and task->predicted_transfer
  *  according to best implementation predictions
@@ -39,12 +41,11 @@ static int find_best_impl(unsigned sched_ctx_id, struct starpu_task * task, int
 	}
 	else
 	{
-		struct starpu_perfmodel_arch* archtype = starpu_worker_get_perf_archtype(workerid, sched_ctx_id);
 		for(impl = 0; impl < STARPU_MAXIMPLEMENTATIONS; impl++)
 		{
 			if(starpu_worker_can_execute_task(workerid, task, impl))
 			{
-				double d = starpu_task_expected_length(task, archtype, impl);
+				double d = starpu_task_worker_expected_length(task, workerid, sched_ctx_id, impl);
 				if(isnan(d))
 				{
 					best_impl = impl;

+ 3 - 0
src/sched_policies/component_eager.c

@@ -16,6 +16,9 @@
 
 #include <starpu_sched_component.h>
 #include <starpu_scheduler.h>
+#ifdef BUILDING_STARPU
+#include <core/workers.h>
+#endif
 
 struct _starpu_eager_data
 {

+ 1 - 0
src/sched_policies/component_heft.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2013-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  * Copyright (C) 2013       Simon Archipoff
+ * Copyright (C) 2020       Télécom-Sud Paris
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 0
src/sched_policies/component_heteroprio.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2013-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  * Copyright (C) 2013       Simon Archipoff
+ * Copyright (C) 2020       Télécom-Sud Paris
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 0
src/sched_policies/component_mct.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2013-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  * Copyright (C) 2013       Simon Archipoff
+ * Copyright (C) 2020       Télécom-Sud Paris
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 0 - 0
src/sched_policies/component_sched.c


Some files were not shown because too many files changed in this diff