소스 검색

Merge branch 'master' into fpga

# Conflicts:
#	src/Makefile.am
#	tools/Makefile.am
Nathalie Furmento 4 년 전
부모
커밋
cf770d8e88
100개의 변경된 파일7553개의 추가작업 그리고 367개의 파일을 삭제
  1. 1 0
      AUTHORS
  2. 36 7
      ChangeLog
  3. 12 0
      README.dev
  4. 29 17
      configure.ac
  5. 5 4
      doc/doxygen/Makefile.am
  6. 55 2
      doc/doxygen/chapters/380_offline_performance_tools.doxy
  7. 256 0
      doc/doxygen/chapters/400_python.doxy
  8. 27 3
      doc/doxygen/chapters/501_environment_variables.doxy
  9. 1648 0
      doc/doxygen/chapters/images/starpu_log.eps
  10. BIN
      doc/doxygen/chapters/images/starpu_log.png
  11. 1754 0
      doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based_energy.eps
  12. BIN
      doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based_energy.pdf
  13. BIN
      doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based_energy.png
  14. 1388 0
      doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based_energy_flops.eps
  15. BIN
      doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based_energy_flops.pdf
  16. BIN
      doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based_energy_flops.png
  17. 1416 0
      doc/doxygen/chapters/images/starpu_power_non_linear_memset_regression_based.eps
  18. BIN
      doc/doxygen/chapters/images/starpu_power_non_linear_memset_regression_based.pdf
  19. BIN
      doc/doxygen/chapters/images/starpu_power_non_linear_memset_regression_based.png
  20. 3 3
      doc/doxygen/doxygen.cfg
  21. 5 0
      doc/doxygen/refman.tex
  22. 4 5
      doc/doxygen_dev/Makefile.am
  23. 7 7
      doc/doxygen_dev/chapters/010_core.doxy
  24. 3 0
      doc/doxygen_dev/doxygen-config.cfg.in
  25. 3 3
      doc/doxygen_dev/doxygen.cfg
  26. 6 5
      examples/Makefile.am
  27. 7 3
      examples/axpy/axpy_opencl.c
  28. 4 0
      examples/axpy/axpy_opencl_kernel.cl
  29. 6 1
      examples/cpp/add_vectors_interface.cpp
  30. 42 0
      examples/interface/complex.c
  31. 3 1
      examples/interface/complex_codelet.h
  32. 8 4
      examples/lu/lu.sh
  33. 19 7
      examples/lu/lu_example.c
  34. 7 1
      examples/mlr/mlr.c
  35. 31 27
      examples/mult/sgemm.sh
  36. 16 0
      examples/native_fortran/nf_vector.f90
  37. 29 5
      examples/scheduler/schedulers.sh
  38. 4 3
      examples/stencil/Makefile.am
  39. 57 0
      include/fstarpu_mod.f90
  40. 2 0
      include/starpu_fxt.h
  41. 12 0
      include/starpu_perfmodel.h
  42. 15 1
      include/starpu_task.h
  43. 3 0
      include/starpu_worker.h
  44. 3 3
      julia/examples/Makefile.am
  45. 2 2
      julia/src/Makefile.am
  46. 2 2
      julia/src/dynamic_compiler/Makefile.am
  47. 45 0
      mpi/GNUmakefile.in
  48. 0 18
      mpi/Makefile.am
  49. 3 3
      mpi/examples/Makefile.am
  50. 1 1
      mpi/examples/matrix_mult/mm.c
  51. 4 1
      mpi/examples/mpi_lu/plu_example.c
  52. 4 1
      mpi/examples/mpi_lu/plu_implicit_example.c
  53. 4 1
      mpi/examples/mpi_lu/plu_outofcore_example.c
  54. 9 9
      mpi/examples/mpi_lu/pxlu.c
  55. 4 5
      mpi/src/Makefile.am
  56. 22 20
      mpi/src/load_balancer/policy/data_movements_interface.c
  57. 4 4
      mpi/src/load_balancer/policy/data_movements_interface.h
  58. 6 6
      mpi/src/load_balancer/policy/load_heat_propagation.c
  59. 1 0
      mpi/src/mpi/starpu_mpi_mpi.c
  60. 9 0
      mpi/src/nmad/starpu_mpi_nmad.c
  61. 14 12
      mpi/src/starpu_mpi.c
  62. 11 0
      mpi/src/starpu_mpi_private.c
  63. 5 5
      mpi/src/starpu_mpi_task_insert.c
  64. 5 3
      mpi/tests/Makefile.am
  65. 84 0
      mpi/tests/insert_task_tags.c
  66. 1 1
      mpi/tests/ring.c
  67. 1 1
      mpi/tests/ring_async.c
  68. 1 1
      mpi/tests/ring_async_implicit.c
  69. 1 1
      mpi/tests/ring_sync.c
  70. 1 1
      mpi/tests/ring_sync_detached.c
  71. 3 3
      mpi/tests/user_defined_datatype.c
  72. 4 3
      mpi/tools/Makefile.am
  73. 2 2
      sc_hypervisor/examples/Makefile.am
  74. 2 4
      sc_hypervisor/src/Makefile.am
  75. 4 4
      socl/examples/Makefile.am
  76. 1 1
      socl/examples/basic/basic.c
  77. 6 6
      socl/examples/clinfo/clinfo.c
  78. 2 2
      socl/src/Makefile.am
  79. 5 4
      src/Makefile.am
  80. 1 1
      src/common/fxt.c
  81. 17 11
      src/common/fxt.h
  82. 11 5
      src/core/jobs.c
  83. 2 1
      src/core/perfmodel/perfmodel_bus.c
  84. 16 3
      src/core/perfmodel/perfmodel_history.c
  85. 8 3
      src/core/task.c
  86. 28 4
      src/core/topology.c
  87. 3 0
      src/core/topology.h
  88. 12 3
      src/core/workers.c
  89. 17 28
      src/core/workers.h
  90. 3 0
      src/datawizard/coherency.h
  91. 10 0
      src/datawizard/datawizard.h
  92. 1 1
      src/datawizard/filters.c
  93. 29 2
      src/datawizard/interfaces/data_interface.c
  94. 114 60
      src/debug/traces/starpu_fxt.c
  95. 4 2
      src/drivers/cpu/driver_cpu.c
  96. 2 0
      src/drivers/driver_common/driver_common.h
  97. 24 3
      src/profiling/profiling.c
  98. 2 1
      src/sched_policies/component_sched.c
  99. 50 0
      src/util/fstarpu.c
  100. 0 0
      starpufft/src/Makefile.am

+ 1 - 0
AUTHORS

@@ -14,6 +14,7 @@ Eyraud-Dubois Lionel, Inria, <lionel.eyraud-dubois@inria.fr>
 Furmento Nathalie, CNRS, <nathalie.furmento@labri.fr>
 Guermouche Amina, Télécom SudParis, <amina.guermouche@inria.fr>
 Guilbaud Adrien, Inria, <adrien.guilbaud@inria.fr>
+He Kun, Inria, <kun.he@inria.fr>
 Henry Sylvain, Université de Bordeaux, <sylvain.henry@inria.fr>
 Hugo Andra, Université de Bordeaux/Inria, <andra.hugo@inria.fr>
 Juhoor Mehdi, Université de Bordeaux, <mjuhoor@gmail.com>

+ 36 - 7
ChangeLog

@@ -43,11 +43,46 @@ New features:
     can make prefetch more aggressive.
   * Add starpu_data_dup_ro().
   * Add starpu_data_release_to() and starpu_data_release_to_on_node().
+  * Add profiling based on papi performance counters.
+  * Add an experimental python interface (not actually parallel yet)
 
 Small changes:
   * Add a synthetic energy efficiency testcase.
 
-StarPU 1.3.5 (git revision xxx)
+StarPU 1.3.8
+====================================================================
+
+Small features:
+  * A codelet can now define a callback function pointer which will be
+    automatically called when the task does not define itself a
+    callback function, in that case, it can still be called from the
+    task callback function.
+  * New STARPU_WORKERS_COREID, STARPU_MAIN_THREAD_COREID and
+    STARPU_MPI_THREAD_COREID environment variables to bind threads to cores
+    instead of hyperthreads.
+
+StarPU 1.3.7
+====================================================================
+
+Small changes:
+  * Simgrid: bug fix for setting network/weight-S to 0.0
+
+StarPU 1.3.6 (git revision fb9fbed81410d9f0ebbff5bdad1352df4705efe8)
+====================================================================
+
+Small features:
+  * New STARPU_BACKOFF_MIN and STARPU_BACKOFF_MAX environment variables to the
+    exponential backoff limits of the number of cycles to pause while drivers
+    are spinning.
+  * Add STARPU_DISPLAY_BINDINGS environment variable and
+    starpu_display_bindings() function to display all bindings on the machine by
+    calling hwloc-ps
+  * New function starpu_get_pu_os_index() to convert logical index of a PU to
+    its OS index.
+  * New function starpu_get_hwloc_topology() to get the hwloc topology used by
+    StarPU.
+
+StarPU 1.3.5 (git revision 5f7458799f548026fab357b18541bb462dde2b53)
 ====================================================================
 
 Small features:
@@ -86,12 +121,6 @@ Small features:
     starpu_mpi_interface_datatype_unregister() which take a enum
     starpu_data_interface_id instead of a starpu_data_handle_t
   * New script starpu_env to set up StarPU environment variables
-  * New STARPU_BACKOFF_MIN and STARPU_BACKOFF_MAX environment variables to the
-    exponential backoff limits of the number of cycles to pause while drivers
-    are spinning.
-  * Add STARPU_DISPLAY_BINDINGS environment variable and
-    starpu_display_bindings() function to display all bindings on the machine by
-    calling hwloc-ps
 
 Small changes:
   * New configure option --disable-build-doc-pdf

+ 12 - 0
README.dev

@@ -101,3 +101,15 @@ Error handling
   --enable-fast.
 
 	STARPU_ASSERT(j->terminated != 0)
+
+
+
+Makefile.am
+-----------
+
+Dependency libraries are appended to LIBS.
+Only real LDFLAGS such as -no-undefined go to LDFLAGS.
+
+If a program foo needs more libraries, it can put then in foo_LDADD.
+
+(No, AM_LDADD does not exist)

+ 29 - 17
configure.ac

@@ -250,14 +250,14 @@ if test x$enable_simgrid = xyes ; then
 	PKG_CHECK_MODULES([SIMGRID], [simgrid], [], [:])
 
 	if test "$simgrid_include_dir" != "no" ; then
-	   	SIMGRID_CFLAGS="$SIMGRID_CFLAGS -I$simgrid_include_dir"
+		SIMGRID_CFLAGS="-I$simgrid_include_dir $SIMGRID_CFLAGS"
 	fi
 	if test "$simgrid_lib_dir" != "no" ; then
-	   	SIMGRID_LIBS="$SIMGRID_LIBS -L$simgrid_lib_dir"
+		SIMGRID_LIBS="-L$simgrid_lib_dir $SIMGRID_LIBS"
 	fi
 	if test "$simgrid_dir" != "no" ; then
-	   	SIMGRID_CFLAGS="$SIMGRID_CFLAGS -I$simgrid_dir/include"
-	   	SIMGRID_LIBS="$SIMGRID_LIBS -L$simgrid_dir/lib"
+		SIMGRID_CFLAGS="-I$simgrid_dir/include $SIMGRID_CFLAGS"
+		SIMGRID_LIBS="-L$simgrid_dir/lib $SIMGRID_LIBS"
 	fi
    	if test -n "$SIMGRID_CFLAGS" ; then
 	   	CFLAGS="$SIMGRID_CFLAGS $CFLAGS"
@@ -346,6 +346,8 @@ if test x$enable_simgrid = xyes ; then
 		AC_DEFINE(STARPU_SIMGRID_MC, [1], [Define this to enable Model Checker in simgrid execution])
 		AC_PATH_PROG([SIMGRID_MC], [simgrid-mc], [no], [$simgrid_dir/bin:$PATH])
 		LDFLAGS="$LDFLAGS -Wl,-znorelro -Wl,-znoseparate-code"
+		# libsimgrid needs to be linked from binaries themselves for MC to work
+		STARPU_EXPORTED_LIBS="$STARPU_EXPORTED_LIBS -lsimgrid"
 	fi
 fi
 AM_CONDITIONAL(STARPU_SIMGRID_MC, test x$enable_simgrid_mc = xyes)
@@ -546,6 +548,10 @@ if test x$enable_mpi = xyes -a x$enable_nmad = xyes ; then
     build_nmad_lib=yes
     build_mpi_lib=no
     PKG_CHECK_MODULES([NMAD],[nmad])
+    AC_CHECK_DECL([piom_ltask_set_bound_thread_os_indexes], have_piom_ltask_set_bound_thread_os_indexes=yes, have_piom_ltask_set_bound_thread_os_indexes=no, [[#include <pioman.h>]])
+    if test x$have_piom_ltask_set_bound_thread_os_indexes = xyes; then
+      AC_DEFINE(HAVE_PIOM_LTASK_SET_BOUND_THREAD_OS_INDEXES, [1], [piom_ltask_set_bound_thread_os_indexes is availabe])
+    fi
 else
     build_nmad_lib=no
 fi
@@ -2191,6 +2197,7 @@ if test x$use_fxt = xyes; then
 	save_LDFLAGS="$LDFLAGS"
 	LDFLAGS="$LDFLAGS $FXT_LDFLAGS"
 	AC_CHECK_FUNCS([fxt_close])
+	AC_CHECK_FUNCS([fxt_blockev_leave])
 	AC_CHECK_FUNCS([enable_fut_flush])
 	AC_CHECK_FUNCS([fut_set_filename])
 	AC_CHECK_FUNCS([fut_setup_flush_callback])
@@ -3511,14 +3518,27 @@ AM_CONDITIONAL(STARPU_BUILD_STARPURM_EXAMPLES, [test x$enable_starpurm_examples
 # Documentation                          #
 ##########################################
 
+def_enable_build_doc="yes"
+available_doc="no"
+if test -d "$srcdir/doc/doxygen/html" ; then
+   def_enable_build_doc="no"
+   available_doc="yes"
+fi
+
 AC_ARG_ENABLE(build-doc, [AS_HELP_STRING([--disable-build-doc],
 			[disable building of documentation])],
-			enable_build_doc=$enableval, enable_build_doc=yes)
+			enable_build_doc=$enableval, enable_build_doc=$def_enable_build_doc)
 
 AC_ARG_ENABLE(build-doc-pdf, [AS_HELP_STRING([--enable-build-doc-pdf],
 			[enable building of PDF documentation])],
 			enable_build_doc_pdf=$enableval, enable_build_doc_pdf=no)
 
+available_doc_pdf="no"
+if test -f "$srcdir/doc/doxygen/starpu.pdf" ; then
+   enable_build_doc_pdf="no"
+   available_doc_pdf="yes"
+fi
+
 # Check whether doxygen needed tools are installed
 AC_PATH_PROG(doxygencommand, doxygen)
 if test "$doxygencommand" = "" ; then
@@ -3534,17 +3554,6 @@ if test "$epstopdfcommand" = "" ; then
    enable_build_doc_pdf="no"
 fi
 
-available_doc="no"
-if test -d "$srcdir/doc/doxygen/html" ; then
-   enable_build_doc="no"
-   available_doc="yes"
-fi
-available_doc_pdf="no"
-if test -f "$srcdir/doc/doxygen/starpu.pdf" ; then
-   enable_build_doc="no"
-   enable_build_doc_pdf="no"
-   available_doc_pdf="yes"
-fi
 AC_MSG_CHECKING(whether HTML documentation should be compiled)
 AC_MSG_RESULT($enable_build_doc)
 AC_MSG_CHECKING(whether HTML documentation is available)
@@ -3592,7 +3601,6 @@ LIBSTARPU_LDFLAGS="$STARPU_OPENCL_LDFLAGS $STARPU_CUDA_LDFLAGS $HWLOC_LIBS $FXT_
 AC_SUBST([LIBSTARPU_LDFLAGS])
 
 LIBSTARPU_LINK=libstarpu-$STARPU_EFFECTIVE_VERSION.la
-AC_SUBST([LIBSTARPU_LINK])
 
 if test "x$enable_shared" = xno; then
         # No .so, so application will unexpected have to know which -l to
@@ -3600,6 +3608,8 @@ if test "x$enable_shared" = xno; then
 	AC_DEFINE(STARPU_STATIC_ONLY, [1], [Only static compilation was made])
 	STARPU_EXPORTED_LIBS="$STARPU_EXPORTED_LIBS $LDFLAGS $LIBS $LIBSTARPU_LDFLAGS"
 fi
+LIBSTARPU_LINK="$LIBSTARPU_LINK $STARPU_EXPORTED_LIBS"
+AC_SUBST([LIBSTARPU_LINK])
 AC_SUBST(STARPU_EXPORTED_LIBS)
 
 # File configuration
@@ -3738,6 +3748,7 @@ AC_OUTPUT([
 	examples/stencil/Makefile
 	tests/Makefile
 	tests/loader-cross.sh
+	tests/model-checking/Makefile
 	tests/model-checking/starpu-mc.sh
 	examples/loader-cross.sh
 	examples/stencil/loader-cross.sh
@@ -3746,6 +3757,7 @@ AC_OUTPUT([
 	mpi/tests/Makefile
 	mpi/examples/Makefile
 	mpi/tools/Makefile
+	mpi/GNUmakefile
 	sc_hypervisor/Makefile
 	sc_hypervisor/src/Makefile
 	sc_hypervisor/examples/Makefile

+ 5 - 4
doc/doxygen/Makefile.am

@@ -76,6 +76,7 @@ chapters =	\
 	chapters/370_online_performance_tools.doxy		\
 	chapters/380_offline_performance_tools.doxy		\
 	chapters/390_faq.doxy		\
+	chapters/400_python.doxy		\
 	chapters/401_out_of_core.doxy		\
 	chapters/410_mpi_support.doxy		\
 	chapters/415_fault_tolerance.doxy	\
@@ -146,6 +147,10 @@ images = 	\
 	chapters/images/temanejo.png
 
 if STARPU_BUILD_DOC
+EXTRA_DIST += \
+	      $(top_srcdir)/doc/doxygen/chapters/version.sty \
+	      $(top_srcdir)/doc/doxygen/chapters/version.html
+
 starpu_config.h: $(top_srcdir)/include/starpu_config.h.in
 	@$(SED) 's/#undef \(.*\)/#define \1 1/' $< > $@
 
@@ -303,8 +308,4 @@ endif
 EXTRA_DIST += doxygen.cfg refman.tex \
 	      $(chapters) $(images)
 
-# Rule to update documentation on web server. Should only be called from benchmarks dalton directory
-PUBLISHDIR	?= /home/benchmarks/softs/starpu/starpu-scripts/mirror/files/doc
-update-web: $(DOX_PDF)
-	cp -pr starpu.pdf html $(PUBLISHDIR)
 

+ 55 - 2
doc/doxygen/chapters/380_offline_performance_tools.doxy

@@ -400,7 +400,7 @@ starpu_perfmodel_load_symbol(). The source code of the tool
 
 An XML output can also be printed by using the <c>-x</c> option:
 \verbatim
-tools/starpu_perfmodel_display -x -s non_linear_memset_regression_based 
+$ tools/starpu_perfmodel_display -x -s non_linear_memset_regression_based 
 <?xml version="1.0" encoding="UTF-8"?>
 <!DOCTYPE StarPUPerfmodel SYSTEM "starpu-perfmodel.dtd">
 <!-- symbol non_linear_memset_regression_based -->
@@ -425,6 +425,12 @@ The tool <c>starpu_perfmodel_plot</c> can be used to draw performance
 models. It writes a <c>.gp</c> file in the current directory, to be
 run with the tool <c>gnuplot</c>, which shows the corresponding curve.
 
+\verbatim
+$ tools/starpu_perfmodel_plot -s non_linear_memset_regression_based 
+$ gnuplot starpu_non_linear_memset_regression_based.gp
+$ gv starpu_non_linear_memset_regression_based.eps
+\endverbatim
+
 \image html starpu_non_linear_memset_regression_based.png
 \image latex starpu_non_linear_memset_regression_based.eps "" width=\textwidth
 
@@ -482,6 +488,53 @@ histogram of the codelet execution time distribution.
 \image html distrib_data_histo.png
 \image latex distrib_data_histo.eps "" width=\textwidth
 
+\section EnergyOfCodelets Energy Of Codelets
+
+A performance model of the energy of codelets can also be recorded thanks to
+the starpu_codelet::energy_model field of the starpu_codelet structure. StarPU usually cannot
+record this automatically since the energy measurement probes are usually not
+fine-grain enough.  It is however possible to measure it by writing a program
+that submits batches of tasks, let StarPU measure the energy requirement of
+the batch, and compute an average, see \ref MeasuringEnergyandPower .
+
+The energy performance model can then be displayed in Joules with
+<c>starpu_perfmodel_display</c> just like the time performance model.  The
+<c>starpu_perfmodel_plot</c> needs an extra <c>-e</c> option to display the
+proper unit in the graph:
+
+\verbatim
+$ tools/starpu_perfmodel_plot -e -s non_linear_memset_regression_based_energy
+$ gnuplot starpu_non_linear_memset_regression_based_energy.gp
+$ gv starpu_non_linear_memset_regression_based_energy.eps
+\endverbatim
+
+\image html starpu_non_linear_memset_regression_based_energy.png
+\image latex starpu_non_linear_memset_regression_based_energy.eps "" width=\textwidth
+
+The <c>-f</c> option can also be used to display the performance in terms of GFlop/s/W, i.e. the efficiency:
+
+\verbatim
+$ tools/starpu_perfmodel_plot -f -e -s non_linear_memset_regression_based_energy
+$ gnuplot starpu_non_linear_memset_regression_based_energy.gp
+$ gv starpu_non_linear_memset_regression_based_energy.eps
+\endverbatim
+
+\image html starpu_non_linear_memset_regression_based_energy_flops.png
+\image latex starpu_non_linear_memset_regression_based_energy_flops.eps "" width=\textwidth
+
+We clearly see here that it is much more energy-efficient to stay in the L3 cache.
+
+One can combine the two time and energy performance models to draw Watts:
+
+\verbatim
+$ tools/starpu_perfmodel_plot -se non_linear_memset_regression_based non_linear_memset_regression_based_energy
+$ gnuplot starpu_power_non_linear_memset_regression_based.gp
+$ gv starpu_power_non_linear_memset_regression_based.eps
+\endverbatim
+
+\image html starpu_power_non_linear_memset_regression_based.png
+\image latex starpu_power_non_linear_memset_regression_based.eps "" width=\textwidth
+
 \section DataTrace Data trace and tasks length
 
 It is possible to get statistics about tasks length and data size by using :
@@ -551,7 +604,7 @@ S: Start time
 Here's an example on how to use it:
 
 \verbatim
-$ python starpu_trace_state_stats.py trace.rec | column -t -s ","
+$ starpu_trace_state_stats.py trace.rec | column -t -s ","
 "Name"		"Count" "Type"	"Duration"
 "Callback"       220	Runtime	0.075978
 "chol_model_11"  10	Task	565.176

파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다.
+ 256 - 0
doc/doxygen/chapters/400_python.doxy


+ 27 - 3
doc/doxygen/chapters/501_environment_variables.doxy

@@ -69,7 +69,8 @@ create as many CUDA workers as there are CUDA devices.
 \anchor STARPU_NWORKER_PER_CUDA
 \addindex __env__STARPU_NWORKER_PER_CUDA
 Specify the number of workers per CUDA device, and thus the number of kernels
-which will be concurrently running on the devices. The default value is 1.
+which will be concurrently running on the devices, i.e. the number of CUDA
+streams. The default value is 1.
 </dd>
 
 <dt>STARPU_CUDA_THREAD_PER_WORKER</dt>
@@ -227,7 +228,14 @@ and the third (resp. second and fourth) workers will be put on CPU #0
 This variable is ignored if the field
 starpu_conf::use_explicit_workers_bindid passed to starpu_init() is
 set.
+</dd>
 
+<dt>STARPU_WORKERS_COREID</dt>
+<dd>
+\anchor STARPU_WORKERS_COREID
+\addindex __env__STARPU_WORKERS_COREID
+Same as \ref STARPU_WORKERS_CPUID, but bind the workers to cores instead of PUs
+(hyperthreads).
 </dd>
 
 <dt>STARPU_MAIN_THREAD_BIND</dt>
@@ -246,6 +254,14 @@ When defined, this make StarPU bind the thread that calls starpu_initialize() to
 the given CPU ID.
 </dd>
 
+<dt>STARPU_MAIN_THREAD_COREID</dt>
+<dd>
+\anchor STARPU_MAIN_THREAD_COREID
+\addindex __env__STARPU_MAIN_THREAD_COREID
+Same as \ref STARPU_MAIN_THREAD_CPUID, but bind the thread that calls
+starpu_initialize() to the given core, instead of the PU (hyperthread).
+</dd>
+
 <dt>STARPU_MPI_THREAD_CPUID</dt>
 <dd>
 \anchor STARPU_MPI_THREAD_CPUID
@@ -255,6 +271,14 @@ it to -1 (the default value) will use a reserved CPU, subtracted from the CPU
 workers.
 </dd>
 
+<dt>STARPU_MPI_THREAD_COREID</dt>
+<dd>
+\anchor STARPU_MPI_THREAD_COREID
+\addindex __env__STARPU_MPI_THREAD_COREID
+Same as \ref STARPU_MPI_THREAD_CPUID, but bind the MPI thread to the given core
+ID, instead of the PU (hyperthread).
+</dd>
+
 <dt>STARPU_MPI_NOBIND</dt>
 <dd>
 \anchor STARPU_MPI_NOBIND
@@ -1212,7 +1236,7 @@ StarPU for internal data structures during execution.
 \addindex __env__STARPU_BUS_STATS
 When defined, statistics about data transfers will be displayed when calling
 starpu_shutdown() (\ref Profiling). By default, statistics are printed
-on the standard error stream, use the environement variable \ref
+on the standard error stream, use the environment variable \ref
 STARPU_BUS_STATS_FILE to define another filename.
 </dd>
 
@@ -1232,7 +1256,7 @@ When defined, statistics about the workers will be displayed when calling
 starpu_shutdown() (\ref Profiling). When combined with the
 environment variable \ref STARPU_PROFILING, it displays the energy
 consumption (\ref Energy-basedScheduling).  By default, statistics are
-printed on the standard error stream, use the environement variable
+printed on the standard error stream, use the environment variable
 \ref STARPU_WORKER_STATS_FILE to define another filename.
 </dd>
 

파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다.
+ 1648 - 0
doc/doxygen/chapters/images/starpu_log.eps


BIN
doc/doxygen/chapters/images/starpu_log.png


파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다.
+ 1754 - 0
doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based_energy.eps


BIN
doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based_energy.pdf


BIN
doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based_energy.png


파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다.
+ 1388 - 0
doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based_energy_flops.eps


BIN
doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based_energy_flops.pdf


BIN
doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based_energy_flops.png


파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다.
+ 1416 - 0
doc/doxygen/chapters/images/starpu_power_non_linear_memset_regression_based.eps


BIN
doc/doxygen/chapters/images/starpu_power_non_linear_memset_regression_based.pdf


BIN
doc/doxygen/chapters/images/starpu_power_non_linear_memset_regression_based.png


+ 3 - 3
doc/doxygen/doxygen.cfg

@@ -365,7 +365,7 @@ TYPEDEF_HIDES_STRUCT   = NO
 # 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0,
 # corresponding to a cache size of 2^16 = 65536 symbols.
 
-SYMBOL_CACHE_SIZE      = 0
+#SYMBOL_CACHE_SIZE      = 0
 
 # Similar to the SYMBOL_CACHE_SIZE the size of the symbol lookup cache can be
 # set using LOOKUP_CACHE_SIZE. This cache is used to resolve symbols given
@@ -1502,13 +1502,13 @@ XML_OUTPUT             = xml
 # which can be used by a validating XML parser to check the
 # syntax of the XML files.
 
-XML_SCHEMA             =
+#XML_SCHEMA             =
 
 # The XML_DTD tag can be used to specify an XML DTD,
 # which can be used by a validating XML parser to check the
 # syntax of the XML files.
 
-XML_DTD                =
+#XML_DTD                =
 
 # If the XML_PROGRAMLISTING tag is set to YES Doxygen will
 # dump the program listings (including syntax highlighting

+ 5 - 0
doc/doxygen/refman.tex

@@ -138,6 +138,11 @@ Documentation License”.
 
 \part{StarPU Extensions}
 
+\chapter{PythonInterface}
+\label{PythonInterface}
+\hypertarget{PythonInterface}{}
+\input{PythonInterface}
+
 \chapter{Out Of Core}
 \label{OutOfCore}
 \hypertarget{OutOfCore}{}

+ 4 - 5
doc/doxygen_dev/Makefile.am

@@ -67,6 +67,10 @@ chapters =	\
 images =
 
 if STARPU_BUILD_DOC
+EXTRA_DIST += \
+	      $(top_srcdir)/doc/doxygen_dev/chapters/version.sty \
+	      $(top_srcdir)/doc/doxygen_dev/chapters/version.html
+
 config.h: $(top_srcdir)/src/common/config.h.in
 	@$(SED) 's/#undef \(.*\)/#define \1 1/' $< > $@
 	@$(SED) -i '1s/^/\/\*\* \@file \*\/\n/' $@
@@ -245,8 +249,3 @@ endif
 EXTRA_DIST += doxygen.cfg refman.tex \
 	      $(chapters) $(images)
 
-# Rule to update documentation on web server. Should only be called from benchmarks dalton directory
-PUBLISHDIR	?= /home/benchmarks/softs/starpu/starpu-scripts/mirror/files/doc
-update-web: $(DOX_PDF)
-	cp -pr starpu_dev.pdf html_dev $(PUBLISHDIR)
-

+ 7 - 7
doc/doxygen_dev/chapters/010_core.doxy

@@ -211,7 +211,7 @@ the application code on a thread launched by the application, or automatically
 by StarPU on a device-dependent CPU thread launched by StarPU. Whether a
 worker's operation cycle is managed automatically or
 not is controlled per session by the field \c not_launched_drivers of the \c
-starpu_conf struct, and is decided in \ref _starpu_launch_drivers() function.
+starpu_conf struct, and is decided in \c _starpu_launch_drivers function.
 
 When managed automatically, cycles of operations for a worker are handled by the corresponding
 driver specific <code>_starpu_<DRV>_worker()</code> function, where \c DRV is a driver name such as
@@ -220,7 +220,7 @@ Otherwise, the application must supply a thread which will repeatedly call \ref
 starpu_driver_run_once() for the corresponding worker.
 
 In both cases, control is then transferred to 
-\ref _starpu_cpu_driver_run_once() (or the corresponding driver specific func).
+\c _starpu_cpu_driver_run_once (or the corresponding driver specific func).
 The cycle of operations typically includes, at least, the following operations:
 
 - <b>task scheduling</b>
@@ -270,7 +270,7 @@ driving) progress, with a call to \ref __starpu_datawizard_progress(),
 
 Once the worker has a pending task assigned and the input data for that task are
 available in the memory node reachable by the worker's computing unit, the
-worker calls \ref _starpu_cpu_driver_execute_task() (or the corresponding driver
+worker calls \c _starpu_cpu_driver_execute_task (or the corresponding driver
 specific function) to proceed to the execution of the task.
 
 
@@ -312,12 +312,12 @@ writing.
 When the set of workers assigned to a scheduling context is about to be
 modified, all the workers in the union between the workers belonging to the
 scheduling context before the change and the workers expected to belong to the
-scheduling context after the change must be notified using the \ref
-notify_workers_about_changing_ctx_pending() function prior to the update. After
+scheduling context after the change must be notified using the
+\c notify_workers_about_changing_ctx_pending function prior to the update. After
 the update, all the workers in that same union must be notified for the update
-completion with a call to \ref notify_workers_about_changing_ctx_done().
+completion with a call to \c notify_workers_about_changing_ctx_done.
 
-The function \ref notify_workers_about_changing_ctx_pending() places every
+The function \c notify_workers_about_changing_ctx_pending places every
 worker passed in argument in a state compatible with changing the scheduling
 context assignment of that worker, possibly blocking until that worker leaves
 incompatible states such as a pending scheduling operation. If the caller of

+ 3 - 0
doc/doxygen_dev/doxygen-config.cfg.in

@@ -16,6 +16,9 @@
 #
 INPUT                  = @top_srcdir@/doc/doxygen_dev/chapters         \
                          @top_builddir@/doc/doxygen_dev/config.h \
+			 @top_srcdir@/include/starpu_driver.h \
+			 @top_srcdir@/include/starpu_worker.h \
+			 @top_builddir@/doc/doxygen/starpu_config.h \
 			 @top_srcdir@/src/datawizard/data_request.h \
 			 @top_srcdir@/src/datawizard/coherency.h \
 			 @top_srcdir@/src/datawizard/sort_data_handles.h \

+ 3 - 3
doc/doxygen_dev/doxygen.cfg

@@ -365,7 +365,7 @@ TYPEDEF_HIDES_STRUCT   = NO
 # 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0,
 # corresponding to a cache size of 2^16 = 65536 symbols.
 
-SYMBOL_CACHE_SIZE      = 0
+#SYMBOL_CACHE_SIZE      = 0
 
 # Similar to the SYMBOL_CACHE_SIZE the size of the symbol lookup cache can be
 # set using LOOKUP_CACHE_SIZE. This cache is used to resolve symbols given
@@ -1502,13 +1502,13 @@ XML_OUTPUT             = xml
 # which can be used by a validating XML parser to check the
 # syntax of the XML files.
 
-XML_SCHEMA             =
+#XML_SCHEMA             =
 
 # The XML_DTD tag can be used to specify an XML DTD,
 # which can be used by a validating XML parser to check the
 # syntax of the XML files.
 
-XML_DTD                =
+#XML_DTD                =
 
 # If the XML_PROGRAMLISTING tag is set to YES Doxygen will
 # dump the program listings (including syntax highlighting

+ 6 - 5
examples/Makefile.am

@@ -20,9 +20,10 @@ include $(top_srcdir)/starpu.mk
 
 AM_CFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
 AM_CXXFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(GLOBAL_AM_CXXFLAGS) -Wno-unused
-LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ $(MAGMA_LIBS) $(HWLOC_LIBS) @LIBS@ $(FXT_LIBS)
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
-AM_LDFLAGS = @STARPU_EXPORT_DYNAMIC@ $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
+AM_LDFLAGS = @STARPU_EXPORT_DYNAMIC@
+LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@ $(MAGMA_LIBS) $(HWLOC_LIBS) $(FXT_LIBS)
+LIBS += $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
 
 SUBDIRS = stencil
 
@@ -75,7 +76,7 @@ EXTRA_DIST = 					\
 	lu/lu.sh
 
 
-CLEANFILES = *.gcno *.gcda *.linkinfo *.mod starpu_idle_microsec.log *.mps *.dot *.pl *.png *.output tasks.rec perfs.rec perfs2.rec fortran90/starpu_mod.f90 native_fortran/fstarpu_mod.f90
+CLEANFILES = *.gcno *.gcda *.linkinfo *.mod starpu_idle_microsec.log *.mps */*.mps */*/*.mps *.dot */*.dot */*/*.dot *.pl */*.pl */*/*.pl *.png *.output tasks.rec perfs.rec */perfs.rec */*/perfs.rec perfs2.rec fortran90/starpu_mod.f90 native_fortran/fstarpu_mod.f90
 
 if STARPU_USE_CUDA
 
@@ -165,7 +166,7 @@ if !STARPU_HAVE_WINDOWS
 ## test loader program
 if !STARPU_CROSS_COMPILING
 LOADER			=	loader
-loader_CPPFLAGS 	=	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
+loader_CPPFLAGS 	=	$(AM_CPPFLAGS) -I$(top_builddir)/src/
 LOADER_BIN		=	$(abs_top_builddir)/examples/$(LOADER)
 loader_SOURCES		=	../tests/loader.c
 noinst_PROGRAMS		+=	loader
@@ -1012,7 +1013,7 @@ endif
 examplebin_PROGRAMS +=				\
 	mandelbrot/mandelbrot
 
-mandelbrot_mandelbrot_CPPFLAGS = $(AM_CFLAGS) $(AM_CPPFLAGS)
+mandelbrot_mandelbrot_CPPFLAGS = $(AM_CPPFLAGS)
 if STARPU_HAVE_X11
 mandelbrot_mandelbrot_CPPFLAGS += $(X_CFLAGS)
 mandelbrot_mandelbrot_LDADD = $(X_PRE_LIBS) $(X_LIBS) -lX11 $(X_EXTRA_LIBS)

+ 7 - 3
examples/axpy/axpy_opencl.c

@@ -31,7 +31,9 @@ void axpy_opencl(void *buffers[], void *_args)
 
 	unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
 	cl_mem x = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
+	unsigned x_offset = STARPU_VECTOR_GET_OFFSET(buffers[0]);
 	cl_mem y = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[1]);
+	unsigned y_offset = STARPU_VECTOR_GET_OFFSET(buffers[1]);
 
 	id = starpu_worker_get_id_check();
 	devid = starpu_worker_get_devid(id);
@@ -41,9 +43,11 @@ void axpy_opencl(void *buffers[], void *_args)
 		STARPU_OPENCL_REPORT_ERROR(err);
 
 	err = clSetKernelArg(kernel, 0, sizeof(x), &x);
-	err|= clSetKernelArg(kernel, 1, sizeof(y), &y);
-	err|= clSetKernelArg(kernel, 2, sizeof(n), &n);
-	err|= clSetKernelArg(kernel, 3, sizeof(*alpha), alpha);
+	err|= clSetKernelArg(kernel, 1, sizeof(x_offset), &x_offset);
+	err|= clSetKernelArg(kernel, 2, sizeof(y), &y);
+	err|= clSetKernelArg(kernel, 3, sizeof(y_offset), &y_offset);
+	err|= clSetKernelArg(kernel, 4, sizeof(n), &n);
+	err|= clSetKernelArg(kernel, 5, sizeof(*alpha), alpha);
 	if (err)
 		STARPU_OPENCL_REPORT_ERROR(err);
 

+ 4 - 0
examples/axpy/axpy_opencl_kernel.cl

@@ -19,11 +19,15 @@
 #include "axpy.h"
 
 __kernel void _axpy_opencl(__global TYPE *x,
+			   unsigned x_offset,
 			   __global TYPE *y,
+			   unsigned y_offset,
 			   unsigned nx,
 			   TYPE alpha)
 {
         const int i = get_global_id(0);
+        x = (__global char*) x + x_offset;
+        y = (__global char*) y + y_offset;
         if (i < nx)
                 y[i] = alpha * x[i] + y[i];
 }

+ 6 - 1
examples/cpp/add_vectors_interface.cpp

@@ -375,7 +375,7 @@ static void register_vector_cpp_handle(starpu_data_handle_t handle, unsigned hom
 
 /* declare a new data with the vector interface */
 void vector_cpp_data_register(starpu_data_handle_t *handleptr, int home_node,
-                        std::vector<MY_TYPE>* vec, uint32_t nx, size_t elemsize)
+			      std::vector<MY_TYPE>* vec, uint32_t nx, size_t elemsize)
 {
 #if __cplusplus >= 201103L
 	struct vector_cpp_interface vector =
@@ -403,6 +403,11 @@ void vector_cpp_data_register(starpu_data_handle_t *handleptr, int home_node,
 	};
 #endif
 
+	if (interface_vector_cpp_ops.interfaceid == STARPU_UNKNOWN_INTERFACE_ID)
+	{
+		interface_vector_cpp_ops.interfaceid = (enum starpu_data_interface_id )starpu_data_interface_get_next_id();
+	}
+
 	starpu_data_register(handleptr, home_node, &vector, &interface_vector_cpp_ops);
 }
 

+ 42 - 0
examples/interface/complex.c

@@ -18,6 +18,25 @@
 #include "complex_interface.h"
 #include "complex_codelet.h"
 
+void copy_complex_codelet_cpu(void *descr[], void *_args)
+{
+	int i;
+	int nx = STARPU_COMPLEX_GET_NX(descr[0]);
+
+	double *i_real = STARPU_COMPLEX_GET_REAL(descr[0]);
+	double *i_imaginary = STARPU_COMPLEX_GET_IMAGINARY(descr[0]);
+
+	double *o_real = STARPU_COMPLEX_GET_REAL(descr[1]);
+	double *o_imaginary = STARPU_COMPLEX_GET_IMAGINARY(descr[1]);
+
+	for(i=0 ; i<nx ; i++)
+	{
+		o_real[i] = i_real[i];
+		o_imaginary[i] = i_imaginary[i];
+	}
+
+}
+
 static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
 {
 	(void) task;
@@ -58,6 +77,7 @@ extern void copy_complex_codelet_opencl(void *buffers[], void *args);
 
 struct starpu_codelet cl_copy =
 {
+	.cpu_funcs = {copy_complex_codelet_cpu},
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {copy_complex_codelet_cuda},
 	.cuda_flags = {STARPU_CUDA_ASYNC},
@@ -82,6 +102,7 @@ int main(void)
 	starpu_data_handle_t handle1;
 	starpu_data_handle_t handle2;
 	starpu_data_handle_t handle3;
+	starpu_data_handle_t handle4;
 
 	double real = 45.0;
 	double imaginary = 12.0;
@@ -227,6 +248,27 @@ int main(void)
 
 	starpu_data_unpartition(handle3, STARPU_MAIN_RAM);
 
+	/* Use helper starpu_data_cpy */
+	starpu_complex_data_register(&handle4, -1, 0, 0, 1);
+	starpu_data_cpy(handle4, handle1, 0, NULL, NULL);
+	ret = starpu_task_insert(&cl_display, STARPU_VALUE, "handle4", strlen("handle4")+1, STARPU_R, handle4, 0);
+	if (ret == -ENODEV) goto end;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+	/* Compare two different complexs.  */
+	ret = starpu_task_insert(&cl_compare,
+				 STARPU_R, handle1,
+				 STARPU_R, handle4,
+				 STARPU_VALUE, &compare_ptr, sizeof(compare_ptr),
+				 0);
+	if (ret == -ENODEV) goto end;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+	starpu_task_wait_for_all();
+	if (compare != 1)
+	{
+	     FPRINTF(stderr, "Complex numbers should be similar\n");
+	     goto end;
+	}
+
 end:
 #ifdef STARPU_USE_OPENCL
 	{

+ 3 - 1
examples/interface/complex_codelet.h

@@ -87,10 +87,12 @@ void display_complex_codelet(void *descr[], void *_args)
 	if (_args)
 		starpu_codelet_unpack_args(_args, &msg);
 
+	FPRINTF(stderr, "[%s]\n", _args?msg:NULL);
 	for(i=0 ; i<nx ; i++)
 	{
-		FPRINTF(stderr, "[%s] Complex[%d] = %3.2f + %3.2f i\n", _args?msg:NULL, i, real[i], imaginary[i]);
+		FPRINTF(stderr, "\tComplex[%d] = %3.2f + %3.2f i\n", i, real[i], imaginary[i]);
 	}
+	fflush(stderr);
 }
 
 struct starpu_codelet cl_display =

+ 8 - 4
examples/lu/lu.sh

@@ -19,6 +19,8 @@
 set -e
 
 PREFIX=$(dirname $0)
+rm -rf $PREFIX/lu.traces
+mkdir -p $PREFIX/lu.traces
 
 if [ -n "$STARPU_MIC_SINK_PROGRAM_PATH" ] ; then
 	STARPU_MIC_SINK_PROGRAM_NAME=$STARPU_MIC_SINK_PROGRAM_PATH/lu_implicit_example_float
@@ -26,11 +28,13 @@ if [ -n "$STARPU_MIC_SINK_PROGRAM_PATH" ] ; then
 	[ -x "$STARPU_MIC_SINK_PROGRAM_PATH/.libs/lu_implicit_example_float" ] && STARPU_MIC_SINK_PROGRAM_NAME=$STARPU_MIC_SINK_PROGRAM_PATH/.libs/lu_implicit_example_float
 fi
 
+export STARPU_FXT_PREFIX=$PREFIX/lu.traces
+
 $STARPU_LAUNCH $PREFIX/lu_implicit_example_float -size $((160 * 4)) -nblocks 4 -piv
 $STARPU_LAUNCH $PREFIX/lu_implicit_example_float -size $((160 * 4)) -nblocks 4 -no-stride
 $STARPU_LAUNCH $PREFIX/lu_implicit_example_float -size $((160 * 4)) -nblocks 4 -bound
-$STARPU_LAUNCH $PREFIX/lu_implicit_example_float -size $((160 * 2)) -nblocks 2 -bounddeps
-$STARPU_LAUNCH $PREFIX/lu_implicit_example_float -size $((160 * 2)) -nblocks 2 -bound -bounddeps -bounddepsprio
+$STARPU_LAUNCH $PREFIX/lu_implicit_example_float -size $((160 * 2)) -nblocks 2 -bounddeps -directory $STARPU_FXT_PREFIX
+$STARPU_LAUNCH $PREFIX/lu_implicit_example_float -size $((160 * 2)) -nblocks 2 -bound -bounddeps -bounddepsprio -directory $STARPU_FXT_PREFIX
 
 if [ -n "$STARPU_MIC_SINK_PROGRAM_PATH" ] ; then
 	STARPU_MIC_SINK_PROGRAM_NAME=$STARPU_MIC_SINK_PROGRAM_PATH/lu_example_float
@@ -41,5 +45,5 @@ fi
 $STARPU_LAUNCH $PREFIX/lu_example_float -size $((160 * 4)) -nblocks 4 -piv
 $STARPU_LAUNCH $PREFIX/lu_example_float -size $((160 * 4)) -nblocks 4 -no-stride
 $STARPU_LAUNCH $PREFIX/lu_example_float -size $((160 * 4)) -nblocks 4 -bound
-$STARPU_LAUNCH $PREFIX/lu_example_float -size $((160 * 2)) -nblocks 2 -bounddeps
-$STARPU_LAUNCH $PREFIX/lu_example_float -size $((160 * 2)) -nblocks 2 -bound -bounddeps -bounddepsprio
+$STARPU_LAUNCH $PREFIX/lu_example_float -size $((160 * 2)) -nblocks 2 -bounddeps -directory $PREFIX/lu.traces
+$STARPU_LAUNCH $PREFIX/lu_example_float -size $((160 * 2)) -nblocks 2 -bound -bounddeps -bounddepsprio -directory $STARPU_FXT_PREFIX

+ 19 - 7
examples/lu/lu_example.c

@@ -40,6 +40,7 @@ static unsigned no_prio=0;
 unsigned bound = 0;
 unsigned bounddeps = 0;
 unsigned boundprio = 0;
+char *directory =  NULL;
 
 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
 
@@ -102,9 +103,13 @@ static void parse_args(int argc, char **argv)
 			bounddeps = 1;
 			boundprio = 1;
 		}
+		else if (strcmp(argv[i], "-directory") == 0)
+		{
+			directory = strdup(argv[++i]);
+		}
 		else if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0)
 		{
-			fprintf(stderr,"usage: lu [-size n] [-nblocks b] [-piv] [-no-stride] [-profile] [-bound] [-bounddeps] [-bounddepsprio]\n");
+			fprintf(stderr,"usage: lu [-size n] [-nblocks b] [-piv] [-no-stride] [-profile] [-bound] [-bounddeps] [-bounddepsprio] [-directory d]\n");
 			fprintf(stderr,"Default is size %lu and nblocks %u\n", size, nblocks);
 			exit(0);
 		}
@@ -420,17 +425,23 @@ int main(int argc, char **argv)
 	{
 		if (bounddeps)
 		{
-			FILE *f = fopen("lu.pl", "w");
+			if (!directory)
+				directory = strdup(".");
+			char filename[256];
+			snprintf(filename, sizeof(filename), "%s/%s", directory, "lu.pl");
+			FILE *f = fopen(filename, "w");
 			starpu_bound_print_lp(f);
-			FPRINTF(stderr,"system printed to lu.pl\n");
+			FPRINTF(stderr,"system printed to %s\n", filename);
 			fclose(f);
-			f = fopen("lu.mps", "w");
+			snprintf(filename, sizeof(filename), "%s/%s", directory, "lu.mps");
+			f = fopen(filename, "w");
 			starpu_bound_print_mps(f);
-			FPRINTF(stderr,"system printed to lu.mps\n");
+			FPRINTF(stderr,"system printed to %s\n", filename);
 			fclose(f);
-			f = fopen("lu.dot", "w");
+			snprintf(filename, sizeof(filename), "%s/%s", directory, "lu.dot");
+			f = fopen(filename, "w");
 			starpu_bound_print_dot(f);
-			FPRINTF(stderr,"system printed to lu.mps\n");
+			FPRINTF(stderr,"system printed to %s\n", filename);
 			fclose(f);
 		}
 	}
@@ -458,6 +469,7 @@ int main(int argc, char **argv)
 	starpu_cublas_shutdown();
 
 	starpu_shutdown();
+	free(directory);
 
 	if (ret == -ENODEV) return 77; else return 0;
 }

+ 7 - 1
examples/mlr/mlr.c

@@ -44,6 +44,12 @@
 #include <stdint.h>
 #include <starpu.h>
 
+#ifdef STARPU_QUICK_CHECK
+#define NTASKS 10
+#else
+#define NTASKS 1000
+#endif
+
 static long sum;
 
 /* Performance function of the task, which is in this case very simple, as the parameter values just need to be written in the array "parameters" */
@@ -185,7 +191,7 @@ int main(void)
 		vector_mn[1] = n;
 		starpu_data_release(vector_mn_handle);
 
-		for (j = 0; j < 1000; j++)
+		for (j = 0; j < NTASKS; j++)
 		{
 			starpu_insert_task(&cl_init,
 					   STARPU_R, vector_mn_handle,

+ 31 - 27
examples/mult/sgemm.sh

@@ -25,6 +25,8 @@
 set -e
 
 PREFIX=$(dirname $0)
+rm -rf $PREFIX/sgemm.traces
+mkdir -p $PREFIX/sgemm.traces
 
 if [ -n "$STARPU_MIC_SINK_PROGRAM_PATH" ] ; then
 	STARPU_MIC_SINK_PROGRAM_NAME=$STARPU_MIC_SINK_PROGRAM_PATH/sgemm
@@ -32,46 +34,48 @@ if [ -n "$STARPU_MIC_SINK_PROGRAM_PATH" ] ; then
 	[ -x "$STARPU_MIC_SINK_PROGRAM_PATH/.libs/sgemm" ] && STARPU_MIC_SINK_PROGRAM_NAME=$STARPU_MIC_SINK_PROGRAM_PATH/.libs/sgemm
 fi
 
-STARPU_SCHED=dmdas STARPU_FXT_PREFIX=$PREFIX/ $PREFIX/sgemm -check
-[ ! -x $PREFIX/../../tools/starpu_perfmodel_display ] || $STARPU_LAUNCH $PREFIX/../../tools/starpu_perfmodel_display -s starpu_sgemm_gemm
-[ ! -x $PREFIX/../../tools/starpu_perfmodel_display ] || $STARPU_LAUNCH $PREFIX/../../tools/starpu_perfmodel_display -x -s starpu_sgemm_gemm
-[ ! -x $PREFIX/../../tools/starpu_perfmodel_recdump ] || $STARPU_LAUNCH $PREFIX/../../tools/starpu_perfmodel_recdump -o perfs.rec
-[ -f perfs.rec ]
+export STARPU_FXT_PREFIX=$PREFIX/sgemm.traces
+
+STARPU_SCHED=dmdas $PREFIX/sgemm -check
 if [ -x $PREFIX/../../tools/starpu_fxt_tool ];
 then
-	$STARPU_LAUNCH $PREFIX/../../tools/starpu_perfmodel_plot -s starpu_sgemm_gemm -i $PREFIX/prof_file_${USER}_0
-	[ -f starpu_starpu_sgemm_gemm.gp -a -f starpu_starpu_sgemm_gemm.data -a -f starpu_starpu_sgemm_gemm.data ]
+	$STARPU_LAUNCH $PREFIX/../../tools/starpu_perfmodel_plot -o $STARPU_FXT_PREFIX -s starpu_sgemm_gemm -i $STARPU_FXT_PREFIX/prof_file_${USER}_0
+	[ -f $STARPU_FXT_PREFIX/starpu_starpu_sgemm_gemm.gp -a -f $STARPU_FXT_PREFIX/starpu_starpu_sgemm_gemm.data -a -f $STARPU_FXT_PREFIX/starpu_starpu_sgemm_gemm.data ]
 
 	# Generate paje, dag, data, etc.
-	$STARPU_LAUNCH $PREFIX/../../tools/starpu_fxt_tool -memory-states -label-deps -i $PREFIX/prof_file_${USER}_0
+	$STARPU_LAUNCH $PREFIX/../../tools/starpu_fxt_tool -d $STARPU_FXT_PREFIX -memory-states -label-deps -i $STARPU_FXT_PREFIX/prof_file_${USER}_0
 
-	$PREFIX/../../tools/starpu_paje_sort paje.trace
-	! type pj_dump || pj_dump -e 0 < paje.trace
+	$PREFIX/../../tools/starpu_paje_sort $STARPU_FXT_PREFIX/paje.trace
+	! type pj_dump || pj_dump -e 0 < $STARPU_FXT_PREFIX/paje.trace
 
-	$PREFIX/../../tools/starpu_codelet_profile distrib.data starpu_sgemm_gemm
-	[ -f distrib.data.gp -a \( -f distrib.data.0 -o -f distrib.data.1 -o -f distrib.data.2 -o -f distrib.data.3 -o -f distrib.data.4 \) ]
+	$PREFIX/../../tools/starpu_codelet_profile $STARPU_FXT_PREFIX/distrib.data starpu_sgemm_gemm
+	[ -f $STARPU_FXT_PREFIX/distrib.data.gp -a \( -f $STARPU_FXT_PREFIX/distrib.data.0 -o -f $STARPU_FXT_PREFIX/distrib.data.1 -o -f $STARPU_FXT_PREFIX/distrib.data.2 -o -f $STARPU_FXT_PREFIX/distrib.data.3 -o -f $STARPU_FXT_PREFIX/distrib.data.4 -o -f $STARPU_FXT_PREFIX/distrib.data.5 -o -f $STARPU_FXT_PREFIX/distrib.data.6 \) ]
 
-	$STARPU_LAUNCH $PREFIX/../../tools/starpu_fxt_data_trace $PREFIX/prof_file_${USER}_0 starpu_sgemm_gemm
-	[ -f data_trace.gp ]
+	$STARPU_LAUNCH $PREFIX/../../tools/starpu_fxt_data_trace -d $STARPU_FXT_PREFIX $STARPU_FXT_PREFIX/prof_file_${USER}_0 starpu_sgemm_gemm
+	[ -f $STARPU_FXT_PREFIX/data_trace.gp ]
 
-	$STARPU_LAUNCH $PREFIX/../../tools/starpu_fxt_stats -i $PREFIX/prof_file_${USER}_0
-	$STARPU_LAUNCH $PREFIX/../../tools/starpu_tasks_rec_complete tasks.rec tasks2.rec
-	python $PREFIX/../../tools/starpu_trace_state_stats.py trace.rec
-	$PREFIX/../../tools/starpu_workers_activity activity.data
-	[ -f activity.eps ]
+	$STARPU_LAUNCH $PREFIX/../../tools/starpu_fxt_stats -i $STARPU_FXT_PREFIX/prof_file_${USER}_0
+	$STARPU_LAUNCH $PREFIX/../../tools/starpu_tasks_rec_complete $STARPU_FXT_PREFIX/tasks.rec $STARPU_FXT_PREFIX/tasks2.rec
+	python3 $PREFIX/../../tools/starpu_trace_state_stats.py $STARPU_FXT_PREFIX/trace.rec
+	$PREFIX/../../tools/starpu_workers_activity -d $STARPU_FXT_PREFIX $STARPU_FXT_PREFIX/activity.data
+	[ -f $STARPU_FXT_PREFIX/activity.eps ]
 
 	# needs some R packages
-	$PREFIX/../../tools/starpu_paje_draw_histogram paje.trace || true
-	$PREFIX/../../tools/starpu_paje_state_stats paje.trace || true
-	$PREFIX/../../tools/starpu_paje_summary paje.trace || true
-	$PREFIX/../../tools/starpu_codelet_histo_profile distrib.data || true
-	[ -f distrib.data.starpu_sgemm_gemm.0.492beed5.33177600.pdf ] || true
+	$PREFIX/../../tools/starpu_paje_draw_histogram $STARPU_FXT_PREFIX/paje.trace || true
+	$PREFIX/../../tools/starpu_paje_state_stats $STARPU_FXT_PREFIX/paje.trace || true
+	$PREFIX/../../tools/starpu_paje_summary $STARPU_FXT_PREFIX/paje.trace || true
+	$PREFIX/../../tools/starpu_codelet_histo_profile $STARPU_FXT_PREFIX/distrib.data || true
+	[ -f $STARPU_FXT_PREFIX/distrib.data.starpu_sgemm_gemm.0.492beed5.33177600.pdf ] || true
 
 	if [ -x $PREFIX/../../tools/starpu_replay ]; then
-		$STARPU_LAUNCH $PREFIX/../../tools/starpu_replay tasks.rec
+		$STARPU_LAUNCH $PREFIX/../../tools/starpu_replay $STARPU_FXT_PREFIX/tasks.rec
 	fi
 
-	[ ! -x $PREFIX/../../tools/starpu_perfmodel_recdump ] || $STARPU_LAUNCH $PREFIX/../../tools/starpu_perfmodel_recdump tasks.rec -o perfs2.rec
-	[ -f perfs2.rec ]
+	[ ! -x $PREFIX/../../tools/starpu_perfmodel_recdump ] || $STARPU_LAUNCH $PREFIX/../../tools/starpu_perfmodel_recdump $STARPU_FXT_PREFIX/tasks.rec -o $STARPU_FXT_PREFIX/perfs2.rec
+	[ -f $STARPU_FXT_PREFIX/perfs2.rec ]
 fi
 
+[ ! -x $PREFIX/../../tools/starpu_perfmodel_display ] || $STARPU_LAUNCH $PREFIX/../../tools/starpu_perfmodel_display -s starpu_sgemm_gemm
+[ ! -x $PREFIX/../../tools/starpu_perfmodel_display ] || $STARPU_LAUNCH $PREFIX/../../tools/starpu_perfmodel_display -x -s starpu_sgemm_gemm
+[ ! -x $PREFIX/../../tools/starpu_perfmodel_recdump ] || $STARPU_LAUNCH $PREFIX/../../tools/starpu_perfmodel_recdump -o $STARPU_FXT_PREFIX/perfs.rec
+[ -f $STARPU_FXT_PREFIX/perfs.rec ]

+ 16 - 0
examples/native_fortran/nf_vector.f90

@@ -23,6 +23,7 @@ program nf_vector
         integer, dimension(:), allocatable, target :: vb
         integer :: i
 
+        type(c_ptr) :: perfmodel_vec   ! a pointer for the perfmodel structure
         type(c_ptr) :: cl_vec   ! a pointer for the codelet structure
         type(c_ptr) :: dh_va    ! a pointer for the 'va' vector data handle
         type(c_ptr) :: dh_vb    ! a pointer for the 'vb' vector data handle
@@ -48,12 +49,24 @@ program nf_vector
                 stop 77
         end if
 
+        ! allocate an empty perfmodel structure
+        perfmodel_vec = fstarpu_perfmodel_allocate()
+
+        ! set the perfmodel symbol
+        call fstarpu_perfmodel_set_symbol(perfmodel_vec, C_CHAR_"my_vec_sym"//C_NULL_CHAR)
+
+        ! set the perfmodel type
+        call fstarpu_perfmodel_set_type(perfmodel_vec, FSTARPU_HISTORY_BASED)
+
         ! allocate an empty codelet structure
         cl_vec = fstarpu_codelet_allocate()
 
         ! set the codelet name
         call fstarpu_codelet_set_name(cl_vec, C_CHAR_"my_vec_codelet"//C_NULL_CHAR)
 
+        ! set the codelet perfmodel
+        call fstarpu_codelet_set_model(cl_vec, perfmodel_vec)
+
         ! add a CPU implementation function to the codelet
         call fstarpu_codelet_add_cpu_func(cl_vec, C_FUNLOC(cl_cpu_func_vec))
 
@@ -98,6 +111,9 @@ program nf_vector
         ! shut StarPU down
         call fstarpu_shutdown()
 
+        ! free perfmodel structure (must be called after fstarpu_shutdown)
+        call fstarpu_perfmodel_free(perfmodel_vec)
+
         deallocate(vb)
         deallocate(va)
 

+ 29 - 5
examples/scheduler/schedulers.sh

@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2012-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
@@ -36,9 +36,33 @@ else
 	SCHEDULERS=`$basedir/../../tools/starpu_sched_display | grep -v heteroprio`
 fi
 
-for sched in $SCHEDULERS
-do
+run()
+{
+    sched=$1
     echo "cholesky.$sched"
-    STARPU_SCHED=$sched $STARPU_LAUNCH $basedir/../cholesky/cholesky_tag -size $((960*3)) -nblocks 3
+    STARPU_SCHED=$sched $STARPU_LAUNCH $basedir/../cholesky/cholesky_tag -size $((320*3)) -nblocks 3
     check_success $?
-done
+}
+
+case "$MAKEFLAGS" in
+    *\ -j1[0-9]*\ *|*\ -j[2-9]*\ *)
+	for sched in $SCHEDULERS
+	do
+		run $sched &
+	done
+	while true
+	do
+		wait -n
+		RET=$?
+		if [ $RET = 127 ] ; then break ; fi
+		check_success $RET
+	done
+    ;;
+
+    *)
+	for sched in $SCHEDULERS
+	do
+		run $sched
+	done
+    ;;
+esac

+ 4 - 3
examples/stencil/Makefile.am

@@ -16,9 +16,10 @@
 include $(top_srcdir)/starpu.mk
 
 AM_CFLAGS = $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
-LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ $(HWLOC_LIBS) @LIBS@ $(FXT_LIBS)
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
-AM_LDFLAGS = @STARPU_EXPORT_DYNAMIC@ $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
+AM_LDFLAGS = @STARPU_EXPORT_DYNAMIC@
+LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@ $(HWLOC_LIBS) $(FXT_LIBS)
+LIBS += $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
 
 if STARPU_USE_MPI
 LIBS += $(top_builddir)/mpi/src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
@@ -73,7 +74,7 @@ if !STARPU_HAVE_WINDOWS
 ## test loader program
 if !STARPU_CROSS_COMPILING
 LOADER			=	loader
-loader_CPPFLAGS 	= 	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
+loader_CPPFLAGS 	= 	$(AM_CPPFLAGS) -I$(top_builddir)/src/
 LOADER_BIN		=	./$(LOADER)
 loader_SOURCES		=	../../tests/loader.c
 noinst_PROGRAMS		+=	loader

+ 57 - 0
include/fstarpu_mod.f90

@@ -92,6 +92,14 @@ module fstarpu_mod
         type(c_ptr), bind(C) :: FSTARPU_CUDA_ASYNC
         type(c_ptr), bind(C) :: FSTARPU_OPENCL_ASYNC
 
+        !type(c_ptr), bind(C) :: FSTARPU_PER_WORKER
+        !type(c_ptr), bind(C) :: FSTARPU_PER_ARCH
+        !type(c_ptr), bind(C) :: FSTARPU_PER_COMMON
+        type(c_ptr), bind(C) :: FSTARPU_HISTORY_BASED
+        type(c_ptr), bind(C) :: FSTARPU_REGRESSION_BASED
+        type(c_ptr), bind(C) :: FSTARPU_NL_REGRESSION_BASED
+        type(c_ptr), bind(C) :: FSTARPU_MULTIPLE_REGRESSION_BASED
+
         ! (some) portable iso_c_binding types
         type(c_ptr), bind(C) :: FSTARPU_SZ_C_DOUBLE
         type(c_ptr), bind(C) :: FSTARPU_SZ_C_FLOAT
@@ -649,6 +657,18 @@ module fstarpu_mod
                         character(c_char), intent(in) :: cl_name
                 end subroutine fstarpu_codelet_set_name
 
+                subroutine fstarpu_codelet_set_model (cl, cl_perfmodel) bind(C)
+                        use iso_c_binding, only: c_ptr
+                        type(c_ptr), value, intent(in) :: cl
+                        type(c_ptr), value, intent(in) :: cl_perfmodel
+                end subroutine fstarpu_codelet_set_model
+
+                subroutine fstarpu_codelet_set_energy_model (cl, cl_perfmodel) bind(C)
+                        use iso_c_binding, only: c_ptr
+                        type(c_ptr), value, intent(in) :: cl
+                        type(c_ptr), value, intent(in) :: cl_perfmodel
+                end subroutine fstarpu_codelet_set_energy_model
+
                 subroutine fstarpu_codelet_add_cpu_func (cl, f_ptr) bind(C)
                         use iso_c_binding, only: c_ptr, c_funptr
                         type(c_ptr), value, intent(in) :: cl
@@ -714,6 +734,28 @@ module fstarpu_mod
                         type(c_ptr), value, intent(in) :: where ! C function expects an intptr_t
                 end subroutine fstarpu_codelet_set_where
 
+                function fstarpu_perfmodel_allocate () bind(C)
+                        use iso_c_binding, only: c_ptr
+                        type(c_ptr) :: fstarpu_perfmodel_allocate
+                end function fstarpu_perfmodel_allocate
+
+                subroutine fstarpu_perfmodel_free (model) bind(C)
+                        use iso_c_binding, only: c_ptr
+                        type(c_ptr), value, intent(in) :: model
+                end subroutine fstarpu_perfmodel_free
+
+                subroutine fstarpu_perfmodel_set_symbol (model, model_symbol) bind(C)
+                        use iso_c_binding, only: c_ptr, c_char
+                        type(c_ptr), value, intent(in) :: model
+                        character(c_char), intent(in) :: model_symbol
+                end subroutine fstarpu_perfmodel_set_symbol
+
+                subroutine fstarpu_perfmodel_set_type (model, type) bind(C)
+                        use iso_c_binding, only: c_ptr
+                        type(c_ptr), value, intent(in) :: model
+                        type(c_ptr), value, intent(in) :: type ! C function expects an intptr_t
+                end subroutine fstarpu_perfmodel_set_type
+
                 ! == starpu_data_interface.h ==
 
                 ! uintptr_t starpu_malloc_on_node_flags(unsigned dst_node, size_t size, int flags);
@@ -2434,6 +2476,21 @@ module fstarpu_mod
                         FSTARPU_OPENCL_ASYNC = &
                             fstarpu_get_constant(C_CHAR_"FSTARPU_OPENCL_ASYNC"//C_NULL_CHAR)
 
+                        !FSTARPU_PER_WORKER = &
+                        !        fstarpu_get_constant(C_CHAR_"FSTARPU_PER_WORKER"//C_NULL_CHAR)
+                        !FSTARPU_PER_ARCH = &
+                        !        fstarpu_get_constant(C_CHAR_"FSTARPU_PER_ARCH"//C_NULL_CHAR)
+                        !FSTARPU_PER_COMMON = &
+                        !        fstarpu_get_constant(C_CHAR_"FSTARPU_PER_COMMON"//C_NULL_CHAR)
+                        FSTARPU_HISTORY_BASED = &
+                                fstarpu_get_constant(C_CHAR_"FSTARPU_HISTORY_BASED"//C_NULL_CHAR)
+                        FSTARPU_REGRESSION_BASED = &
+                                fstarpu_get_constant(C_CHAR_"FSTARPU_REGRESSION_BASED"//C_NULL_CHAR)
+                        FSTARPU_NL_REGRESSION_BASED = &
+                                fstarpu_get_constant(C_CHAR_"FSTARPU_NL_REGRESSION_BASED"//C_NULL_CHAR)
+                        FSTARPU_MULTIPLE_REGRESSION_BASED = &
+                                fstarpu_get_constant(C_CHAR_"FSTARPU_MULTIPLE_REGRESSION_BASED"//C_NULL_CHAR)
+
                         ! Initialize size constants as 'c_ptr'
                         FSTARPU_SZ_C_DOUBLE        = sz_to_p(c_sizeof(FSTARPU_SZ_C_DOUBLE_dummy))
                         FSTARPU_SZ_C_FLOAT        = sz_to_p(c_sizeof(FSTARPU_SZ_C_FLOAT_dummy))

+ 2 - 0
include/starpu_fxt.h

@@ -131,7 +131,9 @@ void starpu_fxt_start_profiling(void);
    start recording it again, etc.
 */
 void starpu_fxt_stop_profiling(void);
+
 void starpu_fxt_write_data_trace(char *filename_in);
+void starpu_fxt_write_data_trace_in_dir(char *filename_in, char *dir);
 
 /**
     Wrapper to get value of env variable STARPU_FXT_TRACE

+ 12 - 0
include/starpu_perfmodel.h

@@ -312,6 +312,13 @@ struct starpu_perfmodel
 void starpu_perfmodel_init(struct starpu_perfmodel *model);
 
 /**
+   Deinitialize the \p model performance model structure. You need to call this 
+   before deallocating the structure. You will probably want to call 
+   starpu_perfmodel_unload_model() before calling this function, to save the perfmodel.
+*/   
+int starpu_perfmodel_deinit(struct starpu_perfmodel *model);
+
+/**
    Load the performance model found in the file named \p filename. \p model has to be
    completely zero, and will be filled with the information stored in the given file.
 */
@@ -333,6 +340,11 @@ int starpu_perfmodel_load_symbol(const char *symbol, struct starpu_perfmodel *mo
 int starpu_perfmodel_unload_model(struct starpu_perfmodel *model);
 
 /**
+	Save the performance model in its file.
+*/
+void starpu_save_history_based_model(struct starpu_perfmodel *model);
+
+/**
   Fills \p path (supposed to be \p maxlen long) with the full path to the
   performance model file for symbol \p symbol.  This path can later on be used
   for instance with starpu_perfmodel_load_file() .

+ 15 - 1
include/starpu_task.h

@@ -596,6 +596,20 @@ struct starpu_codelet
 	unsigned color;
 
 	/**
+	   Optional field, the default value is <c>NULL</c>. This is a
+	   function pointer of prototype <c>void (*f)(void *)</c>
+	   which specifies a possible callback. If this pointer is
+	   non-<c>NULL</c>, the callback function is executed on the
+	   host after the execution of the task. If the task defines a
+	   callback, the codelet callback is not called, unless called
+	   within the task callback function.
+	   The callback is passed the value contained in the
+	   starpu_task::callback_arg field. No callback is executed if
+	   the field is set to <c>NULL</c>.
+	*/
+	void (*callback_func)(void *);
+
+	/**
 	   Various flags for the codelet.
 	 */
 	int flags;
@@ -813,7 +827,7 @@ struct starpu_task
 	   <c>NULL</c>.
 
 	   With starpu_task_insert() and alike this can be specified thanks to
-	   ::STARPU_CALLBACK_ARG followed by the function pointer, or thanks to
+	   ::STARPU_CALLBACK_ARG followed by the argument pointer, or thanks to
 	   ::STARPU_CALLBACK_WITH_ARG or
 	   ::STARPU_CALLBACK_WITH_ARG_NFREE followed by the function
 	   pointer and the argument.

+ 3 - 0
include/starpu_worker.h

@@ -304,6 +304,9 @@ struct starpu_tree* starpu_workers_get_tree(void);
 
 unsigned starpu_worker_get_sched_ctx_list(int worker, unsigned **sched_ctx);
 
+/**
+   Return whether worker \p workerid is currently blocked in a parallel task.
+ */
 unsigned starpu_worker_is_blocked_in_parallel(int workerid);
 
 unsigned starpu_worker_is_slave_somewhere(int workerid);

+ 3 - 3
julia/examples/Makefile.am

@@ -20,7 +20,7 @@ noinst_PROGRAMS		=
 if STARPU_HAVE_WINDOWS
 LOADER_BIN		=
 else
-loader_CPPFLAGS 	= 	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
+loader_CPPFLAGS 	= 	$(AM_CPPFLAGS) -I$(top_builddir)/src/
 if !STARPU_SIMGRID
 LOADER			=	loader
 LOADER_BIN		=	$(abs_top_builddir)/julia/examples/$(LOADER)
@@ -96,9 +96,9 @@ endif
 endif
 
 AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
-LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ ../src/libstarpujulia-@STARPU_EFFECTIVE_VERSION@.la -lm @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
-AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
+LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@ ../src/libstarpujulia-@STARPU_EFFECTIVE_VERSION@.la -lm $(FXT_LIBS) $(MAGMA_LIBS)
+LIBS += $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
 
 check_PROGRAMS = $(LOADER) $(starpu_julia_EXAMPLES)
 SHELL_TESTS	=

+ 2 - 2
julia/src/Makefile.am

@@ -19,9 +19,9 @@ include $(top_srcdir)/starpu-notests.mk
 CLEANFILES = *.gcno *.gcda
 
 AM_CFLAGS = $(GLOBAL_AM_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(HWLOC_CFLAGS) $(FXT_CFLAGS) -fPIC
-LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ `@JULIA@ $(top_srcdir)/julia/src/openblas_ldflags.jl`
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_builddir)/include -I$(top_builddir)/src -I$(top_srcdir)/src -I$(top_srcdir)/julia/src
-AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS) -no-undefined
+LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@ `@JULIA@ $(top_srcdir)/julia/src/openblas_ldflags.jl`
+LIBS += $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS)
 
 SUBDIRS = dynamic_compiler
 

+ 2 - 2
julia/src/dynamic_compiler/Makefile.am

@@ -20,9 +20,9 @@ AM_CPPFLAGS = -I$(abs_top_srcdir)/include/ -I$(abs_top_builddir)/src -I$(abs_top
 
 AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS)
 AM_CFLAGS += -fPIC -O3 -g -DSTRIDE=${STRIDE} -Wall -mavx -fomit-frame-pointer -march=native -ffast-math
-LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@
+LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@
+LIBS += -L @STARPU_BUILD_DIR@/julia/src/.libs/ -lstarpujulia-1.3
 CUDA_CFLAGS = $(STARPU_CUDA_CPPFLAGS) -Wno-deprecated-gpu-targets
-LDFLAGS = -L @STARPU_BUILD_DIR@/julia/src/.libs/ -lstarpujulia-1.3
 EXTERNLIB=extern_tasks.so
 GENERATEDLIB=generated_tasks.so
 

+ 45 - 0
mpi/GNUmakefile.in

@@ -0,0 +1,45 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+SUBDIRS=
+
+@STARPU_BUILD_EXAMPLES_TRUE@SUBDIRS += examples
+@STARPU_BUILD_TESTS_TRUE@SUBDIRS += tests
+
+check: check-recursive
+
+# divide by 4 the number of jobs to run in parallel, since mpirun will start 4
+# processes in the tests and examples
+@STARPU_SIMGRID_FALSE@check-recursive:
+@STARPU_SIMGRID_FALSE@	RET=0 ; \
+@STARPU_SIMGRID_FALSE@	NJOBS=`printf %s "$(MAKEFLAGS)" | sed -ne 's/.*-j \?\([0-9]\+\).*/\1/p'` ; \
+@STARPU_SIMGRID_FALSE@	JOBS="" ; \
+@STARPU_SIMGRID_FALSE@	if [ -n "$$NJOBS" ] ; then \
+@STARPU_SIMGRID_FALSE@		if [ "$$NJOBS" -ge 4 ] ; then \
+@STARPU_SIMGRID_FALSE@			JOBS="-j$$(($$NJOBS / 4))" ; \
+@STARPU_SIMGRID_FALSE@		else \
+@STARPU_SIMGRID_FALSE@			JOBS="-j1" ; \
+@STARPU_SIMGRID_FALSE@		fi ; \
+@STARPU_SIMGRID_FALSE@	fi ; \
+@STARPU_SIMGRID_FALSE@	for i in $(SUBDIRS) ; do \
+@STARPU_SIMGRID_FALSE@		$(MAKE) check -C $$i MAKEFLAGS="$(MAKEFLAGS) $$JOBS" || RET=1; \
+@STARPU_SIMGRID_FALSE@	done ; \
+@STARPU_SIMGRID_FALSE@	exit $$RET
+
+%: force
+	@$(MAKE) -f Makefile $@
+
+force: ;

+ 0 - 18
mpi/Makefile.am

@@ -33,21 +33,3 @@ versinclude_HEADERS = 					\
 	include/starpu_mpi.h				\
 	include/starpu_mpi_lb.h				\
 	include/fstarpu_mpi_mod.f90
-
-if !STARPU_SIMGRID
-check-recursive:
-	RET=0 ; \
-	NJOBS=`printf %s "$(MAKEFLAGS)" | sed -ne 's/.*-j \?\([0-9]\+\).*/\1/p'` ; \
-	JOBS="" ; \
-	if [ -n "$$NJOBS" ] ; then \
-		if [ "$$NJOBS" -ge 4 ] ; then \
-			JOBS="-j$$(($$NJOBS / 4))" ; \
-		else \
-			JOBS="-j1" ; \
-		fi ; \
-	fi ; \
-	for i in $(SUBDIRS) ; do \
-		$(MAKE) check -C $$i MAKEFLAGS="$(MAKEFLAGS) $$JOBS" || RET=1; \
-	done ; \
-	exit $$RET
-endif

+ 3 - 3
mpi/examples/Makefile.am

@@ -26,7 +26,7 @@ noinst_PROGRAMS		=
 if STARPU_HAVE_WINDOWS
 LOADER_BIN		=
 else
-loader_CPPFLAGS 	= 	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
+loader_CPPFLAGS 	= 	$(AM_CPPFLAGS) -I$(top_builddir)/src/
 if !STARPU_SIMGRID
 LOADER			=	loader
 LOADER_BIN		=	$(abs_top_builddir)/mpi/examples/$(LOADER)
@@ -108,9 +108,9 @@ endif
 endif
 
 AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
-LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ ../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la -lm @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include -I$(top_srcdir)/mpi/include
-AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
+LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@ ../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la -lm $(FXT_LIBS) $(MAGMA_LIBS)
+LIBS += $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
 
 ###################
 # Stencil example #

+ 1 - 1
mpi/examples/matrix_mult/mm.c

@@ -128,7 +128,7 @@ static void register_matrices()
 	int mr = (comm_rank == 0) ? STARPU_MAIN_RAM : -1;
 
 	/* mpi tag used for the block */
-	int tag = 0;
+	starpu_mpi_tag_t tag = 0;
 
 	int b_row,b_col;
 

+ 4 - 1
mpi/examples/mpi_lu/plu_example.c

@@ -133,7 +133,10 @@ static void parse_args(int rank, int argc, char **argv)
 
 #ifdef STARPU_HAVE_VALGRIND_H
 	if (RUNNING_ON_VALGRIND)
-		size = 16;
+	{
+		size = 4;
+		nblocks = 4;
+	}
 #endif
 }
 

+ 4 - 1
mpi/examples/mpi_lu/plu_implicit_example.c

@@ -112,7 +112,10 @@ static void parse_args(int argc, char **argv)
 
 #ifdef STARPU_HAVE_VALGRIND_H
 	if (RUNNING_ON_VALGRIND)
-		size = 16;
+	{
+		size = 4;
+		nblocks = 4;
+	}
 #endif
 }
 

+ 4 - 1
mpi/examples/mpi_lu/plu_outofcore_example.c

@@ -123,7 +123,10 @@ static void parse_args(int argc, char **argv)
 
 #ifdef STARPU_HAVE_VALGRIND_H
 	if (RUNNING_ON_VALGRIND)
-		size = 16;
+	{
+		size = 4;
+		nblocks = 4;
+	}
 #endif
 }
 

+ 9 - 9
mpi/examples/mpi_lu/pxlu.c

@@ -90,7 +90,7 @@ static struct starpu_task *create_task(starpu_tag_t id)
 
 /* Send handle to every node appearing in the mask, and unlock tag once the
  * transfers are done. */
-static void send_data_to_mask(starpu_data_handle_t handle, int *rank_mask, int mpi_tag, starpu_tag_t tag)
+static void send_data_to_mask(starpu_data_handle_t handle, int *rank_mask, starpu_mpi_tag_t mpi_tag, starpu_tag_t tag)
 {
 	unsigned cnt = 0;
 
@@ -134,7 +134,7 @@ static void send_data_to_mask(starpu_data_handle_t handle, int *rank_mask, int m
 struct recv_when_done_callback_arg
 {
 	int source;
-	int mpi_tag;
+	starpu_mpi_tag_t mpi_tag;
 	starpu_data_handle_t handle;
 	starpu_tag_t unlocked_tag;
 };
@@ -150,7 +150,7 @@ static void callback_receive_when_done(void *_arg)
 }
 
 static void receive_when_deps_are_done(unsigned ndeps, starpu_tag_t *deps_tags,
-				int source, int mpi_tag,
+				int source, starpu_mpi_tag_t mpi_tag,
 				starpu_data_handle_t handle,
 				starpu_tag_t partial_tag,
 				starpu_tag_t unlocked_tag)
@@ -218,7 +218,7 @@ static void create_task_11_recv(unsigned k)
 #else
 	starpu_data_handle_t block_handle = STARPU_PLU(get_tmp_11_block_handle)(k);
 #endif
-	int mpi_tag = MPI_TAG11(k);
+	starpu_mpi_tag_t mpi_tag = MPI_TAG11(k);
 	starpu_tag_t partial_tag = TAG11_SAVE_PARTIAL(k);
 	starpu_tag_t unlocked_tag = TAG11_SAVE(k);
 
@@ -260,7 +260,7 @@ static void callback_task_11_real(void *_arg)
 	/* Send the block to those nodes */
 	starpu_data_handle_t block_handle = STARPU_PLU(get_block_handle)(k, k);
 	starpu_tag_t tag = TAG11_SAVE(k);
-	int mpi_tag = MPI_TAG11(k);
+	starpu_mpi_tag_t mpi_tag = MPI_TAG11(k);
 	send_data_to_mask(block_handle, rank_mask, mpi_tag, tag);
 
 	free(arg);
@@ -380,7 +380,7 @@ static void create_task_12_recv(unsigned k, unsigned j)
 #else
 	starpu_data_handle_t block_handle = STARPU_PLU(get_tmp_12_block_handle)(j,k);
 #endif
-	int mpi_tag = MPI_TAG12(k, j);
+	starpu_mpi_tag_t mpi_tag = MPI_TAG12(k, j);
 	starpu_tag_t partial_tag = TAG12_SAVE_PARTIAL(k, j);
 	starpu_tag_t unlocked_tag = TAG12_SAVE(k, j);
 
@@ -415,7 +415,7 @@ static void callback_task_12_real(void *_arg)
 	/* Send the block to those nodes */
 	starpu_data_handle_t block_handle = STARPU_PLU(get_block_handle)(k, j);
 	starpu_tag_t tag = TAG12_SAVE(k, j);
-	int mpi_tag = MPI_TAG12(k, j);
+	starpu_mpi_tag_t mpi_tag = MPI_TAG12(k, j);
 	send_data_to_mask(block_handle, rank_mask, mpi_tag, tag);
 
 	free(arg);
@@ -564,7 +564,7 @@ static void create_task_21_recv(unsigned k, unsigned i)
 #else
 	starpu_data_handle_t block_handle = STARPU_PLU(get_tmp_21_block_handle)(i, k);
 #endif
-	int mpi_tag = MPI_TAG21(k, i);
+	starpu_mpi_tag_t mpi_tag = MPI_TAG21(k, i);
 	starpu_tag_t partial_tag = TAG21_SAVE_PARTIAL(k, i);
 	starpu_tag_t unlocked_tag = TAG21_SAVE(k, i);
 
@@ -600,7 +600,7 @@ static void callback_task_21_real(void *_arg)
 	/* Send the block to those nodes */
 	starpu_data_handle_t block_handle = STARPU_PLU(get_block_handle)(i, k);
 	starpu_tag_t tag = TAG21_SAVE(k, i);
-	int mpi_tag = MPI_TAG21(k, i);
+	starpu_mpi_tag_t mpi_tag = MPI_TAG21(k, i);
 	send_data_to_mask(block_handle, rank_mask, mpi_tag, tag);
 
 	free(arg);

+ 4 - 5
mpi/src/Makefile.am

@@ -24,9 +24,10 @@ BUILT_SOURCES =
 CLEANFILES = *.gcno *.gcda *.linkinfo
 
 AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS) $(NMAD_CFLAGS)
-LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS) $(NMAD_LIBS)
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/src/ -I$(top_builddir)/src -I$(top_builddir)/include -I$(top_srcdir)/mpi/include -I$(top_srcdir)/mpi/src
-AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS) $(NMAD_LDFLAGS)
+LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@ $(FXT_LIBS) $(MAGMA_LIBS) $(NMAD_LIBS)
+LIBS += $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS) $(NMAD_LDFLAGS)
+LIBS += $(MPICC_LDFLAGS) $(FXT_LDFLAGS)
 
 ldflags =
 
@@ -54,10 +55,8 @@ endif STARPU_HAVE_WINDOWS
 
 lib_LTLIBRARIES = libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 
-libstarpumpi_@STARPU_EFFECTIVE_VERSION@_la_LIBADD = $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la
 libstarpumpi_@STARPU_EFFECTIVE_VERSION@_la_LDFLAGS = $(ldflags) -no-undefined					\
-  -version-info $(LIBSTARPUMPI_INTERFACE_CURRENT):$(LIBSTARPUMPI_INTERFACE_REVISION):$(LIBSTARPUMPI_INTERFACE_AGE) \
-  $(MPICC_LDFLAGS) $(FXT_LDFLAGS)
+  -version-info $(LIBSTARPUMPI_INTERFACE_CURRENT):$(LIBSTARPUMPI_INTERFACE_REVISION):$(LIBSTARPUMPI_INTERFACE_AGE)
 noinst_HEADERS =					\
 	starpu_mpi_private.h				\
 	starpu_mpi_fxt.h				\

+ 22 - 20
mpi/src/load_balancer/policy/data_movements_interface.c

@@ -23,7 +23,7 @@
 
 #if defined(STARPU_USE_MPI_MPI)
 
-int **data_movements_get_ref_tags_table(starpu_data_handle_t handle)
+starpu_mpi_tag_t **data_movements_get_ref_tags_table(starpu_data_handle_t handle)
 {
 	struct data_movements_interface *dm_interface =
 		(struct data_movements_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
@@ -45,7 +45,7 @@ int **data_movements_get_ref_ranks_table(starpu_data_handle_t handle)
 		return NULL;
 }
 
-int *data_movements_get_tags_table(starpu_data_handle_t handle)
+starpu_mpi_tag_t *data_movements_get_tags_table(starpu_data_handle_t handle)
 {
 	struct data_movements_interface *dm_interface =
 		(struct data_movements_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
@@ -94,8 +94,8 @@ int data_movements_reallocate_tables(starpu_data_handle_t handle, int size)
 
 	if (dm_interface->size)
 	{
-		_STARPU_MPI_MALLOC(dm_interface->tags, size*sizeof(int));
-		_STARPU_MPI_MALLOC(dm_interface->ranks, size*sizeof(int));
+		_STARPU_MPI_MALLOC(dm_interface->tags, size*sizeof(*dm_interface->tags));
+		_STARPU_MPI_MALLOC(dm_interface->ranks, size*sizeof(*dm_interface->ranks));
 	}
 
 	return 0 ;
@@ -129,14 +129,15 @@ static starpu_ssize_t data_movements_allocate_data_on_node(void *data_interface,
 {
 	struct data_movements_interface *dm_interface = (struct data_movements_interface *) data_interface;
 
-	int *addr_tags;
+	starpu_mpi_tag_t *addr_tags;
 	int *addr_ranks;
-	starpu_ssize_t requested_memory = dm_interface->size * sizeof(int);
+	starpu_ssize_t requested_memory_tags = dm_interface->size * sizeof(starpu_mpi_tag_t);
+	starpu_ssize_t requested_memory_ranks = dm_interface->size * sizeof(int);
 
-	addr_tags = (int*) starpu_malloc_on_node(node, requested_memory);
+	addr_tags = (starpu_mpi_tag_t*) starpu_malloc_on_node(node, requested_memory_tags);
 	if (!addr_tags)
 		goto fail_tags;
-	addr_ranks = (int*) starpu_malloc_on_node(node, requested_memory);
+	addr_ranks = (int*) starpu_malloc_on_node(node, requested_memory_ranks);
 	if (!addr_ranks)
 		goto fail_ranks;
 
@@ -144,10 +145,10 @@ static starpu_ssize_t data_movements_allocate_data_on_node(void *data_interface,
 	dm_interface->tags = addr_tags;
 	dm_interface->ranks = addr_ranks;
 
-	return 2*requested_memory;
+	return requested_memory_tags+requested_memory_ranks;
 
 fail_ranks:
-	starpu_free_on_node(node, (uintptr_t) addr_tags, requested_memory);
+	starpu_free_on_node(node, (uintptr_t) addr_tags, requested_memory_tags);
 fail_tags:
 	return -ENOMEM;
 }
@@ -155,10 +156,11 @@ fail_tags:
 static void data_movements_free_data_on_node(void *data_interface, unsigned node)
 {
 	struct data_movements_interface *dm_interface = (struct data_movements_interface *) data_interface;
-	starpu_ssize_t requested_memory = dm_interface->size * sizeof(int);
+	starpu_ssize_t requested_memory_tags = dm_interface->size * sizeof(starpu_mpi_tag_t);
+	starpu_ssize_t requested_memory_ranks = dm_interface->size * sizeof(int);
 
-	starpu_free_on_node(node, (uintptr_t) dm_interface->tags, requested_memory);
-	starpu_free_on_node(node, (uintptr_t) dm_interface->ranks, requested_memory);
+	starpu_free_on_node(node, (uintptr_t) dm_interface->tags, requested_memory_tags);
+	starpu_free_on_node(node, (uintptr_t) dm_interface->ranks, requested_memory_ranks);
 }
 
 static size_t data_movements_get_size(starpu_data_handle_t handle)
@@ -166,7 +168,7 @@ static size_t data_movements_get_size(starpu_data_handle_t handle)
 	size_t size;
 	struct data_movements_interface *dm_interface = (struct data_movements_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 
-	size = (dm_interface->size * 2 * sizeof(int)) + sizeof(int);
+	size = (dm_interface->size * sizeof(starpu_mpi_tag_t)) + (dm_interface->size * sizeof(int)) + sizeof(int);
 	return size;
 }
 
@@ -192,8 +194,8 @@ static int data_movements_pack_data(starpu_data_handle_t handle, unsigned node,
 		memcpy(data, &dm_interface->size, sizeof(int));
 		if (dm_interface->size)
 		{
-			memcpy(data+sizeof(int), dm_interface->tags, (dm_interface->size*sizeof(int)));
-			memcpy(data+sizeof(int)+(dm_interface->size*sizeof(int)), dm_interface->ranks, dm_interface->size*sizeof(int));
+			memcpy(data+sizeof(int), dm_interface->tags, (dm_interface->size*sizeof(starpu_mpi_tag_t)));
+			memcpy(data+sizeof(int)+(dm_interface->size*sizeof(starpu_mpi_tag_t)), dm_interface->ranks, dm_interface->size*sizeof(int));
 		}
 	}
 
@@ -216,8 +218,8 @@ static int data_movements_unpack_data(starpu_data_handle_t handle, unsigned node
 
 	if (dm_interface->size)
 	{
-		memcpy(dm_interface->tags, data+sizeof(int), dm_interface->size*sizeof(int));
-		memcpy(dm_interface->ranks, data+sizeof(int)+(dm_interface->size*sizeof(int)), dm_interface->size*sizeof(int));
+		memcpy(dm_interface->tags, data+sizeof(int), dm_interface->size*sizeof(starpu_mpi_tag_t));
+		memcpy(dm_interface->ranks, data+sizeof(int)+(dm_interface->size*sizeof(starpu_mpi_tag_t)), dm_interface->size*sizeof(int));
 	}
 
     return 0;
@@ -233,7 +235,7 @@ static int copy_any_to_any(void *src_interface, unsigned src_node,
 
 	if (starpu_interface_copy((uintptr_t) src_data_movements->tags, 0, src_node,
 				    (uintptr_t) dst_data_movements->tags, 0, dst_node,
-				     src_data_movements->size*sizeof(int),
+				     src_data_movements->size*sizeof(starpu_mpi_tag_t),
 				     async_data))
 		ret = -EAGAIN;
 	if (starpu_interface_copy((uintptr_t) src_data_movements->ranks, 0, src_node,
@@ -265,7 +267,7 @@ static struct starpu_data_interface_ops interface_data_movements_ops =
 	.describe = NULL
 };
 
-void data_movements_data_register(starpu_data_handle_t *handleptr, unsigned home_node, int *ranks, int *tags, int size)
+void data_movements_data_register(starpu_data_handle_t *handleptr, unsigned home_node, int *ranks, starpu_mpi_tag_t *tags, int size)
 {
 	struct data_movements_interface data_movements =
 	{

+ 4 - 4
mpi/src/load_balancer/policy/data_movements_interface.h

@@ -25,20 +25,20 @@
 struct data_movements_interface
 {
 	/** Data tags table */
-	int *tags;
+	starpu_mpi_tag_t *tags;
 	/** Ranks table (where to move the corresponding data) */
 	int *ranks;
 	/** Size of the tables */
 	int size;
 };
 
-void data_movements_data_register(starpu_data_handle_t *handle, unsigned home_node, int *ranks, int *tags, int size);
+void data_movements_data_register(starpu_data_handle_t *handle, unsigned home_node, int *ranks, starpu_mpi_tag_t *tags, int size);
 
-int **data_movements_get_ref_tags_table(starpu_data_handle_t handle);
+starpu_mpi_tag_t **data_movements_get_ref_tags_table(starpu_data_handle_t handle);
 int **data_movements_get_ref_ranks_table(starpu_data_handle_t handle);
 int data_movements_reallocate_tables(starpu_data_handle_t handle, int size);
 
-int *data_movements_get_tags_table(starpu_data_handle_t handle);
+starpu_mpi_tag_t *data_movements_get_tags_table(starpu_data_handle_t handle);
 int *data_movements_get_ranks_table(starpu_data_handle_t handle);
 int data_movements_get_size_tables(starpu_data_handle_t handle);
 

+ 6 - 6
mpi/src/load_balancer/policy/load_heat_propagation.c

@@ -27,14 +27,14 @@
 
 #if defined(STARPU_USE_MPI_MPI)
 
-static int TAG_LOAD(int n)
+static starpu_mpi_tag_t TAG_LOAD(int n)
 {
-	return (n+1) << 24;
+	return ((starpu_mpi_tag_t) n+1) << 24;
 }
 
-static int TAG_MOV(int n)
+static starpu_mpi_tag_t TAG_MOV(int n)
 {
-	return (n+1) << 20;
+	return ((starpu_mpi_tag_t) n+1) << 20;
 }
 
 /* Hash table of local pieces of data that has been moved out of the local MPI
@@ -132,7 +132,7 @@ static void balance(starpu_data_handle_t load_data_cpy)
 
 			if (nhandles)
 			{
-				int *tags = data_movements_get_tags_table(data_movements_handles[my_rank]);
+				starpu_mpi_tag_t *tags = data_movements_get_tags_table(data_movements_handles[my_rank]);
 				int *ranks = data_movements_get_ranks_table(data_movements_handles[my_rank]);
 
 				for (n = 0; n < nhandles; n++)
@@ -564,7 +564,7 @@ static int deinit_heat()
 		_STARPU_DEBUG("Move back %u data on node %d ..\n", ndata_to_move_back, my_rank);
 		data_movements_reallocate_tables(data_movements_handles[my_rank], ndata_to_move_back);
 
-		int *tags = data_movements_get_tags_table(data_movements_handles[my_rank]);
+		starpu_mpi_tag_t *tags = data_movements_get_tags_table(data_movements_handles[my_rank]);
 		int *ranks = data_movements_get_ranks_table(data_movements_handles[my_rank]);
 
 		int n = 0;

+ 1 - 0
mpi/src/mpi/starpu_mpi_mpi.c

@@ -1513,6 +1513,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 		starpu_pthread_wait_wait(&_starpu_mpi_thread_wait);
 		STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 #endif
+		STARPU_VALGRIND_YIELD();
 	}
 
 	_STARPU_MPI_TRACE_POLLING_END();

+ 9 - 0
mpi/src/nmad/starpu_mpi_nmad.c

@@ -634,9 +634,18 @@ int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv)
 
 	/* Tell pioman to use a bound thread for communication progression:
 	 * share the same core as StarPU's MPI thread, the MPI thread has very low activity with NMAD backend */
+#ifdef HAVE_PIOM_LTASK_SET_BOUND_THREAD_OS_INDEXES
+	/* We prefer to give the OS index of the core, because StarPU can have
+	 * a different vision of the topology, especially if STARPU_WORKERS_GETBIND
+	 * is enabled */
+	int indexes[1] = { starpu_get_pu_os_index((unsigned) _starpu_mpi_thread_cpuid) };
+	if (!_starpu_mpi_nobind)
+		piom_ltask_set_bound_thread_os_indexes(HWLOC_OBJ_PU, indexes, 1);
+#else
 	int indexes[1] = { _starpu_mpi_thread_cpuid };
 	if (!_starpu_mpi_nobind)
 		piom_ltask_set_bound_thread_indexes(HWLOC_OBJ_PU, indexes, 1);
+#endif
 
 	/* Register some hooks for communication progress if needed */
 	int polling_point_prog, polling_point_idle;

+ 14 - 12
mpi/src/starpu_mpi.c

@@ -325,22 +325,23 @@ starpu_mpi_tag_t starpu_mpi_data_get_tag(starpu_data_handle_t data)
 
 void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t data_handle, int node, void (*callback)(void*), void *arg)
 {
-	int me, rank, tag;
+	int me, rank;
+	starpu_mpi_tag_t data_tag;
 
 	rank = starpu_mpi_data_get_rank(data_handle);
 	if (rank == -1)
 	{
-		_STARPU_ERROR("StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register() or starpu_mpi_data_register()\n");
+		_STARPU_ERROR("StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register() or starpu_mpi_data_register_comm()\n");
 	}
 
 	starpu_mpi_comm_rank(comm, &me);
 	if (node == rank)
 		return;
 
-	tag = starpu_mpi_data_get_tag(data_handle);
-	if (tag == -1)
+	data_tag = starpu_mpi_data_get_tag(data_handle);
+	if (data_tag == -1)
 	{
-		_STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register() or starpu_mpi_data_register()\n");
+		_STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register() or starpu_mpi_data_register_comm()\n");
 	}
 
 	if (me == node)
@@ -350,7 +351,7 @@ void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t da
 		if (already_received == 0)
 		{
 			_STARPU_MPI_DEBUG(1, "Receiving data %p from %d\n", data_handle, rank);
-			starpu_mpi_irecv_detached(data_handle, rank, tag, comm, callback, arg);
+			starpu_mpi_irecv_detached(data_handle, rank, data_tag, comm, callback, arg);
 		}
 	}
 	else if (me == rank)
@@ -360,14 +361,15 @@ void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t da
 		if (already_sent == 0)
 		{
 			_STARPU_MPI_DEBUG(1, "Sending data %p to %d\n", data_handle, node);
-			starpu_mpi_isend_detached(data_handle, node, tag, comm, NULL, NULL);
+			starpu_mpi_isend_detached(data_handle, node, data_tag, comm, NULL, NULL);
 		}
 	}
 }
 
 void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle, int node)
 {
-	int me, rank, tag;
+	int me, rank;
+	starpu_mpi_tag_t data_tag;
 
 	rank = starpu_mpi_data_get_rank(data_handle);
 	if (rank == -1)
@@ -379,8 +381,8 @@ void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle
 	if (node == rank)
 		return;
 
-	tag = starpu_mpi_data_get_tag(data_handle);
-	if (tag == -1)
+	data_tag = starpu_mpi_data_get_tag(data_handle);
+	if (data_tag == -1)
 	{
 		_STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
 	}
@@ -393,7 +395,7 @@ void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle
 		if (already_received == 0)
 		{
 			_STARPU_MPI_DEBUG(1, "Receiving data %p from %d\n", data_handle, rank);
-			starpu_mpi_recv(data_handle, rank, tag, comm, &status);
+			starpu_mpi_recv(data_handle, rank, data_tag, comm, &status);
 		}
 	}
 	else if (me == rank)
@@ -403,7 +405,7 @@ void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle
 		if (already_sent == 0)
 		{
 			_STARPU_MPI_DEBUG(1, "Sending data %p to %d\n", data_handle, node);
-			starpu_mpi_send(data_handle, node, tag, comm);
+			starpu_mpi_send(data_handle, node, data_tag, comm);
 		}
 	}
 }

+ 11 - 0
mpi/src/starpu_mpi_private.c

@@ -15,6 +15,7 @@
  */
 
 #include <starpu_mpi_private.h>
+#include <core/topology.h>
 
 int _starpu_debug_rank=-1;
 int _starpu_debug_level_min=0;
@@ -70,4 +71,14 @@ void _starpu_mpi_env_init(void)
 	_starpu_mpi_mem_throttle = starpu_get_env_number_default("STARPU_MPI_MEM_THROTTLE", 0);
 	_starpu_debug_level_min = starpu_get_env_number_default("STARPU_MPI_DEBUG_LEVEL_MIN", 0);
 	_starpu_debug_level_max = starpu_get_env_number_default("STARPU_MPI_DEBUG_LEVEL_MAX", 0);
+
+	int mpi_thread_coreid = starpu_get_env_number_default("STARPU_MPI_THREAD_COREID", -1);
+	if (_starpu_mpi_thread_cpuid >= 0 && mpi_thread_coreid >= 0)
+	{
+		_STARPU_DISP("Warning: STARPU_MPI_THREAD_CPUID and STARPU_MPI_THREAD_COREID cannot be set at the same time. STARPU_MAIN_THREAD_CPUID will be used.\n");
+	}
+	if (_starpu_mpi_thread_cpuid == -1 && mpi_thread_coreid >= 0)
+	{
+		_starpu_mpi_thread_cpuid = mpi_thread_coreid * _starpu_get_nhyperthreads();
+	}
 }

+ 5 - 5
mpi/src/starpu_mpi_task_insert.c

@@ -797,15 +797,15 @@ void _starpu_mpi_redux_data_recv_callback(void *callback_arg)
 void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle, int prio)
 {
 	int me, rank, nb_nodes;
-	starpu_mpi_tag_t tag;
+	starpu_mpi_tag_t data_tag;
 
 	rank = starpu_mpi_data_get_rank(data_handle);
-	tag = starpu_mpi_data_get_tag(data_handle);
+	data_tag = starpu_mpi_data_get_tag(data_handle);
 	if (rank == -1)
 	{
 		_STARPU_ERROR("StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register\n");
 	}
-	if (tag == -1)
+	if (data_tag == -1)
 	{
 		_STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
 	}
@@ -851,7 +851,7 @@ void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle,
 				struct _starpu_mpi_redux_data_args *args;
 				_STARPU_MPI_MALLOC(args, sizeof(struct _starpu_mpi_redux_data_args));
 				args->data_handle = data_handle;
-				args->data_tag = tag;
+				args->data_tag = data_tag;
 				args->node = i;
 				args->comm = comm;
 
@@ -878,7 +878,7 @@ void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle,
 	else
 	{
 		_STARPU_MPI_DEBUG(1, "Sending redux handle to %d ...\n", rank);
-		starpu_mpi_isend_detached_prio(data_handle, rank, tag, prio, comm, NULL, NULL);
+		starpu_mpi_isend_detached_prio(data_handle, rank, data_tag, prio, comm, NULL, NULL);
 		starpu_task_insert(data_handle->init_cl, STARPU_W, data_handle, 0);
 	}
 	/* FIXME: In order to prevent simultaneous receive submissions

+ 5 - 3
mpi/tests/Makefile.am

@@ -24,7 +24,7 @@ noinst_PROGRAMS		=
 if STARPU_HAVE_WINDOWS
 LOADER_BIN		=
 else
-loader_CPPFLAGS 	= 	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
+loader_CPPFLAGS 	= 	$(AM_CPPFLAGS) -I$(top_builddir)/src/
 if !STARPU_SIMGRID
 LOADER			=	loader
 LOADER_BIN		=	$(abs_top_builddir)/mpi/tests/$(LOADER)
@@ -84,9 +84,9 @@ endif
 endif
 
 AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
-LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ ../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la @LIBS@ $(FXT_LIBS) $(MAGMA_LIBS)
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_builddir)/include -I$(top_srcdir)/mpi/include -I$(top_srcdir)/mpi/src -I$(top_srcdir)/src -I$(top_builddir)/src -I$(top_srcdir)/examples/
-AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
+LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@ ../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la $(FXT_LIBS) $(MAGMA_LIBS)
+LIBS += $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
 
 ########################
 # Unit testcases       #
@@ -109,6 +109,7 @@ starpu_mpi_TESTS +=				\
 	insert_task_owner			\
 	insert_task_owner2			\
 	insert_task_owner_data			\
+	insert_task_tags			\
 	matrix					\
 	matrix2					\
 	mpi_barrier				\
@@ -205,6 +206,7 @@ noinst_PROGRAMS +=				\
 	insert_task_count			\
 	insert_task_dyn_handles			\
 	insert_task_seq				\
+	insert_task_tags			\
 	multiple_send				\
 	mpi_scatter_gather			\
 	mpi_reduction				\

+ 84 - 0
mpi/tests/insert_task_tags.c

@@ -0,0 +1,84 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include <math.h>
+#include "helper.h"
+
+void func_cpu(void *descr[], void *_args)
+{
+	(void) _args;
+	(void) descr;
+
+	FPRINTF_MPI(stderr, "Hello\n");
+}
+
+struct starpu_codelet mycodelet =
+{
+	.cpu_funcs = {func_cpu},
+	.nbuffers = 1,
+	.modes = {STARPU_RW},
+	.model = &starpu_perfmodel_nop,
+	.name = "insert_task_tags"
+};
+
+int main(int argc, char **argv)
+{
+	int ret, rank, err;
+	int x=32;
+	starpu_data_handle_t handle0;
+	starpu_data_handle_t handle1;
+	int64_t *value;
+
+	ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
+
+	if (rank != 0 && rank != 1)
+		goto end;
+
+	starpu_variable_data_register(&handle0, STARPU_MAIN_RAM, (uintptr_t)&x, sizeof(x));
+	starpu_variable_data_register(&handle1, STARPU_MAIN_RAM, (uintptr_t)&x, sizeof(x));
+
+	starpu_mpi_comm_get_attr(MPI_COMM_WORLD, STARPU_MPI_TAG_UB, &value, &err);
+	assert(err == 1);
+
+	starpu_mpi_data_register(handle0, (*value)-1, 1);
+	starpu_mpi_data_register(handle1, (*value)-2, 1);
+
+	err = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet,
+				     STARPU_EXECUTE_ON_NODE, 0,
+				     STARPU_RW, handle0,
+				     0);
+	assert(err == 0);
+
+	err = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet,
+				     STARPU_EXECUTE_ON_NODE, 1,
+				     STARPU_RW, handle1,
+				     0);
+	assert(err == 0);
+
+	FPRINTF_MPI(stderr, "Waiting ...\n");
+	starpu_task_wait_for_all();
+	starpu_data_unregister(handle0);
+	starpu_data_unregister(handle1);
+
+end:
+	starpu_mpi_shutdown();
+
+	return 0;
+}
+

+ 1 - 1
mpi/tests/ring.c

@@ -99,7 +99,7 @@ int main(int argc, char **argv)
 
 	for (loop = 0; loop < nloops; loop++)
 	{
-		int tag = loop*size + rank;
+		starpu_mpi_tag_t tag = ((starpu_mpi_tag_t) loop)*size + rank;
 
 		if (loop == 0 && rank == 0)
 		{

+ 1 - 1
mpi/tests/ring_async.c

@@ -99,7 +99,7 @@ int main(int argc, char **argv)
 
 	for (loop = 0; loop < nloops; loop++)
 	{
-		int tag = loop*size + rank;
+		starpu_mpi_tag_t tag = ((starpu_mpi_tag_t) loop)*size + rank;
 
 		if (loop == 0 && rank == 0)
 		{

+ 1 - 1
mpi/tests/ring_async_implicit.c

@@ -92,7 +92,7 @@ int main(int argc, char **argv)
 
 	for (loop = 0; loop < nloops; loop++)
 	{
-		int tag = loop*size + rank;
+		starpu_mpi_tag_t tag = ((starpu_mpi_tag_t) loop)*size + rank;
 
 		if (loop == 0 && rank == 0)
 		{

+ 1 - 1
mpi/tests/ring_sync.c

@@ -99,7 +99,7 @@ int main(int argc, char **argv)
 
 	for (loop = 0; loop < nloops; loop++)
 	{
-		int tag = loop*size + rank;
+		starpu_mpi_tag_t tag = ((starpu_mpi_tag_t) loop)*size + rank;
 
 		if (loop == 0 && rank == 0)
 		{

+ 1 - 1
mpi/tests/ring_sync_detached.c

@@ -112,7 +112,7 @@ int main(int argc, char **argv)
 
 	for (loop = 0; loop < nloops; loop++)
 	{
-		int tag = loop*size + rank;
+		starpu_mpi_tag_t tag = ((starpu_mpi_tag_t) loop)*size + rank;
 
 		if (loop == 0 && rank == 0)
 		{

+ 3 - 3
mpi/tests/user_defined_datatype.c

@@ -26,9 +26,9 @@
 #  define ELEMENTS 1000
 #endif
 
-typedef void (*test_func)(starpu_data_handle_t *, int, int, int);
+typedef void (*test_func)(starpu_data_handle_t *, int, int, starpu_mpi_tag_t);
 
-void test_handle_irecv_isend_detached(starpu_data_handle_t *handles, int nb_handles, int rank, int tag)
+void test_handle_irecv_isend_detached(starpu_data_handle_t *handles, int nb_handles, int rank, starpu_mpi_tag_t tag)
 {
 	int i;
 	(void)rank;
@@ -42,7 +42,7 @@ void test_handle_irecv_isend_detached(starpu_data_handle_t *handles, int nb_hand
 		starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, handles[i], 0, NULL, NULL);
 }
 
-void test_handle_recv_send(starpu_data_handle_t *handles, int nb_handles, int rank, int tag)
+void test_handle_recv_send(starpu_data_handle_t *handles, int nb_handles, int rank, starpu_mpi_tag_t tag)
 {
 	int i;
 

+ 4 - 3
mpi/tools/Makefile.am

@@ -16,14 +16,15 @@
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 #
-include $(top_srcdir)/starpu.mk
+include $(top_srcdir)/starpu-notests.mk
 
 SUBDIRS =
 
 AM_CFLAGS = $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(STARPU_COI_CPPFLAGS) $(GLOBAL_AM_CFLAGS)
-LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ $(top_builddir)/mpi/src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la @LIBS@ $(FXT_LIBS)
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/tools/ -I$(top_srcdir)/mpi/ -I$(top_srcdir)/mpi/include -I$(top_builddir)/src -I$(top_srcdir)/src -DSTARPU_REPLAY_MPI
-AM_LDFLAGS = @STARPU_EXPORT_DYNAMIC@ $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
+AM_LDFLAGS = @STARPU_EXPORT_DYNAMIC@
+LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@ $(top_builddir)/mpi/src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la $(FXT_LIBS)
+LIBS += $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
 
 CC=$(CC_OR_MPICC)
 CCLD=$(CC_OR_MPICC)

+ 2 - 2
sc_hypervisor/examples/Makefile.am

@@ -16,9 +16,9 @@
 include $(top_srcdir)/starpu.mk
 
 AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS)
-LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@ $(top_builddir)/sc_hypervisor/src/libsc_hypervisor.la
 AM_CPPFLAGS = -I$(top_srcdir)/include -I$(top_srcdir)/examples -I$(top_builddir)/include -I$(top_srcdir)/sc_hypervisor/include -I$(top_srcdir)/sc_hypervisor/examples
-AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_GLPK_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
+LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@ $(top_builddir)/sc_hypervisor/src/libsc_hypervisor.la
+LIBS += $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_GLPK_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
 
 noinst_PROGRAMS =				\
 	app_driven_test/app_driven_test		\

+ 2 - 4
sc_hypervisor/src/Makefile.am

@@ -17,14 +17,12 @@
 include $(top_srcdir)/starpu-notests.mk
 
 AM_CFLAGS = -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS) $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) $(GLOBAL_AM_CFLAGS)
-LIBS = $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la
 AM_CPPFLAGS = -I$(top_builddir)/include -I$(top_srcdir)/include -I$(top_srcdir)/sc_hypervisor/include/ -I$(top_srcdir)/sc_hypervisor/src
-AM_LDFLAGS = $(STARPU_CUDA_LDFLAGS) $(STARPU_OPENCL_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
+LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@
+LIBS += $(STARPU_CUDA_LDFLAGS) $(STARPU_OPENCL_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_SCIF_LDFLAGS)
 
 lib_LTLIBRARIES = libsc_hypervisor.la
 
-libsc_hypervisor_la_LIBADD = $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la
-
 libsc_hypervisor_la_SOURCES = 				\
 	sc_hypervisor.c					\
 	sc_config.c					\

+ 4 - 4
socl/examples/Makefile.am

@@ -16,8 +16,8 @@
 include $(top_srcdir)/starpu.mk
 
 AM_CFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(GLOBAL_AM_CFLAGS)
-LIBS = $(top_builddir)/socl/src/libsocl-@STARPU_EFFECTIVE_VERSION@.la $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la
-AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS)
+LIBS += $(top_builddir)/socl/src/libsocl-@STARPU_EFFECTIVE_VERSION@.la $(top_builddir)/src/@LIBSTARPU_LINK@
+LIBS += $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS)
 
 
 SOCL_EXAMPLES	=
@@ -31,7 +31,7 @@ check_PROGRAMS	=	$(SOCL_EXAMPLES)
 if !STARPU_HAVE_WINDOWS
 ## test loader program
 LOADER			=	loader
-loader_CPPFLAGS 	=	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
+loader_CPPFLAGS 	=	$(AM_CPPFLAGS) -I$(top_builddir)/src/
 LOADER_BIN		=	$(abs_top_builddir)/socl/examples/$(LOADER)
 loader_SOURCES		=	../../tests/loader.c
 noinst_PROGRAMS		+=	loader
@@ -72,7 +72,7 @@ matmul_matmul_SOURCES = matmul/matmul.c
 matmul_matmul_LDADD = -lm
 mansched_mansched_SOURCES = mansched/mansched.c
 
-#mandelbrot_mandelbrot_CPPFLAGS = $(AM_CPPFLAGS) $(AM_CFLAGS)
+#mandelbrot_mandelbrot_CPPFLAGS = $(AM_CPPFLAGS)
 #if STARPU_HAVE_X11
 #mandelbrot_mandelbrot_CPPFLAGS += $(X_CFLAGS)
 #mandelbrot_mandelbrot_LDADD = $(X_PRE_LIBS) $(X_LIBS) -lX11 $(X_EXTRA_LIBS)

+ 1 - 1
socl/examples/basic/basic.c

@@ -192,7 +192,7 @@ int main(int UNUSED(argc), char** UNUSED(argv)) {
       check(err, "clGetEventProfilingInfo");\
       err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &t1, NULL);\
       check(err, "clGetEventProfilingInfo");\
-      printf("Profiling %s: %lu nanoseconds\n", label, t1-t0);\
+      printf("Profiling %s: %llu nanoseconds\n", label, (unsigned long long) (t1-t0));\
    } while (0);
 
    DURATION(eventW1, "first buffer writing");

+ 6 - 6
socl/examples/clinfo/clinfo.c

@@ -189,7 +189,7 @@ main(void) {
    cl_ulong val; \
    err = clGetDeviceInfo(devices[j], CL_D, sizeof(val), &val, NULL); \
    checkErr(err, "clGetDeviceInfo(" #CL_D ")"); \
-   printf(str, val); \
+   printf(str, (unsigned long long) val); \
 }
 
 #define GET_BOOL(CL_D,str) { \
@@ -223,7 +223,7 @@ main(void) {
                GET_UINT(CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, "  Preferred vector width double:\t\t %u\n")
                GET_UINT(CL_DEVICE_MAX_CLOCK_FREQUENCY, "  Max clock frequency:\t\t\t\t %uMHz\n")
                GET_UINT(CL_DEVICE_ADDRESS_BITS, "  Address bits:\t\t\t\t\t %ubits\n")
-               GET_ULONG(CL_DEVICE_MAX_MEM_ALLOC_SIZE, "  Max memory allocation:\t\t\t %lu bytes\n")
+               GET_ULONG(CL_DEVICE_MAX_MEM_ALLOC_SIZE, "  Max memory allocation:\t\t\t %llu bytes\n")
 
                GET_BOOL(CL_DEVICE_IMAGE_SUPPORT, "  Image support:\t\t\t\t %s\n")
 
@@ -258,9 +258,9 @@ main(void) {
                }
 
                GET_UINT(CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, "  Cache line size:\t\t\t\t %u bytes\n")
-               GET_ULONG(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, "  Cache size:\t\t\t\t\t %lu bytes\n")
-               GET_ULONG(CL_DEVICE_GLOBAL_MEM_SIZE, "  Global memory size:\t\t\t\t %lu bytes\n")
-               GET_ULONG(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, "  Constant buffer size:\t\t\t\t %lu bytes\n")
+               GET_ULONG(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, "  Cache size:\t\t\t\t\t %llu bytes\n")
+               GET_ULONG(CL_DEVICE_GLOBAL_MEM_SIZE, "  Global memory size:\t\t\t\t %llu bytes\n")
+               GET_ULONG(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, "  Constant buffer size:\t\t\t\t %llu bytes\n")
                GET_UINT(CL_DEVICE_MAX_CONSTANT_ARGS, "  Max number of constant args:\t\t\t %u\n")
 
                {
@@ -278,7 +278,7 @@ main(void) {
                   }
                }
 
-               GET_ULONG(CL_DEVICE_LOCAL_MEM_SIZE, "  Local memory size:\t\t\t\t %lu bytes\n")
+               GET_ULONG(CL_DEVICE_LOCAL_MEM_SIZE, "  Local memory size:\t\t\t\t %llu bytes\n")
                GET_SIZET(CL_DEVICE_PROFILING_TIMER_RESOLUTION, "  Profiling timer resolution:\t\t\t %u\n")
                GET_BOOL_CUSTOM(CL_DEVICE_ENDIAN_LITTLE, "  Device endianess:\t\t\t\t %s\n", "Little", "Big")
                GET_BOOL(CL_DEVICE_AVAILABLE, "  Available:\t\t\t\t\t %s\n")

+ 2 - 2
socl/src/Makefile.am

@@ -19,9 +19,9 @@ include $(top_srcdir)/starpu-notests.mk
 CLEANFILES = *.gcno *.gcda
 
 AM_CFLAGS = $(GLOBAL_AM_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(HWLOC_CFLAGS) $(FXT_CFLAGS)
-libsocl_@STARPU_EFFECTIVE_VERSION@_la_LIBADD = $(top_builddir)/src/libstarpu-@STARPU_EFFECTIVE_VERSION@.la
 AM_CPPFLAGS = -DBUILDING_SOCL -I$(top_srcdir)/include/ -I$(top_builddir)/include -I$(top_builddir)/src -I$(top_srcdir)/src -I$(top_srcdir)/socl/src
-AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS) -no-undefined
+LIBS += $(top_builddir)/src/@LIBSTARPU_LINK@ \
+	    $(STARPU_OPENCL_LDFLAGS) $(FXT_LDFLAGS)
 
 SUBDIRS =
 

+ 5 - 4
src/Makefile.am

@@ -17,6 +17,10 @@
 
 include $(top_srcdir)/starpu-notests.mk
 
+AM_CFLAGS = $(GLOBAL_AM_CFLAGS)
+AM_CPPFLAGS = -I$(top_srcdir)/include/ -DBUILDING_STARPU  $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(STARPU_FPGA_CPPFLAGS) $(FXT_CFLAGS) $(STARPU_COI_CPPFLAGS) $(STARPU_SCIF_CPPFLAGS) $(STARPU_RCCE_CFLAGS) $(STARPU_RCCE_CPPFLAGS) -DSTARPU_DATADIR='"$(datadir)"'
+LIBS += -lm $(FXT_LDFLAGS) $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(LIBSTARPU_LDFLAGS) $(PAPI_LIBS) $(STARPU_FPGA_LDFLAGS)
+
 SUBDIRS =
 
 CLEANFILES = *.gcno *.gcda *.linkinfo
@@ -55,10 +59,7 @@ endif STARPU_HAVE_WINDOWS
 
 lib_LTLIBRARIES = libstarpu-@STARPU_EFFECTIVE_VERSION@.la
 
-libstarpu_@STARPU_EFFECTIVE_VERSION@_la_CPPFLAGS = -I$(top_srcdir)/include/ -DBUILDING_STARPU  $(GLOBAL_AM_CFLAGS) $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(STARPU_FPGA_CPPFLAGS) $(FXT_CFLAGS) $(STARPU_COI_CPPFLAGS) $(STARPU_SCIF_CPPFLAGS) $(STARPU_RCCE_CFLAGS) $(STARPU_RCCE_CPPFLAGS) -DSTARPU_DATADIR='"$(datadir)"'
-
-libstarpu_@STARPU_EFFECTIVE_VERSION@_la_LIBADD = -lm $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(LIBSTARPU_LDFLAGS) $(STARPU_FPGA_LDFLAGS)
-libstarpu_@STARPU_EFFECTIVE_VERSION@_la_LDFLAGS = $(ldflags) $(FXT_LDFLAGS) -no-undefined									\
+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_LDFLAGS = $(ldflags) -no-undefined									\
   -version-info $(libstarpu_so_version)
 
 if STARPU_HAVE_DARWIN

+ 1 - 1
src/common/fxt.c

@@ -99,7 +99,7 @@ static void _starpu_profile_set_tracefile(void)
 	else
 		_starpu_mkpath_and_check(fxt_prefix, S_IRWXU);
 
-	char suffix[127];
+	char suffix[64];
 	char *fxt_suffix = starpu_getenv("STARPU_FXT_SUFFIX");
 	if (!fxt_suffix)
 	{

+ 17 - 11
src/common/fxt.h

@@ -235,6 +235,8 @@
 #define _STARPU_FUT_DATA_STATE_SHARED     0x5184
 
 #define _STARPU_FUT_DATA_REQUEST_CREATED   0x5185
+#define _STARPU_FUT_PAPI_TASK_EVENT_VALUE   0x5186
+#define _STARPU_FUT_TASK_EXCLUDE_FROM_DAG   0x5187
 
 
 /* Predefined FUT key masks */
@@ -259,8 +261,6 @@
 #define _STARPU_FUT_KEYMASK_HYP            FUT_KEYMASK18
 #define _STARPU_FUT_KEYMASK_HYP_VERBOSE    FUT_KEYMASK19
 
-#define _STARPU_FUT_PAPI_TASK_EVENT_VALUE   0x5186
-
 extern unsigned long _starpu_job_cnt;
 
 static inline unsigned long _starpu_fxt_get_job_id(void)
@@ -844,18 +844,23 @@ do {									\
 #define _STARPU_TRACE_GHOST_TASK_DEPS(ghost_prev_id, job_succ)		\
 	_STARPU_FUT_FULL_PROBE4STR(_STARPU_FUT_KEYMASK_TASK_VERBOSE, _STARPU_FUT_TASK_DEPS, (ghost_prev_id), (job_succ)->job_id, (job_succ)->task->type, 1, "ghost")
 
-#define _STARPU_TRACE_TASK_NAME(job)						\
-do {										\
-	unsigned exclude_from_dag = (job)->exclude_from_dag;			\
+#define _STARPU_TRACE_TASK_EXCLUDE_FROM_DAG(job)			\
+	do {								\
+	unsigned exclude_from_dag = (job)->exclude_from_dag;		\
+	FUT_FULL_PROBE2(_STARPU_FUT_KEYMASK_TASK, _STARPU_FUT_TASK_EXCLUDE_FROM_DAG, (job)->job_id, (long unsigned)exclude_from_dag); \
+} while(0)
+
+#define _STARPU_TRACE_TASK_NAME(job)					\
+	do {								\
         const char *model_name = _starpu_job_get_task_name((job));                       \
 	if (model_name)					                        \
 	{									\
-		_STARPU_FUT_FULL_PROBE4STR(_STARPU_FUT_KEYMASK_TASK, _STARPU_FUT_TASK_NAME, (job)->job_id, _starpu_gettid(), (long unsigned)exclude_from_dag, 1, model_name);\
+		_STARPU_FUT_FULL_PROBE1STR(_STARPU_FUT_KEYMASK_TASK, _STARPU_FUT_TASK_NAME, (job)->job_id, model_name);\
 	}									\
 	else {									\
-		FUT_FULL_PROBE4(_STARPU_FUT_KEYMASK_TASK, _STARPU_FUT_TASK_NAME, (job)->job_id, _starpu_gettid(), (long unsigned)exclude_from_dag, 0);\
+		_STARPU_FUT_FULL_PROBE1STR(_STARPU_FUT_KEYMASK_TASK, _STARPU_FUT_TASK_NAME, (job)->job_id, "unknown");\
 	}									\
-} while(0);
+} while(0)
 
 #define _STARPU_TRACE_TASK_COLOR(job)						\
 do { \
@@ -1269,8 +1274,8 @@ do {										\
 #define _STARPU_TRACE_SCHED_COMPONENT_CONNECT(parent, child)		\
 	FUT_RAW_ALWAYS_PROBE2(FUT_CODE(_STARPU_FUT_SCHED_COMPONENT_CONNECT,2), parent, child);
 
-#define _STARPU_TRACE_SCHED_COMPONENT_PUSH(from, to, task)		\
-	FUT_FULL_PROBE5(_STARPU_FUT_KEYMASK_SCHED, _STARPU_FUT_SCHED_COMPONENT_PUSH, _starpu_gettid(), from, to, task, (task)->priority);
+#define _STARPU_TRACE_SCHED_COMPONENT_PUSH(from, to, task, prio)		\
+	FUT_FULL_PROBE5(_STARPU_FUT_KEYMASK_SCHED, _STARPU_FUT_SCHED_COMPONENT_PUSH, _starpu_gettid(), from, to, task, prio);
 
 #define _STARPU_TRACE_SCHED_COMPONENT_PULL(from, to, task)		\
 	FUT_FULL_PROBE5(_STARPU_FUT_KEYMASK_SCHED, _STARPU_FUT_SCHED_COMPONENT_PULL, _starpu_gettid(), from, to, task, (task)->priority);
@@ -1328,6 +1333,7 @@ do {										\
 #define _STARPU_TRACE_TAG_DEPS(a, b)		do {(void)(a); (void)(b);} while(0)
 #define _STARPU_TRACE_TASK_DEPS(a, b)		do {(void)(a); (void)(b);} while(0)
 #define _STARPU_TRACE_GHOST_TASK_DEPS(a, b)	do {(void)(a); (void)(b);} while(0)
+#define _STARPU_TRACE_TASK_EXCLUDE_FROM_DAG(a)	do {(void)(a);} while(0)
 #define _STARPU_TRACE_TASK_NAME(a)		do {(void)(a);} while(0)
 #define _STARPU_TRACE_TASK_COLOR(a)		do {(void)(a);} while(0)
 #define _STARPU_TRACE_TASK_DONE(a)		do {(void)(a);} while(0)
@@ -1421,7 +1427,7 @@ do {										\
 #define _STARPU_TRACE_HYPERVISOR_END()                  do {} while(0)
 #define _STARPU_TRACE_SCHED_COMPONENT_NEW(component)	do {(void)(component);} while (0)
 #define _STARPU_TRACE_SCHED_COMPONENT_CONNECT(parent, child)	do {(void)(parent); (void)(child);} while (0)
-#define _STARPU_TRACE_SCHED_COMPONENT_PUSH(from, to, task)	do {(void)(from); (void)(to); (void)(task);} while (0)
+#define _STARPU_TRACE_SCHED_COMPONENT_PUSH(from, to, task, prio)	do {(void)(from); (void)(to); (void)(task); (void)(prio);} while (0)
 #define _STARPU_TRACE_SCHED_COMPONENT_PULL(from, to, task)	do {(void)(from); (void)(to); (void)(task);} while (0)
 #define _STARPU_TRACE_HANDLE_DATA_REGISTER(handle)	do {(void)(handle);} while (0)
 #define _STARPU_TRACE_HANDLE_DATA_UNREGISTER(handle)	do {(void)(handle);} while (0)

+ 11 - 5
src/core/jobs.c

@@ -62,6 +62,7 @@ void _starpu_exclude_task_from_dag(struct starpu_task *task)
 	struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
 
 	j->exclude_from_dag = 1;
+	_STARPU_TRACE_TASK_EXCLUDE_FROM_DAG(j);
 }
 
 /* create an internal struct _starpu_job structure to encapsulate the task */
@@ -387,6 +388,13 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 	if (_starpu_graph_record)
 		_starpu_graph_drop_job(j);
 
+	/* Get callback pointer for codelet before notifying dependencies, in
+	   case dependencies free the codelet (see starpu_data_unregister for
+	   instance) */
+	void (*callback)(void *) = task->callback_func;
+	if (!callback && task->cl)
+		callback = task->cl->callback_func;
+
 	/* Task does not have a cl, but has explicit data dependencies, we need
 	 * to tell them that we will not exist any more before notifying the
 	 * tasks waiting for us
@@ -404,8 +412,6 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 			_starpu_spin_unlock(&handle->header_lock);
 	}
 
-	_STARPU_TRACE_TASK_NAME(j);
-
 	/* If this is a continuation, we do not notify task/tag dependencies
 	 * now. Task/tag dependencies will be notified only when the continued
 	 * task fully completes */
@@ -424,7 +430,7 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 	{
 		/* the callback is executed after the dependencies so that we may remove the tag
 		 * of the task itself */
-		if (task->callback_func)
+		if (callback)
 		{
 			int profiling = starpu_profiling_status_get();
 			if (profiling && task->profiling_info)
@@ -434,7 +440,6 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 			 * within the callback */
 			_starpu_set_local_worker_status(STATUS_CALLBACK);
 
-
 			/* Perhaps we have nested callbacks (eg. with chains of empty
 			 * tasks). So we store the current task and we will restore it
 			 * later. */
@@ -443,7 +448,8 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 			_starpu_set_current_task(task);
 
 			_STARPU_TRACE_START_CALLBACK(j);
-			task->callback_func(task->callback_arg);
+			if (callback)
+				callback(task->callback_arg);
 			_STARPU_TRACE_END_CALLBACK(j);
 
 			_starpu_set_current_task(current_task);

+ 2 - 1
src/core/perfmodel/perfmodel_bus.c

@@ -2684,13 +2684,14 @@ static void write_bus_platform_file_content(int version)
 			"   <prop id=\"network/TCP%cgamma\" value=\"-1\"></prop>\n"
 			"   <prop id=\"network/latency%cfactor\" value=\"1\"></prop>\n"
 			"   <prop id=\"network/bandwidth%cfactor\" value=\"1\"></prop>\n"
+			"   <prop id=\"network/weight%cS\" value=\"0.0\"></prop>\n"
 			" </config>\n"
 			" <AS  id=\"AS0\"  routing=\"Full\">\n"
 			"   <host id=\"MAIN\" %s=\"1%s\"/>\n",
 			version == 3
 			? "http://simgrid.gforge.inria.fr/simgrid.dtd"
 			: "http://simgrid.gforge.inria.fr/simgrid/simgrid.dtd",
-			version, dash, dash, dash, speed, flops);
+			version, dash, dash, dash, dash, speed, flops);
 
 	for (i = 0; i < ncpus; i++)
 		/* TODO: host memory for out-of-core simulation */

+ 16 - 3
src/core/perfmodel/perfmodel_history.c

@@ -1158,7 +1158,7 @@ void starpu_perfmodel_get_model_path(const char *symbol, char *path, size_t maxl
 }
 
 #ifndef STARPU_SIMGRID
-static void save_history_based_model(struct starpu_perfmodel *model)
+void starpu_save_history_based_model(struct starpu_perfmodel *model)
 {
 	STARPU_ASSERT(model);
 	STARPU_ASSERT(model->symbol);
@@ -1202,7 +1202,7 @@ static void _starpu_dump_registered_models(void)
 	     node  = _starpu_perfmodel_list_next(node))
 	{
 		if (node->model->is_init)
-			save_history_based_model(node->model);
+			starpu_save_history_based_model(node->model);
 	}
 
 	STARPU_PTHREAD_RWLOCK_UNLOCK(&registered_models_rwlock);
@@ -1229,12 +1229,18 @@ void _starpu_initialize_registered_performance_models(void)
 	unsigned ncuda =  conf->topology.nhwcudagpus;
 	unsigned nopencl = conf->topology.nhwopenclgpus;
 	unsigned nmic = 0;
+#if STARPU_MAXMICDEVS > 0 || STARPU_MAXMPIDEVS > 0
 	unsigned i;
+#endif
+#if STARPU_MAXMICDEVS > 0
 	for(i = 0; i < conf->topology.nhwmicdevices; i++)
 		nmic += conf->topology.nhwmiccores[i];
+#endif
 	unsigned nmpi = 0;
+#if STARPU_MAXMPIDEVS > 0
 	for(i = 0; i < conf->topology.nhwmpidevices; i++)
 		nmpi += conf->topology.nhwmpicores[i];
+#endif
 
 	// We used to allocate 2**(ncores + ncuda + nopencl + nmic + nmpi), this is too big
 	// We now allocate only 2*(ncores + ncuda + nopencl + nmic + nmpi), and reallocate when necessary in starpu_perfmodel_arch_comb_add
@@ -1511,6 +1517,13 @@ int starpu_perfmodel_unload_model(struct starpu_perfmodel *model)
 		model->symbol = NULL;
 	}
 
+	starpu_perfmodel_deinit(model);
+
+	return 0;
+}
+
+int starpu_perfmodel_deinit(struct starpu_perfmodel *model){
+
 	_starpu_deinitialize_performance_model(model);
 	free(model->state);
 	model->state = NULL;
@@ -1928,7 +1941,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 					{
 						char archname[STR_SHORT_LENGTH];
 						starpu_perfmodel_get_arch_name(arch, archname, sizeof(archname), impl);
-						_STARPU_DISP("Too big deviation for model %s on %s: %fus vs average %fus, %u such errors against %u samples (%+f%%), flushing the performance model. Use the STARPU_HISTORY_MAX_ERROR environement variable to control the threshold (currently %d%%)\n", model->symbol, archname, measured, entry->mean, entry->nerror, entry->nsample, measured * 100. / entry->mean - 100, historymaxerror);
+						_STARPU_DISP("Too big deviation for model %s on %s: %fus vs average %fus, %u such errors against %u samples (%+f%%), flushing the performance model. Use the STARPU_HISTORY_MAX_ERROR environment variable to control the threshold (currently %d%%)\n", model->symbol, archname, measured, entry->mean, entry->nerror, entry->nsample, measured * 100. / entry->mean - 100, historymaxerror);
 						entry->sum = 0.0;
 						entry->sum2 = 0.0;
 						entry->nsample = 0;

+ 8 - 3
src/core/task.c

@@ -870,6 +870,7 @@ int _starpu_task_submit(struct starpu_task *task, int nodeps)
 	_STARPU_LOG_IN();
 	STARPU_ASSERT(task);
 	STARPU_ASSERT_MSG(task->magic == _STARPU_TASK_MAGIC, "Tasks must be created with starpu_task_create, or initialized with starpu_task_init.");
+	STARPU_ASSERT_MSG(starpu_is_initialized(), "starpu_init must be called (and return no error) before submitting tasks.");
 
 	int ret;
 	{
@@ -940,6 +941,7 @@ int _starpu_task_submit(struct starpu_task *task, int nodeps)
 		_STARPU_TRACE_TASK_SUBMIT(j,
 			_starpu_get_sched_ctx_struct(task->sched_ctx)->iterations[0],
 			_starpu_get_sched_ctx_struct(task->sched_ctx)->iterations[1]);
+		_STARPU_TRACE_TASK_NAME(j);
 	}
 
 	/* If this is a continuation, we don't modify the implicit data dependencies detected earlier. */
@@ -976,10 +978,13 @@ int _starpu_task_submit(struct starpu_task *task, int nodeps)
 
 	/* If profiling is activated, we allocate a structure to store the
 	 * appropriate info. */
-	struct starpu_profiling_task_info *info;
+	struct starpu_profiling_task_info *info = task->profiling_info;
 	int profiling = starpu_profiling_status_get();
-	info = _starpu_allocate_profiling_info_if_needed(task);
-	task->profiling_info = info;
+	if (!info)
+	{
+		info = _starpu_allocate_profiling_info_if_needed(task);
+		task->profiling_info = info;
+	}
 
 	/* The task is considered as block until we are sure there remains not
 	 * dependency. */

+ 28 - 4
src/core/topology.c

@@ -962,9 +962,17 @@ static void _starpu_initialize_workers_bindid(struct _starpu_machine_config *con
 	unsigned i;
 
 	struct _starpu_machine_topology *topology = &config->topology;
+	int nhyperthreads = topology->nhwpus / topology->nhwcpus;
+	unsigned bind_on_core = 0;
+	int scale = 1;
 
 	config->current_bindid = 0;
 
+	if (starpu_getenv("STARPU_WORKERS_CPUID") && starpu_getenv("STARPU_WORKERS_COREID"))
+	{
+		_STARPU_DISP("Warning: STARPU_WORKERS_CPUID and STARPU_WORKERS_COREID cannot be set at the same time. STARPU_WORKERS_CPUID will be used.\n");
+	}
+
 	/* conf->workers_bindid indicates the successive logical PU identifier that
 	 * should be used to bind the workers. It should be either filled
 	 * according to the user's explicit parameters (from starpu_conf) or
@@ -974,6 +982,16 @@ static void _starpu_initialize_workers_bindid(struct _starpu_machine_config *con
 
 	/* what do we use, explicit value, env. variable, or round-robin ? */
 	strval = starpu_getenv("STARPU_WORKERS_CPUID");
+	if (strval == NULL)
+	{
+		strval = starpu_getenv("STARPU_WORKERS_COREID");
+		if (strval)
+		{
+			bind_on_core = 1;
+			scale = nhyperthreads;
+		}
+	}
+
 	if (strval)
 	{
 		/* STARPU_WORKERS_CPUID certainly contains less entries than
@@ -994,7 +1012,7 @@ static void _starpu_initialize_workers_bindid(struct _starpu_machine_config *con
 				val = strtol(strval, &endptr, 10);
 				if (endptr != strval)
 				{
-					topology->workers_bindid[i] = (unsigned)(val % topology->nhwpus);
+					topology->workers_bindid[i] = (unsigned)((val * scale) % topology->nhwpus);
 					strval = endptr;
 					if (*strval == '-')
 					{
@@ -1008,14 +1026,14 @@ static void _starpu_initialize_workers_bindid(struct _starpu_machine_config *con
 						}
 						else
 						{
-							endval = topology->nhwpus-1;
+							endval = (bind_on_core ? topology->nhwcpus : topology->nhwpus) - 1;
 							if (*strval)
 								strval++;
 						}
 						for (val++; val <= endval && i < STARPU_NMAXWORKERS-1; val++)
 						{
 							i++;
-							topology->workers_bindid[i] = (unsigned)(val % topology->nhwpus);
+							topology->workers_bindid[i] = (unsigned)((val * scale) % topology->nhwpus);
 						}
 					}
 					if (*strval == ',')
@@ -1054,7 +1072,6 @@ static void _starpu_initialize_workers_bindid(struct _starpu_machine_config *con
 		int nth_per_core = starpu_get_env_number_default("STARPU_NTHREADS_PER_CORE", 1);
 		int k;
 		int nbindids=0;
-		int nhyperthreads = topology->nhwpus / topology->nhwcpus;
 		STARPU_ASSERT_MSG(nth_per_core > 0 && nth_per_core <= nhyperthreads , "Incorrect number of hyperthreads");
 
 		i = 0; /* PU number currently assigned */
@@ -3279,3 +3296,10 @@ hwloc_topology_t starpu_get_hwloc_topology(void)
 	return config->topology.hwtopology;
 }
 #endif
+
+unsigned _starpu_get_nhyperthreads()
+{
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
+
+	return config->topology.nhwpus / config->topology.nhwcpus;
+}

+ 3 - 0
src/core/topology.h

@@ -59,6 +59,9 @@ unsigned _starpu_topology_get_nhwpu(struct _starpu_machine_config *config);
 /** returns the number of NUMA nodes */
 unsigned _starpu_topology_get_nnumanodes(struct _starpu_machine_config *config);
 
+/** returns the number of hyperthreads per core */
+unsigned _starpu_get_nhyperthreads();
+
 #ifdef STARPU_HAVE_HWLOC
 /** Small convenient function to filter hwloc topology depending on HWLOC API version */
 void _starpu_topology_filter(hwloc_topology_t topology);

+ 12 - 3
src/core/workers.c

@@ -223,7 +223,7 @@ char ***_starpu_get_argv()
 
 int starpu_is_initialized(void)
 {
-	return initialized == INITIALIZED;
+	return initialized != UNINITIALIZED;
 }
 
 void starpu_wait_initialized(void)
@@ -1597,8 +1597,6 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 
 	_starpu_timing_init();
 
-	_starpu_profiling_init();
-
 	_starpu_load_bus_performance_files();
 
 	/* Depending on whether we are a MP sink or not, we must build the
@@ -1629,6 +1627,8 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 		return ret;
 	}
 
+	_starpu_profiling_init();
+
 	_starpu_task_init();
 
 	for (worker = 0; worker < _starpu_config.topology.nworkers; worker++)
@@ -1680,6 +1680,15 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 	STARPU_PTHREAD_MUTEX_UNLOCK(&init_mutex);
 
 	int main_thread_cpuid = starpu_get_env_number_default("STARPU_MAIN_THREAD_CPUID", -1);
+	int main_thread_coreid = starpu_get_env_number_default("STARPU_MAIN_THREAD_COREID", -1);
+	if (main_thread_cpuid >= 0 && main_thread_coreid >= 0)
+	{
+		_STARPU_DISP("Warning: STARPU_MAIN_THREAD_CPUID and STARPU_MAIN_THREAD_COREID cannot be set at the same time. STARPU_MAIN_THREAD_CPUID will be used.\n");
+	}
+	if (main_thread_cpuid == -1 && main_thread_coreid >= 0)
+	{
+		main_thread_cpuid = main_thread_coreid * _starpu_get_nhyperthreads();
+	}
 	int main_thread_bind = starpu_get_env_number_default("STARPU_MAIN_THREAD_BIND", 0);
 	int main_thread_activity = STARPU_NONACTIVETHREAD;
 	if (main_thread_bind)

+ 17 - 28
src/core/workers.h

@@ -818,6 +818,10 @@ static inline void _starpu_worker_process_block_in_parallel_requests(struct _sta
 	}
 }
 
+#ifdef STARPU_SPINLOCK_CHECK
+#define _starpu_worker_enter_sched_op(worker) __starpu_worker_enter_sched_op((worker), __FILE__, __LINE__, __starpu_func__)
+static inline void __starpu_worker_enter_sched_op(struct _starpu_worker * const worker, const char*file, int line, const char* func)
+#else
 /** Mark the beginning of a scheduling operation by the worker. No worker
  * blocking operations on parallel tasks and no scheduling context change
  * operations must be performed on contexts containing the worker, on
@@ -834,9 +838,6 @@ static inline void _starpu_worker_process_block_in_parallel_requests(struct _sta
  *
  * Must be called with worker's sched_mutex held.
  */
-#ifdef STARPU_SPINLOCK_CHECK
-static inline void __starpu_worker_enter_sched_op(struct _starpu_worker * const worker, const char*file, int line, const char* func)
-#else
 static inline void _starpu_worker_enter_sched_op(struct _starpu_worker * const worker)
 #endif
 {
@@ -879,18 +880,17 @@ static inline void _starpu_worker_enter_sched_op(struct _starpu_worker * const w
 	worker->relax_on_func = func;
 #endif
 }
-#ifdef STARPU_SPINLOCK_CHECK
-#define _starpu_worker_enter_sched_op(worker) __starpu_worker_enter_sched_op((worker), __FILE__, __LINE__, __starpu_func__)
-#endif
 
-/** Mark the end of a scheduling operation by the worker.
- *
- * Must be called with worker's sched_mutex held.
- */
 void _starpu_worker_apply_deferred_ctx_changes(void);
+
 #ifdef STARPU_SPINLOCK_CHECK
+#define _starpu_worker_leave_sched_op(worker) __starpu_worker_leave_sched_op((worker), __FILE__, __LINE__, __starpu_func__)
 static inline void __starpu_worker_leave_sched_op(struct _starpu_worker * const worker, const char*file, int line, const char* func)
 #else
+/** Mark the end of a scheduling operation by the worker.
+ *
+ * Must be called with worker's sched_mutex held.
+ */
 static inline void _starpu_worker_leave_sched_op(struct _starpu_worker * const worker)
 #endif
 {
@@ -905,9 +905,6 @@ static inline void _starpu_worker_leave_sched_op(struct _starpu_worker * const w
 	STARPU_PTHREAD_COND_BROADCAST(&worker->sched_cond);
 	_starpu_worker_apply_deferred_ctx_changes();
 }
-#ifdef STARPU_SPINLOCK_CHECK
-#define _starpu_worker_leave_sched_op(worker) __starpu_worker_leave_sched_op((worker), __FILE__, __LINE__, __starpu_func__)
-#endif
 
 static inline int _starpu_worker_sched_op_pending(void)
 {
@@ -977,11 +974,12 @@ static inline void _starpu_worker_leave_changing_ctx_op(struct _starpu_worker *
 	STARPU_PTHREAD_COND_BROADCAST(&worker->sched_cond);
 }
 
-/** Temporarily allow other worker to access current worker state, when still scheduling,
- * but the scheduling has not yet been made or is already done */
 #ifdef STARPU_SPINLOCK_CHECK
+#define _starpu_worker_relax_on() __starpu_worker_relax_on(__FILE__, __LINE__, __starpu_func__)
 static inline void __starpu_worker_relax_on(const char*file, int line, const char* func)
 #else
+/** Temporarily allow other worker to access current worker state, when still scheduling,
+ * but the scheduling has not yet been made or is already done */
 static inline void _starpu_worker_relax_on(void)
 #endif
 {
@@ -1005,15 +1003,13 @@ static inline void _starpu_worker_relax_on(void)
 	STARPU_PTHREAD_COND_BROADCAST(&worker->sched_cond);
 	STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&worker->sched_mutex);
 }
-#ifdef STARPU_SPINLOCK_CHECK
-#define _starpu_worker_relax_on() __starpu_worker_relax_on(__FILE__, __LINE__, __starpu_func__)
-#endif
 #define starpu_worker_relax_on _starpu_worker_relax_on
 
-/** Same, but with current worker mutex already held */
 #ifdef STARPU_SPINLOCK_CHECK
+#define _starpu_worker_relax_on_locked(worker) __starpu_worker_relax_on_locked(worker,__FILE__, __LINE__, __starpu_func__)
 static inline void __starpu_worker_relax_on_locked(struct _starpu_worker *worker, const char*file, int line, const char* func)
 #else
+/** Same, but with current worker mutex already held */
 static inline void _starpu_worker_relax_on_locked(struct _starpu_worker *worker)
 #endif
 {
@@ -1032,11 +1028,9 @@ static inline void _starpu_worker_relax_on_locked(struct _starpu_worker *worker)
 #endif
 	STARPU_PTHREAD_COND_BROADCAST(&worker->sched_cond);
 }
-#ifdef STARPU_SPINLOCK_CHECK
-#define _starpu_worker_relax_on_locked(worker) __starpu_worker_relax_on_locked(worker,__FILE__, __LINE__, __starpu_func__)
-#endif
 
 #ifdef STARPU_SPINLOCK_CHECK
+#define _starpu_worker_relax_off() __starpu_worker_relax_off(__FILE__, __LINE__, __starpu_func__)
 static inline void __starpu_worker_relax_off(const char*file, int line, const char* func)
 #else
 static inline void _starpu_worker_relax_off(void)
@@ -1063,12 +1057,10 @@ static inline void _starpu_worker_relax_off(void)
 #endif
 	STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&worker->sched_mutex);
 }
-#ifdef STARPU_SPINLOCK_CHECK
-#define _starpu_worker_relax_off() __starpu_worker_relax_off(__FILE__, __LINE__, __starpu_func__)
-#endif
 #define starpu_worker_relax_off _starpu_worker_relax_off
 
 #ifdef STARPU_SPINLOCK_CHECK
+#define _starpu_worker_relax_off_locked() __starpu_worker_relax_off_locked(__FILE__, __LINE__, __starpu_func__)
 static inline void __starpu_worker_relax_off_locked(const char*file, int line, const char* func)
 #else
 static inline void _starpu_worker_relax_off_locked(void)
@@ -1093,9 +1085,6 @@ static inline void _starpu_worker_relax_off_locked(void)
 	worker->relax_off_func = func;
 #endif
 }
-#ifdef STARPU_SPINLOCK_CHECK
-#define _starpu_worker_relax_off_locked() __starpu_worker_relax_off_locked(__FILE__, __LINE__, __starpu_func__)
-#endif
 
 static inline int _starpu_worker_get_relax_state(void)
 {

+ 3 - 0
src/datawizard/coherency.h

@@ -341,6 +341,9 @@ void _starpu_release_nowhere_task_output(struct _starpu_job *j);
 
 struct _starpu_worker;
 STARPU_ATTRIBUTE_WARN_UNUSED_RESULT
+/** Fetch the data parameters for task \p task
+ * Setting \p async to 1 allows to only start the fetches, and call
+ * \p _starpu_fetch_task_input_tail later when the transfers are finished */
 int _starpu_fetch_task_input(struct starpu_task *task, struct _starpu_job *j, int async);
 void _starpu_fetch_task_input_tail(struct starpu_task *task, struct _starpu_job *j, struct _starpu_worker *worker);
 void _starpu_fetch_nowhere_task_input(struct _starpu_job *j);

+ 10 - 0
src/datawizard/datawizard.h

@@ -34,8 +34,18 @@
 
 #include <core/dependencies/implicit_data_deps.h>
 
+/** Make data transfers progress on node \p memory_node.
+ *
+ * If \p push_requests is 1, it can start new transfers
+ *
+ * If \p may_alloc is 1, it can allocate destination data for transfers
+ * (this is not possible e.g. when spinning for a handle lock)
+ */
 int ___starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc, unsigned push_requests);
+/** Call ___starpu_datawizard_progress() for all memory nodes driven by the
+ * current worker */
 int __starpu_datawizard_progress(unsigned may_alloc, unsigned push_requests);
+/** Call __starpu_datawizard_progress with push_requests = 1 */
 void _starpu_datawizard_progress(unsigned may_alloc);
 
 #endif // __DATAWIZARD_H__

+ 1 - 1
src/datawizard/filters.c

@@ -561,7 +561,7 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_data_filter *f)
 {
 	unsigned nparts = _starpu_data_partition_nparts(initial_handle, f);
-	STARPU_ASSERT_MSG(initial_handle->nchildren == 0, "there should not be mutiple filters applied on the same data %p, futher filtering has to be done on children", initial_handle);
+	STARPU_ASSERT_MSG(initial_handle->nchildren == 0, "there should not be multiple filters applied on the same data %p, further filtering has to be done on children", initial_handle);
 	STARPU_ASSERT_MSG(initial_handle->nplans == 0, "partition planning and synchronous partitioning is not supported");
 
 	initial_handle->children = NULL;

+ 29 - 2
src/datawizard/interfaces/data_interface.c

@@ -30,6 +30,9 @@
 #include <util/openmp_runtime_support.h>
 #endif
 
+static struct starpu_data_interface_ops **_id_to_ops_array;
+static unsigned _id_to_ops_array_size;
+
 /* Entry in the `registered_handles' hash table.  */
 struct handle_entry
 {
@@ -50,6 +53,8 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 void _starpu_data_interface_init(void)
 {
 	_starpu_spin_init(&registered_handles_lock);
+	_id_to_ops_array_size = 20;
+	_STARPU_MALLOC(_id_to_ops_array, _id_to_ops_array_size * sizeof(struct starpu_data_interface_ops *));
 
 	/* Just for testing purpose */
 	if (starpu_get_env_number_default("STARPU_GLOBAL_ARBITER", 0) > 0)
@@ -66,6 +71,7 @@ void _starpu_data_interface_shutdown()
 	}
 
 	_starpu_spin_destroy(&registered_handles_lock);
+	free(_id_to_ops_array);
 
 	HASH_ITER(hh, registered_handles, entry, tmp)
 	{
@@ -138,8 +144,16 @@ struct starpu_data_interface_ops *_starpu_data_interface_get_ops(unsigned interf
 			return &starpu_interface_multiformat_ops;
 
 		default:
-			STARPU_ABORT();
-			return NULL;
+		{
+			if (interface_id-STARPU_MAX_INTERFACE_ID > _id_to_ops_array_size || _id_to_ops_array[interface_id-STARPU_MAX_INTERFACE_ID]==NULL)
+			{
+				_STARPU_MSG("There is no 'struct starpu_data_interface_ops' registered for interface %d\n", interface_id);
+				STARPU_ABORT();
+				return NULL;
+			}
+			else
+				return _id_to_ops_array[interface_id-STARPU_MAX_INTERFACE_ID];
+		}
 	}
 }
 
@@ -548,10 +562,23 @@ void starpu_data_register(starpu_data_handle_t *handleptr, int home_node,
 	STARPU_ASSERT(handleptr);
 	*handleptr = handle;
 
+	/* check the interfaceid is set */
+	STARPU_ASSERT(ops->interfaceid != STARPU_UNKNOWN_INTERFACE_ID);
+
 	/* fill the interface fields with the appropriate method */
 	STARPU_ASSERT(ops->register_data_handle);
 	ops->register_data_handle(handle, home_node, data_interface);
 
+	if ((unsigned)ops->interfaceid >= STARPU_MAX_INTERFACE_ID)
+	{
+		if ((unsigned)ops->interfaceid > _id_to_ops_array_size)
+		{
+			_id_to_ops_array_size *= 2;
+			_STARPU_REALLOC(_id_to_ops_array, _id_to_ops_array_size * sizeof(struct starpu_data_interface_ops *));
+		}
+		_id_to_ops_array[ops->interfaceid-STARPU_MAX_INTERFACE_ID] = ops;
+	}
+
 	_starpu_register_new_data(handle, home_node, 0);
 	_STARPU_TRACE_HANDLE_DATA_REGISTER(handle);
 }

+ 114 - 60
src/debug/traces/starpu_fxt.c

@@ -279,6 +279,7 @@ static void task_dump(struct task_info *task, struct starpu_fxt_options *options
 		for (i = 0; i < task->ndata; i++)
 			fprintf(tasks_file, " %lu", task->data[i].size);
 		fprintf(tasks_file, "\n");
+		free(task->data);
 	}
 	fprintf(tasks_file, "MPIRank: %d\n", task->mpi_rank);
 	fprintf(tasks_file, "\n");
@@ -447,23 +448,23 @@ static const char *get_worker_color(int workerid)
 	return worker_colors[workerid];
 }
 
-static unsigned get_colour_symbol_red(char *name)
+static unsigned get_color_symbol_red(char *name)
 {
-	/* choose some colour ... that's disguting yes */
+	/* choose some color ... that's disguting yes */
 	uint32_t hash_symbol = starpu_hash_crc32c_string(name, 0);
 	return (unsigned)starpu_hash_crc32c_string("red", hash_symbol) % 1024;
 }
 
-static unsigned get_colour_symbol_green(char *name)
+static unsigned get_color_symbol_green(char *name)
 {
-	/* choose some colour ... that's disguting yes */
+	/* choose some color ... that's disguting yes */
 	uint32_t hash_symbol = starpu_hash_crc32c_string(name, 0);
 	return (unsigned)starpu_hash_crc32c_string("green", hash_symbol) % 1024;
 }
 
-static unsigned get_colour_symbol_blue(char *name)
+static unsigned get_color_symbol_blue(char *name)
 {
-	/* choose some colour ... that's disguting yes */
+	/* choose some color ... that's disguting yes */
 	uint32_t hash_symbol = starpu_hash_crc32c_string(name, 0);
 	return (unsigned)starpu_hash_crc32c_string("blue", hash_symbol) % 1024;
 }
@@ -680,9 +681,7 @@ static int prefixTOnodeid (const char *prefix)
 	//if we are a single-node trace, prefix is empty, so return 0
 	if (strcmp(prefix, "")==0) return 0;
 
-	char *str;
-	_STARPU_MALLOC(str, sizeof(char)*strlen(prefix));
-	strncpy (str, prefix, strlen(prefix));
+	char *str = strdup(prefix);
 	str[strlen(prefix)-1] = '\0';
 	unsigned long nodeid = atoi(str);
 	free(str);
@@ -1420,10 +1419,10 @@ static void create_paje_state_if_not_found(char *name, unsigned color, struct st
 
 	_starpu_symbol_name_list_push_front(&symbol_list, entry);
 
-	/* choose some colour ... that's disguting yes */
-	unsigned hash_symbol_red = get_colour_symbol_red(name);
-	unsigned hash_symbol_green = get_colour_symbol_green(name);
-	unsigned hash_symbol_blue = get_colour_symbol_blue(name);
+	/* choose some color ... that's disguting yes */
+	unsigned hash_symbol_red = get_color_symbol_red(name);
+	unsigned hash_symbol_green = get_color_symbol_green(name);
+	unsigned hash_symbol_blue = get_color_symbol_blue(name);
 
 	uint32_t hash_sum = hash_symbol_red + hash_symbol_green + hash_symbol_blue;
 
@@ -1544,7 +1543,7 @@ static void handle_start_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_op
 	unsigned long has_name = ev->param[4];
 	char *name = has_name?get_fxt_string(ev, 5):"unknown";
 
-	snprintf(_starpu_last_codelet_symbol[worker], sizeof(_starpu_last_codelet_symbol[worker])-1, "%s", name);
+	snprintf(_starpu_last_codelet_symbol[worker], sizeof(_starpu_last_codelet_symbol[worker]), "%.*s", (int) sizeof(_starpu_last_codelet_symbol[worker])-1, name);
 	_starpu_last_codelet_symbol[worker][sizeof(_starpu_last_codelet_symbol[worker])-1] = 0;
 	last_codelet_parameter[worker] = 0;
 
@@ -1621,7 +1620,7 @@ static void handle_codelet_data(struct fxt_ev_64 *ev STARPU_ATTRIBUTE_UNUSED, st
 	if (num >= MAX_PARAMETERS)
 		return;
 	char *name = get_fxt_string(ev, 1);
-	snprintf(last_codelet_parameter_description[worker][num], sizeof(last_codelet_parameter_description[worker][num])-1, "%s", name);
+	snprintf(last_codelet_parameter_description[worker][num], sizeof(last_codelet_parameter_description[worker][num]), "%.*s", (int) sizeof(last_codelet_parameter_description[worker][num])-1, name);
 	last_codelet_parameter_description[worker][num][sizeof(last_codelet_parameter_description[worker][num])-1] = 0;
 }
 
@@ -1766,7 +1765,7 @@ static void handle_end_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_opti
 	size_t codelet_size = ev->param[1];
 	uint32_t codelet_hash = ev->param[2];
 	long unsigned int threadid = ev->param[4];
-	char *name = get_fxt_string(ev, 4);
+	char *name = get_fxt_string(ev, 5);
 
 	const char *state = "I";
 	if (find_sync(prefixTOnodeid(prefix), threadid))
@@ -1852,7 +1851,7 @@ static void handle_end_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_opti
 		snprintf(dumped_codelets[dumped_codelets_count - 1].symbol, sizeof(dumped_codelets[dumped_codelets_count - 1].symbol)-1, "%s", _starpu_last_codelet_symbol[worker]);
 		dumped_codelets[dumped_codelets_count - 1].symbol[sizeof(dumped_codelets[dumped_codelets_count - 1].symbol)-1] = 0;
 		dumped_codelets[dumped_codelets_count - 1].workerid = worker;
-		snprintf(dumped_codelets[dumped_codelets_count - 1].perfmodel_archname, sizeof(dumped_codelets[dumped_codelets_count - 1].perfmodel_archname)-1, "%s", name);
+		snprintf(dumped_codelets[dumped_codelets_count - 1].perfmodel_archname, sizeof(dumped_codelets[dumped_codelets_count - 1].perfmodel_archname), "%.*s", (int) sizeof(dumped_codelets[dumped_codelets_count - 1].perfmodel_archname)-1, name);
 		dumped_codelets[dumped_codelets_count - 1].perfmodel_archname[sizeof(dumped_codelets[dumped_codelets_count - 1].perfmodel_archname)-1] = 0;
 		dumped_codelets[dumped_codelets_count - 1].size = codelet_size;
 		dumped_codelets[dumped_codelets_count - 1].hash = codelet_hash;
@@ -2826,7 +2825,7 @@ static void handle_task_deps(struct fxt_ev_64 *ev, struct starpu_fxt_options *op
 	task->ndeps++;
 
 	/* There is a dependency between both job id : dep_prev -> dep_succ */
-	if (show_task(task, options))
+	if (show_task(task, options) && show_task(prev_task, options))
 	{
 		if (!options->label_deps) name = NULL;
 		/* We should show the name of the predecessor, then. */
@@ -2862,47 +2861,49 @@ static void handle_task_color(struct fxt_ev_64 *ev, struct starpu_fxt_options *o
 	task->color = color;
 }
 
-static void handle_task_name(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
+static void handle_task_exclude_from_dag(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
 {
-	char *prefix = options->file_prefix;
-
 	unsigned long job_id = ev->param[0];
+	unsigned exclude_from_dag = ev->param[1];
+
 	struct task_info *task = get_task(job_id, options->file_rank);
+	task->exclude_from_dag = exclude_from_dag;
+}
 
-	unsigned long has_name = ev->param[3];
-	char *name = has_name?get_fxt_string(ev,4):"unknown";
+static void handle_task_name(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
+{
+	unsigned long job_id = ev->param[0];
+	char *name = get_fxt_string(ev,1);
 
-        int worker;
-        worker = find_worker_id(prefixTOnodeid(prefix), ev->param[1]);
+	char *prefix = options->file_prefix;
+	struct task_info *task = get_task(job_id, options->file_rank);
+        int worker = find_worker_id(prefixTOnodeid(prefix), ev->param[1]);
 
-	const char *colour;
+	const char *color;
 	char buffer[32];
 	if (task->color != 0)
 	{
 		snprintf(buffer, sizeof(buffer), "#%06x", task->color);
-		colour = &buffer[0];
+		color = &buffer[0];
 	}
 	else if (options->per_task_colour)
 	{
 		snprintf(buffer, sizeof(buffer), "#%x%x%x",
-			 get_colour_symbol_red(name)/4,
-			 get_colour_symbol_green(name)/4,
-			 get_colour_symbol_blue(name)/4);
-		colour = &buffer[0];
+			 get_color_symbol_red(name)/4,
+			 get_color_symbol_green(name)/4,
+			 get_color_symbol_blue(name)/4);
+		color = &buffer[0];
 	}
 	else
 	{
-		colour= (worker < 0)?"#aaaaaa":get_worker_color(worker);
+		color= (worker < 0)?"#aaaaaa":get_worker_color(worker);
 	}
 
 	if (!task->name)
 		task->name = strdup(name);
 
-	unsigned exclude_from_dag = ev->param[2];
-	task->exclude_from_dag = exclude_from_dag;
-
-	if (!exclude_from_dag && show_task(task, options))
-		_starpu_fxt_dag_set_task_name(options->file_prefix, job_id, name, colour);
+	if (!task->exclude_from_dag && show_task(task, options))
+		_starpu_fxt_dag_set_task_name(options->file_prefix, job_id, task->name, color);
 }
 
 static void handle_task_done(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
@@ -2937,22 +2938,22 @@ static void handle_tag_done(struct fxt_ev_64 *ev, struct starpu_fxt_options *opt
         int worker;
         worker = find_worker_id(prefixTOnodeid(prefix), ev->param[1]);
 
-	const char *colour;
+	const char *color;
 	char buffer[32];
 	if (options->per_task_colour)
 	{
 		snprintf(buffer, sizeof(buffer), "%.4f,%.4f,%.4f",
-			 get_colour_symbol_red(name)/1024.0,
-			 get_colour_symbol_green(name)/1024.0,
-			 get_colour_symbol_blue(name)/1024.0);
-		colour = &buffer[0];
+			 get_color_symbol_red(name)/1024.0,
+			 get_color_symbol_green(name)/1024.0,
+			 get_color_symbol_blue(name)/1024.0);
+		color = &buffer[0];
 	}
 	else
 	{
-		colour= (worker < 0)?"white":get_worker_color(worker);
+		color= (worker < 0)?"white":get_worker_color(worker);
 	}
 
-	_starpu_fxt_dag_set_tag_done(options->file_prefix, tag_id, colour);
+	_starpu_fxt_dag_set_tag_done(options->file_prefix, tag_id, color);
 }
 
 static void handle_mpi_barrier(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
@@ -3354,14 +3355,20 @@ static
 void _starpu_fxt_process_bandwidth(struct starpu_fxt_options *options)
 {
 	char *prefix = options->file_prefix;
+	struct _starpu_communication *itor, *next;
 
 	/* Loop through completed communications */
-	while (!_starpu_communication_list_empty(&communication_list)
-			&& _starpu_communication_list_begin(&communication_list)->peer)
+	for (itor = _starpu_communication_list_begin(&communication_list);
+		itor != _starpu_communication_list_end(&communication_list);
+		itor = next)
 	{
-		struct _starpu_communication*itor;
+		next = _starpu_communication_list_next(itor);
+
+		if (!itor->peer)
+			continue;
+
 		/* This communication is complete */
-		itor = _starpu_communication_list_pop_front(&communication_list);
+		_starpu_communication_list_erase(&communication_list, itor);
 
 		current_bandwidth_out_per_node[itor->src_node] +=  itor->bandwidth;
 		if (out_paje_file)
@@ -3700,6 +3707,10 @@ void _starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *op
 				handle_task_submit_event(&ev, options, ev.param[0], NULL);
 				break;
 
+			case _STARPU_FUT_TASK_EXCLUDE_FROM_DAG:
+				handle_task_exclude_from_dag(&ev, options);
+				break;
+
 			case _STARPU_FUT_TASK_NAME:
 				handle_task_name(&ev, options);
 				break;
@@ -4136,10 +4147,10 @@ void _starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *op
 			_starpu_fxt_process_computations(options);
 	}
 
+	unsigned i;
 	if (!options->no_flops)
 	{
-		/* computations are supposed to be over, drop any pending comp */
-		unsigned i;
+		/* computations are supposed to be over, unref any pending comp */
 		for (i = 0; i < STARPU_NMAXWORKERS; i++)
 		{
 			struct _starpu_computation *comp = ongoing_computation[i];
@@ -4147,13 +4158,23 @@ void _starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *op
 			{
 				STARPU_ASSERT(!comp->peer);
 				_starpu_computation_list_erase(&computation_list, comp);
-				ongoing_computation[i] = 0;
 			}
 		}
 		/* And flush completed computations */
 		_starpu_fxt_process_computations(options);
 	}
 
+	for (i = 0; i < STARPU_NMAXWORKERS; i++)
+	{
+		struct _starpu_computation *comp = ongoing_computation[i];
+		if (comp)
+		{
+			STARPU_ASSERT(!comp->peer);
+			_starpu_computation_delete(comp);
+			ongoing_computation[i] = 0;
+		}
+	}
+
 	if (out_paje_file && !options->no_bus)
 	{
 		while (!_starpu_communication_list_empty(&communication_list)) {
@@ -4184,7 +4205,6 @@ void _starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *op
 
 	if (out_paje_file && !options->no_flops)
 	{
-		unsigned i;
 		for (i = 0; i < STARPU_NMAXWORKERS; i++)
 		{
 			if (last_codelet_end[i] != 0.0)
@@ -4228,12 +4248,23 @@ void _starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *op
 		}
 	}
 
+	for (i = 0; i < STARPU_NMAXWORKERS; i++)
+		free(options->worker_archtypes[i].devices);
+
+#ifdef HAVE_FXT_BLOCKEV_LEAVE
+	fxt_blockev_leave(block);
+#endif
+
 	/* Close the trace file */
+#ifdef HAVE_FXT_CLOSE
+	fxt_close(fut);
+#else
 	if (close(fd_in))
 	{
 	        perror("close failed :");
 	        exit(-1);
 	}
+#endif
 }
 
 /* Initialize FxT options to default values */
@@ -4651,12 +4682,20 @@ uint64_t _starpu_fxt_find_start_time(char *filename_in)
 	int ret = fxt_next_ev(block, FXT_EV_TYPE_64, (struct fxt_ev *)&ev);
 	STARPU_ASSERT (ret == FXT_EV_OK);
 
+#ifdef HAVE_FXT_BLOCKEV_LEAVE
+	fxt_blockev_leave(block);
+#endif
+
 	/* Close the trace file */
+#ifdef HAVE_FXT_CLOSE
+	fxt_close(fut);
+#else
 	if (close(fd_in))
 	{
 	        perror("close failed :");
 	        exit(-1);
 	}
+#endif
 	return (ev.time);
 }
 
@@ -4838,7 +4877,7 @@ struct starpu_data_trace_kernel
 
 static FILE *codelet_list;
 
-static void write_task(struct parse_task pt)
+static void write_task(char *dir, struct parse_task pt)
 {
 	struct starpu_data_trace_kernel *kernel;
 	char *codelet_name = pt.codelet_name;
@@ -4848,11 +4887,13 @@ static void write_task(struct parse_task pt)
 	{
 		_STARPU_MALLOC(kernel, sizeof(*kernel));
 		kernel->name = strdup(codelet_name);
+		char filename[256];
+		snprintf(filename, sizeof(filename), "%s/%s", dir, kernel->name);
 		//fprintf(stderr, "%s\n", kernel->name);
-		kernel->file = fopen(codelet_name, "w+");
+		kernel->file = fopen(filename, "w+");
 		if(!kernel->file)
 		{
-			STARPU_ABORT_MSG("Failed to open '%s' (err %s)", codelet_name, strerror(errno));
+			STARPU_ABORT_MSG("Failed to open '%s' (err %s)", filename, strerror(errno));
 		}
 		HASH_ADD_STR(kernels, name, kernel);
 		fprintf(codelet_list, "%s\n", codelet_name);
@@ -4861,7 +4902,7 @@ static void write_task(struct parse_task pt)
 	fprintf(kernel->file, "%lf %u %u\n", time, pt.data_total, pt.workerid);
 }
 
-void starpu_fxt_write_data_trace(char *filename_in)
+void starpu_fxt_write_data_trace_in_dir(char *filename_in, char *dir)
 {
 	int fd_in;
 	fd_in = open(filename_in, O_RDONLY);
@@ -4878,10 +4919,12 @@ void starpu_fxt_write_data_trace(char *filename_in)
 	        exit(-1);
 	}
 
-	codelet_list = fopen("codelet_list", "w+");
+	char filename_out[512];
+	snprintf(filename_out, sizeof(filename_out), "%s/codelet_list", dir);
+	codelet_list = fopen(filename_out, "w+");
 	if(!codelet_list)
 	{
-		STARPU_ABORT_MSG("Failed to open '%s' (err %s)", "codelet_list", strerror(errno));
+		STARPU_ABORT_MSG("Failed to open '%s' (err %s)", filename_out, strerror(errno));
 	}
 
 	fxt_blockev_t block;
@@ -4922,7 +4965,7 @@ void starpu_fxt_write_data_trace(char *filename_in)
 			workerid = ev.param[3];
 			assert(workerid != -1);
 			tasks[workerid].exec_time = ev.time - tasks[workerid].exec_time;
-			write_task(tasks[workerid]);
+			write_task(dir, tasks[workerid]);
 			break;
 
 		case _STARPU_FUT_DATA_LOAD:
@@ -4939,14 +4982,19 @@ void starpu_fxt_write_data_trace(char *filename_in)
 		}
 	}
 
+#ifdef HAVE_FXT_BLOCKEV_LEAVE
+	fxt_blockev_leave(block);
+#endif
+
 #ifdef HAVE_FXT_CLOSE
 	fxt_close(fut);
-#endif
+#else
 	if (close(fd_in))
 	{
 	        perror("close failed :");
 	        exit(-1);
 	}
+#endif
 
 	if(fclose(codelet_list))
 	{
@@ -4968,4 +5016,10 @@ void starpu_fxt_write_data_trace(char *filename_in)
 	}
 
 }
+
+void starpu_fxt_write_data_trace(char *filename_in)
+{
+	starpu_fxt_write_data_trace_in_dir(filename_in, ".");
+}
+
 #endif // STARPU_USE_FXT

+ 4 - 2
src/drivers/cpu/driver_cpu.c

@@ -115,11 +115,13 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 			}
 #else
 #  ifdef STARPU_PAPI
-			_starpu_profiling_papi_task_start_counters(task);
+			if (rank == 0)
+				_starpu_profiling_papi_task_start_counters(task);
 #  endif
 			func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
 #  ifdef STARPU_PAPI
-			_starpu_profiling_papi_task_stop_counters(task);
+			if (rank == 0)
+				_starpu_profiling_papi_task_stop_counters(task);
 #  endif
 #endif
 			_STARPU_TRACE_END_EXECUTING();

+ 2 - 0
src/drivers/driver_common/driver_common.h

@@ -32,6 +32,8 @@ void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j,
 void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_worker *worker_args,
 					struct starpu_perfmodel_arch* perf_arch, int profiling);
 
+/** Get from the scheduler a task to be executed on the worker \p workerid */
 struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int workerid, unsigned memnode);
+/** Get from the scheduler tasks to be executed on the workers \p workers */
 int _starpu_get_multi_worker_task(struct _starpu_worker *workers, struct starpu_task ** tasks, int nworker, unsigned memnode);
 #endif // __DRIVER_COMMON_H__

+ 24 - 3
src/profiling/profiling.c

@@ -44,6 +44,7 @@ static unsigned worker_registered_executing_start[STARPU_NMAXWORKERS];
 static struct timespec executing_start_date[STARPU_NMAXWORKERS];
 
 #ifdef STARPU_PAPI
+static starpu_pthread_mutex_t papi_mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
 static int papi_events[PAPI_MAX_HWCTRS];
 static int papi_nevents = 0;
 static int warned_component_unavailable = 0;
@@ -145,6 +146,7 @@ void _starpu_profiling_init(void)
 	}
 
 #ifdef STARPU_PAPI
+		STARPU_PTHREAD_MUTEX_LOCK(&papi_mutex);
 		int retval = PAPI_library_init(PAPI_VER_CURRENT);
 		if (retval != PAPI_VER_CURRENT)
 		{
@@ -159,11 +161,18 @@ void _starpu_profiling_init(void)
 		char *conf_papi_events;
 		char *papi_event_name;
 		conf_papi_events = starpu_getenv("STARPU_PROF_PAPI_EVENTS");
+		papi_nevents = 0;
 		if (conf_papi_events != NULL)
 		{
 			while ((papi_event_name = strtok_r(conf_papi_events, " ,", &conf_papi_events)))
 			{
-				_STARPU_DEBUG("Loading PAPI Event:%s\n", papi_event_name);
+				if (papi_nevents == PAPI_MAX_HWCTRS)
+				{
+				      _STARPU_MSG("Too many requested papi counters, ignoring %s\n", papi_event_name);
+				      continue;
+				}
+
+				_STARPU_DEBUG("Loading PAPI Event: %s\n", papi_event_name);
 				retval = PAPI_event_name_to_code ((char*)papi_event_name, &papi_events[papi_nevents]);
 				if (retval != PAPI_OK)
 				      _STARPU_MSG("Failed to codify papi event [%s], error: %s.\n", papi_event_name, PAPI_strerror(retval));
@@ -171,6 +180,7 @@ void _starpu_profiling_init(void)
 					papi_nevents++;
 			}
 		}
+		STARPU_PTHREAD_MUTEX_UNLOCK(&papi_mutex);
 #endif
 
 }
@@ -183,9 +193,10 @@ void _starpu_profiling_papi_task_start_counters(struct starpu_task *task)
 
 	struct starpu_profiling_task_info *profiling_info;
 	profiling_info = task->profiling_info;
-	if (profiling_info)
+	if (profiling_info && papi_nevents)
 	{
 		profiling_info->papi_event_set = PAPI_NULL;
+		STARPU_PTHREAD_MUTEX_LOCK(&papi_mutex);
 		PAPI_create_eventset(&profiling_info->papi_event_set);
 		for(int i=0; i<papi_nevents; i++)
 		{
@@ -199,6 +210,7 @@ void _starpu_profiling_papi_task_start_counters(struct starpu_task *task)
 		}
 		PAPI_reset(profiling_info->papi_event_set);
 		PAPI_start(profiling_info->papi_event_set);
+		STARPU_PTHREAD_MUTEX_UNLOCK(&papi_mutex);
 	}
 }
 
@@ -210,8 +222,9 @@ void _starpu_profiling_papi_task_stop_counters(struct starpu_task *task)
 	struct starpu_profiling_task_info *profiling_info;
 	profiling_info = task->profiling_info;
 
-	if (profiling_info)
+	if (profiling_info && papi_nevents)
 	{
+		STARPU_PTHREAD_MUTEX_LOCK(&papi_mutex);
 		PAPI_stop(profiling_info->papi_event_set, profiling_info->papi_values);
 		for(int i=0; i<papi_nevents; i++)
 		{
@@ -219,6 +232,7 @@ void _starpu_profiling_papi_task_stop_counters(struct starpu_task *task)
 		}
 		PAPI_cleanup_eventset(profiling_info->papi_event_set);
 		PAPI_destroy_eventset(&profiling_info->papi_event_set);
+		STARPU_PTHREAD_MUTEX_UNLOCK(&papi_mutex);
 	}
 }
 #endif
@@ -240,6 +254,13 @@ void _starpu_profiling_terminate(void)
 	{
 		STARPU_PTHREAD_MUTEX_DESTROY(&worker_info_mutex[worker]);
 	}
+#ifdef STARPU_PAPI
+	/* free the resources used by PAPI */
+	STARPU_PTHREAD_MUTEX_LOCK(&papi_mutex);
+	PAPI_shutdown();
+	STARPU_PTHREAD_MUTEX_UNLOCK(&papi_mutex);
+#endif
+
 }
 
 /*

+ 2 - 1
src/sched_policies/component_sched.c

@@ -370,9 +370,10 @@ int starpu_sched_tree_push_task(struct starpu_task * task)
 int starpu_sched_component_push_task(struct starpu_sched_component *from STARPU_ATTRIBUTE_UNUSED, struct starpu_sched_component *to, struct starpu_task *task)
 {
 	int pushback;
+	int priority = task->priority;
 	pushback = to->push_task(to, task);
 	if (!pushback)
-		_STARPU_TRACE_SCHED_COMPONENT_PUSH(from, to, task);
+		_STARPU_TRACE_SCHED_COMPONENT_PUSH(from, to, task, priority);
 	return pushback;
 }
 

+ 50 - 0
src/util/fstarpu.c

@@ -103,6 +103,13 @@ static const intptr_t fstarpu_starpu_codelet_simgrid_execute_and_inject	= STARPU
 static const intptr_t fstarpu_starpu_cuda_async	= STARPU_CUDA_ASYNC;
 static const intptr_t fstarpu_starpu_opencl_async	= STARPU_OPENCL_ASYNC;
 
+//static const intptr_t fstarpu_per_worker	= STARPU_PER_WORKER;
+//static const intptr_t fstarpu_per_arch		= STARPU_PER_ARCH;
+//static const intptr_t fstarpu_per_common	= STARPU_COMMON;
+static const intptr_t fstarpu_history_based	= STARPU_HISTORY_BASED;
+static const intptr_t fstarpu_regression_based	= STARPU_REGRESSION_BASED;
+static const intptr_t fstarpu_nl_regression_based	= STARPU_NL_REGRESSION_BASED;
+static const intptr_t fstarpu_multiple_regression_based	= STARPU_MULTIPLE_REGRESSION_BASED;
 
 intptr_t fstarpu_get_constant(char *s)
 {
@@ -187,6 +194,14 @@ intptr_t fstarpu_get_constant(char *s)
 	else if (!strcmp(s, "FSTARPU_CUDA_ASYNC"))	{ return fstarpu_starpu_cuda_async; }
 	else if (!strcmp(s, "FSTARPU_OPENCL_ASYNC"))	{ return fstarpu_starpu_opencl_async; }
 
+//	else if (!strcmp(s, "FSTARPU_PER_WORKER"))	{ return fstarpu_per_worker; }
+//	else if (!strcmp(s, "FSTARPU_PER_ARCH"))	{ return fstarpu_per_arch; }
+//	else if (!strcmp(s, "FSTARPU_COMMON"))	{ return fstarpu_per_common; }
+	else if (!strcmp(s, "FSTARPU_HISTORY_BASED"))	{ return fstarpu_history_based; }
+	else if (!strcmp(s, "FSTARPU_REGRESSION_BASED"))	{ return fstarpu_regression_based; }
+	else if (!strcmp(s, "FSTARPU_NL_REGRESSION_BASED"))	{ return fstarpu_nl_regression_based; }
+	else if (!strcmp(s, "FSTARPU_MULTIPLE_REGRESSION_BASED"))	{ return fstarpu_multiple_regression_based; }
+
 	else { _STARPU_ERROR("unknown constant"); }
 }
 
@@ -281,6 +296,16 @@ void fstarpu_codelet_set_name(struct starpu_codelet *cl, const char *cl_name)
 	cl->name = cl_name;
 }
 
+void fstarpu_codelet_set_model(struct starpu_codelet *cl, struct starpu_perfmodel *cl_perfmodel)
+{
+	cl->model = cl_perfmodel;
+}
+
+void fstarpu_codelet_set_energy_model(struct starpu_codelet *cl, struct starpu_perfmodel *cl_perfmodel)
+{
+	cl->energy_model = cl_perfmodel;
+}
+
 void fstarpu_codelet_add_cpu_func(struct starpu_codelet *cl, void *f_ptr)
 {
 	const size_t max_cpu_funcs = sizeof(cl->cpu_funcs)/sizeof(cl->cpu_funcs[0])-1;
@@ -419,6 +444,31 @@ void fstarpu_codelet_set_where(struct starpu_codelet *cl, intptr_t where)
 	cl->where = (uint32_t)where;
 }
 
+STARPU_ATTRIBUTE_MALLOC
+struct starpu_perfmodel *fstarpu_perfmodel_allocate(void)
+{
+	struct starpu_perfmodel *model;
+	_STARPU_CALLOC(model, 1, sizeof(*model));
+	return model;
+}
+
+void fstarpu_perfmodel_free(struct starpu_perfmodel *model)
+{
+	memset(model, 0, sizeof(*model));
+	free(model);
+}
+
+void fstarpu_perfmodel_set_symbol(struct starpu_perfmodel *model, const char *model_symbol)
+{
+	model->symbol = model_symbol;
+}
+
+void fstarpu_perfmodel_set_type(struct starpu_perfmodel *model, intptr_t type)
+{
+	STARPU_ASSERT(type == fstarpu_history_based || type == fstarpu_regression_based || type == fstarpu_nl_regression_based || type == fstarpu_multiple_regression_based);
+	model->type = type;
+}
+
 void * fstarpu_variable_get_ptr(void *buffers[], int i)
 {
 	return (void *)STARPU_VARIABLE_GET_PTR(buffers[i]);

+ 0 - 0
starpufft/src/Makefile.am


이 변경점에서 너무 많은 파일들이 변경되어 몇몇 파일들은 표시되지 않았습니다.