浏览代码

Merge branch 'fpga' of gitlab.inria.fr:starpu/starpu into fpga

Samuel Thibault 4 年之前
父节点
当前提交
084fe2d984
共有 100 个文件被更改,包括 2760 次插入1029 次删除
  1. 17 8
      .gitlab-ci.yml
  2. 2 0
      AUTHORS
  3. 7 0
      ChangeLog
  4. 36 9
      configure.ac
  5. 1 1
      contrib/ci.inria.fr/job-1-check.sh
  6. 10 3
      doc/doxygen/Makefile.am
  7. 15 15
      doc/doxygen/chapters/210_check_list_performance.doxy
  8. 9 4
      doc/doxygen/chapters/310_data_management.doxy
  9. 53 12
      doc/doxygen/chapters/320_scheduling.doxy
  10. 104 10
      doc/doxygen/chapters/380_offline_performance_tools.doxy
  11. 1 1
      doc/doxygen/chapters/400_python.doxy
  12. 2 43
      doc/doxygen/chapters/497_eclipse_plugin.doxy
  13. 118 1
      doc/doxygen/chapters/501_environment_variables.doxy
  14. 7 0
      doc/doxygen/chapters/510_configure_options.doxy
  15. 二进制
      doc/doxygen/chapters/images/eclipse_hello_hgraph.png
  16. 二进制
      doc/doxygen/chapters/images/eclipse_hello_paje_trace.png
  17. 二进制
      doc/doxygen/chapters/images/eclipse_hello_plugin.png
  18. 二进制
      doc/doxygen/chapters/images/eclipse_hello_svg_graph.png
  19. 二进制
      doc/doxygen/chapters/images/eclipse_hello_vite.png
  20. 1 1
      doc/doxygen/dev/starpu_check_include.sh
  21. 1 0
      doc/doxygen/doxygen-config.cfg.in
  22. 2 1
      doc/doxygen/doxygen.cfg
  23. 1 0
      doc/doxygen/refman.tex
  24. 43 0
      doc/doxygen/sectionNumbering.py
  25. 4 2
      doc/doxygen_dev/Makefile.am
  26. 1 1
      doc/doxygen_dev/dev/starpu_check_missing.sh
  27. 1 0
      doc/doxygen_dev/doxygen-config.cfg.in
  28. 2 1
      doc/doxygen_dev/doxygen.cfg
  29. 6 5
      doc/tutorial/Makefile
  30. 19 15
      doc/tutorial/vector_scal_opencl.c
  31. 4 4
      eclipse-plugin/src/starpu/handlers/SvgHandler.java
  32. 1 1
      eclipse-plugin/src/starpu/handlers/TraceGenHandler.java
  33. 5 2
      examples/Makefile.am
  34. 9 8
      examples/filters/fblock.c
  35. 7 1
      examples/filters/fmultiple_manual.c
  36. 0 244
      examples/filters/fplan_notautomatic.c
  37. 1 1
      examples/filters/fread.c
  38. 4 0
      examples/filters/frecursive.c
  39. 191 0
      examples/filters/ftensor.c
  40. 497 0
      examples/filters/shadow4d.c
  41. 9 0
      examples/spmd/vector_scal_spmd.c
  42. 84 2
      include/schedulers/starpu_heteroprio.h
  43. 1 0
      include/starpu.h
  44. 25 1
      include/starpu_data.h
  45. 93 8
      include/starpu_data_filters.h
  46. 5 0
      include/starpu_task.h
  47. 15 0
      include/starpu_worker.h
  48. 33 11
      mpi/examples/Makefile.am
  49. 1 1
      mpi/examples/benchs/abstract_sendrecv_bench.c
  50. 1 1
      mpi/examples/benchs/abstract_sendrecv_bench.h
  51. 1 1
      mpi/examples/benchs/bcast_bench.c
  52. 1 1
      mpi/examples/benchs/bench_helper.c
  53. 1 1
      mpi/examples/benchs/bench_helper.h
  54. 1 1
      mpi/examples/benchs/burst.c
  55. 1 1
      mpi/examples/benchs/burst_helper.h
  56. 2 1
      mpi/examples/benchs/gemm_helper.c
  57. 1 1
      mpi/examples/benchs/gemm_helper.h
  58. 1 1
      mpi/examples/benchs/sendrecv_parallel_tasks_bench.c
  59. 1 1
      mpi/examples/cg/cg.c
  60. 1 1
      mpi/examples/mpi_redux/mpi_redux.c
  61. 184 0
      mpi/examples/mpi_redux/mpi_redux_tree.c
  62. 77 78
      mpi/examples/native_fortran/nf_mpi_redux.f90
  63. 240 0
      mpi/examples/native_fortran/nf_mpi_redux_tree.f90
  64. 1 1
      mpi/examples/native_fortran/nf_redux_test.f90
  65. 97 0
      mpi/examples/user_datatype/user_datatype_interface.c
  66. 19 0
      mpi/include/fstarpu_mpi_mod.f90
  67. 12 0
      mpi/include/starpu_mpi.h
  68. 2 0
      mpi/src/mpi/starpu_mpi_mpi.c
  69. 1 1
      mpi/src/mpi/starpu_mpi_mpi.h
  70. 5 4
      mpi/src/nmad/starpu_mpi_nmad.c
  71. 1 1
      mpi/src/nmad/starpu_mpi_nmad.h
  72. 1 1
      mpi/src/nmad/starpu_mpi_nmad_backend.c
  73. 1 1
      mpi/src/nmad/starpu_mpi_nmad_unknown_datatype.c
  74. 3 4
      mpi/src/starpu_mpi.c
  75. 1 0
      mpi/src/starpu_mpi_coop_sends.c
  76. 11 0
      mpi/src/starpu_mpi_fortran.c
  77. 1 0
      mpi/src/starpu_mpi_private.c
  78. 0 6
      mpi/src/starpu_mpi_private.h
  79. 151 168
      mpi/src/starpu_mpi_task_insert.c
  80. 1 1
      mpi/tests/insert_task_tags.c
  81. 1 1
      mpi/tests/nothing.c
  82. 2 0
      src/Makefile.am
  83. 41 4
      src/common/fxt.c
  84. 18 0
      src/common/fxt.h
  85. 314 314
      src/common/rbtree.c
  86. 1 1
      src/core/combined_workers.c
  87. 1 0
      src/core/dependencies/data_arbiter_concurrency.c
  88. 1 0
      src/core/dependencies/data_concurrency.c
  89. 1 1
      src/core/disk_ops/unistd/disk_unistd_global.c
  90. 10 0
      src/core/jobs.c
  91. 6 0
      src/core/perfmodel/perfmodel.c
  92. 1 0
      src/core/perfmodel/perfmodel.h
  93. 2 2
      src/core/perfmodel/perfmodel_bus.c
  94. 10 6
      src/core/sched_ctx.h
  95. 2 1
      src/core/sched_policy.c
  96. 18 4
      src/core/task.c
  97. 59 0
      src/core/workers.c
  98. 2 0
      src/core/workers.h
  99. 1 1
      src/datawizard/coherency.c
  100. 0 0
      src/datawizard/coherency.h

+ 17 - 8
.gitlab-ci.yml

@@ -30,20 +30,29 @@ build:
       when: never  # Prevent pipeline run for push event
     - when: always # Run pipeline for all other cases
 
-check:
+.check_template:
   stage: deploy
-  script:
-    - ./contrib/gitlab/deploy.sh
+  needs: [build]
+  dependencies:
+    - build
   rules:
     - if: '$CI_PIPELINE_SOURCE == "push"'
       when: never  # Prevent pipeline run for push event
     - when: always # Run pipeline for all other cases
 
+check_linux:
+  extends: .check_template
+  tags : ["linux"]
+  script:
+    - ./contrib/gitlab/deploy.sh
+
+check_macosx:
+  extends: .check_template
+  tags : ["macosx"]
+  script:
+    - ./contrib/gitlab/deploy.sh
+
 simgrid:
-  stage: deploy
+  extends: .check_template
   script:
     - ./contrib/gitlab/simgrid.sh
-  rules:
-    - if: '$CI_PIPELINE_SOURCE == "push"'
-      when: never  # Prevent pipeline run for push event
-    - when: always # Run pipeline for all other cases

+ 2 - 0
AUTHORS

@@ -11,6 +11,7 @@ Collin Nicolas, Inria, <nicolas.collin@inria.fr>
 Danjean Vincent, University Grenoble Alpes, <Vincent.Danjean@ens-lyon.org>
 Denis Alexandre, Inria, <alexandre.denis@inria.fr>
 Eyraud-Dubois Lionel, Inria, <lionel.eyraud-dubois@inria.fr>
+Flint Clément, Inria, <clement.flint@inria.fr>
 Furmento Nathalie, CNRS, <nathalie.furmento@labri.fr>
 Guermouche Amina, Télécom SudParis, <amina.guermouche@inria.fr>
 Guilbaud Adrien, Inria, <adrien.guilbaud@inria.fr>
@@ -30,6 +31,7 @@ Nakov Stojce, Inria, <stojce.nakov@inria.fr>
 Namyst Raymond, Université de Bordeaux, <raymond.namyst@labri.fr>
 Nesi Lucas Leandro, Federal University of Rio Grande do Sul (UFRGS), <llnesi@inf.ufrgs.br>
 Pablo Joris, Inria, <joris.pablo@orange.fr>
+Paillat Ludovic, Inria, <ludovic.paillat@inria.fr>
 Pasqualinotto Damien, Université de Bordeaux, <dam.pasqualinotto@wanadoo.fr>
 Pinto Vinicius Garcia, <vgpinto@inf.ufrgs.br>
 Pitoiset Samuel, Inria, <samuel.pitoiset@inria.fr>

+ 7 - 0
ChangeLog

@@ -53,6 +53,7 @@ New features:
   * Add peek_data interface method.
   * Add STARPU_MPI_REDUX
   * Add starpu_data_query_status2 function.
+  * Add starpu_data_evict_from_node function.
   * Add a StarPU Eclipse Plugin
 
 Small features:
@@ -64,6 +65,10 @@ Small features:
   * Deprecate starpu_free() and add new function starpu_free_noflag()
     to specify allocated size.
 
+Changes:
+  * The redux codelet should expose the STARPU_COMMUTE flag, since StarPU
+    actually uses commutability.
+
 Small changes:
   * Add a synthetic energy efficiency testcase.
   * Make reduction methods want the commute flag.
@@ -76,6 +81,8 @@ Small changes:
   - Change the default value for configure option --enable-maxcpus to
     auto. it allows StarPU to automatically use the number of CPUs
     on the build machine.
+  - New option --worker for tool starpu_machine_display to only
+    display workers of a specific type
 
 StarPU 1.3.8
 ====================================================================

+ 36 - 9
configure.ac

@@ -146,7 +146,7 @@ AC_ARG_ENABLE(link-with-maxeler, [AS_HELP_STRING([--disable-link-with-maxeler],
 
 if test x$enable_fpga = xyes
 then
-   	fpga_include_dir="${fpga_dir}/include"
+	fpga_include_dir="${fpga_dir}/include"
 	fpga_lib_dir="${fpga_dir}/lib"
 
 	SAVED_CPPFLAGS="${CPPFLAGS}"
@@ -154,12 +154,12 @@ then
 
 	STARPU_FPGA_CPPFLAGS="-I$fpga_include_dir"
 	if test x$link_with_riffa = xyes; then
-   	   STARPU_FPGA_LDFLAGS="-L$fpga_lib_dir -lfpga -lriffa -lrt -lm"
+	   STARPU_FPGA_LDFLAGS="-L$fpga_lib_dir -lfpga -lriffa -lrt -lm"
 	elif test x$link_with_maxeler = xyes; then
 	   STARPU_FPGA_CPPFLAGS="`slic-config --cflags | sed s/\'//g | sed "s/-I /-I/"` $STARPU_FPGA_CPPFLAGS"
-   	   STARPU_FPGA_LDFLAGS="`slic-config --libs | sed s/\'//g | sed "s/-L /-L/" | sed "s/-L /-L/"`"
+	   STARPU_FPGA_LDFLAGS="`slic-config --libs | sed s/\'//g | sed "s/-L /-L/" | sed "s/-L /-L/"`"
 	else
-   	   STARPU_FPGA_LDFLAGS="-L$fpga_lib_dir -lfpga -lrt -lm"
+	   STARPU_FPGA_LDFLAGS="-L$fpga_lib_dir -lfpga -lrt -lm"
 	fi
 
 	CPPFLAGS="${CPPFLAGS} ${STARPU_FPGA_CPPFLAGS} "
@@ -169,7 +169,7 @@ then
 	AC_HAVE_LIBRARY([fpga],[have_valid_fpga=yes],[have_valid_fpga=no])
 
 	if test x$have_valid_fpga = xyes; then
-   	   	AC_COMPILE_IFELSE(
+		AC_COMPILE_IFELSE(
 			[AC_LANG_PROGRAM([[#include<fpga.h>]],[[hello()]])]
 			[have_valid_fpga="yes"],
 			[have_valid_fpga="no"]
@@ -192,7 +192,7 @@ then
 	# in case FPGA was explicitely required, but is not available, this is an error
 	if test x$enable_fpga = xyes -a x$have_valid_fpga = xno; then
 		AC_MSG_ERROR([cannot find FPGA])
-    	fi
+	fi
 	# now we enable FPGA if and only if a proper setup is available
 	enable_fpga=$have_valid_fpga
 fi
@@ -202,7 +202,7 @@ AC_MSG_RESULT($enable_fpga)
 AC_SUBST(STARPU_USE_FPGA,$enable_fpga)
 AM_CONDITIONAL(STARPU_USE_FPGA,test x$enable_fpga = xyes)
 if test x$enable_fpga = xyes; then
-   	AC_DEFINE(STARPU_USE_FPGA,[1],[FPGA support is activated])
+	AC_DEFINE(STARPU_USE_FPGA,[1],[FPGA support is activated])
 fi
 
 
@@ -1161,11 +1161,16 @@ AC_ARG_ENABLE(maxcpus, [AS_HELP_STRING([--enable-maxcpus=<number>],
 			maxcpus=$enableval, maxcpus=auto)
 if test x$maxcpus == xauto
 then
-	maxcpus=$(getconf _NPROCESSORS_ONLN 2>/dev/null)
-	if test x$maxcpus = x
+	confcpu=$(getconf _NPROCESSORS_ONLN 2>/dev/null)
+	if test x$confcpu = x
 	then
 		AC_MSG_ERROR([cannot get the number of CPUS, please specify a numerical value with --enable-maxcpus])
 	fi
+	maxcpus=2
+	while test $maxcpus -lt $confcpu
+	do
+		maxcpus=`expr $maxcpus \* 2`
+	done
 fi
 AC_MSG_RESULT($maxcpus)
 AC_DEFINE_UNQUOTED(STARPU_MAXCPUS, [$maxcpus], [Maximum number of CPUs supported])
@@ -2153,6 +2158,21 @@ AM_CONDITIONAL([STARPU_USE_AYUDAME2], [test "x$enable_ayudame2" = "xyes"])
 STARPU_FXT_EVENT_DEFINES="`grep -E '#define\s+_STARPU_(MPI_)?FUT_' ${srcdir}/src/common/fxt.h ${srcdir}/mpi/src/starpu_mpi_fxt.h | grep 0x | grep -v 0x1 | cut -d : -f 2`"
 AC_SUBST([STARPU_FXT_EVENT_DEFINES])
 
+# Heteroprio works better if it can store information based on the program's name
+AC_MSG_CHECKING(whether the target supports program_invocation_short_name)
+AC_LINK_IFELSE([AC_LANG_SOURCE(
+	[
+		#include <stdio.h>
+		#include <errno.h>
+		int main() {
+			printf("%s\n", program_invocation_short_name);
+			return 0;
+		}
+	])],
+	[AC_DEFINE([STARPU_HAVE_PROGRAM_INVOCATION_SHORT_NAME], [1], [variable program_invocation_short_name is available]) AC_MSG_RESULT(yes)],
+	AC_MSG_RESULT(no)
+)
+
 ###############################################################################
 #                                                                             #
 #                  Miscellaneous options for StarPU                           #
@@ -3444,6 +3464,13 @@ AM_CONDITIONAL(STARPU_AVAILABLE_DOC, [test x$available_doc != xno])
 AM_CONDITIONAL(STARPU_BUILD_DOC_PDF, [test x$enable_build_doc_pdf != xno])
 AM_CONDITIONAL(STARPU_AVAILABLE_DOC_PDF, [test x$available_doc_pdf != xno])
 
+if test x$enable_build_doc_pdf != xno ; then
+   DOC_GENERATE_LATEX=YES
+else
+   DOC_GENERATE_LATEX=NO
+fi
+AC_SUBST(DOC_GENERATE_LATEX)
+
 ###############################################################################
 #                                                                             #
 #                                Julia                                        #

+ 1 - 1
contrib/ci.inria.fr/job-1-check.sh

@@ -45,7 +45,7 @@ cd $basename
 
 test -d $basename && chmod -R u+rwX $basename && rm -rf $basename
 tar xfz ../$tarball
-touch --date="last hour" $(find $basename)
+touch --date="last hour" $(find $basename) || true
 cd $basename
 mkdir build
 cd build

+ 10 - 3
doc/doxygen/Makefile.am

@@ -149,7 +149,12 @@ images = 	\
 	chapters/images/eclipse_hello_build.png	\
 	chapters/images/eclipse_hello_run.png	\
 	chapters/images/eclipse_hello_fxt.png	\
-	chapters/images/eclipse_hello_graph.png
+	chapters/images/eclipse_hello_graph.png	\
+	chapters/images/eclipse_hello_vite.png	\
+	chapters/images/eclipse_hello_svg_graph.png	\
+	chapters/images/eclipse_hello_plugin.png	\
+	chapters/images/eclipse_hello_paje_trace.png	\
+	chapters/images/eclipse_hello_hgraph.png
 
 if STARPU_BUILD_DOC
 EXTRA_DIST += \
@@ -273,12 +278,14 @@ $(DOX_TAG): $(dox_inputs)
 	@if test -f html/navtree.js ; then $(SED) -i 's/\[ "Files", "Files.html", null \]/\[ "", "Files.html", null \]/' html/navtree.js ; fi
 	@$(SED) -i 's/.*"Files.html".*//' html/pages.html
 	@if test -f latex/main.tex ; then mv latex/main.tex latex/index.tex ; fi
-	@$(SED) -i '/\\begin{titlepage}/,$$d' $(DOX_LATEX_DIR)/refman.tex
-	@cat $(top_srcdir)/doc/doxygen/refman.tex >> $(DOX_LATEX_DIR)/refman.tex
+	@if test -f $(DOX_LATEX_DIR)/refman.tex ; then $(SED) -i '/\\begin{titlepage}/,$$d' $(DOX_LATEX_DIR)/refman.tex ; fi
+	@if test -f $(DOX_LATEX_DIR)/refman.tex ; then cat $(top_srcdir)/doc/doxygen/refman.tex >> $(DOX_LATEX_DIR)/refman.tex ; fi
+	$(top_srcdir)/doc/doxygen/sectionNumbering.py $(top_srcdir)/doc/doxygen/chapters/ $(DOX_HTML_DIR)
 
 $(DOX_HTML_DIR): $(DOX_TAG)
 
 $(DOX_PDF): $(DOX_TAG) refman.tex $(images)
+	$(MKDIR_P) $(DOX_LATEX_DIR)
 	@cp $(top_srcdir)/doc/doxygen/chapters/version.sty $(DOX_LATEX_DIR)
 	@cp $(top_srcdir)/doc/doxygen/chapters/images/*pdf $(DOX_LATEX_DIR)
 	@echo $(PDFLATEX) $(DOX_LATEX_DIR)/refman.tex

+ 15 - 15
doc/doxygen/chapters/210_check_list_performance.doxy

@@ -99,6 +99,21 @@ cudaStreamSynchronize(starpu_cuda_get_local_stream());
 as well as the use of \c cudaMemcpyAsync(), etc. for each CUDA operation one needs
 to use a version that takes the a stream parameter.
 
+If the kernel uses its own non-default stream, one can synchronize this stream
+with the StarPU-provided stream this way:
+
+\code{.c}
+cudaEvent_t event;
+call_kernel_with_its_own_stream()
+cudaEventCreateWithFlags(&event, cudaEventDisableTiming);
+cudaEventRecord(event, get_kernel_stream());
+cudaStreamWaitEvent(starpu_cuda_get_local_stream(), event, 0);
+cudaEventDestroy(event);
+\endcode
+
+This code makes the StarPU-provided stream wait for a new event, which will be
+triggered by the completion of the kernel.
+
 Unfortunately, some CUDA libraries do not have stream variants of
 kernels. This will seriously lower the potential for overlapping.
 If some CUDA calls are made without specifying this local stream,
@@ -129,21 +144,6 @@ able to submit and complete data transfers while kernels are executing, instead
 kernel submission. The kernel just has to make sure that StarPU can use the
 local stream to synchronize with the kernel startup and completion.
 
-If the kernel uses its own non-default stream, one can synchronize this stream
-with the StarPU-provided stream this way:
-
-\code{.c}
-cudaEvent_t event;
-call_kernel_with_its_own_stream()
-cudaEventCreateWithFlags(&event, cudaEventDisableTiming);
-cudaEventRecord(event, get_kernel_stream());
-cudaStreamWaitEvent(starpu_cuda_get_local_stream(), event, 0);
-cudaEventDestroy(event);
-\endcode
-
-This code makes the StarPU-provided stream wait for a new event, which will be
-triggered by the completion of the kernel.
-
 Using the flag ::STARPU_CUDA_ASYNC also permits to enable concurrent kernel
 execution, on cards which support it (Kepler and later, notably). This is
 enabled by setting the environment variable \ref STARPU_NWORKER_PER_CUDA to the

+ 9 - 4
doc/doxygen/chapters/310_data_management.doxy

@@ -529,12 +529,12 @@ When we want to switch to the vertical slice view, all we need to do is bring
 coherency between them by running an empty task on the home node of the data:
 
 \code{.c}
-void empty(void *buffers[], void *cl_arg)
-{ }
 struct starpu_codelet cl_switch =
 {
-	.cpu_funcs = {empty},
-	.nbuffers = STARPU_VARIABLE_NBUFFERS,
+	.where = STARPU_NOWHERE,
+	.nbuffers = 3,
+	.specific_nodes = 1,
+	.nodes = { STARPU_MAIN_RAM, STARPU_MAIN_RAM, STARPU_MAIN_RAM },
 };
 
 ret = starpu_task_insert(&cl_switch, STARPU_RW, handle,
@@ -552,6 +552,11 @@ Again, we prefer to make sure that we don't accidentally access the matrix throu
 starpu_data_invalidate_submit(handle);
 \endcode
 
+Note: when enabling a set of handles in this way, the set must not have any
+overlapping, i.e. the handles of the set must not have any part of data in
+common, otherwise StarPU will not properly handle concurrent accesses between
+them.
+
 And now we can start using vertical slices, etc.
 
 \section DataPointers Handles data buffer pointers

+ 53 - 12
doc/doxygen/chapters/320_scheduling.doxy

@@ -374,7 +374,11 @@ The tasks are stored inside buckets, where each bucket corresponds to a priority
 worker uses an indirect access array to know the order in which it should access the buckets. Moreover,
 all the tasks inside a bucket must be compatible with all the processing units that may access it (at least).
 
-As an example, see the following code where we have 5 types of tasks.
+These priorities are now automatically assigned by Heteroprio in auto calibration mode using heuristics.
+If you want to set these priorities manually, you can change \ref STARPU_HETEROPRIO_USE_AUTO_CALIBRATION
+and follow the example below.
+
+In this example code, we have 5 types of tasks.
 CPU workers can compute all of them, but CUDA workers can only execute
 tasks of types 0 and 1, and is expected to go 20 and 30 time
 faster than the CPU, respectively.
@@ -388,7 +392,7 @@ starpu_conf_init(&conf);
 conf.sched_policy_name = "heteroprio";
  // Inform StarPU about the function that will init the priorities in Heteroprio
  // where init_heteroprio is a function to implement
-conf.sched_policy_init = &init_heteroprio;
+conf.sched_policy_callback = &init_heteroprio;
  // Do other things with conf if needed, then init StarPU
 starpu_init(&conf);
 \endcode
@@ -396,31 +400,31 @@ starpu_init(&conf);
 \code{.c}
 void init_heteroprio(unsigned sched_ctx) {
   // CPU uses 5 buckets and visits them in the natural order
-  starpu_heteroprio_set_nb_prios(ctx, STARPU_CPU_WORKER, 5);
+  starpu_heteroprio_set_nb_prios(sched_ctx, STARPU_CPU_WORKER, 5);
   // It uses direct mapping idx => idx
   for(unsigned idx = 0; idx < 5; ++idx){
-    starpu_heteroprio_set_mapping(ctx, STARPU_CPU_WORKER, idx, idx);
+    starpu_heteroprio_set_mapping(sched_ctx, STARPU_CPU_WORKER, idx, idx);
     // If there is no CUDA worker we must tell that CPU is faster
-    starpu_heteroprio_set_faster_arch(ctx, STARPU_CPU_WORKER, idx);
+    starpu_heteroprio_set_faster_arch(sched_ctx, STARPU_CPU_WORKER, idx);
   }
 
   if(starpu_cuda_worker_get_count()){
     // CUDA is enabled and uses 2 buckets
-    starpu_heteroprio_set_nb_prios(ctx, STARPU_CUDA_WORKER, 2);
+    starpu_heteroprio_set_nb_prios(sched_ctx, STARPU_CUDA_WORKER, 2);
     // CUDA will first look at bucket 1
-    starpu_heteroprio_set_mapping(ctx, STARPU_CUDA_WORKER, 0, 1);
+    starpu_heteroprio_set_mapping(sched_ctx, STARPU_CUDA_WORKER, 0, 1);
     // CUDA will then look at bucket 2
-    starpu_heteroprio_set_mapping(ctx, STARPU_CUDA_WORKER, 1, 2);
+    starpu_heteroprio_set_mapping(sched_ctx, STARPU_CUDA_WORKER, 1, 2);
 
     // For bucket 1 CUDA is the fastest
-    starpu_heteroprio_set_faster_arch(ctx, STARPU_CUDA_WORKER, 1);
+    starpu_heteroprio_set_faster_arch(sched_ctx, STARPU_CUDA_WORKER, 1);
     // And CPU is 30 times slower
-    starpu_heteroprio_set_arch_slow_factor(ctx, STARPU_CPU_WORKER, 1, 30.0f);
+    starpu_heteroprio_set_arch_slow_factor(sched_ctx, STARPU_CPU_WORKER, 1, 30.0f);
 
     // For bucket 0 CUDA is the fastest
-    starpu_heteroprio_set_faster_arch(ctx, STARPU_CUDA_WORKER, 0);
+    starpu_heteroprio_set_faster_arch(sched_ctx, STARPU_CUDA_WORKER, 0);
     // And CPU is 20 times slower
-    starpu_heteroprio_set_arch_slow_factor(ctx, STARPU_CPU_WORKER, 0, 20.0f);
+    starpu_heteroprio_set_arch_slow_factor(sched_ctx, STARPU_CPU_WORKER, 0, 20.0f);
   }
 }
 \endcode
@@ -431,4 +435,41 @@ So, in the given example, the priority of a task will be between 0 and 4 include
 However, tasks of priorities 0-1 must provide CPU and CUDA kernels, and
 tasks of priorities 2-4 must provide CPU kernels (at least).
 
+\subsection LAHeteroprio Using locality aware Heteroprio
+
+Heteroprio supports a mode where locality is evaluated to guide the distribution
+of the tasks (see https://peerj.com/articles/cs-190.pdf).
+Currently, this mode is available using the dedicated function or an environment variable
+\ref STARPU_HETEROPRIO_USE_LA, and can be configured using environment variables.
+\code{.c}
+void starpu_heteroprio_set_use_locality(unsigned sched_ctx_id, unsigned use_locality);
+\endcode
+
+In this mode, multiple strategies are available to determine which memory node's workers are the most qualified for executing a specific task. This strategy can be set with \ref STARPU_LAHETEROPRIO_PUSH
+and available strategies are:
+- WORKER: the worker which pushed the task is preferred for the execution.
+- LcS: the node with the shortest data transfer time (estimated by StarPU) is the most qualified
+- LS_SDH: the node with the smallest data amount to be transferred will be preferred.
+- LS_SDH2: similar to LS_SDH, but data in write access is counted in a quadratic manner to give them more importance.
+- LS_SDHB: similar to LS_SDH, but data in write access is balanced with a coefficient (its value is set to 1000) and
+for the same amount of data, the one with less pieces of data to be transferred will be preferred.
+- LC_SMWB: similar to LS_SDH, but the amount of data in write access gets multiplied by a coefficient which gets closer to 2
+as the amount of data in read access gets larger than the data in write access.
+- AUTO: strategy by default, this one selects the best strategy and changes it in runtime to improve performance
+
+Other environment variables to configure LaHeteteroprio are documented in \ref ConfiguringLaHeteroprio
+
+\subsection AutoHeteroprio Using Heteroprio in auto-calibration mode
+
+In this mode, Heteroprio saves data about each program execution, in order to improve future ones.
+By default, theses files are stored in the folder used by perfmodel, but this can be changed using the
+\ref STARPU_HETEROPRIO_DATA_DIR environment variable. You can also specify the data filename directly using
+\ref STARPU_HETEROPRIO_DATA_FILE.
+
+Additionally, to assign priorities to tasks, Heteroprio needs a way to detect that some tasks are similar.
+By default, Heteroprio looks for tasks with the same perfmodel, or with the same codelet's name if no perfmodel was assigned.
+This behavior can be changed to only consider the codelet's name by setting
+\ref STARPU_HETEROPRIO_CODELET_GROUPING_STRATEGY to <c>1</c>
+
+Other environment variables to configure AutoHeteteroprio are documented in \ref ConfiguringAutoHeteroprio
 */

+ 104 - 10
doc/doxygen/chapters/380_offline_performance_tools.doxy

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
- * Copyright (C) 2020       Federal University of Rio Grande do Sul (UFRGS)
+ * Copyright (C) 2020,2021  Federal University of Rio Grande do Sul (UFRGS)
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -284,10 +284,10 @@ flow of tasks between the components of the modular scheduler.
 
 \subsubsection TimeBetweenSendRecvDataUse Analyzing Time Between MPI Data Transfer and Use by Tasks
 
-<c>starpu_fxt_tool</c> produces a file called <c>comms.rec</c> which describes all 
-MPI communications. The script <c>starpu_send_recv_data_use.py</c> uses this file 
-and <c>tasks.rec</c> in order to produce two graphs: the first one shows durations 
-between the reception of data and their usage by a task and the second one plots the 
+<c>starpu_fxt_tool</c> produces a file called <c>comms.rec</c> which describes all
+MPI communications. The script <c>starpu_send_recv_data_use.py</c> uses this file
+and <c>tasks.rec</c> in order to produce two graphs: the first one shows durations
+between the reception of data and their usage by a task and the second one plots the
 same graph but with elapsed time between send and usage of a data by the sender.
 
 \image html trace_recv_use.png
@@ -403,7 +403,7 @@ starpu_perfmodel_load_symbol(). The source code of the tool
 
 An XML output can also be printed by using the <c>-x</c> option:
 \verbatim
-$ tools/starpu_perfmodel_display -x -s non_linear_memset_regression_based 
+$ tools/starpu_perfmodel_display -x -s non_linear_memset_regression_based
 <?xml version="1.0" encoding="UTF-8"?>
 <!DOCTYPE StarPUPerfmodel SYSTEM "starpu-perfmodel.dtd">
 <!-- symbol non_linear_memset_regression_based -->
@@ -429,7 +429,7 @@ models. It writes a <c>.gp</c> file in the current directory, to be
 run with the tool <c>gnuplot</c>, which shows the corresponding curve.
 
 \verbatim
-$ tools/starpu_perfmodel_plot -s non_linear_memset_regression_based 
+$ tools/starpu_perfmodel_plot -s non_linear_memset_regression_based
 $ gnuplot starpu_non_linear_memset_regression_based.gp
 $ gv starpu_non_linear_memset_regression_based.eps
 \endverbatim
@@ -650,12 +650,12 @@ $ starpu_paje_sort paje.trace
 Performance counter values could be obtained from the PAPI framework if
 <c>./configure</c> detected the libpapi.
 
-In Debian, packages <c>libpapi-dev</c> and <c>libpapi5.7</c> provide required
-files.  Package <c>papi-tools</c> contains a set of useful tools, for example
+In Debian, the <c>libpapi-dev</c> package provides the required
+files.  Additionally, the <c>papi-tools</c> package contains a set of useful tools, for example
 <c>papi_avail</c> to see which counters are available.
 
 To be able to use Papi counters, one may need to reduce the level of the kernel
-parameter <c>kernel.perf_event_paranoid</c> to at least 2. See
+parameter <c>kernel.perf_event_paranoid</c> to 2 or below. See
 https://www.kernel.org/doc/html/latest/admin-guide/perf-security.html for the
 security impact of this parameter.
 
@@ -841,6 +841,100 @@ An example of visualization follows:
 \image html starvz_visu_r.png
 \image latex starvz_visu_r.pdf "" width=\textwidth
 
+\section EclipsePluginUsage StarPU Eclipse Plugin
+
+The StarPU Eclipse Plugin provides the ability to generate the
+different traces directly from the Eclipse IDE. Once StarPU has been
+configured and installed with its Eclipse plugin (see Section \ref
+EclipsePlugin), you first need to set up your environment for StarPU.
+
+\verbatim
+cd $HOME/usr/local/starpu
+source ./bin/starpu_env
+\endverbatim
+
+To generate traces from the application, it is necessary to set \ref
+STARPU_FXT_TRACE to 1.
+
+\verbatim
+export STARPU_FXT_TRACE=1
+\endverbatim
+
+The eclipse workspace together with an example is available in \c
+lib/starpu/eclipse-plugin.
+
+\verbatim
+cd ./lib/starpu/eclipse-plugin
+eclipse -data workspace
+\endverbatim
+
+You can then open the file \c hello/hello.c, and build the application
+by pressing \c Ctrl-B.
+
+\image html eclipse_hello_build.png
+\image latex eclipse_hello_build.png "" width=\textwidth
+
+The application can now be executed.
+
+\image html eclipse_hello_run.png
+\image latex eclipse_hello_run.png "" width=\textwidth
+
+After executing the C/C++ StarPU application, one can use the StarPU
+plugin to generate and visualise the task graph of the application.
+The StarPU plugin eclipse is either available through the icons in the
+upper toolbar, or from the dropdown menu \c StarPU.
+
+\image html eclipse_hello_plugin.png
+\image latex eclipse_hello_plugin.png "" width=\textwidth
+
+To start, one first need to run the StarPU FxT tool, either through
+the \c FxT icon of the toolbar, or from the menu \c StarPU / <c>StarPU
+FxT Tool</c>. This will call the tool \c starpu_fxt_tool to generate
+traces for your application execution.
+
+A message dialog box is displayed to confirm the generation of the
+different traces.
+
+\image html eclipse_hello_fxt.png
+\image latex eclipse_hello_fxt.png "" width=\textwidth
+
+One of the generated files is a Paje trace which can be viewed with
+ViTE, a trace explorer. To open and visualise the file \c paje.trace with
+ViTE, one can select the second command of the StarPU menu, which is
+named <c>Generate Paje Trace</c>, or click on the second icon named
+<c>Trace</c> in the toolbar.
+
+\image html eclipse_hello_paje_trace.png
+\image latex eclipse_hello_paje_trace.png "" width=\textwidth
+
+\image html eclipse_hello_vite.png
+\image latex eclipse_hello_vite.png "" width=\textwidth
+
+Another generated trace file is a task graph described using the DOT
+language. It is possible to get a graphical output of the graph by
+calling the <c>graphviz library</c>. To do this, one can click on the
+third command of StarPU menu. A task graph of the application in
+the \c png format is then generated.
+
+\image html eclipse_hello_graph.png
+\image latex eclipse_hello_graph.png "" width=\textwidth
+
+In StarPU eclipse plugin, one can display the graph task directly from
+eclipse, or through a web browser. To do this, there is another
+command named <c> Generate SVG graph</c> in the StarPU menu or HGraph
+in the toolbar of eclipse.
+
+From the HTML file, you can see the graph task, and by clicking on a
+task name, it will open the C file in which the task submission was
+called (if you have an editor which understands the syntax \c
+href="file.c#123").
+
+\image html eclipse_hello_svg_graph.png
+\image latex eclipse_hello_svg_graph.png "" width=\textwidth
+
+\image html eclipse_hello_hgraph.png
+\image latex eclipse_hello_hgraph.png "" width=\textwidth
+
 \section MemoryFeedback Memory Feedback
 
 It is possible to enable memory statistics. To do so, you need to pass

+ 1 - 1
doc/doxygen/chapters/400_python.doxy

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 2 - 43
doc/doxygen/chapters/497_eclipse_plugin.doxy

@@ -80,48 +80,7 @@ $ ls $HOME/usr/local/eclipse/java-2021-03/eclipse/dropins
 StarPU_1.0.0.202105272056.jar
 \endverbatim
 
-\section PluginUsage Using The StarPU Eclipse Plugin
-
-You first need to set up your environment for StarPU.
-
-\verbatim
-cd $HOME/usr/local/starpu
-source ./bin/starpu_env
-\endverbatim
-
-The eclipse workspace together with an example is available in \c lib/starpu/eclipse-plugin.
-
-\verbatim
-cd ./lib/starpu/eclipse-plugin
-eclipse -data workspace
-\endverbatim
-
-You can then open the file \c hello/hello.c, and build the application by pressing \c Ctrl-B.
-
-\image html eclipse_hello_build.png
-\image latex eclipse_hello_build.png "" width=10cm
-
-The application can now be executed.
-
-\image html eclipse_hello_run.png
-\image latex eclipse_hello_run.png "" width=10cm
-
-The StarPU plugin eclipse is either available through the icons in the upper toolbar, or from the dropdown menu \c StarPU.
-
-You first need to run the StarPU FxT tool, either through the \c FxT icon, or from the menu \c StarPU / \c StarPU Fxt Tool.
-This will call the tool \c starpu_fxt_tool to generate traces for your application execution.
-
-\image html eclipse_hello_fxt.png
-\image latex eclipse_hello_fxt.png "" width=10cm
-
-You can then visualise the Paje trace with the tool \c vite, or display the graph task directly from eclipse.
-
-\image html eclipse_hello_graph.png
-\image latex eclipse_hello_graph.png "" width=10cm
-
-or through a web browser. From the HTML file, you can see the graph
-task, and by clicking on a task name, it will open the C file in which
-the task submission was called (if you have an editor which
-understands the syntax \c href="file.c#123").
+You can now go to Section \ref EclipsePluginUsage to see how to use
+the plugin.
 
 */

+ 118 - 1
doc/doxygen/chapters/501_environment_variables.doxy

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2009-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  * Copyright (C) 2016       Uppsala University
- * Copyright (C) 2020       Federal University of Rio Grande do Sul (UFRGS)
+ * Copyright (C) 2020,2021  Federal University of Rio Grande do Sul (UFRGS)
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -646,6 +646,123 @@ Specify which PAPI events should be recorded in the trace (\ref PapiCounters).
 
 </dl>
 
+\section ConfiguringHeteroprio Configuring The Heteroprio Scheduler
+
+\subsection ConfiguringLaHeteroprio Configuring LAHeteroprio
+<dl>
+
+<dt>STARPU_HETEROPRIO_USE_LA</dt>
+<dd>
+\anchor STARPU_HETEROPRIO_USE_LA
+\addindex __env__STARPU_HETEROPRIO_USE_LA
+Enable the locality aware mode of Heteroprio which guides the distribution of tasks to workers
+in order to reduce the data transfers between memory nodes.
+</dd>
+
+<dt>STARPU_LAHETEROPRIO_PUSH</dt>
+<dd>
+\anchor STARPU_LAHETEROPRIO_PUSH
+\addindex __env__STARPU_LAHETEROPRIO_PUSH
+Choose between the different push strategies for locality aware Heteroprio: 
+WORKER, LcS, LS_SDH, LS_SDH2, LS_SDHB, LC_SMWB, AUTO (by default: AUTO). These are detailed in
+\ref LAHeteroprio
+</dd>
+
+<dt>STARPU_LAHETEROPRIO_S_[ARCH]</dt>
+<dd>
+\anchor STARPU_LAHETEROPRIO_S_[ARCH]
+\addindex __env__STARPU_LAHETEROPRIO_S_arch
+Specify the number of memory nodes contained in an affinity group. An affinity
+group will be composed of the closests memory nodes to a worker of a given architecture,
+and this worker will look for tasks available inside these memory nodes, before
+considering stealing tasks outside this group.
+ARCH can be CPU, CUDA, OPENCL, MICC, SCC, MPI_MS, etc.
+</dd>
+
+<dt>STARPU_LAHETEROPRIO_PRIO_STEP_[ARCH]</dt>
+<dd>
+\anchor STARPU_LAHETEROPRIO_PRIO_STEP_[ARCH]
+\addindex __env__STARPU_LAHETEROPRIO_PRIO_STEP_arch
+Specify the number of buckets in the local memory node in which a worker will look for
+available tasks, before this worker starts looking for tasks in other memory nodes' buckets.
+ARCH indicates that this number is specific to a given arch which can be:
+CPU, CUDA, OPENCL, MICC, SCC, MPI_MS, etc.
+</dd>
+
+</dl>
+
+\subsection ConfiguringAutoHeteroprio Configuring AutoHeteroprio
+<dl>
+
+<dt>STARPU_HETEROPRIO_USE_AUTO_CALIBRATION</dt>
+<dd>
+\anchor STARPU_HETEROPRIO_USE_AUTO_CALIBRATION
+\addindex __env__STARPU_HETEROPRIO_USE_AUTO_CALIBRATION
+Enable the auto calibration mode of Heteroprio which assign priorities to tasks automatically
+</dd>
+
+<dt>STARPU_HETEROPRIO_DATA_DIR</dt>
+<dd>
+\anchor STARPU_HETEROPRIO_DATA_DIR
+\addindex __env__STARPU_HETEROPRIO_DATA_DIR
+Specify the path of the directory where Heteroprio stores data about program executions.
+By default, these are stored in the same directory used by perfmodel.
+</dd>
+
+<dt>STARPU_HETEROPRIO_DATA_FILE</dt>
+<dd>
+\anchor STARPU_HETEROPRIO_DATA_FILE
+\addindex __env__STARPU_HETEROPRIO_DATA_FILE
+Specify the filename where Heteroprio will save data about the current program's execution.
+</dd>
+
+<dt>STARPU_HETEROPRIO_CODELET_GROUPING_STRATEGY</dt>
+<dd>
+\anchor STARPU_HETEROPRIO_CODELET_GROUPING_STRATEGY
+\addindex __env__STARPU_HETEROPRIO_CODELET_GROUPING_STRATEGY
+Choose how Heteroprio groups similar tasks. It can be <c>0</c> to group
+the tasks with the same perfmodel or the same codelet's name if no perfmodel was assigned.
+Or, it could be <c>1</c> to group the tasks only by codelet's name.
+</dd>
+
+<dt>STARPU_AUTOHETEROPRIO_PRINT_DATA_ON_UPDATE</dt>
+<dd>
+\anchor STARPU_AUTOHETEROPRIO_PRINT_DATA_ON_UPDATE
+\addindex __env__STARPU_AUTOHETEROPRIO_PRINT_DATA_ON_UPDATE
+Enable the printing of priorities' data every time they get updated.
+</dd>
+
+<dt>STARPU_AUTOHETEROPRIO_PRINT_AFTER_ORDERING</dt>
+<dd>
+\anchor STARPU_AUTOHETEROPRIO_PRINT_AFTER_ORDERING
+\addindex __env__STARPU_AUTOHETEROPRIO_PRINT_AFTER_ORDERING
+Enable the printing of priorities' order for each architecture every time there's a reordering.
+</dd>
+
+<dt>STARPU_AUTOHETEROPRIO_PRIORITY_ORDERING_POLICY</dt>
+<dd>
+\anchor STARPU_AUTOHETEROPRIO_PRIORITY_ORDERING_POLICY
+\addindex __env__STARPU_AUTOHETEROPRIO_PRIORITY_ORDERING_POLICY
+Specify the heuristic which will be used to assign priorities automatically.
+It should be an integer between 0 and 27.
+</dd>
+
+<dt>STARPU_AUTOHETEROPRIO_ORDERING_INTERVAL</dt>
+<dd>
+\anchor STARPU_AUTOHETEROPRIO_ORDERING_INTERVAL
+\addindex __env__STARPU_AUTOHETEROPRIO_ORDERING_INTERVAL
+Specify the period (in number of tasks pushed), between priorities reordering operations.
+</dd>
+
+<dt>STARPU_AUTOHETEROPRIO_FREEZE_GATHERING</dt>
+<dd>
+\anchor STARPU_AUTOHETEROPRIO_FREEZE_GATHERING
+\addindex __env__STARPU_AUTOHETEROPRIO_FREEZE_GATHERING
+Disable data gathering from task executions.
+</dd>
+
+</dl>
+
 \section Extensions Extensions
 
 <dl>

+ 7 - 0
doc/doxygen/chapters/510_configure_options.doxy

@@ -488,6 +488,13 @@ Enable OpenMP Support (\ref OpenMPRuntimeSupport)
 Enable cluster Support (\ref ClusteringAMachine)
 </dd>
 
+<dt>--enable-eclipse-plugin</dt>
+<dd>
+\anchor enable-eclipse-plugin
+\addindex __configure__--enable-eclipse-plugin
+Enable the StarPU Eclipse Plugin. See \ref EclipsePlugin to know how to install Eclipse.
+</dd>
+
 </dl>
 
 \section AdvancedConfiguration Advanced Configuration

二进制
doc/doxygen/chapters/images/eclipse_hello_hgraph.png


二进制
doc/doxygen/chapters/images/eclipse_hello_paje_trace.png


二进制
doc/doxygen/chapters/images/eclipse_hello_plugin.png


二进制
doc/doxygen/chapters/images/eclipse_hello_svg_graph.png


二进制
doc/doxygen/chapters/images/eclipse_hello_vite.png


+ 1 - 1
doc/doxygen/dev/starpu_check_include.sh

@@ -1,7 +1,7 @@
 #!/bin/bash
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+# Copyright (C) 2020-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by

+ 1 - 0
doc/doxygen/doxygen-config.cfg.in

@@ -88,3 +88,4 @@ INPUT_FILTER           = @top_builddir@/doc/doxygen/doxygen_filter.sh
 
 IMAGE_PATH             = @top_srcdir@/doc/doxygen/chapters/images
 
+GENERATE_LATEX         = @DOC_GENERATE_LATEX@

+ 2 - 1
doc/doxygen/doxygen.cfg

@@ -1319,7 +1319,8 @@ EXTRA_SEARCH_MAPPINGS  =
 # If the GENERATE_LATEX tag is set to YES (the default) Doxygen will
 # generate Latex output.
 
-GENERATE_LATEX         = YES
+#defined in doxygen-config.cfg
+#GENERATE_LATEX         = YES
 
 # The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put.
 # If a relative path is entered the value of OUTPUT_DIRECTORY will be

+ 1 - 0
doc/doxygen/refman.tex

@@ -269,6 +269,7 @@ Documentation License”.
 \input{group__API__Master__Slave}
 \input{group__API__Random__Functions}
 \input{group__API__Sink}
+\input{group__API__HeteroPrio}
 
 \chapter{File Index}
 \input{files}

+ 43 - 0
doc/doxygen/sectionNumbering.py

@@ -0,0 +1,43 @@
+#!/usr/bin/python3
+
+import os
+import operator
+import sys
+
+files = {}
+for x in os.listdir(sys.argv[1]):
+    if x.endswith(".doxy"):
+        with open(sys.argv[1]+x, "r", encoding="utf-8") as fin:
+            for line in fin.readlines():
+                if "\page" in line:
+                    line = line.replace("/*! \page ", "").strip()
+                    files[x] = line[0:line.index(" ")]+".html"
+
+sfiles= dict(sorted(files.items(), key=operator.itemgetter(0)))
+htmlfiles = ["index.html"]
+htmlfiles.extend(sfiles.values())
+
+htmldir=sys.argv[2]+"/"
+
+chapter=0
+for x in htmlfiles:
+    chapter+=1
+    section=0
+    with open(htmldir+x, "r", encoding="utf-8") as fin:
+        with open(htmldir+x+".count.html", "w", encoding="utf-8") as fout:
+            for line in fin.readlines():
+                if "<div class=\"title\">" in line:
+                    line = line.replace("<div class=\"title\">", "<div class=\"title\">"+str(chapter)+". ")
+                if "<h1>" in line:
+                    section += 1
+                    line = line.replace("<h1>", "<h1>" + str(chapter) + "." + str(section))
+                    subsection = 0
+                if "<h2>" in line:
+                    subsection += 1
+                    line = line.replace("<h2>", "<h2>" + str(chapter) + "." + str(section) + "." + str(subsection))
+                    subsubsection = 0
+                if "<h3>" in line:
+                    subsubsection += 1
+                    line = line.replace("<h3>", "<h3>" + str(chapter) + "." + str(section) + "." + str(subsection) + "." + str(subsubsection))
+                fout.write(line)
+    os.rename(htmldir+x+".count.html", htmldir+x)

+ 4 - 2
doc/doxygen_dev/Makefile.am

@@ -211,10 +211,12 @@ $(DOX_TAG): $(dox_inputs)
 	@if test -f html_dev/navtree.js ; then $(SED) -i 's/\[ "Files", "Files.html", null \]/\[ "", "Files.html", null \]/' html_dev/navtree.js ; fi
 	@$(SED) -i 's/.*"Files.html".*//' html_dev/pages.html
 	@if test -f latex/main.tex ; then mv latex/main.tex latex/index.tex ; fi
-	@$(SED) -i '/\\begin{titlepage}/,$$d' $(DOX_LATEX_DIR)/refman.tex
-	@cat $(top_srcdir)/doc/doxygen_dev/refman.tex >> $(DOX_LATEX_DIR)/refman.tex
+	@if test -f $(DOX_LATEX_DIR)/refman.tex ; then $(SED) -i '/\\begin{titlepage}/,$$d' $(DOX_LATEX_DIR)/refman.tex ; fi
+	@if test -f $(DOX_LATEX_DIR)/refman.tex ; then cat $(top_srcdir)/doc/doxygen_dev/refman.tex >> $(DOX_LATEX_DIR)/refman.tex ; fi
+	$(top_srcdir)/doc/doxygen/sectionNumbering.py $(top_srcdir)/doc/doxygen_dev/chapters/ $(DOX_HTML_DIR)
 
 $(DOX_PDF): $(DOX_TAG) refman.tex
+	$(MKDIR_P) $(DOX_LATEX_DIR)
 	@cp $(top_srcdir)/doc/doxygen_dev/chapters/version.sty $(DOX_LATEX_DIR)
 	@cp $(top_srcdir)/doc/doxygen_dev/modules.tex $(DOX_LATEX_DIR)
 	@echo $(PDFLATEX) $(DOX_LATEX_DIR)/refman.tex

+ 1 - 1
doc/doxygen_dev/dev/starpu_check_missing.sh

@@ -1,7 +1,7 @@
 #!/bin/bash
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+# Copyright (C) 2020-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by

+ 1 - 0
doc/doxygen_dev/doxygen-config.cfg.in

@@ -134,3 +134,4 @@ INPUT_FILTER           = @top_builddir@/doc/doxygen_dev/doxygen_filter.sh
 
 #IMAGE_PATH             = @top_srcdir@/doc/doxygen_dev/chapters/images
 
+GENERATE_LATEX         = @DOC_GENERATE_LATEX@

+ 2 - 1
doc/doxygen_dev/doxygen.cfg

@@ -1319,7 +1319,8 @@ EXTRA_SEARCH_MAPPINGS  =
 # If the GENERATE_LATEX tag is set to YES (the default) Doxygen will
 # generate Latex output.
 
-GENERATE_LATEX         = YES
+#defined in doxygen-config.cfg
+#GENERATE_LATEX         = YES
 
 # The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put.
 # If a relative path is entered the value of OUTPUT_DIRECTORY will be

+ 6 - 5
doc/tutorial/Makefile

@@ -13,12 +13,12 @@
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 #
-CFLAGS          +=      $$(pkg-config --cflags starpu-1.1)
-LDLIBS          +=      $$(pkg-config --libs starpu-1.1)
+CFLAGS          +=      $$(pkg-config --cflags starpu-1.3)
+LDLIBS          +=      $$(pkg-config --libs starpu-1.3)
 
-HAS_CUDA	=	$(shell pkg-config --libs starpu-1.1 |grep -i cuda)
+HAS_CUDA	=	$(shell starpu_machine_display | grep "CUDA" | grep -v "No CUDA worker" | head -1)
 NVCC		?=	nvcc
-HAS_OPENCL	=	$(shell pkg-config --libs starpu-1.1 |grep -i opencl)
+HAS_OPENCL	=	$(shell starpu_machine_display | grep "OpenCL" | grep -v "No OpenCL worker" | head -1)
 
 %.o: %.cu
 	nvcc $(CFLAGS) $< -c
@@ -35,7 +35,8 @@ else
 VECTOR_SCAL_COMPILER		=	$(CC)
 endif
 ifneq ($(strip $(HAS_OPENCL)),)
-VECTOR_SCAL_PREREQUISITES += vector_scal_opencl.o
+VECTOR_SCAL_PREREQUISITES 	+=	vector_scal_opencl.o
+LDLIBS				+=	-lOpenCL
 endif
 
 vector_scal: $(VECTOR_SCAL_PREREQUISITES)

+ 19 - 15
doc/tutorial/vector_scal_opencl.c

@@ -21,38 +21,42 @@ extern struct starpu_opencl_program programs;
 void vector_scal_opencl(void *buffers[], void *_args)
 {
 	float *factor = _args;
-	int id, devid, err;
+	int id, devid;
+	cl_int err;
 	cl_kernel kernel;
 	cl_command_queue queue;
-	cl_event event;
 
 	/* length of the vector */
-	unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
+	unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]);
 	/* OpenCL copy of the vector pointer */
 	cl_mem val = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
 
 	id = starpu_worker_get_id();
 	devid = starpu_worker_get_devid(id);
 
-	err = starpu_opencl_load_kernel(&kernel, &queue, &programs,
-					"vector_mult_opencl", devid);   /* Name of the codelet defined above */
+	err = starpu_opencl_load_kernel(&kernel, &queue, &programs, "vector_mult_opencl", devid);
 	if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
 
-	err = clSetKernelArg(kernel, 0, sizeof(val), &val);
-	err |= clSetKernelArg(kernel, 1, sizeof(n), &n);
+	err = clSetKernelArg(kernel, 0, sizeof(n), &n);
+	err = clSetKernelArg(kernel, 1, sizeof(val), &val);
 	err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);
 	if (err) STARPU_OPENCL_REPORT_ERROR(err);
 
 	{
-		size_t global=1;
-		size_t local=1;
-		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
-		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
-	}
+		size_t global=n;
+		size_t local;
+                size_t s;
+                cl_device_id device;
+
+                starpu_opencl_get_device(devid, &device);
 
-	clFinish(queue);
-	starpu_opencl_collect_stats(event);
-	clReleaseEvent(event);
+                err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
+                if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+                if (local > global) local=global;
+                else global = (global + local-1) / local * local;
 
+		err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
+		if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
+	}
 	starpu_opencl_release_kernel(kernel);
 }

+ 4 - 4
eclipse-plugin/src/starpu/handlers/SvgHandler.java

@@ -54,10 +54,10 @@ public class SvgHandler extends AbstractHandler {
 				else
 				{
 					String map = TraceUtils.readFileToString(workDir + "/output.map");
-					Pattern p = Pattern.compile("href=\"[a-zA-Z./]+#");
-					IPath ipath = ((IPathEditorInput) input).getPath().makeAbsolute();
-					String chemin = ipath.toString();
-					String replaceBy = "href=\"" + chemin + "#";
+					Pattern p = Pattern.compile("href=\"([^#\"/]+/)*");
+					IPath ipath = ((IPathEditorInput) input).getPath().makeAbsolute().removeLastSegments(1);
+					String path = ipath.toString();
+					String replaceBy = "href=\"" + path + "/";
 					map = p.matcher(map).replaceAll(replaceBy);
 
 					PrintWriter pw = new PrintWriter(workDir + "/output.html");

+ 1 - 1
eclipse-plugin/src/starpu/handlers/TraceGenHandler.java

@@ -55,7 +55,7 @@ public class TraceGenHandler extends AbstractHandler {
 				if (!f.isFile())
 					throw new Exception("File <" + inputfilename + "> does not exist. Have you run your application?");
 
-				String[] command = {"starpu_fxt_tool", "-i", inputfilename, "-d", TraceUtils.getRandomDirectoryName()};
+				String[] command = {"starpu_fxt_tool", "-i", inputfilename, "-d", TraceUtils.getRandomDirectoryName(), "-c", "-no-acquire"};
 				TraceUtils.runCommand(command);
 			} catch (Exception e) {
 				TraceUtils.displayMessage("Error: " + e.toString());

+ 5 - 2
examples/Makefile.am

@@ -162,9 +162,11 @@ if !STARPU_USE_MPI_MASTER_SLAVE
 SHELL_TESTS			+=	scheduler/schedulers.sh
 SHELL_TESTS			+=	scheduler/schedulers_context.sh
 if !STARPU_NO_BLAS_LIB
+if STARPU_USE_FXT
 SHELL_TESTS			+=	mult/sgemm.sh
 endif
 endif
+endif
 
 check_PROGRAMS		=	$(STARPU_EXAMPLES)
 noinst_PROGRAMS		=
@@ -236,6 +238,7 @@ STARPU_EXAMPLES +=				\
 	cpp/add_vectors_interface		\
 	filters/fread				\
 	filters/fvector				\
+	filters/ftensor				\
 	filters/fblock				\
 	filters/fmatrix				\
 	filters/fmultiple_manual		\
@@ -243,7 +246,6 @@ STARPU_EXAMPLES +=				\
 	filters/fmultiple_submit_readonly	\
 	filters/fmultiple_submit_implicit	\
 	filters/frecursive			\
-	filters/fplan_notautomatic		\
 	tag_example/tag_example			\
 	tag_example/tag_example2		\
 	tag_example/tag_example3		\
@@ -626,7 +628,8 @@ endif
 examplebin_PROGRAMS +=				\
 	filters/shadow				\
 	filters/shadow2d			\
-	filters/shadow3d
+	filters/shadow3d			\
+	filters/shadow4d
 
 #############################
 # Custom multiformat filter #

+ 9 - 8
examples/filters/fblock.c

@@ -157,11 +157,8 @@ int main(void)
                 task->cl_arg_size = sizeof(multiplier);
 
                 ret = starpu_task_submit(task);
-                if (ret)
-		{
-                        FPRINTF(stderr, "Error when submitting task\n");
-                        exit(ret);
-                }
+                if (ret == -ENODEV) goto enodev;
+                STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
         }
 
         /* Unpartition the data, unregister it from StarPU and shutdown */
@@ -178,8 +175,12 @@ int main(void)
         FPRINTF(stderr, "OUT Block\n");
         print_block(block, NX, NY, NZ, NX, NX*NY);
 
-	free(block);
+        free(block);
+        
+        starpu_shutdown();
+        return 0;
 
-	starpu_shutdown();
-	return 0;
+enodev:
+        starpu_shutdown();
+        return 77;
 }

+ 7 - 1
examples/filters/fmultiple_manual.c

@@ -117,9 +117,15 @@ void empty(void *buffers[], void *cl_arg)
 
 struct starpu_codelet cl_switch =
 {
+#if 1
+	/* Check for the values */
 	.cpu_funcs = {empty},
+#else
+	/* For production code: we do not need to actually execute anything */
+	.where = STARPU_NOWHERE,
+#endif
 	.nbuffers = STARPU_VARIABLE_NBUFFERS,
-	.name = "switch"
+	.name = "switch",
 };
 
 int main(void)

+ 0 - 244
examples/filters/fplan_notautomatic.c

@@ -1,244 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2018-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include <starpu.h>
-
-#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
-
-#define NX    9
-#define PARTS 3
-
-struct starpu_codelet task_codelet;
-
-// CPU implementations
-void task_cpu(void *descr[], void *args)
-{
-	int *values = (int*)STARPU_VECTOR_GET_PTR(descr[0]);
-	int nx = STARPU_VECTOR_GET_NX(descr[0]);
-	int i, add;
-	char message[10000];
-	int cur = 0;
-
-	starpu_codelet_unpack_args(args, &add);
-
-	cur += snprintf(&message[cur], 10000-cur, "Values ");
-	for(i=0 ; i<nx ; i++)
-	{
-		values[i] += add;
-		cur += snprintf(&message[cur], 10000-cur, "%d ", values[i]);
-	}
-	FPRINTF(stderr, "%s\n", message);
-}
-
-void split_callback(void *arg)
-{
-	(void)arg;
-	struct starpu_task *task = starpu_task_get_current();
-	starpu_data_handle_t value_handle, sub_handles[PARTS];
-
-	starpu_codelet_unpack_args(task->cl_arg, &value_handle, &sub_handles);
-
-	FPRINTF(stderr, "[callback] Partition for handle %p into handles %p %p and %p\n", value_handle, sub_handles[0], sub_handles[1], sub_handles[2]);
-
-	starpu_data_partition_submit_sequential_consistency(value_handle, PARTS, sub_handles, 0);
-}
-
-void supertask_callback(void *arg)
-{
-	(void)arg;
-	starpu_data_handle_t sub_handles[PARTS];
-	int add;
-	struct starpu_task *task = starpu_task_get_current();
-
-	starpu_codelet_unpack_args(task->cl_arg, &sub_handles, &add);
-
-	FPRINTF(stderr, "Submitting tasks on %d subdata (add %d)\n", PARTS, add);
-
-	int i;
-	for(i=0 ; i<PARTS ; i++)
-	{
-		int ret = starpu_task_insert(&task_codelet,
-					     STARPU_RW, sub_handles[i],
-					     STARPU_VALUE, &add, sizeof(add),
-					     0);
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
-	}
-}
-
-void release(void *arg)
-{
-	struct starpu_task *task = (struct starpu_task *)arg;
-	starpu_task_end_dep_release(task);
-}
-
-void merge_callback(void *arg)
-{
-	(void)arg;
-	struct starpu_task *task = starpu_task_get_current();
-
-	starpu_data_handle_t value_handle, sub_handles[PARTS];
-	starpu_codelet_unpack_args(task->cl_arg, &value_handle, &sub_handles);
-
-	FPRINTF(stderr, "Unpartition for handle %p from handles %p %p and %p\n", value_handle, sub_handles[0], sub_handles[1], sub_handles[2]);
-
-	starpu_data_unpartition_submit_sequential_consistency_cb(value_handle, PARTS, sub_handles, STARPU_MAIN_RAM, 0, release, task);
-}
-
-// Codelets
-struct starpu_codelet task_codelet =
-{
-	.cpu_funcs = {task_cpu},
-	.nbuffers = 1,
-	.modes = {STARPU_RW},
-	.name = "task_codelet"
-};
-
-struct starpu_codelet supertask_codelet =
-{
-	.where= STARPU_NOWHERE,
-	.nbuffers = 1,
-	.modes = {STARPU_RW},
-	.name = "supertask_codelet"
-};
-
-struct starpu_codelet split_codelet =
-{
-	.where= STARPU_NOWHERE,
-	.nbuffers = 1,
-	.modes = {STARPU_RW},
-	.name = "split_codelet"
-};
-
-struct starpu_codelet merge_codelet =
-{
-	.where= STARPU_NOWHERE,
-	.nbuffers = 1,
-	.modes = {STARPU_RW},
-	.name = "merge_codelet"
-};
-
-int main(void)
-{
-	int ret, i;
-	int values[NX];
-	int check[NX];
-	int add1=1;
-	int add2=2;
-	starpu_data_handle_t value_handle;
-	starpu_data_handle_t sub_handles[PARTS];
-
-	ret = starpu_init(NULL);
-	if (ret == -ENODEV)
-		exit(77);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-
-	if (starpu_cpu_worker_get_count() == 0)
-	{
-		FPRINTF(stderr, "We need at least 1 CPU worker.\n");
-		starpu_shutdown();
-		return 77;
-	}
-
-	struct starpu_data_filter f =
-	{
-		.filter_func = starpu_vector_filter_block,
-		.nchildren = PARTS
-	};
-
-	values[NX-1] = 2;
-	for(i=NX-2 ; i>= 0 ; i--) values[i] = values[i+1] * 2;
-	for(i=0 ; i<NX ; i++) check[i] = values[i] + (2 * add1) + (2 * add2);
-
-	starpu_vector_data_register(&value_handle, STARPU_MAIN_RAM, (uintptr_t)&values[0], NX, sizeof(values[0]));
-	starpu_data_partition_plan(value_handle, &f, sub_handles);
-
-	// tell StarPU not to partition data, the application will decide itself when to do it
-	starpu_data_partition_not_automatic(value_handle);
-	for(i=0 ; i<PARTS ; i++)
-		starpu_data_partition_not_automatic(sub_handles[i]);
-
-	// insert a task on the whole data
-	ret = starpu_task_insert(&task_codelet, STARPU_RW, value_handle,
-				 STARPU_VALUE, &add1, sizeof(add1),
-				 STARPU_NAME, "task_1", 0);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
-
-	// insert a task to split the data
-	ret = starpu_task_insert(&split_codelet, STARPU_RW, value_handle,
-				 STARPU_VALUE, &value_handle, sizeof(starpu_data_handle_t),
-				 STARPU_VALUE, sub_handles, PARTS*sizeof(starpu_data_handle_t),
-				 STARPU_NAME, "split",
-				 STARPU_PROLOGUE_CALLBACK, split_callback,
-				 0);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
-
-	// insert a task that will work on the subdata
-	ret = starpu_task_insert(&supertask_codelet, STARPU_RW, value_handle,
-				 STARPU_VALUE, sub_handles, PARTS*sizeof(starpu_data_handle_t),
-				 STARPU_VALUE, &add1, sizeof(add1),
-				 STARPU_NAME, "supertask_1",
-				 STARPU_PROLOGUE_CALLBACK, supertask_callback,
-				 0);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
-
-	// insert another task that will work on the subdata
-	ret = starpu_task_insert(&supertask_codelet, STARPU_RW, value_handle,
-				 STARPU_VALUE, sub_handles, PARTS*sizeof(starpu_data_handle_t),
-				 STARPU_VALUE, &add2, sizeof(add2),
-				 STARPU_NAME, "supertask_2",
-				 STARPU_PROLOGUE_CALLBACK, supertask_callback,
-				 0);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
-
-	// insert a task to merge the data
-	ret = starpu_task_insert(&merge_codelet, STARPU_RW, value_handle,
-				 STARPU_VALUE, &value_handle, sizeof(starpu_data_handle_t),
-				 STARPU_VALUE, sub_handles, PARTS*sizeof(starpu_data_handle_t),
-				 STARPU_NAME, "merge",
-				 STARPU_PROLOGUE_CALLBACK, merge_callback,
-				 STARPU_TASK_END_DEP, 1,
-				 0);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
-
-	// insert a task that will work on the whole data
-	ret = starpu_task_insert(&task_codelet, STARPU_RW, value_handle,
-				 STARPU_VALUE, &add2, sizeof(add2),
-				 STARPU_NAME, "task_2", 0);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
-
-	starpu_task_wait_for_all();
-	starpu_data_partition_clean(value_handle, PARTS, sub_handles);
-	starpu_data_unregister(value_handle);
-
-	FPRINTF(stderr, "Values : ");
-	for(i=0 ; i<NX ; i++)
-	{
-		FPRINTF(stderr, "%d ", values[i]);
-	}
-	FPRINTF(stderr, "\n");
-	for(i=0 ; i<NX ; i++)
-	{
-		if (values[i] != check[i])
-		{
-			FPRINTF(stderr, "Incorrect value for %d. %d != %d\n", i, values[i], check[i]);
-			ret = 1;
-		}
-	}
-
-	starpu_shutdown();
-
-	return ret;
-}

+ 1 - 1
examples/filters/fread.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 4 - 0
examples/filters/frecursive.c

@@ -114,6 +114,7 @@ int main(void)
 	ret = starpu_task_insert(&cl,
 				 STARPU_RW, subhandle_l1[0],
 				 STARPU_VALUE, &factor, sizeof(factor),
+				 STARPU_NAME, "sub-matrix1",
 				 0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 	for (i=1; i<PARTS; i++)
@@ -121,6 +122,8 @@ int main(void)
 		ret = starpu_task_insert(&cl,
 					 STARPU_RW, subhandle_l2[i][0],
 					 STARPU_VALUE, &factor, sizeof(factor),
+					 STARPU_NAME, "sub-matrix2",
+
 					 0);
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 		for (j=1; j<PARTS; j++)
@@ -130,6 +133,7 @@ int main(void)
 				ret = starpu_task_insert(&cl,
 							 STARPU_RW, subhandle_l3[i][j][k],
 							 STARPU_VALUE, &factor, sizeof(factor),
+							 STARPU_NAME, "sub-matrix3",
 							 0);
 				STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 			}

+ 191 - 0
examples/filters/ftensor.c

@@ -0,0 +1,191 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * This examplifies how to use partitioning filters.  We here just split a 4D
+ * matrix into 4D slices (along the X axis), and run a dumb kernel on them.
+ */
+
+#include <starpu.h>
+
+#define NX    6
+#define NY    5
+#define NZ    4
+#define NT    3
+#define PARTS 2
+
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
+void cpu_func(void *buffers[], void *cl_arg)
+{
+    int i, j, k, l;
+    int *factor = (int *) cl_arg;
+    int *val = (int *)STARPU_TENSOR_GET_PTR(buffers[0]);
+    int nx = (int)STARPU_TENSOR_GET_NX(buffers[0]);
+    int ny = (int)STARPU_TENSOR_GET_NY(buffers[0]);
+    int nz = (int)STARPU_TENSOR_GET_NZ(buffers[0]);
+    int nt = (int)STARPU_TENSOR_GET_NT(buffers[0]);
+    unsigned ldy = STARPU_TENSOR_GET_LDY(buffers[0]);
+    unsigned ldz = STARPU_TENSOR_GET_LDZ(buffers[0]);
+    unsigned ldt = STARPU_TENSOR_GET_LDT(buffers[0]);
+
+    for(l=0; l<nt ; l++)
+    {
+        for(k=0; k<nz ; k++)
+        {
+            for(j=0; j<ny ; j++)
+            {
+                for(i=0; i<nx ; i++)
+                    val[(l*ldt)+(k*ldz)+(j*ldy)+i] = *factor;
+            }
+        }
+    }
+        
+}
+
+void print_tensor(int *tensor, int nx, int ny, int nz, int nt, unsigned ldy, unsigned ldz, unsigned ldt)
+{
+        int i, j, k, l;
+        FPRINTF(stderr, "tensor=%p nx=%d ny=%d nz=%d nt=%d ldy=%u ldz=%u ldt=%u\n", tensor, nx, ny, nz, nt, ldy, ldz, ldt);
+        for(l=0 ; l<nt ; l++)
+        {
+            for(k=0 ; k<nz ; k++)
+            {
+                for(j=0 ; j<ny ; j++)
+                {
+                    for(i=0 ; i<nx ; i++)
+                    {
+                        FPRINTF(stderr, "%2d ", tensor[(l*ldt)+(k*ldz)+(j*ldy)+i]);
+                    }
+                    FPRINTF(stderr,"\n");
+                }
+                FPRINTF(stderr,"\n");
+            }
+            FPRINTF(stderr,"\n");
+        }
+        FPRINTF(stderr,"\n");
+}
+
+void print_data(starpu_data_handle_t tensor_handle)
+{
+    int *tensor = (int *)starpu_tensor_get_local_ptr(tensor_handle);
+    int nx = starpu_tensor_get_nx(tensor_handle);
+    int ny = starpu_tensor_get_ny(tensor_handle);
+    int nz = starpu_tensor_get_nz(tensor_handle);
+    int nt = starpu_tensor_get_nt(tensor_handle);
+    unsigned ldy = starpu_tensor_get_local_ldy(tensor_handle);
+    unsigned ldz = starpu_tensor_get_local_ldz(tensor_handle);
+    unsigned ldt = starpu_tensor_get_local_ldt(tensor_handle);
+
+    print_tensor(tensor, nx, ny, nz, nt, ldy, ldz, ldt);
+}
+
+int main(void)
+{
+    int *tensor,n=0;
+    int i, j, k, l;
+    int ret;
+
+    tensor = (int*)malloc(NX*NY*NZ*NT*sizeof(tensor[0]));
+    assert(tensor);
+    for(l=0 ; l<NT ; l++)
+    {
+        for(k=0 ; k<NZ ; k++)
+        {
+            for(j=0 ; j<NY ; j++)
+            {
+                for(i=0 ; i<NX ; i++)
+                {
+                    tensor[(l*NX*NY*NZ)+(k*NX*NY)+(j*NX)+i] = n++;
+                }
+            }
+        }
+    }
+
+    starpu_data_handle_t handle;
+    struct starpu_codelet cl =
+    {
+        .cpu_funcs = {cpu_func},
+        .cpu_funcs_name = {"cpu_func"},
+        .nbuffers = 1,
+        .modes = {STARPU_RW},
+        .name = "tensor_scal"
+    };
+
+    ret = starpu_init(NULL);
+    if (ret == -ENODEV)
+        return 77;
+    STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+    
+    /* Declare data to StarPU */
+    starpu_tensor_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)tensor, NX, NX*NY, NX*NY*NZ, NX, NY, NZ, NT, sizeof(int));
+    FPRINTF(stderr, "IN  Tensor\n");
+    print_data(handle);
+
+    /* Partition the tensor in PARTS sub-tensors */
+    struct starpu_data_filter f =
+    {
+        .filter_func = starpu_tensor_filter_block,
+        .nchildren = PARTS
+    };
+    starpu_data_partition(handle, &f);
+
+    FPRINTF(stderr,"Nb of partitions : %d\n",starpu_data_get_nb_children(handle));
+
+    for(i=0 ; i<starpu_data_get_nb_children(handle) ; i++)
+    {
+        starpu_data_handle_t stensor = starpu_data_get_sub_data(handle, 1, i);
+        FPRINTF(stderr, "Sub tensor %d\n", i);
+        print_data(stensor);
+    }
+
+    /* Submit a task on each sub-tensor */
+    for(i=0 ; i<starpu_data_get_nb_children(handle) ; i++)
+    {
+        int multiplier=i;
+        struct starpu_task *task = starpu_task_create();
+
+        FPRINTF(stderr,"Dealing with sub-tensor %d\n", i);
+        task->cl = &cl;
+        task->synchronous = 1;
+        task->callback_func = NULL;
+        task->handles[0] = starpu_data_get_sub_data(handle, 1, i);
+        task->cl_arg = &multiplier;
+        task->cl_arg_size = sizeof(multiplier);
+
+        ret = starpu_task_submit(task);
+        if (ret == -ENODEV) goto enodev;
+        STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+    }
+
+    /* Unpartition the data, unregister it from StarPU and shutdown */
+    starpu_data_unpartition(handle, STARPU_MAIN_RAM);
+    print_data(handle);
+    starpu_data_unregister(handle);
+
+    /* Print result tensor */
+    FPRINTF(stderr, "OUT Tensor\n");
+    print_tensor(tensor, NX, NY, NZ, NT, NX, NX*NY, NX*NY*NZ);
+
+    free(tensor);
+
+    starpu_shutdown();
+    return 0;
+
+enodev:
+    starpu_shutdown();
+    return 77;
+}

+ 497 - 0
examples/filters/shadow4d.c

@@ -0,0 +1,497 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2010       Mehdi Juhoor
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * This examplifies the use of the 4D matrix shadow filters: a source "matrix" of
+ * NX*NY*NZ*NT elements (plus SHADOW wrap-around elements) is partitioned into
+ * matrices with some shadowing, and these are copied into a destination
+ * "matrix2" of
+ * NRPARTSX*NPARTSY*NPARTSZ*NPARTST*((NX/NPARTSX+2*SHADOWX)*(NY/NPARTSY+2*SHADOWY)*(NZ/NPARTSZ+2*SHADOWZ)*(NT/NPARTST+2*SHADOWT))
+ * elements, partitioned in the traditionnal way, thus showing how shadowing
+ * shows up.
+ */
+
+#include <starpu.h>
+
+/* Shadow width */
+#define SHADOWX 2
+#define SHADOWY 2
+#define SHADOWZ 1
+#define SHADOWT 1
+#define NX    6
+#define NY    6
+#define NZ    2
+#define NT    2
+#define PARTSX 2
+#define PARTSY 2
+#define PARTSZ 2
+#define PARTST 2
+
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
+void cpu_func(void *buffers[], void *cl_arg)
+{
+    (void)cl_arg;
+        /* length of the shadowed source matrix */
+        unsigned ldy = STARPU_TENSOR_GET_LDY(buffers[0]);
+        unsigned ldz = STARPU_TENSOR_GET_LDZ(buffers[0]);
+        unsigned ldt = STARPU_TENSOR_GET_LDT(buffers[0]);
+        unsigned x = STARPU_TENSOR_GET_NX(buffers[0]);
+        unsigned y = STARPU_TENSOR_GET_NY(buffers[0]);
+        unsigned z = STARPU_TENSOR_GET_NZ(buffers[0]);
+        unsigned t = STARPU_TENSOR_GET_NT(buffers[0]);
+        /* local copy of the shadowed source matrix pointer */
+        int *val = (int *)STARPU_TENSOR_GET_PTR(buffers[0]);
+
+        /* length of the destination matrix */
+        unsigned ldy2 = STARPU_TENSOR_GET_LDY(buffers[1]);
+        unsigned ldz2 = STARPU_TENSOR_GET_LDZ(buffers[1]);
+        unsigned ldt2 = STARPU_TENSOR_GET_LDT(buffers[1]);
+        unsigned x2 = STARPU_TENSOR_GET_NX(buffers[1]);
+        unsigned y2 = STARPU_TENSOR_GET_NY(buffers[1]);
+        unsigned z2 = STARPU_TENSOR_GET_NZ(buffers[1]);
+        unsigned t2 = STARPU_TENSOR_GET_NT(buffers[1]);
+        /* local copy of the destination matrix pointer */
+        int *val2 = (int *)STARPU_TENSOR_GET_PTR(buffers[1]);
+
+    unsigned i, j, k, l;
+
+    /* If things go right, sizes should match */
+    STARPU_ASSERT(x == x2);
+    STARPU_ASSERT(y == y2);
+    STARPU_ASSERT(z == z2);
+    STARPU_ASSERT(t == t2);
+    for (l = 0; l < t; l++)
+        for (k = 0; k < z; k++)
+            for (j = 0; j < y; j++)
+                for (i = 0; i < x; i++)
+                    val2[l*ldt2+k*ldz2+j*ldy2+i] = val[l*ldt+k*ldz+j*ldy+i];
+}
+
+#ifdef STARPU_USE_CUDA
+void cuda_func(void *buffers[], void *cl_arg)
+{
+    (void)cl_arg;
+        /* length of the shadowed source matrix*/
+        unsigned ldy = STARPU_TENSOR_GET_LDY(buffers[0]);
+        unsigned ldz = STARPU_TENSOR_GET_LDZ(buffers[0]);
+        unsigned ldt = STARPU_TENSOR_GET_LDT(buffers[0]);
+        unsigned x = STARPU_TENSOR_GET_NX(buffers[0]);
+        unsigned y = STARPU_TENSOR_GET_NY(buffers[0]);
+        unsigned z = STARPU_TENSOR_GET_NZ(buffers[0]);
+        unsigned t = STARPU_TENSOR_GET_NT(buffers[0]);
+        /* local copy of the shadowed source matrix pointer */
+        int *val = (int *)STARPU_TENSOR_GET_PTR(buffers[0]);
+
+        /* length of the destination matrix */
+        unsigned ldy2 = STARPU_TENSOR_GET_LDY(buffers[1]);
+        unsigned ldz2 = STARPU_TENSOR_GET_LDZ(buffers[1]);
+        unsigned ldt2 = STARPU_TENSOR_GET_LDT(buffers[1]);
+        unsigned x2 = STARPU_TENSOR_GET_NX(buffers[1]);
+        unsigned y2 = STARPU_TENSOR_GET_NY(buffers[1]);
+        unsigned z2 = STARPU_TENSOR_GET_NZ(buffers[1]);
+        unsigned t2 = STARPU_TENSOR_GET_NT(buffers[1]);
+        /* local copy of the destination matrix pointer */
+        int *val2 = (int *)STARPU_TENSOR_GET_PTR(buffers[1]);
+
+    unsigned k, l;
+    cudaError_t cures;
+
+    /* If things go right, sizes should match */
+    STARPU_ASSERT(x == x2);
+    STARPU_ASSERT(y == y2);
+    STARPU_ASSERT(z == z2);
+    STARPU_ASSERT(t == t2);
+    for (l = 0; l < t; l++)
+    {
+        for (k = 0; k < z; k++)
+        {
+            cures = cudaMemcpy2DAsync(val2+k*ldz2+l*ldt2, ldy2*sizeof(*val2), val+k*ldz+l*ldt, ldy*sizeof(*val),
+                    x*sizeof(*val), y, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
+            STARPU_ASSERT(!cures);
+        }
+    }
+        
+}
+#endif
+
+int main(void)
+{
+    unsigned i, j, k, l, m, n, p, q;
+    int matrix[NT + 2*SHADOWT][NZ + 2*SHADOWZ][NY + 2*SHADOWY][NX + 2*SHADOWX];
+    int matrix2[NT + PARTST*2*SHADOWT][NZ + PARTSZ*2*SHADOWZ][NY + PARTSY*2*SHADOWY][NX + PARTSX*2*SHADOWX];
+    starpu_data_handle_t handle, handle2;
+    int ret;
+
+    struct starpu_codelet cl =
+    {
+        .cpu_funcs = {cpu_func},
+        .cpu_funcs_name = {"cpu_func"},
+#ifdef STARPU_USE_CUDA
+        .cuda_funcs = {cuda_func},
+        .cuda_flags = {STARPU_CUDA_ASYNC},
+#endif
+        .nbuffers = 2,
+        .modes = {STARPU_R, STARPU_W}
+    };
+
+    memset(matrix, -1, sizeof(matrix));
+    for(l=1 ; l<=NT ; l++)
+        for(k=1 ; k<=NZ ; k++)
+            for(j=1 ; j<=NY ; j++)
+                for(i=1 ; i<=NX ; i++)
+                    matrix[SHADOWT+l-1][SHADOWZ+k-1][SHADOWY+j-1][SHADOWX+i-1] = i+j+k+l;
+
+    /*copy cubes*/
+    for (l = SHADOWT ; l<SHADOWT+NT ; l++)
+        for (k = SHADOWZ ; k<SHADOWZ+NZ ; k++)
+            for (j = SHADOWY ; j<SHADOWY+NY ; j++)
+                for(i=0 ; i<SHADOWX ; i++)
+                {
+                    matrix[l][k][j][i] = matrix[l][k][j][i+NX];
+                    matrix[l][k][j][SHADOWX+NX+i] = matrix[l][k][j][SHADOWX+i];
+                }
+    for (l = SHADOWT ; l<SHADOWT+NT ; l++)
+        for(k=SHADOWZ ; k<SHADOWZ+NZ ; k++)
+            for(j=0 ; j<SHADOWY ; j++)
+                for(i=SHADOWX ; i<SHADOWX+NX ; i++)
+                {
+                    matrix[l][k][j][i] = matrix[l][k][j+NY][i];
+                    matrix[l][k][SHADOWY+NY+j][i] = matrix[l][k][SHADOWY+j][i];
+                }
+    for (l = SHADOWT ; l<SHADOWT+NT ; l++)
+        for(k=0 ; k<SHADOWZ ; k++)
+            for(j=SHADOWY ; j<SHADOWY+NY ; j++)
+                for(i=SHADOWX ; i<SHADOWX+NX ; i++)
+                {
+                    matrix[l][k][j][i] = matrix[l][k+NZ][j][i];
+                    matrix[l][SHADOWZ+NZ+k][j][i] = matrix[l][SHADOWZ+k][j][i];
+                }
+    for (l = 0 ; l<SHADOWT ; l++)
+        for(k=SHADOWZ ; k<SHADOWZ+NZ ; k++)
+            for(j=SHADOWY ; j<SHADOWY+NY ; j++)
+                for(i=SHADOWX ; i<SHADOWX+NX ; i++)
+                {
+                    matrix[l][k][j][i] = matrix[l+NT][k][j][i];
+                    matrix[SHADOWT+NT+l][k][j][i] = matrix[SHADOWT+l][k][j][i];
+                }
+
+    /*copy planes*/
+    for (l = SHADOWT ; l<SHADOWT+NT ; l++)
+        for (k = SHADOWZ ; k<SHADOWZ+NZ ; k++)
+            for(j=0 ; j<SHADOWY ; j++)
+                for(i=0 ; i<SHADOWX ; i++)
+                {
+                    matrix[l][k][j][i] = matrix[l][k][j+NY][i+NX];
+                    matrix[l][k][j][SHADOWX+NX+i] = matrix[l][k][j+NY][SHADOWX+i];
+                    matrix[l][k][SHADOWY+NY+j][i] = matrix[l][k][SHADOWY+j][i+NX];
+                    matrix[l][k][SHADOWY+NY+j][SHADOWX+NX+i] = matrix[l][k][SHADOWY+j][SHADOWX+i];
+                }
+
+    for (l = SHADOWT ; l<SHADOWT+NT ; l++)
+        for (k=0 ; k<SHADOWZ ; k++)
+            for(j = SHADOWY ; j<SHADOWY+NY ; j++)
+                for(i=0 ; i<SHADOWX ; i++)
+                {
+                    matrix[l][k][j][i] = matrix[l][k+NZ][j][i+NX];
+                    matrix[l][k][j][SHADOWX+NX+i] = matrix[l][k+NZ][j][SHADOWX+i];
+                    matrix[l][SHADOWZ+NZ+k][j][i] = matrix[l][SHADOWZ+k][j][i+NX];
+                    matrix[l][SHADOWZ+NZ+k][j][SHADOWX+NX+i] = matrix[l][SHADOWZ+k][j][SHADOWX+i];
+                }
+
+    for (l = SHADOWT ; l<SHADOWT+NT ; l++)
+        for (k=0 ; k<SHADOWZ ; k++)
+            for(j=0 ; j<SHADOWY ; j++)
+                for(i=SHADOWX ; i<SHADOWX+NX ; i++)
+                {
+                    matrix[l][k][j][i] = matrix[l][k+NZ][j+NY][i];
+                    matrix[l][k][SHADOWY+NY+j][i] = matrix[l][k+NZ][SHADOWY+j][i];
+                    matrix[l][SHADOWZ+NZ+k][j][i] = matrix[l][SHADOWZ+k][j+NY][i];
+                    matrix[l][SHADOWZ+NZ+k][SHADOWY+NY+j][i] = matrix[l][SHADOWZ+k][SHADOWY+j][i];
+                }
+
+    for (l=0 ; l<SHADOWT ; l++)
+        for (k = SHADOWZ ; k<SHADOWZ+NZ ; k++)
+            for(j = SHADOWY ; j<SHADOWY+NY ; j++)
+                for(i=0 ; i<SHADOWX ; i++)
+                {
+                    matrix[l][k][j][i] = matrix[l+NT][k][j][i+NX];
+                    matrix[l][k][j][SHADOWX+NX+i] = matrix[l+NT][k][j][SHADOWX+i];
+                    matrix[SHADOWT+NT+l][k][j][i] = matrix[SHADOWT+l][k][j][i+NX];
+                    matrix[SHADOWT+NT+l][k][j][SHADOWX+NX+i] = matrix[SHADOWT+l][k][j][SHADOWX+i];
+                }
+
+    for (l=0 ; l<SHADOWT ; l++)
+        for (k = SHADOWZ ; k<SHADOWZ+NZ ; k++)
+            for(j=0 ; j<SHADOWY ; j++)
+                for(i=SHADOWX ; i<SHADOWX+NX ; i++)
+                {
+                    matrix[l][k][j][i] = matrix[l+NT][k][j+NY][i];
+                    matrix[l][k][SHADOWY+NY+j][i] = matrix[l+NT][k][SHADOWY+j][i];
+                    matrix[SHADOWT+NT+l][k][j][i] = matrix[SHADOWT+l][k][j+NY][i];
+                    matrix[SHADOWT+NT+l][k][SHADOWY+NY+j][i] = matrix[SHADOWT+l][k][SHADOWY+j][i];
+                }
+
+    for (l=0 ; l<SHADOWT ; l++)
+        for(k=0 ; k<SHADOWZ ; k++)
+            for (j = SHADOWY ; j<SHADOWY+NY ; j++)
+                for(i=SHADOWX ; i<SHADOWX+NX ; i++)
+                {
+                    matrix[l][k][j][i] = matrix[l+NT][k+NZ][j][i];
+                    matrix[l][SHADOWZ+NZ+k][j][i] = matrix[l+NT][SHADOWZ+k][j][i];
+                    matrix[SHADOWT+NT+l][k][j][i] = matrix[SHADOWT+l][k+NZ][j][i];
+                    matrix[SHADOWT+NT+l][SHADOWZ+NZ+k][j][i] = matrix[SHADOWT+l][SHADOWZ+k][j][i];
+                }
+
+    /* Copy borders */
+    for (l = SHADOWT ; l<SHADOWT+NT ; l++)
+        for (k=0 ; k<SHADOWZ ; k++)
+            for(j=0 ; j<SHADOWY ; j++)
+                for(i=0 ; i<SHADOWX ; i++)
+                {
+                    matrix[l][k][j][i] = matrix[l][k+NZ][j+NY][i+NX];
+                    matrix[l][k][j][SHADOWX+NX+i] = matrix[l][k+NZ][j+NY][SHADOWX+i];
+                    matrix[l][k][SHADOWY+NY+j][i] = matrix[l][k+NZ][SHADOWY+j][i+NX];
+                    matrix[l][k][SHADOWY+NY+j][SHADOWX+NX+i] = matrix[l][k+NZ][SHADOWY+j][SHADOWX+i];
+                    matrix[l][SHADOWZ+NZ+k][j][i] = matrix[l][SHADOWZ+k][j+NY][i+NX];
+                    matrix[l][SHADOWZ+NZ+k][j][SHADOWX+NX+i] = matrix[l][SHADOWZ+k][j+NY][SHADOWX+i];
+                    matrix[l][SHADOWZ+NZ+k][SHADOWY+NY+j][i] = matrix[l][SHADOWZ+k][SHADOWY+j][i+NX];
+                    matrix[l][SHADOWZ+NZ+k][SHADOWY+NY+j][SHADOWX+NX+i] = matrix[l][SHADOWZ+k][SHADOWY+j][SHADOWX+i];
+                }
+    for (l=0 ; l<SHADOWT ; l++)
+        for (k = SHADOWZ ; k<SHADOWZ+NZ ; k++)
+            for(j=0 ; j<SHADOWY ; j++)
+                for(i=0 ; i<SHADOWX ; i++)
+                {
+                    matrix[l][k][j][i] = matrix[l+NT][k][j+NY][i+NX];
+                    matrix[l][k][j][SHADOWX+NX+i] = matrix[l+NT][k][j+NY][SHADOWX+i];
+                    matrix[l][k][SHADOWY+NY+j][i] = matrix[l+NT][k][SHADOWY+j][i+NX];
+                    matrix[SHADOWT+NT+l][k][j][i] = matrix[SHADOWT+l][k][j+NY][i+NX];
+                    matrix[l][k][SHADOWY+NY+j][SHADOWX+NX+i] = matrix[l+NT][k][SHADOWY+j][SHADOWX+i];
+                    matrix[SHADOWT+NT+l][k][j][SHADOWX+NX+i] = matrix[SHADOWT+l][k][j+NY][SHADOWX+i];
+                    matrix[SHADOWT+NT+l][k][SHADOWY+NY+j][i] = matrix[SHADOWT+l][k][SHADOWY+j][i+NX];
+                    matrix[SHADOWT+NT+l][k][SHADOWY+NY+j][SHADOWX+NX+i] = matrix[SHADOWT+l][k][SHADOWY+j][SHADOWX+i];
+                }
+    for (l=0 ; l<SHADOWT ; l++)
+        for(k=0 ; k<SHADOWZ ; k++)
+            for (j = SHADOWY ; j<SHADOWY+NY ; j++)
+                for(i=0 ; i<SHADOWX ; i++)
+                {
+                    matrix[l][k][j][i] = matrix[l+NT][k+NZ][j][i+NX];
+                    matrix[l][k][j][SHADOWX+NX+i] = matrix[l+NT][k+NZ][j][SHADOWX+i];
+                    matrix[l][SHADOWZ+NZ+k][j][i] = matrix[l+NT][SHADOWZ+k][j][i+NX];
+                    matrix[SHADOWT+NT+l][k][j][i] = matrix[SHADOWT+l][k+NZ][j][i+NX];
+                    matrix[l][SHADOWZ+NZ+k][j][SHADOWX+NX+i] = matrix[l+NT][SHADOWZ+k][j][SHADOWX+i];
+                    matrix[SHADOWT+NT+l][k][j][SHADOWX+NX+i] = matrix[SHADOWT+l][k+NZ][j][SHADOWX+i];
+                    matrix[SHADOWT+NT+l][SHADOWZ+NZ+k][j][i] = matrix[SHADOWT+l][SHADOWZ+k][j][i+NX];
+                    matrix[SHADOWT+NT+l][SHADOWZ+NZ+k][j][SHADOWX+NX+i] = matrix[SHADOWT+l][SHADOWZ+k][j][SHADOWX+i];
+                }
+    for (l=0 ; l<SHADOWT ; l++)
+        for(k=0 ; k<SHADOWZ ; k++)
+            for(j=0 ; j<SHADOWY ; j++)
+                for(i=SHADOWX ; i<SHADOWX+NX ; i++)
+                {
+                    matrix[l][k][j][i] = matrix[l+NT][k+NZ][j+NY][i];
+                    matrix[l][k][SHADOWY+NY+j][i] = matrix[l+NT][k+NZ][SHADOWY+j][i];
+                    matrix[l][SHADOWZ+NZ+k][j][i] = matrix[l+NT][SHADOWZ+k][j+NY][i];
+                    matrix[SHADOWT+NT+l][k][j][i] = matrix[SHADOWT+l][k+NZ][j+NY][i];
+                    matrix[l][SHADOWZ+NZ+k][SHADOWY+NY+j][i] = matrix[l+NT][SHADOWZ+k][SHADOWY+j][i];
+                    matrix[SHADOWT+NT+l][k][SHADOWY+NY+j][i] = matrix[SHADOWT+l][k+NZ][SHADOWY+j][i];
+                    matrix[SHADOWT+NT+l][SHADOWZ+NZ+k][j][i] = matrix[SHADOWT+l][SHADOWZ+k][j+NY][i];
+                    matrix[SHADOWT+NT+l][SHADOWZ+NZ+k][SHADOWY+NY+j][i] = matrix[SHADOWT+l][SHADOWZ+k][SHADOWY+j][i];
+                }
+
+    /* Copy corners */
+    for(l=0 ; l<SHADOWT ; l++)
+        for(k=0 ; k<SHADOWZ ; k++)
+            for(j=0 ; j<SHADOWY ; j++)
+                for(i=0 ; i<SHADOWX ; i++)
+                {
+                    matrix[l][k][j][i] = matrix[l+NT][k+NZ][j+NY][i+NX];
+                    matrix[l][k][j][SHADOWX+NX+i] = matrix[l+NT][k+NZ][j+NY][SHADOWX+i];
+                    matrix[l][k][SHADOWY+NY+j][i] = matrix[l+NT][k+NZ][SHADOWY+j][i+NX];
+                    matrix[l][k][SHADOWY+NY+j][SHADOWX+NX+i] = matrix[l+NT][k+NZ][SHADOWY+j][SHADOWX+i];
+                    matrix[l][SHADOWZ+NZ+k][j][i] = matrix[l+NT][SHADOWZ+k][j+NY][i+NX];
+                    matrix[l][SHADOWZ+NZ+k][j][SHADOWX+NX+i] = matrix[l+NT][SHADOWZ+k][j+NY][SHADOWX+i];
+                    matrix[l][SHADOWZ+NZ+k][SHADOWY+NY+j][i] = matrix[l+NT][SHADOWZ+k][SHADOWY+j][i+NX];
+                    matrix[l][SHADOWZ+NZ+k][SHADOWY+NY+j][SHADOWX+NX+i] = matrix[l+NT][SHADOWZ+k][SHADOWY+j][SHADOWX+i];
+                    matrix[SHADOWT+NT+l][k][j][i] = matrix[SHADOWT+l][k+NZ][j+NY][i+NX];
+                    matrix[SHADOWT+NT+l][k][j][SHADOWX+NX+i] = matrix[SHADOWT+l][k+NZ][j+NY][SHADOWX+i];
+                    matrix[SHADOWT+NT+l][k][SHADOWY+NY+j][i] = matrix[SHADOWT+l][k+NZ][SHADOWY+j][i+NX];
+                    matrix[SHADOWT+NT+l][SHADOWZ+NZ+k][j][i] = matrix[SHADOWT+l][SHADOWZ+k][j+NY][i+NX];
+                    matrix[SHADOWT+NT+l][k][SHADOWY+NY+j][SHADOWX+NX+i] = matrix[SHADOWT+l][k+NZ][SHADOWY+j][SHADOWX+i];
+                    matrix[SHADOWT+NT+l][SHADOWZ+NZ+k][j][SHADOWX+NX+i] = matrix[SHADOWT+l][SHADOWZ+k][j+NY][SHADOWX+i];
+                    matrix[SHADOWT+NT+l][SHADOWZ+NZ+k][SHADOWY+NY+j][i] = matrix[SHADOWT+l][SHADOWZ+k][SHADOWY+j][i+NX];
+                    matrix[SHADOWT+NT+l][SHADOWZ+NZ+k][SHADOWY+NY+j][SHADOWX+NX+i] = matrix[SHADOWT+l][SHADOWZ+k][SHADOWY+j][SHADOWX+i];
+                }
+
+    FPRINTF(stderr,"IN  Matrix:\n");
+    for(l=0 ; l<NT + 2*SHADOWT ; l++)
+    {
+        for(k=0 ; k<NZ + 2*SHADOWZ ; k++)
+        {
+            for(j=0 ; j<NY + 2*SHADOWY ; j++)
+            {
+                for(i=0 ; i<NX + 2*SHADOWX ; i++)
+                    FPRINTF(stderr, "%5d ", matrix[l][k][j][i]);
+                FPRINTF(stderr,"\n");
+            }
+            FPRINTF(stderr,"\n\n");
+        }
+        FPRINTF(stderr,"\n\n");
+    }
+    FPRINTF(stderr,"\n");
+
+    ret = starpu_init(NULL);
+    if (ret == -ENODEV)
+        exit(77);
+    STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+
+    /* Declare source matrix to StarPU */
+    starpu_tensor_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)matrix,
+            NX + 2*SHADOWX, (NX + 2*SHADOWX) * (NY + 2*SHADOWY), (NX + 2*SHADOWX) * (NY + 2*SHADOWY) * (NZ + 2*SHADOWZ),
+            NX + 2*SHADOWX, NY + 2*SHADOWY, NZ + 2*SHADOWZ, NT + 2*SHADOWT,
+            sizeof(matrix[0][0][0][0]));
+
+    /* Declare destination matrix to StarPU */
+    starpu_tensor_data_register(&handle2, STARPU_MAIN_RAM, (uintptr_t)matrix2,
+            NX + PARTSX*2*SHADOWX, (NX + PARTSX*2*SHADOWX) * (NY + PARTSY*2*SHADOWY), (NX + PARTSX*2*SHADOWX) * (NY + PARTSY*2*SHADOWY) * (NZ + PARTSZ*2*SHADOWZ),
+            NX + PARTSX*2*SHADOWX, NY + PARTSY*2*SHADOWY, NZ + PARTSZ*2*SHADOWZ, NT + PARTST*2*SHADOWT,
+            sizeof(matrix2[0][0][0][0]));
+
+    /* Partition the source matrix in PARTST*PARTSZ*PARTSY*PARTSX sub-matrices with shadows */
+    /* NOTE: the resulting handles should only be used in read-only mode,
+     * as StarPU will not know how the overlapping parts would have to be
+     * combined. */
+    struct starpu_data_filter ft =
+    {
+        .filter_func = starpu_tensor_filter_time_block_shadow,
+        .nchildren = PARTST,
+        .filter_arg_ptr = (void*)(uintptr_t) SHADOWT /* Shadow width */
+    };
+    struct starpu_data_filter fz =
+    {
+        .filter_func = starpu_tensor_filter_depth_block_shadow,
+        .nchildren = PARTSZ,
+        .filter_arg_ptr = (void*)(uintptr_t) SHADOWZ /* Shadow width */
+    };
+    struct starpu_data_filter fy =
+    {
+        .filter_func = starpu_tensor_filter_vertical_block_shadow,
+        .nchildren = PARTSY,
+        .filter_arg_ptr = (void*)(uintptr_t) SHADOWY /* Shadow width */
+    };
+    struct starpu_data_filter fx =
+    {
+        .filter_func = starpu_tensor_filter_block_shadow,
+        .nchildren = PARTSX,
+        .filter_arg_ptr = (void*)(uintptr_t) SHADOWX /* Shadow width */
+    };
+    starpu_data_map_filters(handle, 4, &ft, &fz, &fy, &fx);
+
+    /* Partition the destination matrix in PARTST*PARTSZ*PARTSY*PARTSX sub-matrices */
+    struct starpu_data_filter ft2 =
+    {
+        .filter_func = starpu_tensor_filter_time_block,
+        .nchildren = PARTST,
+    };
+    struct starpu_data_filter fz2 =
+    {
+        .filter_func = starpu_tensor_filter_depth_block,
+        .nchildren = PARTSZ,
+    };
+    struct starpu_data_filter fy2 =
+    {
+        .filter_func = starpu_tensor_filter_vertical_block,
+        .nchildren = PARTSY,
+    };
+    struct starpu_data_filter fx2 =
+    {
+        .filter_func = starpu_tensor_filter_block,
+        .nchildren = PARTSX,
+    };
+    starpu_data_map_filters(handle2, 4, &ft2, &fz2, &fy2, &fx2);
+
+
+    /* Submit a task on each sub-matrix */
+    for (l=0; l<PARTST; l++)
+    {
+        for (k=0; k<PARTSZ; k++)
+        {
+            for (j=0; j<PARTSY; j++)
+            {
+                for (i=0; i<PARTSX; i++)
+                {
+                    starpu_data_handle_t sub_handle = starpu_data_get_sub_data(handle, 4, l, k, j, i);
+                    starpu_data_handle_t sub_handle2 = starpu_data_get_sub_data(handle2, 4, l, k, j, i);
+                    struct starpu_task *task = starpu_task_create();
+
+                    task->handles[0] = sub_handle;
+                    task->handles[1] = sub_handle2;
+                    task->cl = &cl;
+                    task->synchronous = 1;
+
+                    ret = starpu_task_submit(task);
+                    if (ret == -ENODEV) goto enodev;
+                    STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+                }
+            }
+        }
+    }
+
+    starpu_data_unpartition(handle, STARPU_MAIN_RAM);
+    starpu_data_unpartition(handle2, STARPU_MAIN_RAM);
+    starpu_data_unregister(handle);
+    starpu_data_unregister(handle2);
+    starpu_shutdown();
+
+    FPRINTF(stderr,"OUT Matrix:\n");
+    for(l=0 ; l<NT + PARTST*2*SHADOWT ; l++)
+    {
+        for(k=0 ; k<NZ + PARTSZ*2*SHADOWZ ; k++)
+        {
+            for(j=0 ; j<NY + PARTSY*2*SHADOWY ; j++)
+            {
+                for(i=0 ; i<NX + PARTSX*2*SHADOWX ; i++)
+                {
+                    FPRINTF(stderr, "%5d ", matrix2[l][k][j][i]);
+                }
+                FPRINTF(stderr,"\n");
+            }
+            FPRINTF(stderr,"\n\n");
+        }
+        FPRINTF(stderr,"\n\n");
+    }
+    FPRINTF(stderr,"\n");
+    for(l=0 ; l<PARTST ; l++)
+        for(k=0 ; k<PARTSZ ; k++)
+            for(j=0 ; j<PARTSY ; j++)
+                for(i=0 ; i<PARTSX ; i++)
+                    for (q=0 ; q<NT/PARTST + 2*SHADOWT ; q++)
+                        for (p=0 ; p<NZ/PARTSZ + 2*SHADOWZ ; p++)
+                            for (n=0 ; n<NY/PARTSY + 2*SHADOWY ; n++)
+                                for (m=0 ; m<NX/PARTSX + 2*SHADOWX ; m++)
+                                    STARPU_ASSERT(matrix2[l*(NT/PARTST+2*SHADOWT)+q][k*(NZ/PARTSZ+2*SHADOWZ)+p][j*(NY/PARTSY+2*SHADOWY)+n][i*(NX/PARTSX+2*SHADOWX)+m] ==
+                                            matrix[l*(NT/PARTST)+q][k*(NZ/PARTSZ)+p][j*(NY/PARTSY)+n][i*(NX/PARTSX)+m]);
+
+    return 0;
+
+enodev:
+    FPRINTF(stderr, "WARNING: No one can execute this task\n");
+    starpu_shutdown();
+    return 77;
+}

+ 9 - 0
examples/spmd/vector_scal_spmd.c

@@ -112,6 +112,15 @@ int main(void)
 	conf.single_combined_worker = 1;
 	conf.sched_policy_name = "pheft";
 
+	{
+		ret = starpu_init(NULL);
+		if (ret == -ENODEV) return 77;
+		conf.ncpus = starpu_cpu_worker_get_count();
+		conf.ncpus /= 2;
+		starpu_shutdown();
+	}
+
+
 	ret = starpu_init(&conf);
 	if (ret == -ENODEV) return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");

+ 84 - 2
include/schedulers/starpu_heteroprio.h

@@ -24,13 +24,86 @@ extern "C"
 {
 #endif
 
-#define STARPU_HETEROPRIO_MAX_PRIO 100
+/**
+   @defgroup API_HeteroPrio Heteroprio Scheduler
+   @brief This is the interface for the heteroprio scheduler
+   @{
+ */
 
 #define STARPU_HETEROPRIO_MAX_PREFETCH 2
 #if STARPU_HETEROPRIO_MAX_PREFETCH <= 0
 #error STARPU_HETEROPRIO_MAX_PREFETCH == 1 means no prefetch so STARPU_HETEROPRIO_MAX_PREFETCH must >= 1
 #endif
 
+#define STARPU_AUTOHETEROPRIO_PRIORITY_ORDERING_POLICY_COUNT 28
+
+enum starpu_autoheteroprio_priority_ordering_policy
+{
+	STARPU_HETEROPRIO_NOD_TIME_COMBINATION, // 0
+	STARPU_HETEROPRIO_BEST_NODS_SCORE,
+	STARPU_HETEROPRIO_BEST_NODS,
+	STARPU_HETEROPRIO_URT_PURE,
+	STARPU_HETEROPRIO_URT,
+	STARPU_HETEROPRIO_URT_2, // 5
+	STARPU_HETEROPRIO_URT_DOT_DIFF_PURE,
+	STARPU_HETEROPRIO_URT_DOT_DIFF_PURE_2,
+	STARPU_HETEROPRIO_URT_DOT_REL_DIFF_PURE,
+	STARPU_HETEROPRIO_URT_DOT_REL_DIFF_PURE_2,
+	STARPU_HETEROPRIO_URT_DOT_DIFF_2, // 10
+	STARPU_HETEROPRIO_URT_DOT_DIFF_3,
+	STARPU_HETEROPRIO_URT_DOT_DIFF_4,
+	STARPU_HETEROPRIO_URT_DOT_DIFF_5,
+	STARPU_HETEROPRIO_URT_DOT_DIFF_6,
+	STARPU_HETEROPRIO_URT_DOT_DIFF_7, // 15
+	STARPU_HETEROPRIO_URT_DOT_DIFF_8,
+	STARPU_HETEROPRIO_URT_DOT_DIFF_9,
+	STARPU_HETEROPRIO_URT_DOT_DIFF_10,
+	STARPU_HETEROPRIO_URT_DOT_DIFF_11,
+	STARPU_HETEROPRIO_URTS_PER_SECONDS, // 20
+	STARPU_HETEROPRIO_URTS_PER_SECONDS_2,
+	STARPU_HETEROPRIO_URTS_PER_SECONDS_DIFF,
+	STARPU_HETEROPRIO_URTS_TIME_RELEASED_DIFF,
+	STARPU_HETEROPRIO_URTS_TIME_COMBINATION,
+	STARPU_HETEROPRIO_NODS_PER_SECOND,
+	STARPU_HETEROPRIO_NODS_TIME_RELEASED,
+	STARPU_HETEROPRIO_NODS_TIME_RELEASED_DIFF
+};
+
+static const char starpu_autoheteroprio_priority_ordering_policy_names[STARPU_AUTOHETEROPRIO_PRIORITY_ORDERING_POLICY_COUNT][64] = 
+{
+	"STARPU_HETEROPRIO_NOD_TIME_COMBINATION",
+	"STARPU_HETEROPRIO_BEST_NODS_SCORE",
+	"STARPU_HETEROPRIO_BEST_NODS",
+	"STARPU_HETEROPRIO_URT_PURE",
+	"STARPU_HETEROPRIO_URT",
+	"STARPU_HETEROPRIO_URT_2",
+	"STARPU_HETEROPRIO_URT_DOT_DIFF_PURE",
+	"STARPU_HETEROPRIO_URT_DOT_DIFF_PURE_2",
+	"STARPU_HETEROPRIO_URT_DOT_REL_DIFF_PURE",
+	"STARPU_HETEROPRIO_URT_DOT_REL_DIFF_PURE_2",
+	"STARPU_HETEROPRIO_URT_DOT_DIFF_2",
+	"STARPU_HETEROPRIO_URT_DOT_DIFF_3",
+	"STARPU_HETEROPRIO_URT_DOT_DIFF_4",
+	"STARPU_HETEROPRIO_URT_DOT_DIFF_5",
+	"STARPU_HETEROPRIO_URT_DOT_DIFF_6",
+	"STARPU_HETEROPRIO_URT_DOT_DIFF_7",
+	"STARPU_HETEROPRIO_URT_DOT_DIFF_8",
+	"STARPU_HETEROPRIO_URT_DOT_DIFF_9",
+	"STARPU_HETEROPRIO_URT_DOT_DIFF_10",
+	"STARPU_HETEROPRIO_URT_DOT_DIFF_11",
+	"STARPU_HETEROPRIO_URTS_PER_SECONDS",
+	"STARPU_HETEROPRIO_URTS_PER_SECONDS_2",
+	"STARPU_HETEROPRIO_URTS_PER_SECONDS_DIFF",
+	"STARPU_HETEROPRIO_URTS_TIME_RELEASED_DIFF",
+	"STARPU_HETEROPRIO_URTS_TIME_COMBINATION",
+	"STARPU_HETEROPRIO_NODS_PER_SECOND",
+	"STARPU_HETEROPRIO_NODS_TIME_RELEASED",
+	"STARPU_HETEROPRIO_NODS_TIME_RELEASED_DIFF"
+};
+
+/** Set if heteroprio should use data locality or not */
+void starpu_heteroprio_set_use_locality(unsigned sched_ctx_id, unsigned use_locality);
+
 /** Tell how many prio there are for a given arch */
 void starpu_heteroprio_set_nb_prios(unsigned sched_ctx_id, enum starpu_worker_archtype arch, unsigned max_prio);
 
@@ -40,11 +113,20 @@ void starpu_heteroprio_set_mapping(unsigned sched_ctx_id, enum starpu_worker_arc
 /** Tell which arch is the faster for the tasks of a bucket (optional) */
 void starpu_heteroprio_set_faster_arch(unsigned sched_ctx_id, enum starpu_worker_archtype arch, unsigned bucket_id);
 
-/** Tell how slow is a arch for the tasks of a bucket (optional) */ 
+/** Tell how slow is a arch for the tasks of a bucket (optional) */
 void starpu_heteroprio_set_arch_slow_factor(unsigned sched_ctx_id, enum starpu_worker_archtype arch, unsigned bucket_id, float slow_factor);
 
+/** One memory node will be one wgroup */
+void starpu_heteroprio_map_wgroup_memory_nodes(unsigned sched_ctx_id);
+
+/** Print the current setup groups */
+void starpu_heteroprio_print_wgroups(FILE *stream, unsigned sched_ctx_id);
+
+/** @} */
+
 #ifdef __cplusplus
 }
 #endif
 
 #endif /* __STARPU_SCHEDULER_HETEROPRIO_H__ */
+

+ 1 - 0
include/starpu.h

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2021       Federal University of Rio Grande do Sul (UFRGS)
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 25 - 1
include/starpu_data.h

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2021       Federal University of Rio Grande do Sul (UFRGS)
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -117,7 +118,9 @@ enum starpu_data_access_mode
 			            When inserting these tasks through the
 				    MPI layer however, the access mode needs
 				    to be ::STARPU_MPI_REDUX. */
-	STARPU_ACCESS_MODE_MAX=(1<<8) /**< The purpose of ::STARPU_ACCESS_MODE_MAX is to
+	STARPU_NOPLAN=(1<<8),	/**< Disable automatic submission of asynchronous
+				    partitioning/unpartitioning */
+	STARPU_ACCESS_MODE_MAX=(1<<9) /**< The purpose of ::STARPU_ACCESS_MODE_MAX is to
 					be the maximum of this enum. */
 };
 
@@ -492,6 +495,15 @@ unsigned starpu_data_is_on_node(starpu_data_handle_t handle, unsigned node);
 void starpu_data_wont_use(starpu_data_handle_t handle);
 
 /**
+   Advise StarPU to evict \p handle from the memory node \p node
+   StarPU will thus write its value back to its home node, before evicting it.
+   This may however fail if e.g. some task is still working on it.
+
+   If the eviction was successful, 0 is returned ; -1 is returned otherwise.
+*/
+int starpu_data_evict_from_node(starpu_data_handle_t handle, unsigned node);
+
+/**
    Set the write-through mask of the data \p handle (and
    its children), i.e. a bitmask of nodes where the data should be always
    replicated after modification. It also prevents the data from being
@@ -606,6 +618,18 @@ void starpu_data_set_user_data(starpu_data_handle_t handle, void* user_data);
 void *starpu_data_get_user_data(starpu_data_handle_t handle);
 
 /**
+   Set the field \c sched_data for the \p handle to \p sched_data . It can
+   then be retrieved with starpu_data_get_sched_data(). \p sched_data can be any
+   scheduler-defined value.
+*/
+void starpu_data_set_sched_data(starpu_data_handle_t handle, void* sched_data);
+
+/**
+   Retrieve the field \c sched_data previously set for the \p handle.
+*/
+void *starpu_data_get_sched_data(starpu_data_handle_t handle);
+
+/**
   Check whether data \p handle can be evicted now from node \p node
 */
 int starpu_data_can_evict(starpu_data_handle_t handle, unsigned node, enum starpu_is_prefetch is_prefetch);

+ 93 - 8
include/starpu_data_filters.h

@@ -240,6 +240,12 @@ void starpu_data_partition_submit(starpu_data_handle_t initial_handle, unsigned
 void starpu_data_partition_readonly_submit(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children);
 
 /**
+ * Similar to starpu_data_partition_readonly_submit(), but allow to
+ * specify the the coherency to be used for the main data \p initial_handle
+ */
+void starpu_data_partition_readonly_submit_sequential_consistency(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children, int sequential_consistency);
+
+/**
    Assume that a partitioning of \p initial_handle has already been submited
    in readonly mode through starpu_data_partition_readonly_submit(), and will upgrade
    that partitioning into read-write mode for the \p children, by invalidating \p
@@ -295,12 +301,6 @@ void starpu_data_partition_submit_sequential_consistency(starpu_data_handle_t in
 */
 void starpu_data_unpartition_submit_sequential_consistency(starpu_data_handle_t initial_handle, unsigned nparts, starpu_data_handle_t *children, int gathering_node, int sequential_consistency);
 
-/**
-   Disable the automatic partitioning of the data \p handle for which
-   a asynchronous plan has previously been submitted
-*/
-void starpu_data_partition_not_automatic(starpu_data_handle_t handle);
-
 /** @} */
 
 /**
@@ -527,16 +527,101 @@ void starpu_block_filter_depth_block(void *father_interface, void *child_interfa
 */
 void starpu_block_filter_depth_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
 
+/** @} */
+
+/**
+   @name Predefined Tensor Filter Functions
+   Predefined partitioning functions for tensor
+   data.
+   @{
+*/
+
+/**
+  Partition a tensor along the X dimension, thus getting
+  (x/\p nparts ,y,z,t) tensors. If \p nparts does not divide x, the last
+  submatrix contains the remainder.
+ */
+void starpu_tensor_filter_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
+
+/**
+   Partition a tensor along the X dimension, with a
+   shadow border <c>filter_arg_ptr</c>, thus getting
+   ((x-2*shadow)/\p nparts +2*shadow,y,z,t) tensors. If \p nparts does not
+   divide x, the last submatrix contains the remainder.
+
+   <b>IMPORTANT</b>:
+   This can only be used for read-only access, as no coherency is
+   enforced for the shadowed parts.
+*/
+void starpu_tensor_filter_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
+
+/**
+   Partition a tensor along the Y dimension, thus getting
+   (x,y/\p nparts ,z,t) tensors. If \p nparts does not divide y, the last
+   submatrix contains the remainder.
+ */
+void starpu_tensor_filter_vertical_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
+
+/**
+   Partition a tensor along the Y dimension, with a
+   shadow border <c>filter_arg_ptr</c>, thus getting
+   (x,(y-2*shadow)/\p nparts +2*shadow,z,t) tensors. If \p nparts does not
+   divide y, the last submatrix contains the remainder.
+
+   <b>IMPORTANT</b>:
+   This can only be used for read-only access, as no coherency is
+   enforced for the shadowed parts.
+*/
+void starpu_tensor_filter_vertical_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
+
+/**
+   Partition a tensor along the Z dimension, thus getting
+   (x,y,z/\p nparts,t) tensors. If \p nparts does not divide z, the last
+   submatrix contains the remainder.
+*/
+void starpu_tensor_filter_depth_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
+
+/**
+   Partition a tensor along the Z dimension, with a
+   shadow border <c>filter_arg_ptr</c>, thus getting
+   (x,y,(z-2*shadow)/\p nparts +2*shadow,t) tensors. If \p nparts does not
+   divide z, the last submatrix contains the remainder.
+
+   <b>IMPORTANT</b>:
+   This can only be used for read-only access, as no coherency is
+   enforced for the shadowed parts.
+*/
+void starpu_tensor_filter_depth_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
+
+/**
+   Partition a tensor along the T dimension, thus getting
+   (x,y,z,t/\p nparts) tensors. If \p nparts does not divide t, the last
+   submatrix contains the remainder.
+*/
+void starpu_tensor_filter_time_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
+
+/**
+   Partition a tensor along the T dimension, with a
+   shadow border <c>filter_arg_ptr</c>, thus getting
+   (x,y,z,(t-2*shadow)/\p nparts +2*shadow) tensors. If \p nparts does not
+   divide t, the last submatrix contains the remainder.
+
+   <b>IMPORTANT</b>:
+   This can only be used for read-only access, as no coherency is
+   enforced for the shadowed parts.
+*/
+void starpu_tensor_filter_time_block_shadow(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
+
 /**
    Given an integer \p n, \p n the number of parts it must be divided in, \p id the
    part currently considered, determines the \p chunk_size and the \p offset, taking
    into account the size of the elements stored in the data structure \p elemsize
-   and \p ld, the leading dimension, which is most often 1.
+   and \p blocksize, which is most often 1.
  */
 void
 starpu_filter_nparts_compute_chunk_size_and_offset(unsigned n, unsigned nparts,
 					     size_t elemsize, unsigned id,
-					     unsigned ld, unsigned *chunk_size,
+					     unsigned blocksize, unsigned *chunk_size,
 					     size_t *offset);
 
 /** @} */

+ 5 - 0
include/starpu_task.h

@@ -1808,6 +1808,11 @@ void starpu_task_ft_success(struct starpu_task *meta_task);
 */
 void starpu_task_watchdog_set_hook(void (*hook)(void *), void *hook_arg);
 
+/**
+ * Return the given status as a string
+ */
+char *starpu_task_status_get_as_string(enum starpu_task_status status);
+
 /** @} */
 
 #ifdef __cplusplus

+ 15 - 0
include/starpu_worker.h

@@ -160,6 +160,16 @@ extern struct starpu_worker_collection starpu_worker_list;
 extern struct starpu_worker_collection starpu_worker_tree;
 
 /**
+   Return true if type matches one of StarPU's defined worker architectures
+*/
+unsigned starpu_worker_archtype_is_valid(enum starpu_worker_archtype type);
+
+/**
+   Convert a mask of architectures to a worker archtype
+*/
+enum starpu_worker_archtype starpu_arch_mask_to_worker_archtype(unsigned mask);
+
+/**
    Return the number of workers (i.e. processing units executing
    StarPU tasks). The return value should be at most \ref
    STARPU_NMAXWORKERS.
@@ -259,6 +269,11 @@ int starpu_worker_get_by_type(enum starpu_worker_archtype type, int num);
 int starpu_worker_get_by_devid(enum starpu_worker_archtype type, int devid);
 
 /**
+   Return true if worker type can execute this task
+*/
+unsigned starpu_worker_type_can_execute_task(enum starpu_worker_archtype worker_type, const struct starpu_task *task);
+
+/**
    Get the name of the worker \p id. StarPU associates a unique human
    readable string to each processing unit. This function copies at
    most the \p maxlen first bytes of the unique string associated to

+ 33 - 11
mpi/examples/Makefile.am

@@ -312,17 +312,22 @@ endif
 ########################
 
 examplebin_PROGRAMS +=		\
-	mpi_redux/mpi_redux
+	mpi_redux/mpi_redux     \
+	mpi_redux/mpi_redux_tree
 
 mpi_redux_mpi_redux_SOURCES	=		\
-	mpi_redux/mpi_redux.c
+	mpi_redux/mpi_redux.c                   
+mpi_redux_mpi_redux_tree_SOURCES	=		\
+	mpi_redux/mpi_redux_tree.c
 
 mpi_redux_mpi_redux_LDADD =			\
 	-lm
-
+mpi_redux_mpi_redux_tree_LDADD =			\
+	-lm
 if !STARPU_SIMGRID
 starpu_mpi_EXAMPLES +=				\
-	mpi_redux/mpi_redux
+	mpi_redux/mpi_redux                     \
+	mpi_redux/mpi_redux_tree
 endif
 
 ##########################################
@@ -377,20 +382,30 @@ endif
 
 if STARPU_HAVE_MPIFORT
 if !STARPU_SANITIZE
-examplebin_PROGRAMS +=		\
-	native_fortran/nf_mpi_redux
+examplebin_PROGRAMS +=				\
+	native_fortran/nf_mpi_redux		\
+	native_fortran/nf_mpi_redux_tree
 
-native_fortran_nf_mpi_redux_SOURCES	=			\
+native_fortran_nf_mpi_redux_SOURCES =		\
 	native_fortran/fstarpu_mpi_mod.f90	\
 	native_fortran/fstarpu_mod.f90		\
 	native_fortran/nf_mpi_redux.f90	
 
-native_fortran_nf_mpi_redux_LDADD =					\
+native_fortran_nf_mpi_redux_LDADD =		\
+	-lm
+
+native_fortran_nf_mpi_redux_tree_SOURCES =	\
+	native_fortran/fstarpu_mpi_mod.f90	\
+	native_fortran/fstarpu_mod.f90		\
+	native_fortran/nf_mpi_redux_tree.f90	
+
+native_fortran_nf_mpi_redux_tree_LDADD =	\
 	-lm
 
 if !STARPU_SIMGRID
 starpu_mpi_EXAMPLES +=				\
-	native_fortran/nf_mpi_redux
+	native_fortran/nf_mpi_redux		\
+	native_fortran/nf_mpi_redux_tree
 endif
 endif
 endif
@@ -441,7 +456,8 @@ starpu_mpi_EXAMPLES	+=			\
 examplebin_PROGRAMS +=				\
 	user_datatype/user_datatype		\
 	user_datatype/user_datatype2		\
-	user_datatype/user_datatype_early
+	user_datatype/user_datatype_early	\
+	user_datatype/user_datatype_interface
 
 user_datatype_user_datatype_SOURCES =		\
 	user_datatype/user_datatype.c		\
@@ -455,11 +471,16 @@ user_datatype_user_datatype_early_SOURCES =	\
 	user_datatype/user_datatype_early.c	\
 	user_datatype/my_interface.c
 
+user_datatype_user_datatype_interface_SOURCES =	\
+	user_datatype/user_datatype_interface.c	\
+	user_datatype/my_interface.c
+
 if !STARPU_SIMGRID
 starpu_mpi_EXAMPLES	+=			\
 	user_datatype/user_datatype2		\
 	user_datatype/user_datatype_early	\
-	user_datatype/user_datatype
+	user_datatype/user_datatype		\
+	user_datatype/user_datatype_interface
 endif
 
 ###################
@@ -513,6 +534,7 @@ native_fortran/nf_mm_task_build.o: nf_mm_cl.mod fstarpu_mpi_mod.mod fstarpu_mod.
 native_fortran/nf_basic_ring.o: fstarpu_mpi_mod.mod fstarpu_mod.mod
 native_fortran/nf_redux_test.o: fstarpu_mpi_mod.mod fstarpu_mod.mod
 native_fortran/nf_mpi_redux.o: fstarpu_mpi_mod.mod fstarpu_mod.mod
+native_fortran/nf_mpi_redux_tree.o: fstarpu_mpi_mod.mod fstarpu_mod.mod
 endif
 endif
 

+ 1 - 1
mpi/examples/benchs/abstract_sendrecv_bench.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2020-2021 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
mpi/examples/benchs/abstract_sendrecv_bench.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
mpi/examples/benchs/bcast_bench.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2021       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
mpi/examples/benchs/bench_helper.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2020-2021 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
mpi/examples/benchs/bench_helper.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
mpi/examples/benchs/burst.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2020-2021 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
mpi/examples/benchs/burst_helper.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 2 - 1
mpi/examples/benchs/gemm_helper.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2020-2021 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -14,6 +14,7 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
+#include <limits.h>
 #include <common/blas.h>
 #include "../../examples/mult/simple.h"
 #include "helper.h"

+ 1 - 1
mpi/examples/benchs/gemm_helper.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
mpi/examples/benchs/sendrecv_parallel_tasks_bench.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2020-2021 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
mpi/examples/cg/cg.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2021       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
mpi/examples/mpi_redux/mpi_redux.c

@@ -92,7 +92,7 @@ static struct starpu_codelet task_red_cl =
 {
 	.cpu_funcs = { cl_cpu_task_red },
 	.nbuffers = 2,
-	.modes = { STARPU_RW, STARPU_R },
+	.modes = { STARPU_RW|STARPU_COMMUTE, STARPU_R },
 	.name = "task_red"
 };
 

+ 184 - 0
mpi/examples/mpi_redux/mpi_redux_tree.c

@@ -0,0 +1,184 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2016-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ * This example illustrates how to use the STARPU_MPI_REDUX mode
+ * and compare it with the standard STARPU_REDUX.
+ *
+ * In order to make this comparison salliant, the init codelet is not
+ * a task that set the handle to a neutral element but rather depends
+ * on the working node.
+ * This is not a proper way to use a reduction pattern however it
+ * can be analogous to the cost/weight of each contribution.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+#include <math.h>
+#include <starpu.h>
+#include <starpu_mpi.h>
+#include "helper.h"
+#include <unistd.h>
+
+static void cl_cpu_work(void *handles[], void*arg)
+{
+	(void)arg;
+	double *a = (double *)STARPU_VARIABLE_GET_PTR(handles[0]);
+	double *b = (double *)STARPU_VARIABLE_GET_PTR(handles[1]);
+	sleep(2);
+	printf("work_cl (rank:%d,worker:%d) %f =>",starpu_mpi_world_rank(), starpu_worker_get_id(), *a);
+	*a = 3.0 + *a + *b;
+	printf("%f\n",*a);
+}
+
+static struct starpu_codelet work_cl =
+{
+	.cpu_funcs = { cl_cpu_work },
+	.nbuffers = 2,
+	.modes = { STARPU_REDUX, STARPU_R },
+	.name = "task_init"
+};
+
+static struct starpu_codelet mpi_work_cl =
+{
+	.cpu_funcs = { cl_cpu_work },
+	.nbuffers = 2,
+	.modes = { STARPU_RW | STARPU_COMMUTE, STARPU_R },
+	.name = "task_init-mpi"
+};
+
+static void cl_cpu_task_init(void *handles[], void*arg)
+{
+	(void) arg;
+	double *a = (double *)STARPU_VARIABLE_GET_PTR(handles[0]);
+	sleep(1);
+	printf("init_cl (rank:%d,worker:%d) %d (was %f)\n", starpu_mpi_world_rank(), starpu_worker_get_id(), starpu_mpi_world_rank(), *a);
+	*a = starpu_mpi_world_rank();
+}
+
+static struct starpu_codelet task_init_cl =
+{
+	.cpu_funcs = { cl_cpu_task_init },
+	.nbuffers = 1,
+	.modes = { STARPU_W },
+	.name = "task_init"
+};
+
+static void cl_cpu_task_red(void *handles[], void*arg)
+{
+	(void) arg;
+	double *ad = (double *)STARPU_VARIABLE_GET_PTR(handles[0]);
+	double *as = (double *)STARPU_VARIABLE_GET_PTR(handles[1]);
+	sleep(2);
+	printf("red_cl (rank:%d,worker:%d) %f ; %f --> %f\n", starpu_mpi_world_rank(), starpu_worker_get_id(), *as, *ad, *as+*ad);
+	*ad = *ad + *as;
+}
+
+static struct starpu_codelet task_red_cl =
+{
+	.cpu_funcs = { cl_cpu_task_red },
+	.nbuffers = 2,
+	.modes = { STARPU_RW|STARPU_COMMUTE, STARPU_R },
+	.name = "task_red"
+};
+
+int main(int argc, char *argv[])
+{
+	int comm_rank, comm_size;
+	/* Initializes STarPU and the StarPU-MPI layer */
+	starpu_fxt_autostart_profiling(0);
+	int ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_ini_conft");
+
+	int nworkers = starpu_cpu_worker_get_count();
+	starpu_mpi_comm_size(MPI_COMM_WORLD, &comm_size);
+	if (comm_size < 2)
+	{
+        	FPRINTF(stderr, "We need at least 2 nodes.\n");
+        	starpu_mpi_shutdown();
+       		return STARPU_TEST_SKIPPED;
+	}
+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &comm_rank);
+
+	double a, b[comm_size];
+	starpu_data_handle_t a_h, b_h[comm_size];
+	double work_coef = 2;
+	enum starpu_data_access_mode codelet_mode;
+	enum starpu_data_access_mode task_mode;
+	int arity,j,work_node;
+    	starpu_mpi_tag_t tag = 0;
+	for (arity = 2 ; arity < comm_size ; arity++)
+	{
+		starpu_mpi_barrier(MPI_COMM_WORLD);
+		task_mode = STARPU_MPI_REDUX;
+		if (comm_rank == 0)
+		{
+			a = 1.0;
+			printf("init a = %f\n", a);
+			starpu_variable_data_register(&a_h, STARPU_MAIN_RAM, (uintptr_t)&a, sizeof(double));
+			for (j=0;j<comm_size;j++)
+				starpu_variable_data_register(&b_h[j], -1, 0, sizeof(double));
+		}
+		else
+		{
+			b[comm_rank] = 1.0 / (comm_rank + 1.0);
+			printf("init b_%d = %f\n", comm_rank, b[comm_rank]);
+			starpu_variable_data_register(&a_h, -1, 0, sizeof(double));
+			for (j=0;j<comm_size;j++)
+			{
+				if (j == comm_rank)
+					starpu_variable_data_register(&b_h[j], STARPU_MAIN_RAM, (uintptr_t)&b[j], sizeof(double));
+				else
+					starpu_variable_data_register(&b_h[j], -1, 0, sizeof(double));
+			}
+		}
+		starpu_mpi_data_register(a_h, tag++, 0);
+		for (j=0;j<comm_size;j++)
+			starpu_mpi_data_register(b_h[j], tag++, j);
+
+		starpu_data_set_reduction_methods(a_h, &task_red_cl, &task_init_cl);
+		starpu_fxt_start_profiling();
+		for (work_node=1; work_node < comm_size;work_node++)
+		{
+			for (j=1;j<=work_coef*nworkers;j++)
+			{
+			    starpu_mpi_task_insert(MPI_COMM_WORLD,
+				&mpi_work_cl,
+				task_mode, a_h,
+				STARPU_R, b_h[work_node],
+				STARPU_EXECUTE_ON_NODE, work_node,
+				0);
+			}
+		}
+		starpu_mpi_redux_data_tree(MPI_COMM_WORLD, a_h, arity);
+		starpu_mpi_wait_for_all(MPI_COMM_WORLD);
+		starpu_mpi_barrier(MPI_COMM_WORLD);
+		if (comm_rank == 0)
+		{
+			double tmp = 0.0;
+			for (work_node = 1; work_node < comm_size ; work_node++)
+				tmp += 1.0 / (work_node + 1.0);
+			printf("computed result ---> %f expected %f\n", a, 1.0 + (comm_size - 1.0)*(comm_size)/2.0 + work_coef*nworkers*((comm_size-1)*3.0 + tmp));
+		}
+		starpu_data_unregister(a_h);
+		for (work_node=0; work_node < comm_size;work_node++)
+			starpu_data_unregister(b_h[work_node]);
+		starpu_mpi_barrier(MPI_COMM_WORLD);
+	}
+	starpu_mpi_shutdown();
+	return 0;
+}

+ 77 - 78
mpi/examples/native_fortran/nf_mpi_redux.f90

@@ -43,7 +43,7 @@ program nf_mpi_redux
   comm_size  = fstarpu_mpi_world_size()
   if (comm_size.lt.2) then
     write(*,'(" ")')
-    write(*,'("This application is meant to run with at least two nodes.")')
+    write(*,'("This application is meant to run with at least two nodes (found ",i4," ; i am ",i4,").")') comm_size, comm_w_rank
     stop 2
   end if
   allocate(b(comm_size-1), bhdl(comm_size-1))
@@ -58,7 +58,7 @@ program nf_mpi_redux
   task_red_cl = fstarpu_codelet_allocate()
   call fstarpu_codelet_set_name(task_red_cl, namered)
   call fstarpu_codelet_add_cpu_func(task_red_cl,C_FUNLOC(cl_cpu_task_red))
-  call fstarpu_codelet_add_buffer(task_red_cl, FSTARPU_RW)
+  call fstarpu_codelet_add_buffer(task_red_cl, FSTARPU_RW.ior.FSTARPU_COMMUTE)
   call fstarpu_codelet_add_buffer(task_red_cl, FSTARPU_R)
 
   task_ini_cl = fstarpu_codelet_allocate()
@@ -70,91 +70,90 @@ program nf_mpi_redux
 
   do trial=1,2
 
-  if (trial.eq.1) then
-        write(*,*) "Using STARPU_MPI_REDUX"
-        codelet_mode = FSTARPU_RW.ior.FSTARPU_COMMUTE
-        task_mode = FSTARPU_MPI_REDUX
-  else if (trial.eq.2) then
-        write(*,*) "Using STARPU_REDUX"
-        codelet_mode = FSTARPU_REDUX
-        task_mode = FSTARPU_REDUX
-  end if
-  ! allocate and fill codelet structs
-  work_cl = fstarpu_codelet_allocate()
-  call fstarpu_codelet_set_name(work_cl, name)
-  call fstarpu_codelet_add_cpu_func(work_cl, C_FUNLOC(cl_cpu_task))
-  call fstarpu_codelet_add_buffer(work_cl, codelet_mode)
-  call fstarpu_codelet_add_buffer(work_cl, FSTARPU_R)
-  err = fstarpu_mpi_barrier(comm_world)
-
-  if(comm_w_rank.eq.0) then
-    write(*,'(" ")')
-    a = 1.0
-    write(*,*) "init a = ", a
-  else
-    b(comm_w_rank) = 1.0 / (comm_w_rank + 1.0)
-    write(*,*) "init b_",comm_w_rank,"=", b(comm_w_rank), " AT ", &
-c_loc(bhdl(comm_w_rank)) ! This is not really meaningful
-  end if
-
-  err = fstarpu_mpi_barrier(comm_world)
-
-  tag = 0
-  if(comm_w_rank.eq.0) then
-    call fstarpu_variable_data_register(ahdl, 0, c_loc(a),c_sizeof(a))
-    do i=1,comm_size-1
-        call fstarpu_variable_data_register(bhdl(i), -1, c_null_ptr,c_sizeof(b(i)))
-    end do
-  else
-    call fstarpu_variable_data_register(ahdl, -1, c_null_ptr,c_sizeof(a))
+    if (trial.eq.2) then
+          write(*,*) "Using STARPU_MPI_REDUX"
+          codelet_mode = FSTARPU_RW.ior.FSTARPU_COMMUTE
+          task_mode = FSTARPU_MPI_REDUX
+    else if (trial.eq.1) then
+          write(*,*) "Using STARPU_REDUX"
+          codelet_mode = FSTARPU_REDUX
+          task_mode = FSTARPU_REDUX
+    end if
+    ! allocate and fill codelet structs
+    work_cl = fstarpu_codelet_allocate()
+    call fstarpu_codelet_set_name(work_cl, name)
+    call fstarpu_codelet_add_cpu_func(work_cl, C_FUNLOC(cl_cpu_task))
+    call fstarpu_codelet_add_buffer(work_cl, codelet_mode)
+    call fstarpu_codelet_add_buffer(work_cl, FSTARPU_R)
+    err = fstarpu_mpi_barrier(comm_world)
+
+    if(comm_w_rank.eq.0) then
+      write(*,'(" ")')
+      a = 1.0
+      write(*,*) "init a = ", a
+    else
+      b(comm_w_rank) = 1.0 / (comm_w_rank + 1.0)
+      write(*,*) "init b_",comm_w_rank,"=", b(comm_w_rank)
+    end if
+
+    err = fstarpu_mpi_barrier(comm_world)
+
+    tag = 0
+    if(comm_w_rank.eq.0) then
+      call fstarpu_variable_data_register(ahdl, 0, c_loc(a),c_sizeof(a))
+      do i=1,comm_size-1
+          call fstarpu_variable_data_register(bhdl(i), -1, c_null_ptr,c_sizeof(b(i)))
+      end do
+    else
+      call fstarpu_variable_data_register(ahdl, -1, c_null_ptr,c_sizeof(a))
+      do i=1,comm_size-1
+        if (i.eq.comm_w_rank) then
+          call fstarpu_variable_data_register(bhdl(i), 0, c_loc(b(i)),c_sizeof(b(i)))
+        else
+          call fstarpu_variable_data_register(bhdl(i), -1, c_null_ptr,c_sizeof(b(i)))
+        end if
+      end do
+    end if
+    call fstarpu_mpi_data_register(ahdl,  tag,  0)
     do i=1,comm_size-1
-      if (i.eq.comm_w_rank) then
-        call fstarpu_variable_data_register(bhdl(i), 0, c_loc(b(i)),c_sizeof(b(i)))
-      else
-        call fstarpu_variable_data_register(bhdl(i), -1, c_null_ptr,c_sizeof(b(i)))
-      end if
+       call fstarpu_mpi_data_register(bhdl(i), tag+i,i)
     end do
-  end if
-  call fstarpu_mpi_data_register(ahdl,  tag,  0)
-  do i=1,comm_size-1
-     call fstarpu_mpi_data_register(bhdl(i), tag+i,i)
-  end do
 
-  tag = tag + comm_size
+    tag = tag + comm_size
 
-  call fstarpu_data_set_reduction_methods(ahdl,task_red_cl,task_ini_cl)
+    call fstarpu_data_set_reduction_methods(ahdl,task_red_cl,task_ini_cl)
 
-  err = fstarpu_mpi_barrier(comm_world)
+    err = fstarpu_mpi_barrier(comm_world)
 
 
-  call fstarpu_fxt_start_profiling()
-  do w_node=1,comm_size-1
-    do i=1,work_coef*nworkers
-      call fstarpu_mpi_task_insert( (/ c_loc(comm_world),   &
-             work_cl,                                         &
-             task_mode, ahdl,                            &
-             FSTARPU_R, bhdl(w_node),                      &
-             FSTARPU_EXECUTE_ON_NODE, c_loc(w_node),          &
-             C_NULL_PTR /))
+    call fstarpu_fxt_start_profiling()
+    do w_node=1,comm_size-1
+      do i=1,work_coef*nworkers
+        call fstarpu_mpi_task_insert( (/ c_loc(comm_world),   &
+               work_cl,                                         &
+               task_mode, ahdl,                            &
+               FSTARPU_R, bhdl(w_node),                      &
+               FSTARPU_EXECUTE_ON_NODE, c_loc(w_node),          &
+               C_NULL_PTR /))
+      end do
     end do
-  end do
-  call fstarpu_mpi_redux_data(comm_world, ahdl)
-  err = fstarpu_mpi_wait_for_all(comm_world)
-
-  if(comm_w_rank.eq.0) then
-    tmp = 0
+    call fstarpu_mpi_redux_data(comm_world, ahdl)
+    err = fstarpu_mpi_wait_for_all(comm_world)
+
+    if(comm_w_rank.eq.0) then
+      tmp = 0
+      do w_node=1,comm_size-1
+        tmp = tmp + 1.0 / (w_node+1.0)
+      end do
+      write(*,*) 'computed result ---> ',a, "expected =",&
+        1.0 + (comm_size-1.0)*(comm_size)/2.0 + work_coef*nworkers*((comm_size-1.0)*3.0 + tmp)
+    end if
+    err = fstarpu_mpi_barrier(comm_world)
+    call fstarpu_data_unregister(ahdl)
     do w_node=1,comm_size-1
-      tmp = tmp + 1.0 / (w_node+1.0)
+      call fstarpu_data_unregister(bhdl(w_node))
     end do
-    write(*,*) 'computed result ---> ',a, "expected =",&
-      1.0 + (comm_size-1.0)*(comm_size)/2.0 + work_coef*nworkers*((comm_size-1.0)*3.0 + tmp)
-  end if
-  err = fstarpu_mpi_barrier(comm_world)
-  call fstarpu_data_unregister(ahdl)
-  do w_node=1,comm_size-1
-    call fstarpu_data_unregister(bhdl(w_node))
-  end do
-  call fstarpu_codelet_free(work_cl)
+    call fstarpu_codelet_free(work_cl)
 
   end do
 
@@ -166,7 +165,7 @@ c_loc(bhdl(comm_w_rank)) ! This is not really meaningful
   err = fstarpu_mpi_shutdown()
   call fstarpu_shutdown()
   deallocate(b, bhdl)
-  stop
+  stop 0
 
 contains
 

+ 240 - 0
mpi/examples/native_fortran/nf_mpi_redux_tree.f90

@@ -0,0 +1,240 @@
+! StarPU --- Runtime system for heterogeneous multicore architectures.
+!
+! Copyright (C) 2016-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+!
+! StarPU is free software; you can redistribute it and/or modify
+! it under the terms of the GNU Lesser General Public License as published by
+! the Free Software Foundation; either version 2.1 of the License, or (at
+! your option) any later version.
+!
+! StarPU is distributed in the hope that it will be useful, but
+! WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+!
+! See the GNU Lesser General Public License in COPYING.LGPL for more details.
+!
+program nf_mpi_redux
+  use iso_c_binding
+  use fstarpu_mod
+  use fstarpu_mpi_mod
+
+  implicit none
+
+  integer, target                         :: ret, np, i, j, arity
+  type(c_ptr)                             :: work_cl, task_rw_cl,task_red_cl, task_ini_cl
+  character(kind=c_char,len=*), parameter :: name=C_CHAR_"task"//C_NULL_CHAR
+  character(kind=c_char,len=*), parameter :: namered=C_CHAR_"task_red"//C_NULL_CHAR
+  character(kind=c_char,len=*), parameter :: nameini=C_CHAR_"task_ini"//C_NULL_CHAR
+  real(kind(1.d0)), target                :: a,tmp
+  real(kind(1.d0)), target, allocatable   :: b(:)
+  integer(kind=8)                         :: tag, err
+  type(c_ptr), target                     :: ahdl
+  type(c_ptr), target, allocatable        :: bhdl(:)
+  type(c_ptr)                             :: task_mode, codelet_mode
+  integer, target                         :: comm_world,comm_w_rank, comm_size
+  integer(c_int), target                  :: w_node, nworkers, work_coef
+
+  !call fstarpu_fxt_autostart_profiling(0)
+  ret = fstarpu_init(c_null_ptr)
+  ret = fstarpu_mpi_init(1)
+
+  comm_world = fstarpu_mpi_world_comm()
+  comm_w_rank  = fstarpu_mpi_world_rank()
+  comm_size  = fstarpu_mpi_world_size()
+  allocate(b(comm_size-1), bhdl(comm_size-1))
+  nworkers = fstarpu_worker_get_count()
+  if (nworkers.lt.1) then
+    write(*,'(" ")')
+    write(*,'("This application is meant to run with at least one worker per node.")')
+    stop 2
+  end if
+
+  ! allocate and reduction codelets
+  task_red_cl = fstarpu_codelet_allocate()
+  call fstarpu_codelet_set_name(task_red_cl, namered)
+  call fstarpu_codelet_add_cpu_func(task_red_cl,C_FUNLOC(cl_cpu_task_red))
+  call fstarpu_codelet_add_buffer(task_red_cl, FSTARPU_RW.ior.FSTARPU_COMMUTE)
+  call fstarpu_codelet_add_buffer(task_red_cl, FSTARPU_R)
+
+  task_ini_cl = fstarpu_codelet_allocate()
+  call fstarpu_codelet_set_name(task_ini_cl, nameini)
+  call fstarpu_codelet_add_cpu_func(task_ini_cl,C_FUNLOC(cl_cpu_task_ini))
+  call fstarpu_codelet_add_buffer(task_ini_cl, FSTARPU_W)
+
+  work_coef=2
+
+  codelet_mode = FSTARPU_RW.ior.FSTARPU_COMMUTE
+  task_mode = FSTARPU_MPI_REDUX
+  ! allocate and fill codelet structs
+  work_cl = fstarpu_codelet_allocate()
+  call fstarpu_codelet_set_name(work_cl, name)
+  call fstarpu_codelet_add_cpu_func(work_cl, C_FUNLOC(cl_cpu_task))
+  call fstarpu_codelet_add_buffer(work_cl, codelet_mode)
+  call fstarpu_codelet_add_buffer(work_cl, FSTARPU_R)
+  err = fstarpu_mpi_barrier(comm_world)
+
+  do arity=2,comm_size
+
+    if(comm_w_rank.eq.0) then
+      write(*,'(" ")')
+      a = 1.0
+      write(*,*) "init a = ", a
+    else
+      b(comm_w_rank) = 1.0 / (comm_w_rank + 1.0)
+      write(*,*) "init b_",comm_w_rank,"=", b(comm_w_rank)
+    end if
+
+    err = fstarpu_mpi_barrier(comm_world)
+
+    tag = 0
+    if(comm_w_rank.eq.0) then
+      call fstarpu_variable_data_register(ahdl, 0, c_loc(a),c_sizeof(a))
+      do i=1,comm_size-1
+          call fstarpu_variable_data_register(bhdl(i), -1, c_null_ptr,c_sizeof(b(i)))
+      end do
+    else
+      call fstarpu_variable_data_register(ahdl, -1, c_null_ptr,c_sizeof(a))
+      do i=1,comm_size-1
+        if (i.eq.comm_w_rank) then
+          call fstarpu_variable_data_register(bhdl(i), 0, c_loc(b(i)),c_sizeof(b(i)))
+        else
+          call fstarpu_variable_data_register(bhdl(i), -1, c_null_ptr,c_sizeof(b(i)))
+        end if
+      end do
+    end if
+    call fstarpu_mpi_data_register(ahdl,  tag,  0)
+    do i=1,comm_size-1
+       call fstarpu_mpi_data_register(bhdl(i), tag+i,i)
+    end do
+
+    tag = tag + comm_size
+
+    call fstarpu_data_set_reduction_methods(ahdl,task_red_cl,task_ini_cl)
+
+    err = fstarpu_mpi_barrier(comm_world)
+
+
+    call fstarpu_fxt_start_profiling()
+    do w_node=1,comm_size-1
+      do i=1,work_coef*nworkers
+        call fstarpu_mpi_task_insert( (/ c_loc(comm_world),   &
+               work_cl,                                         &
+               task_mode, ahdl,                            &
+               FSTARPU_R, bhdl(w_node),                      &
+               FSTARPU_EXECUTE_ON_NODE, c_loc(w_node),          &
+               C_NULL_PTR /))
+      end do
+    end do
+    call fstarpu_mpi_redux_data_tree(comm_world, ahdl, arity)
+    err = fstarpu_mpi_wait_for_all(comm_world)
+
+    if(comm_w_rank.eq.0) then
+      tmp = 0
+      do w_node=1,comm_size-1
+        tmp = tmp + 1.0 / (w_node+1.0)
+      end do
+      write(*,*) 'computed result ---> ',a, "expected =",&
+        1.0 + (comm_size-1.0)*(comm_size)/2.0 + work_coef*nworkers*((comm_size-1.0)*3.0 + tmp)
+    end if
+    err = fstarpu_mpi_barrier(comm_world)
+    call fstarpu_data_unregister(ahdl)
+    do w_node=1,comm_size-1
+       call fstarpu_data_unregister(bhdl(w_node))
+    end do
+
+    call fstarpu_fxt_stop_profiling()
+  end do
+
+  call fstarpu_codelet_free(work_cl)
+  call fstarpu_codelet_free(task_red_cl)
+  call fstarpu_codelet_free(task_ini_cl)
+
+
+  err = fstarpu_mpi_shutdown()
+  call fstarpu_shutdown()
+  deallocate(b, bhdl)
+  stop 0
+
+contains
+
+  recursive subroutine cl_cpu_task (buffers, cl_args) bind(C)
+    use iso_c_binding       ! C interfacing module
+    use fstarpu_mod         ! StarPU interfacing module
+    implicit none
+
+    type(c_ptr), value, intent(in) :: buffers, cl_args ! cl_args is unused
+    integer(c_int) :: ret, worker_id
+    integer        :: comm_rank
+    integer, target :: i
+    real(kind(1.d0)), pointer :: a, b
+    real(kind(1.d0))          :: old_a
+
+    worker_id = fstarpu_worker_get_id()
+    comm_rank  = fstarpu_mpi_world_rank()
+
+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 0), a)
+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 1), b)
+    call nf_sleep(1.d0)
+    old_a = a
+    a = old_a + 3.0 + b
+    write(*,*) "task   (c_w_rank:",comm_rank," worker_id:",worker_id,") from ",old_a,"to",a
+
+    return
+  end subroutine cl_cpu_task
+
+  recursive subroutine cl_cpu_task_red (buffers, cl_args) bind(C)
+    use iso_c_binding       ! C interfacing module
+    use fstarpu_mod         ! StarPU interfacing module
+    implicit none
+
+    type(c_ptr), value, intent(in) :: buffers, cl_args ! cl_args is unused
+    integer(c_int) :: ret, worker_id
+    integer, target                         :: comm_rank
+    real(kind(1.d0)), pointer :: as, ad
+    real(kind(1.d0))           :: old_ad
+    worker_id = fstarpu_worker_get_id()
+    comm_rank  = fstarpu_mpi_world_rank()
+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 0), ad)
+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 1), as)
+    old_ad = ad
+    ad = ad + as
+    call nf_sleep(1.d0)
+    write(*,*) "red_cl (c_w_rank:",comm_rank,"worker_id:",worker_id,")",as, old_ad, ' ---> ',ad
+
+    return
+  end subroutine cl_cpu_task_red
+
+  recursive subroutine cl_cpu_task_ini (buffers, cl_args) bind(C)
+    use iso_c_binding       ! C interfacing module
+    use fstarpu_mod         ! StarPU interfacing module
+    implicit none
+
+    type(c_ptr), value, intent(in) :: buffers, cl_args
+        ! cl_args is unused
+    integer(c_int) :: ret, worker_id
+    integer, target                         :: comm_rank
+    real(kind(1.d0)), pointer :: a
+    worker_id = fstarpu_worker_get_id()
+    comm_rank  = fstarpu_mpi_world_rank()
+    call c_f_pointer(fstarpu_variable_get_ptr(buffers, 0), a)
+    call nf_sleep(0.5d0)
+    ! As this codelet is run by each worker in the REDUX mode case
+    ! this initialization makes salient the number of copies spawned
+    write(*,*) "ini_cl (c_w_rank:",comm_rank,"worker_id:",worker_id,") set to", comm_rank, "(was",a,")"
+    a = comm_rank
+    return
+  end subroutine cl_cpu_task_ini
+
+  subroutine nf_sleep(t)
+    implicit none
+    integer :: t_start, t_end, t_rate
+    real(kind(1.d0))     :: ta, t
+    call system_clock(t_start)
+    do
+       call system_clock(t_end, t_rate)
+       ta = real(t_end-t_start)/real(t_rate)
+       if(ta.gt.t) return
+    end do
+  end subroutine nf_sleep
+
+end program

+ 1 - 1
mpi/examples/native_fortran/nf_redux_test.f90

@@ -71,7 +71,7 @@ program main
   task_red_cl = fstarpu_codelet_allocate()
   call fstarpu_codelet_set_name(task_red_cl, namered)
   call fstarpu_codelet_add_cpu_func(task_red_cl,C_FUNLOC(cl_cpu_task_red))
-  call fstarpu_codelet_add_buffer(task_red_cl, FSTARPU_RW)
+  call fstarpu_codelet_add_buffer(task_red_cl, FSTARPU_RW.ior.FSTARPU_COMMUTE)
   call fstarpu_codelet_add_buffer(task_red_cl, FSTARPU_R)
 
   task_ini_cl = fstarpu_codelet_allocate()

+ 97 - 0
mpi/examples/user_datatype/user_datatype_interface.c

@@ -0,0 +1,97 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2020-2021 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include "my_interface.h"
+
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
+int main(int argc, char **argv)
+{
+	int rank, nodes;
+	int ret=0;
+	struct starpu_my_data my_data;
+	struct starpu_my_data my_data2 = {.d = 77, .c = 'x'};
+	starpu_data_handle_t my_handle1;
+	starpu_data_handle_t my_handle2;
+	starpu_data_handle_t my_handle3;
+
+	ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
+	starpu_mpi_comm_size(MPI_COMM_WORLD, &nodes);
+
+	if (nodes < 2 || (starpu_cpu_worker_get_count() == 0))
+	{
+		if (rank == 0)
+		{
+			if (nodes < 2)
+				fprintf(stderr, "We need at least 2 processes.\n");
+			else
+				fprintf(stderr, "We need at least 1 CPU.\n");
+		}
+		starpu_mpi_shutdown();
+		return 77;
+	}
+
+	if (rank == 0)
+	{
+		my_data.d = 42;
+		my_data.c = 'n';
+	}
+	else
+	{
+		my_data.d = 0;
+		my_data.c = 'z';
+	}
+
+	starpu_my_data_register(&my_handle1, STARPU_MAIN_RAM, &my_data2);
+	starpu_my_data_register(&my_handle2, STARPU_MAIN_RAM, &my_data2);
+	starpu_my_data_register(&my_handle3, STARPU_MAIN_RAM, &my_data);
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+
+	if (rank == 0)
+	{
+		starpu_mpi_send(my_handle1, 1, 10, MPI_COMM_WORLD);
+		starpu_mpi_send(my_handle2, 1, 12, MPI_COMM_WORLD);
+		starpu_mpi_send(my_handle3, 1, 14, MPI_COMM_WORLD);
+	}
+	else if (rank == 1)
+	{
+		starpu_mpi_req req;
+
+		starpu_task_insert(&starpu_my_data_display_codelet, STARPU_VALUE, "node1 initial value", strlen("node1 initial value")+1, STARPU_R, my_handle3, 0);
+
+		starpu_mpi_irecv(my_handle3, &req, 0, 14, MPI_COMM_WORLD);
+		starpu_mpi_recv(my_handle2, 0, 12, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+		starpu_mpi_recv(my_handle1, 0, 10, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+		starpu_mpi_wait(&req, NULL);
+
+		starpu_task_insert(&starpu_my_data_display_codelet, STARPU_VALUE, "node1 rceived value", strlen("node1 rceived value")+1, STARPU_R, my_handle3, 0);
+	}
+
+	starpu_task_wait_for_all();
+	starpu_mpi_wait_for_all(MPI_COMM_WORLD);
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+
+	starpu_data_unregister(my_handle1);
+	starpu_data_unregister(my_handle2);
+	starpu_data_unregister(my_handle3);
+
+	starpu_mpi_shutdown();
+
+	return 0;
+}

+ 19 - 0
mpi/include/fstarpu_mpi_mod.f90

@@ -304,6 +304,25 @@ module fstarpu_mpi_mod
                         integer(c_int), value, intent(in) :: prio
                 end subroutine fstarpu_mpi_redux_data_prio
 
+                ! void starpu_mpi_redux_data_tree(MPI_Comm comm, starpu_data_handle_t data_handle, int arity);
+                subroutine fstarpu_mpi_redux_data_tree(mpi_comm,dh, arity) bind(C)
+                        use iso_c_binding
+                        implicit none
+                        integer(c_int), value, intent(in) :: mpi_comm
+                        type(c_ptr), value, intent(in) :: dh
+                        integer(c_int), value, intent(in) :: arity
+                end subroutine fstarpu_mpi_redux_data_tree
+
+                ! void starpu_mpi_redux_data_prio_tree(MPI_Comm comm, starpu_data_handle_t data_handle, int prio, int arity);
+                subroutine fstarpu_mpi_redux_data_prio_tree(mpi_comm,dh, prio, arity) bind(C)
+                        use iso_c_binding
+                        implicit none
+                        integer(c_int), value, intent(in) :: mpi_comm
+                        type(c_ptr), value, intent(in) :: dh
+                        integer(c_int), value, intent(in) :: prio
+                        integer(c_int), value, intent(in) :: arity
+                end subroutine fstarpu_mpi_redux_data_prio_tree
+
                 ! int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg);
                 function fstarpu_mpi_scatter_detached (dhs, cnt, root, mpi_comm, scallback, sarg, rcallback, rarg) bind(C)
                         use iso_c_binding

+ 12 - 0
mpi/include/starpu_mpi.h

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2021       Federal University of Rio Grande do Sul (UFRGS)
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -796,6 +797,17 @@ void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle);
 void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle, int prio);
 
 /**
+   Perform a reduction on the given data \p handle. 
+   Nodes perform the reduction through in a tree-based fashion.
+   The tree use is an \p arity - ary tree.
+*/
+void starpu_mpi_redux_data_tree(MPI_Comm comm, starpu_data_handle_t data_handle, int arity);
+
+/**
+   Similar to starpu_mpi_redux_data_tree, but take a priority \p prio.
+*/
+void starpu_mpi_redux_data_prio_tree(MPI_Comm comm, starpu_data_handle_t data_handle, int prio, int arity);
+/**
    Scatter data among processes of the communicator based on the
    ownership of the data. For each data of the array \p data_handles,
    the process \p root sends the data to the process owning this data.

+ 2 - 0
mpi/src/mpi/starpu_mpi_mpi.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2009-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  * Copyright (C) 2017       Guillaume Beauchamp
+ * Copyright (C) 2021       Federal University of Rio Grande do Sul (UFRGS)
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -1291,6 +1292,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 		/* We need to record our ID in the trace before the main thread makes any MPI call */
 		_STARPU_MPI_TRACE_START(argc_argv->rank, argc_argv->world_size);
 		starpu_profiling_set_id(argc_argv->rank);
+		_starpu_profiling_set_mpi_worldsize(argc_argv->world_size);
 		_starpu_mpi_add_sync_point_in_fxt();
 	}
 #endif //STARPU_USE_FXT

+ 1 - 1
mpi/src/mpi/starpu_mpi_mpi.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 5 - 4
mpi/src/nmad/starpu_mpi_nmad.c

@@ -429,7 +429,7 @@ void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req* req)
 	_STARPU_MPI_LOG_OUT();
 }
 
-void _starpu_mpi_handle_request_termination_callback(nm_sr_event_t event, const nm_sr_event_info_t* event_info, void* ref)
+void _starpu_mpi_handle_request_termination_callback(nm_sr_event_t event STARPU_ATTRIBUTE_UNUSED, const nm_sr_event_info_t* event_info STARPU_ATTRIBUTE_UNUSED, void* ref)
 {
 	_starpu_mpi_handle_request_termination(ref);
 }
@@ -447,7 +447,7 @@ void _starpu_mpi_coop_sends_build_tree(struct _starpu_mpi_coop_sends *coop_sends
 }
 #endif
 
-void _starpu_mpi_submit_coop_sends(struct _starpu_mpi_coop_sends *coop_sends, int submit_control, int submit_data)
+void _starpu_mpi_submit_coop_sends(struct _starpu_mpi_coop_sends *coop_sends, int submit_control STARPU_ATTRIBUTE_UNUSED, int submit_data)
 {
 	unsigned i, n = coop_sends->n;
 
@@ -530,6 +530,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 		/* We need to record our ID in the trace before the main thread makes any MPI call */
 		_STARPU_MPI_TRACE_START(argc_argv->rank, argc_argv->world_size);
 		starpu_profiling_set_id(argc_argv->rank);
+		_starpu_profiling_set_mpi_worldsize(argc_argv->world_size);
 		_starpu_mpi_add_sync_point_in_fxt();
 	}
 #endif //STARPU_USE_FXT
@@ -717,12 +718,12 @@ int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv)
 
 	if(polling_point_prog)
 	{
-		starpu_progression_hook_register((unsigned (*)(void *))&piom_ltask_schedule, (void *)&polling_point_prog);
+		starpu_progression_hook_register((void *)&piom_ltask_schedule, (void *)&polling_point_prog);
 	}
 
 	if(polling_point_idle)
 	{
-		starpu_idle_hook_register((unsigned (*)(void *))&piom_ltask_schedule, (void *)&polling_point_idle);
+		starpu_idle_hook_register((void *)&piom_ltask_schedule, (void *)&polling_point_idle);
 	}
 
 	/* Launch thread used for nmad callbacks */

+ 1 - 1
mpi/src/nmad/starpu_mpi_nmad.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
mpi/src/nmad/starpu_mpi_nmad_backend.c

@@ -55,7 +55,7 @@ void _starpu_mpi_nmad_backend_request_init(struct _starpu_mpi_req *req)
 	req->backend->data_request = NM_SR_REQUEST_NULL;
 }
 
-void _starpu_mpi_nmad_backend_request_fill(struct _starpu_mpi_req *req, MPI_Comm comm, int is_internal_req)
+void _starpu_mpi_nmad_backend_request_fill(struct _starpu_mpi_req *req, MPI_Comm comm, int is_internal_req STARPU_ATTRIBUTE_UNUSED)
 {
 	/* this function gives session and gate: */
 	nm_mpi_nmad_dest(&req->backend->session, &req->backend->gate, comm, req->node_tag.node.rank);

+ 1 - 1
mpi/src/nmad/starpu_mpi_nmad_unknown_datatype.c

@@ -88,7 +88,7 @@ void _starpu_mpi_isend_unknown_datatype(struct _starpu_mpi_req *req)
  * Receive
  **********************************************/
 
-static void _starpu_mpi_unknown_datatype_recv_callback(nm_sr_event_t event, const nm_sr_event_info_t* p_info, void* ref)
+static void _starpu_mpi_unknown_datatype_recv_callback(nm_sr_event_t event, const nm_sr_event_info_t* p_info STARPU_ATTRIBUTE_UNUSED, void* ref)
 {
 	STARPU_ASSERT_MSG(!((event & NM_SR_EVENT_FINALIZED) && (event & NM_SR_EVENT_RECV_DATA)), "Both events can't be triggered at the same time !");
 

+ 3 - 4
mpi/src/starpu_mpi.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
- * Copyright (C) 2019       Federal University of Rio Grande do Sul (UFRGS)
+ * Copyright (C) 2019,2021  Federal University of Rio Grande do Sul (UFRGS)
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -393,10 +393,9 @@ void _starpu_mpi_data_clear(starpu_data_handle_t data_handle)
 	_mpi_backend._starpu_mpi_backend_data_clear(data_handle);
 	_starpu_mpi_cache_data_clear(data_handle);
 	_starpu_spin_destroy(&data->coop_lock);
-	if (data->redux_map != REDUX_CONTRIB)
-		free(data->redux_map);
+	free(data->redux_map);
+	data->redux_map = NULL;
 	free(data);
-	data_handle->mpi_data = NULL;
 }
 
 struct _starpu_mpi_data *_starpu_mpi_data_get(starpu_data_handle_t data_handle)

+ 1 - 0
mpi/src/starpu_mpi_coop_sends.c

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2021       Federal University of Rio Grande do Sul (UFRGS)
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 11 - 0
mpi/src/starpu_mpi_fortran.c

@@ -119,11 +119,22 @@ void fstarpu_mpi_redux_data(MPI_Fint comm, starpu_data_handle_t data_handle)
 {
 	starpu_mpi_redux_data(MPI_Comm_f2c(comm), data_handle);
 }
+
 void fstarpu_mpi_redux_data_prio(MPI_Fint comm, starpu_data_handle_t data_handle, int prio)
 {
 	starpu_mpi_redux_data_prio(MPI_Comm_f2c(comm), data_handle, prio);
 }
 
+void fstarpu_mpi_redux_data_tree(MPI_Fint comm, starpu_data_handle_t data_handle, int arity)
+{
+	starpu_mpi_redux_data_tree(MPI_Comm_f2c(comm), data_handle, arity);
+}
+
+void fstarpu_mpi_redux_data_prio_tree(MPI_Fint comm, starpu_data_handle_t data_handle, int prio, int arity)
+{
+	starpu_mpi_redux_data_prio_tree(MPI_Comm_f2c(comm), data_handle, prio, arity);
+}
+
 /* scatter/gather */
 int fstarpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int cnt, int root, MPI_Fint comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg)
 {

+ 1 - 0
mpi/src/starpu_mpi_private.c

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020       Federal University of Rio Grande do Sul (UFRGS)
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 0 - 6
mpi/src/starpu_mpi_private.h

@@ -203,12 +203,6 @@ struct _starpu_mpi_coop_sends
 	long pre_sync_jobid;
 };
 
-/** cf. redux_map field : this is the value
- * put in this field whenever a node contributes
- * to the reduction of the data.
- * Only the owning node keeps track of all the contributing nodes. */
-#define REDUX_CONTRIB ((char*) -1)
-
 /** Initialized in starpu_mpi_data_register_comm */
 struct _starpu_mpi_data
 {

+ 151 - 168
mpi/src/starpu_mpi_task_insert.c

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2021       Federal University of Rio Grande do Sul (UFRGS)
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -628,16 +629,13 @@ int _starpu_mpi_task_postbuild_v(MPI_Comm comm, int xrank, int do_execute, struc
 		if ((descrs[i].mode & STARPU_REDUX || descrs[i].mode & STARPU_MPI_REDUX) && descrs[i].handle)
 		{
 			struct _starpu_mpi_data *mpi_data = (struct _starpu_mpi_data *) descrs[i].handle->mpi_data;
-			if (me == starpu_mpi_data_get_rank(descrs[i].handle))
-			{
-				int size;
-				starpu_mpi_comm_size(comm, &size);
-				if (mpi_data->redux_map == NULL)
-					_STARPU_CALLOC(mpi_data->redux_map, size, sizeof(mpi_data->redux_map[0]));
-				mpi_data->redux_map [xrank] = 1;
-			}
-			else if (me == xrank)
-				mpi_data->redux_map = REDUX_CONTRIB;
+			int rrank = starpu_mpi_data_get_rank(descrs[i].handle);
+			int size;
+			starpu_mpi_comm_size(comm, &size);
+			if (mpi_data->redux_map == NULL)
+				_STARPU_CALLOC(mpi_data->redux_map, size, sizeof(mpi_data->redux_map[0]));
+			mpi_data->redux_map [xrank] = 1;
+			mpi_data->redux_map [rrank] = 1;
 		}
 		_starpu_mpi_exchange_data_after_execution(descrs[i].handle, descrs[i].mode, me, xrank, do_execute, prio, comm);
 		_starpu_mpi_clear_data_after_execution(descrs[i].handle, descrs[i].mode, me, do_execute);
@@ -751,6 +749,13 @@ int starpu_mpi_task_post_build(MPI_Comm comm, struct starpu_codelet *codelet, ..
 	return _starpu_mpi_task_postbuild_v(comm, xrank, do_execute, descrs, nb_data, prio);
 }
 
+struct starpu_codelet _starpu_mpi_redux_data_synchro_cl =
+{
+	.where = STARPU_NOWHERE,
+	.modes = {STARPU_R, STARPU_W},
+	.nbuffers = 2
+};
+
 struct _starpu_mpi_redux_data_args
 {
 	starpu_data_handle_t data_handle;
@@ -762,86 +767,14 @@ struct _starpu_mpi_redux_data_args
 	long taskC_jobid;
 };
 
-void _starpu_mpi_redux_data_dummy_func(void *buffers[], void *cl_arg)
-{
-	(void)buffers;
-	(void)cl_arg;
-}
-
-/* Dummy cost function for simgrid */
-static double cost_function(struct starpu_task *task, unsigned nimpl)
-{
-	(void)task;
-	(void)nimpl;
-	return 0.000001;
-}
-static struct starpu_perfmodel dumb_model =
-{
-	.type		= STARPU_COMMON,
-	.cost_function	= cost_function
-};
-
-/* FIXME: we can probably use STARPU_NOWHERE for these? */
-static
-struct starpu_codelet _starpu_mpi_redux_data_read_cl =
-{
-	.cpu_funcs = {_starpu_mpi_redux_data_dummy_func},
-	.cuda_funcs = {_starpu_mpi_redux_data_dummy_func},
-	.opencl_funcs = {_starpu_mpi_redux_data_dummy_func},
-	.nbuffers = 1,
-	.modes = {STARPU_R},
-	.model = &dumb_model,
-	.name = "_starpu_mpi_redux_data_read_cl"
-};
-
-struct starpu_codelet _starpu_mpi_redux_data_readwrite_cl =
-{
-	.cpu_funcs = {_starpu_mpi_redux_data_dummy_func},
-	.cuda_funcs = {_starpu_mpi_redux_data_dummy_func},
-	.opencl_funcs = {_starpu_mpi_redux_data_dummy_func},
-	.nbuffers = 1,
-	.modes = {STARPU_RW},
-	.model = &dumb_model,
-	.name = "_starpu_mpi_redux_data_write_cl"
-};
-
-static
-void _starpu_mpi_redux_data_detached_callback(void *arg)
-{
-	struct _starpu_mpi_redux_data_args *args = (struct _starpu_mpi_redux_data_args *) arg;
-
-	STARPU_TASK_SET_HANDLE(args->taskB, args->new_handle, 1);
-	int ret = starpu_task_submit(args->taskB);
-	STARPU_ASSERT(ret == 0);
-
-	starpu_data_unregister_submit(args->new_handle);
-	free(args);
-}
-
-static
-void _starpu_mpi_redux_data_recv_callback(void *callback_arg)
-{
-	struct _starpu_mpi_redux_data_args *args = (struct _starpu_mpi_redux_data_args *) callback_arg;
-	starpu_data_register_same(&args->new_handle, args->data_handle);
-
-	starpu_mpi_irecv_detached_sequential_consistency(args->new_handle, args->node, args->data_tag, args->comm, _starpu_mpi_redux_data_detached_callback, args, 0);
-}
-
-
 void _starpu_mpi_redux_fill_post_sync_jobid(const void * const redux_data_args, long * const post_sync_jobid)
 {
 	*post_sync_jobid = ((const struct _starpu_mpi_redux_data_args *) redux_data_args)->taskC_jobid;
 }
 
-
 /* TODO: this should rather be implicitly called by starpu_mpi_task_insert when
- * a data previously accessed in REDUX mode gets accessed in R mode. */
-/* FIXME: In order to prevent simultaneous receive submissions
- * on the same handle, we need to wait that all the starpu_mpi
- * tasks are done before submitting next tasks. The current
- * version of the implementation does not support multiple
- * simultaneous receive requests on the same handle.*/
-void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle, int prio)
+ *  * a data previously accessed in (MPI_)REDUX mode gets accessed in R mode. */
+void starpu_mpi_redux_data_prio_tree(MPI_Comm comm, starpu_data_handle_t data_handle, int prio, int arity)
 {
 	int me, rank, nb_nodes;
 	starpu_mpi_tag_t data_tag;
@@ -857,106 +790,156 @@ void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle,
 	{
 		_STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
 	}
-
+	if (mpi_data->redux_map == NULL)
+	{
+		_STARPU_MPI_DEBUG(5, "We do not contribute to the data being reduced.\n");
+		return;
+	}
 	starpu_mpi_comm_rank(comm, &me);
 	starpu_mpi_comm_size(comm, &nb_nodes);
 
-	_STARPU_MPI_DEBUG(50, "Doing reduction for data %p on node %d with %d nodes ...\n", data_handle, rank, nb_nodes);
-	// need to count how many nodes have the data in redux mode
-	if (me == rank)
+	int current_level, nb_contrib, next_nb_contrib;
+	int i, j, step, node;
+	char root_in_step, me_in_step;
+	// https://stackoverflow.com/questions/109023/how-to-count-the-number-of-set-bits-in-a-32-bit-integer
+	// https://stackoverflow.com/a/109025
+	// see hamming weight
+	//nb_contrib = std::popcount(mpi_data->redux_map); // most preferable
+	nb_contrib=0;
+	for (i=0;i<nb_nodes;i++)
+	{
+		_STARPU_MPI_DEBUG(5, "mpi_data->redux_map[%d] = %d\n", i, mpi_data->redux_map[i]);
+		if (mpi_data->redux_map[i]) nb_contrib++;
+	}
+	if (nb_contrib == 0)
+	{
+		/* Nothing to do! */
+		return;
+	}
+	if (arity < 2)
+	{
+		arity = nb_contrib;
+	}
+	_STARPU_MPI_DEBUG(5, "There is %d contributors\n", nb_contrib);
+	int contributors[nb_contrib];
+	int reducing_node;
+	j=0;
+	for (i=0;i<nb_nodes;i++)
 	{
-		int i,j;
-		_STARPU_MPI_DEBUG(50, "Who is in the map ?\n");
-		for (j = 0; j<nb_nodes; j++)
+		_STARPU_MPI_DEBUG(5, "%d in reduction ? %d\n", i, mpi_data->redux_map[i]);
+		if (mpi_data->redux_map[i])
 		{
-			_STARPU_MPI_DEBUG(50, "%d is in the map ? %d\n", j, mpi_data->redux_map[j]);
+			contributors[j++] = i;
 		}
+	}
+	for (i=0;i<nb_contrib;i++)
+	{
+		_STARPU_MPI_DEBUG(5, "%dth contributor = %d\n", i, contributors[i]);
+	}
 
-		// taskC depends on all taskBs created
-		// Creating synchronization task and use its jobid for tracing
-		struct starpu_task *taskC = starpu_task_create();
-		const long taskC_jobid = starpu_task_get_job_id(taskC);
-		taskC->cl = &_starpu_mpi_redux_data_readwrite_cl;
-		STARPU_TASK_SET_HANDLE(taskC, data_handle, 0);
-
-		for(i=0 ; i<nb_nodes ; i++)
-		{
-			if (i != rank && mpi_data->redux_map[i])
+	_STARPU_MPI_DEBUG(15, "mpi_redux _ STARTING with %d-ary tree \n", arity);
+	current_level = 0;
+	while (nb_contrib != 1)
+	{
+		_STARPU_MPI_DEBUG(5, "%dth level in the reduction \n", current_level);
+		if (nb_contrib%arity == 0) next_nb_contrib = nb_contrib/arity;
+		else next_nb_contrib = nb_contrib/arity + 1;
+		for (step = 0; step < next_nb_contrib; step++)
+		{
+			root_in_step = 0;
+			me_in_step = 0;
+			for (node = step*arity ; node < nb_contrib && node < (step+1)*arity ; node++)
+			{
+				if (contributors[node] == rank) root_in_step = 1;
+				if (contributors[node] == me) me_in_step = 1;
+			}
+			/* FIXME: if the root node is note in the step, then we agree the node
+			 * with the lowest id reduces the step : we could agree on another
+			 * node to better load balance in the case of multiple reductions involving
+			 * the same sets of nodes
+			 * FIX: We chose to use the tag%arity-th contributor in the step
+			 */
+			if (root_in_step)
+			{
+				reducing_node = rank;
+			}
+			else if (step*arity + data_tag%arity < nb_contrib)
 			{
-				_STARPU_MPI_DEBUG(5, "%d takes part in the reduction of %p \n", i, data_handle);
-				/* We need to make sure all is
-				 * executed after data_handle finished
-				 * its last read access, we hence do
-				 * the following:
-				 * - submit an empty task A reading
-				 * data_handle whose callback submits
-				 * the mpi comm with sequential
-				 * consistency set to 0, whose
-				 * callback submits the redux_cl task
-				 * B with sequential consistency set
-				 * to 0,
-				 * - submit an empty task C reading
-				 * and writing data_handle and
-				 * depending on task B, just to replug
-				 * with implicit data dependencies
-				 * with tasks inserted after this
-				 * reduction.
-				 */
-
-				struct _starpu_mpi_redux_data_args *args;
-				_STARPU_MPI_MALLOC(args, sizeof(struct _starpu_mpi_redux_data_args));
-				args->data_handle = data_handle;
-				args->data_tag = data_tag;
-				args->node = i;
-				args->comm = comm;
-
-				args->taskC_jobid = taskC_jobid;
-
-				// We need to create taskB early as
-				// taskC declares a dependancy on it
-				args->taskB = starpu_task_create();
-				args->taskB->cl = args->data_handle->redux_cl;
-				args->taskB->sequential_consistency = 0;
-				STARPU_TASK_SET_HANDLE(args->taskB, args->data_handle, 0);
-
-				starpu_task_declare_deps_array(taskC, 1, &args->taskB);
-
-				// Submit taskA
-				starpu_task_insert(&_starpu_mpi_redux_data_read_cl,
-						   STARPU_R, data_handle,
-						   STARPU_CALLBACK_WITH_ARG_NFREE, _starpu_mpi_redux_data_recv_callback, args,
-						   0);
+				reducing_node = contributors[step*arity + data_tag%arity];
 			}
 			else
 			{
-				_STARPU_MPI_DEBUG(5, "%d is not in the map or is me\n", i);
+				reducing_node = contributors[step*arity];
 			}
-		}
 
-		int ret = starpu_task_submit(taskC);
-		STARPU_ASSERT(ret == 0);
-	}
-	else if (mpi_data->redux_map)
-	{
-		STARPU_ASSERT(mpi_data->redux_map == REDUX_CONTRIB);
-		_STARPU_MPI_DEBUG(5, "Sending redux handle to %d ...\n", rank);
-		starpu_mpi_isend_detached_prio(data_handle, rank, data_tag, prio, comm, NULL, NULL);
-		starpu_data_invalidate_submit(data_handle);
-	}
-	else
-	{
-		_STARPU_MPI_DEBUG(5, "I am not in the map of %d, I am %d ...\n", rank, me);
-	}
-	if (mpi_data->redux_map != NULL)
-	{
-		_STARPU_MPI_DEBUG(100, "waiting for redux tasks with %d\n", rank);
-		starpu_task_wait_for_all();
+			if (me == reducing_node)
+			{
+				_STARPU_MPI_DEBUG(5, "mpi_redux _ %dth level, %dth step ; chose %d node\n", current_level, step, reducing_node);
+				for (node = step*arity ; node < nb_contrib && node < (step+1)*arity ; node++)
+				{
+					if (me != contributors[node])
+					{
+						_STARPU_MPI_DEBUG(5, "%d takes part in the reduction of %p towards %d (%dth level ; %dth step) \n",
+								  contributors[node], data_handle, reducing_node, current_level, step);
+						/* We need to make sure all is
+						 * executed after data_handle finished
+						 * its last read access, we hence do
+						 * the following:
+						 * - submit an empty task A reading
+						 * data_handle
+						 * - submit the reducing task B
+						 * reading and writing data_handle and
+						 * depending on task A through sequencial
+						 * consistency
+						 */
+						starpu_data_handle_t new_handle;
+						starpu_data_register_same(&new_handle, data_handle);
+						/* Task A */
+				       	        starpu_task_insert(&_starpu_mpi_redux_data_synchro_cl,
+									STARPU_R, data_handle,
+									STARPU_W, new_handle, 0);
+				       	        starpu_mpi_irecv_detached_prio(new_handle, contributors[node], data_tag, prio, comm, NULL, NULL);
+					        /* Task B */
+				       		starpu_task_insert(data_handle->redux_cl, STARPU_RW|STARPU_COMMUTE, data_handle, STARPU_R, new_handle, 0);
+						starpu_data_unregister_submit(new_handle);
+					}
+				}
+			}
+			else if (me_in_step)
+			{
+				_STARPU_MPI_DEBUG(5, "Sending redux handle to %d ...\n", reducing_node);
+				starpu_mpi_isend_detached_prio(data_handle, reducing_node, data_tag, prio, comm, NULL, NULL);
+				starpu_data_invalidate_submit(data_handle);
+			}
+			contributors[step] = reducing_node;
+		}
+		nb_contrib = next_nb_contrib;
+		current_level++;
 	}
-	if (me == rank)
-		free(mpi_data->redux_map);
-	mpi_data->redux_map = NULL;
 }
+
 void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle)
 {
 	return starpu_mpi_redux_data_prio(comm, data_handle, 0);
 }
+
+void starpu_mpi_redux_data_tree(MPI_Comm comm, starpu_data_handle_t data_handle, int arity)
+{
+	return starpu_mpi_redux_data_prio_tree(comm, data_handle, 0, arity);
+}
+
+void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle, int prio)
+{
+	int nb_nodes, nb_contrib, i;
+	struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
+	starpu_mpi_comm_size(comm, &nb_nodes);
+	nb_contrib=0;
+	for (i=0;i<nb_nodes;i++)
+	{
+		if (mpi_data->redux_map[i])
+		{
+			nb_contrib++;
+		}
+	}
+	return starpu_mpi_redux_data_prio_tree(comm, data_handle, prio, nb_contrib);
+}

+ 1 - 1
mpi/tests/insert_task_tags.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
mpi/tests/nothing.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2020-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 2 - 0
src/Makefile.am

@@ -102,6 +102,7 @@ noinst_HEADERS = 						\
 	core/detect_combined_workers.h				\
 	sched_policies/helper_mct.h				\
 	sched_policies/fifo_queues.h				\
+	sched_policies/heteroprio.h				\
 	datawizard/node_ops.h					\
 	datawizard/footprint.h					\
 	datawizard/datawizard.h					\
@@ -249,6 +250,7 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 		\
 	datawizard/interfaces/block_interface.c			\
 	datawizard/interfaces/tensor_interface.c		\
 	datawizard/interfaces/vector_interface.c		\
+	datawizard/interfaces/tensor_filters.c		    \
 	datawizard/interfaces/bcsr_filters.c			\
 	datawizard/interfaces/csr_filters.c			\
 	datawizard/interfaces/vector_filters.c			\

+ 41 - 4
src/common/fxt.c

@@ -20,6 +20,7 @@
 #include <core/simgrid.h>
 #include <starpu_util.h>
 #include <starpu_profiling.h>
+#include <core/workers.h>
 
 /* we need to identify each task to generate the DAG. */
 unsigned long _starpu_job_cnt = 0;
@@ -52,6 +53,11 @@ static int _starpu_written = 0;
 
 static int _starpu_id;
 
+/* If we use several MPI processes, we can't use STARPU_GENERATE_TRACE=1,
+ * because each MPI process will handle its own trace file, so store the world
+ * size to warn the user if needed and avoid processing partial traces. */
+static int _starpu_mpi_worldsize = 1;
+
 static unsigned int initial_key_mask = FUT_KEYMASKALL;
 
 #ifdef STARPU_SIMGRID
@@ -127,6 +133,28 @@ void starpu_profiling_set_id(int new_id)
 #endif
 }
 
+void _starpu_profiling_set_mpi_worldsize(int worldsize)
+{
+	STARPU_ASSERT(worldsize >= 1);
+	_starpu_mpi_worldsize = worldsize;
+
+	int generate_trace = starpu_get_env_number("STARPU_GENERATE_TRACE");
+	if (generate_trace == 1 && _starpu_mpi_worldsize > 1)
+	{
+		/** TODO: make it work !
+		 * The problem is that when STARPU_GENERATE_TRACE is used, each MPI
+		 * process will generate the trace corresponding to its own execution
+		 * (which makes no sense in MPI execution with several processes).
+		 * Although letting only one StarPU process generating the trace by
+		 * using the trace files of all MPI processes is not the most
+		 * complicated thing to do, one case is not easy to deal with: what to
+		 * do when each process stored its trace file in the local memory of
+		 * the node (e.g. /tmp/) ?
+		 */
+		_STARPU_MSG("You can't use STARPU_GENERATE_TRACE=1 with several MPI processes. Use starpu_fxt_tool after application execution.\n");
+	}
+}
+
 void starpu_fxt_autostart_profiling(int autostart)
 {
 	if (autostart)
@@ -182,6 +210,7 @@ void _starpu_fxt_init_profiling(uint64_t trace_buffer_size)
 	_starpu_written = 0;
 	_starpu_profile_set_tracefile();
 
+
 #ifdef HAVE_FUT_SET_FILENAME
 	fut_set_filename(_starpu_prof_file_user);
 #endif
@@ -212,7 +241,7 @@ void _starpu_fxt_init_profiling(uint64_t trace_buffer_size)
 	return;
 }
 
-static void _starpu_generate_paje_trace_read_option(const char *option, struct starpu_fxt_options *options)
+int _starpu_generate_paje_trace_read_option(const char *option, struct starpu_fxt_options *options)
 {
 	if (strcmp(option, "-c") == 0)
 	{
@@ -254,10 +283,15 @@ static void _starpu_generate_paje_trace_read_option(const char *option, struct s
 	{
 		options->label_deps = 1;
 	}
+	else if (strcmp(option, "-number-events") == 0)
+	{
+		options->number_events_path = strdup("number_events.data");
+	}
 	else
 	{
-		_STARPU_MSG("Option <%s> is not a valid option for starpu_fxt_tool\n", option);
+		return 1;
 	}
+	return 0;
 }
 
 static void _starpu_generate_paje_trace(char *input_fxt_filename, char *output_paje_filename, char *dirname)
@@ -272,7 +306,9 @@ static void _starpu_generate_paje_trace(char *input_fxt_filename, char *output_p
 		char *option = strtok(trace_options, " ");
 		while (option)
 		{
-			_starpu_generate_paje_trace_read_option(option, &options);
+			int ret = _starpu_generate_paje_trace_read_option(option, &options);
+			if (ret == 1)
+				_STARPU_MSG("Option <%s> is not a valid option for starpu_fxt_tool\n", option);
 			option = strtok(NULL, " ");
 		}
 	}
@@ -316,8 +352,9 @@ void _starpu_stop_fxt_profiling(void)
 
 		/* Should we generate a Paje trace directly ? */
 		int generate_trace = starpu_get_env_number("STARPU_GENERATE_TRACE");
-		if (generate_trace == 1)
+		if (_starpu_mpi_worldsize == 1 && generate_trace == 1)
 		{
+			_starpu_set_catch_signals(0);
 			char *fxt_prefix = starpu_getenv("STARPU_FXT_PREFIX");
 			_starpu_generate_paje_trace(_starpu_prof_file_user, "paje.trace", fxt_prefix);
 		}

+ 18 - 0
src/common/fxt.h

@@ -242,6 +242,7 @@
 #define _STARPU_FUT_PAPI_TASK_EVENT_VALUE   0x5186
 #define _STARPU_FUT_TASK_EXCLUDE_FROM_DAG   0x5187
 
+#define _STARPU_FUT_TASK_END_DEP	0x5188
 
 /* Predefined FUT key masks */
 #define _STARPU_FUT_KEYMASK_META           FUT_KEYMASK0
@@ -315,12 +316,17 @@ static inline unsigned long _starpu_fxt_get_submit_order(void)
 
 long _starpu_gettid(void) STARPU_ATTRIBUTE_VISIBILITY_DEFAULT;
 
+int _starpu_generate_paje_trace_read_option(const char *option, struct starpu_fxt_options *options) STARPU_ATTRIBUTE_VISIBILITY_DEFAULT;
+
 /** Initialize the FxT library. */
 void _starpu_fxt_init_profiling(uint64_t trace_buffer_size);
 
 /** Stop the FxT library, and generate the trace file. */
 void _starpu_stop_fxt_profiling(void);
 
+/** In case we use MPI, tell the profiling system how many processes are used. */
+void _starpu_profiling_set_mpi_worldsize(int worldsize) STARPU_ATTRIBUTE_VISIBILITY_DEFAULT;
+
 /** Generate the trace file. Used when catching signals SIGINT and SIGSEGV */
 void _starpu_fxt_dump_file(void);
 
@@ -856,6 +862,9 @@ do {									\
 #define _STARPU_TRACE_TASK_DEPS(job_prev, job_succ)	\
 	_STARPU_FUT_FULL_PROBE4STR(_STARPU_FUT_KEYMASK_TASK, _STARPU_FUT_TASK_DEPS, (job_prev)->job_id, (job_succ)->job_id, (job_succ)->task->type, 1, "task")
 
+#define _STARPU_TRACE_TASK_END_DEP(job_prev, job_succ) \
+	FUT_DO_PROBE2(_STARPU_FUT_TASK_END_DEP, (job_prev)->job_id, (job_succ)->job_id)
+
 #define _STARPU_TRACE_GHOST_TASK_DEPS(ghost_prev_id, job_succ)		\
 	_STARPU_FUT_FULL_PROBE4STR(_STARPU_FUT_KEYMASK_TASK_VERBOSE, _STARPU_FUT_TASK_DEPS, (ghost_prev_id), (job_succ)->job_id, (job_succ)->task->type, 1, "ghost")
 
@@ -865,6 +874,13 @@ do {									\
 	FUT_FULL_PROBE2(_STARPU_FUT_KEYMASK_TASK, _STARPU_FUT_TASK_EXCLUDE_FROM_DAG, (job)->job_id, (long unsigned)exclude_from_dag); \
 } while(0)
 
+#define _STARPU_TRACE_TASK_NAME_LINE_COLOR(job)				\
+	do {								\
+		_STARPU_TRACE_TASK_NAME(job);				\
+		_STARPU_TRACE_TASK_LINE(job);				\
+		_STARPU_TRACE_TASK_COLOR(job);				\
+	} while(0)
+
 #define _STARPU_TRACE_TASK_LINE(job)					\
 	do {								\
 		if ((job)->task->file)					\
@@ -1364,8 +1380,10 @@ do {										\
 #define _STARPU_TRACE_TAG(tag, job)		do {(void)(tag); (void)(job);} while(0)
 #define _STARPU_TRACE_TAG_DEPS(a, b)		do {(void)(a); (void)(b);} while(0)
 #define _STARPU_TRACE_TASK_DEPS(a, b)		do {(void)(a); (void)(b);} while(0)
+#define _STARPU_TRACE_TASK_END_DEP(a, b)	do {(void)(a); (void)(b);} while(0)
 #define _STARPU_TRACE_GHOST_TASK_DEPS(a, b)	do {(void)(a); (void)(b);} while(0)
 #define _STARPU_TRACE_TASK_EXCLUDE_FROM_DAG(a)	do {(void)(a);} while(0)
+#define _STARPU_TRACE_TASK_NAME_LINE_COLOR(a)   do {(void)(a);} while(0)
 #define _STARPU_TRACE_TASK_NAME(a)		do {(void)(a);} while(0)
 #define _STARPU_TRACE_TASK_LINE(a)		do {(void)(a);} while(0)
 #define _STARPU_TRACE_TASK_COLOR(a)		do {(void)(a);} while(0)

+ 314 - 314
src/common/rbtree.c

@@ -38,15 +38,15 @@
 static inline int starpu_rbtree_index(const struct starpu_rbtree_node *node,
                                const struct starpu_rbtree_node *parent)
 {
-    assert(parent != NULL);
-    assert((node == NULL) || (starpu_rbtree_parent(node) == parent));
+	assert(parent != NULL);
+	assert((node == NULL) || (starpu_rbtree_parent(node) == parent));
 
-    if (parent->children[STARPU_RBTREE_LEFT] == node)
-        return STARPU_RBTREE_LEFT;
+	if (parent->children[STARPU_RBTREE_LEFT] == node)
+		return STARPU_RBTREE_LEFT;
 
-    assert(parent->children[STARPU_RBTREE_RIGHT] == node);
+	assert(parent->children[STARPU_RBTREE_RIGHT] == node);
 
-    return STARPU_RBTREE_RIGHT;
+	return STARPU_RBTREE_RIGHT;
 }
 
 /*
@@ -54,7 +54,7 @@ static inline int starpu_rbtree_index(const struct starpu_rbtree_node *node,
  */
 static inline int starpu_rbtree_color(const struct starpu_rbtree_node *node)
 {
-    return node->parent & STARPU_RBTREE_COLOR_MASK;
+	return node->parent & STARPU_RBTREE_COLOR_MASK;
 }
 
 /*
@@ -62,7 +62,7 @@ static inline int starpu_rbtree_color(const struct starpu_rbtree_node *node)
  */
 static inline int starpu_rbtree_is_red(const struct starpu_rbtree_node *node)
 {
-    return starpu_rbtree_color(node) == STARPU_RBTREE_COLOR_RED;
+	return starpu_rbtree_color(node) == STARPU_RBTREE_COLOR_RED;
 }
 
 /*
@@ -70,7 +70,7 @@ static inline int starpu_rbtree_is_red(const struct starpu_rbtree_node *node)
  */
 static inline int starpu_rbtree_is_black(const struct starpu_rbtree_node *node)
 {
-    return starpu_rbtree_color(node) == STARPU_RBTREE_COLOR_BLACK;
+	return starpu_rbtree_color(node) == STARPU_RBTREE_COLOR_BLACK;
 }
 
 /*
@@ -79,10 +79,10 @@ static inline int starpu_rbtree_is_black(const struct starpu_rbtree_node *node)
 static inline void starpu_rbtree_set_parent(struct starpu_rbtree_node *node,
                                      struct starpu_rbtree_node *parent)
 {
-    assert(starpu_rbtree_check_alignment(node));
-    assert(starpu_rbtree_check_alignment(parent));
+	assert(starpu_rbtree_check_alignment(node));
+	assert(starpu_rbtree_check_alignment(parent));
 
-    node->parent = (uintptr_t)parent | (node->parent & STARPU_RBTREE_COLOR_MASK);
+	node->parent = (uintptr_t)parent | (node->parent & STARPU_RBTREE_COLOR_MASK);
 }
 
 /*
@@ -90,8 +90,8 @@ static inline void starpu_rbtree_set_parent(struct starpu_rbtree_node *node,
  */
 static inline void starpu_rbtree_set_color(struct starpu_rbtree_node *node, int color)
 {
-    assert((color & ~STARPU_RBTREE_COLOR_MASK) == 0);
-    node->parent = (node->parent & STARPU_RBTREE_PARENT_MASK) | color;
+	assert((color & ~STARPU_RBTREE_COLOR_MASK) == 0);
+	node->parent = (node->parent & STARPU_RBTREE_PARENT_MASK) | color;
 }
 
 /*
@@ -99,7 +99,7 @@ static inline void starpu_rbtree_set_color(struct starpu_rbtree_node *node, int
  */
 static inline void starpu_rbtree_set_red(struct starpu_rbtree_node *node)
 {
-    starpu_rbtree_set_color(node, STARPU_RBTREE_COLOR_RED);
+	starpu_rbtree_set_color(node, STARPU_RBTREE_COLOR_RED);
 }
 
 /*
@@ -107,7 +107,7 @@ static inline void starpu_rbtree_set_red(struct starpu_rbtree_node *node)
  */
 static inline void starpu_rbtree_set_black(struct starpu_rbtree_node *node)
 {
-    starpu_rbtree_set_color(node, STARPU_RBTREE_COLOR_BLACK);
+	starpu_rbtree_set_color(node, STARPU_RBTREE_COLOR_BLACK);
 }
 
 /*
@@ -118,323 +118,323 @@ static inline void starpu_rbtree_set_black(struct starpu_rbtree_node *node)
  */
 static void starpu_rbtree_rotate(struct starpu_rbtree *tree, struct starpu_rbtree_node *node, int direction)
 {
-    struct starpu_rbtree_node *parent, *rnode;
-    int left, right;
+	struct starpu_rbtree_node *parent, *rnode;
+	int left, right;
 
-    left = direction;
-    right = 1 - left;
-    parent = starpu_rbtree_parent(node);
-    rnode = node->children[right];
+	left = direction;
+	right = 1 - left;
+	parent = starpu_rbtree_parent(node);
+	rnode = node->children[right];
 
-    node->children[right] = rnode->children[left];
+	node->children[right] = rnode->children[left];
 
-    if (rnode->children[left] != NULL)
-        starpu_rbtree_set_parent(rnode->children[left], node);
+	if (rnode->children[left] != NULL)
+		starpu_rbtree_set_parent(rnode->children[left], node);
 
-    rnode->children[left] = node;
-    starpu_rbtree_set_parent(rnode, parent);
+	rnode->children[left] = node;
+	starpu_rbtree_set_parent(rnode, parent);
 
-    if (unlikely(parent == NULL))
-        tree->root = rnode;
-    else
-        parent->children[starpu_rbtree_index(node, parent)] = rnode;
+	if (unlikely(parent == NULL))
+		tree->root = rnode;
+	else
+		parent->children[starpu_rbtree_index(node, parent)] = rnode;
 
-    starpu_rbtree_set_parent(node, rnode);
+	starpu_rbtree_set_parent(node, rnode);
 }
 
 void starpu_rbtree_insert_rebalance(struct starpu_rbtree *tree, struct starpu_rbtree_node *parent,
-                             int index, struct starpu_rbtree_node *node)
+				    int index, struct starpu_rbtree_node *node)
 {
-    struct starpu_rbtree_node *grand_parent, *tmp;
-
-    assert(starpu_rbtree_check_alignment(parent));
-    assert(starpu_rbtree_check_alignment(node));
-
-    node->parent = (uintptr_t)parent | STARPU_RBTREE_COLOR_RED;
-    node->children[STARPU_RBTREE_LEFT] = NULL;
-    node->children[STARPU_RBTREE_RIGHT] = NULL;
-
-    if (unlikely(parent == NULL))
-        tree->root = node;
-    else
-        parent->children[index] = node;
-
-    for (;;)
-    {
-	struct starpu_rbtree_node *uncle;
-	int left, right;
+	struct starpu_rbtree_node *grand_parent, *tmp;
 
-	if (parent == NULL)
-	{
-            starpu_rbtree_set_black(node);
-            break;
-        }
-
-        if (starpu_rbtree_is_black(parent))
-            break;
+	assert(starpu_rbtree_check_alignment(parent));
+	assert(starpu_rbtree_check_alignment(node));
 
-        grand_parent = starpu_rbtree_parent(parent);
-        assert(grand_parent != NULL);
+	node->parent = (uintptr_t)parent | STARPU_RBTREE_COLOR_RED;
+	node->children[STARPU_RBTREE_LEFT] = NULL;
+	node->children[STARPU_RBTREE_RIGHT] = NULL;
 
-        left = starpu_rbtree_index(parent, grand_parent);
-        right = 1 - left;
+	if (unlikely(parent == NULL))
+		tree->root = node;
+	else
+		parent->children[index] = node;
 
-        uncle = grand_parent->children[right];
-
-        /*
-         * Uncle is red. Flip colors and repeat at grand parent.
-         */
-        if ((uncle != NULL) && starpu_rbtree_is_red(uncle))
-	{
-            starpu_rbtree_set_black(uncle);
-            starpu_rbtree_set_black(parent);
-            starpu_rbtree_set_red(grand_parent);
-            node = grand_parent;
-            parent = starpu_rbtree_parent(node);
-            continue;
-        }
-
-        /*
-         * Node is the right child of its parent. Rotate left at parent.
-         */
-        if (parent->children[right] == node)
+	for (;;)
 	{
-            starpu_rbtree_rotate(tree, parent, left);
-            tmp = node;
-            node = parent;
-            parent = tmp;
-        }
-
-        /*
-         * Node is the left child of its parent. Handle colors, rotate right
-         * at grand parent, and leave.
-         */
-        starpu_rbtree_set_black(parent);
-        starpu_rbtree_set_red(grand_parent);
-        starpu_rbtree_rotate(tree, grand_parent, right);
-        break;
-    }
-
-    assert(starpu_rbtree_is_black(tree->root));
+		struct starpu_rbtree_node *uncle;
+		int left, right;
+
+		if (parent == NULL)
+		{
+			starpu_rbtree_set_black(node);
+			break;
+		}
+
+		if (starpu_rbtree_is_black(parent))
+			break;
+
+		grand_parent = starpu_rbtree_parent(parent);
+		assert(grand_parent != NULL);
+
+		left = starpu_rbtree_index(parent, grand_parent);
+		right = 1 - left;
+
+		uncle = grand_parent->children[right];
+
+		/*
+		 * Uncle is red. Flip colors and repeat at grand parent.
+		 */
+		if ((uncle != NULL) && starpu_rbtree_is_red(uncle))
+		{
+			starpu_rbtree_set_black(uncle);
+			starpu_rbtree_set_black(parent);
+			starpu_rbtree_set_red(grand_parent);
+			node = grand_parent;
+			parent = starpu_rbtree_parent(node);
+			continue;
+		}
+
+		/*
+		 * Node is the right child of its parent. Rotate left at parent.
+		 */
+		if (parent->children[right] == node)
+		{
+			starpu_rbtree_rotate(tree, parent, left);
+			tmp = node;
+			node = parent;
+			parent = tmp;
+		}
+
+		/*
+		 * Node is the left child of its parent. Handle colors, rotate right
+		 * at grand parent, and leave.
+		 */
+		starpu_rbtree_set_black(parent);
+		starpu_rbtree_set_red(grand_parent);
+		starpu_rbtree_rotate(tree, grand_parent, right);
+		break;
+	}
+
+	assert(starpu_rbtree_is_black(tree->root));
 }
 
 void starpu_rbtree_remove(struct starpu_rbtree *tree, struct starpu_rbtree_node *node)
 {
-    struct starpu_rbtree_node *child, *parent, *brother;
-    int color, left, right;
-
-    if (node->children[STARPU_RBTREE_LEFT] == NULL)
-        child = node->children[STARPU_RBTREE_RIGHT];
-    else if (node->children[STARPU_RBTREE_RIGHT] == NULL)
-        child = node->children[STARPU_RBTREE_LEFT];
-    else
-    {
-        struct starpu_rbtree_node *successor;
-
-        /*
-         * Two-children case: replace the node with its successor.
-         */
-
-        successor = node->children[STARPU_RBTREE_RIGHT];
-
-        while (successor->children[STARPU_RBTREE_LEFT] != NULL)
-            successor = successor->children[STARPU_RBTREE_LEFT];
-
-        color = starpu_rbtree_color(successor);
-        child = successor->children[STARPU_RBTREE_RIGHT];
-        parent = starpu_rbtree_parent(node);
-
-        if (unlikely(parent == NULL))
-            tree->root = successor;
-        else
-            parent->children[starpu_rbtree_index(node, parent)] = successor;
-
-        parent = starpu_rbtree_parent(successor);
-
-        /*
-         * Set parent directly to keep the original color.
-         */
-        successor->parent = node->parent;
-        successor->children[STARPU_RBTREE_LEFT] = node->children[STARPU_RBTREE_LEFT];
-        starpu_rbtree_set_parent(successor->children[STARPU_RBTREE_LEFT], successor);
-
-        if (node == parent)
-            parent = successor;
-        else
-	{
-            successor->children[STARPU_RBTREE_RIGHT] = node->children[STARPU_RBTREE_RIGHT];
-            starpu_rbtree_set_parent(successor->children[STARPU_RBTREE_RIGHT], successor);
-            parent->children[STARPU_RBTREE_LEFT] = child;
-
-            if (child != NULL)
-                starpu_rbtree_set_parent(child, parent);
-        }
-
-        goto update_color;
-    }
-
-    /*
-     * Node has at most one child.
-     */
-
-    color = starpu_rbtree_color(node);
-    parent = starpu_rbtree_parent(node);
-
-    if (child != NULL)
-        starpu_rbtree_set_parent(child, parent);
-
-    if (unlikely(parent == NULL))
-        tree->root = child;
-    else
-        parent->children[starpu_rbtree_index(node, parent)] = child;
-
-    /*
-     * The node has been removed, update the colors. The child pointer can
-     * be null, in which case it is considered a black leaf.
-     */
-update_color:
-    if (color == STARPU_RBTREE_COLOR_RED)
-        return;
-
-    for (;;)
-    {
-        if ((child != NULL) && starpu_rbtree_is_red(child))
+	struct starpu_rbtree_node *child, *parent, *brother;
+	int color, left, right;
+
+	if (node->children[STARPU_RBTREE_LEFT] == NULL)
+		child = node->children[STARPU_RBTREE_RIGHT];
+	else if (node->children[STARPU_RBTREE_RIGHT] == NULL)
+		child = node->children[STARPU_RBTREE_LEFT];
+	else
 	{
-            starpu_rbtree_set_black(child);
-            break;
-        }
-
-        if (parent == NULL)
-            break;
-
-        left = starpu_rbtree_index(child, parent);
-        right = 1 - left;
-
-        brother = parent->children[right];
-
-        /*
-         * Brother is red. Recolor and rotate left at parent so that brother
-         * becomes black.
-         */
-        if (starpu_rbtree_is_red(brother))
-	{
-            starpu_rbtree_set_black(brother);
-            starpu_rbtree_set_red(parent);
-            starpu_rbtree_rotate(tree, parent, left);
-            brother = parent->children[right];
-        }
-
-        /*
-         * Brother has no red child. Recolor and repeat at parent.
-         */
-        if (((brother->children[STARPU_RBTREE_LEFT] == NULL)
-             || starpu_rbtree_is_black(brother->children[STARPU_RBTREE_LEFT]))
-            && ((brother->children[STARPU_RBTREE_RIGHT] == NULL)
-                || starpu_rbtree_is_black(brother->children[STARPU_RBTREE_RIGHT])))
-	{
-            starpu_rbtree_set_red(brother);
-            child = parent;
-            parent = starpu_rbtree_parent(child);
-            continue;
-        }
-
-        /*
-         * Brother's right child is black. Recolor and rotate right at brother.
-         */
-        if ((brother->children[right] == NULL)
-            || starpu_rbtree_is_black(brother->children[right]))
+		struct starpu_rbtree_node *successor;
+
+		/*
+		 * Two-children case: replace the node with its successor.
+		 */
+
+		successor = node->children[STARPU_RBTREE_RIGHT];
+
+		while (successor->children[STARPU_RBTREE_LEFT] != NULL)
+			successor = successor->children[STARPU_RBTREE_LEFT];
+
+		color = starpu_rbtree_color(successor);
+		child = successor->children[STARPU_RBTREE_RIGHT];
+		parent = starpu_rbtree_parent(node);
+
+		if (unlikely(parent == NULL))
+			tree->root = successor;
+		else
+			parent->children[starpu_rbtree_index(node, parent)] = successor;
+
+		parent = starpu_rbtree_parent(successor);
+
+		/*
+		 * Set parent directly to keep the original color.
+		 */
+		successor->parent = node->parent;
+		successor->children[STARPU_RBTREE_LEFT] = node->children[STARPU_RBTREE_LEFT];
+		starpu_rbtree_set_parent(successor->children[STARPU_RBTREE_LEFT], successor);
+
+		if (node == parent)
+			parent = successor;
+		else
+		{
+			successor->children[STARPU_RBTREE_RIGHT] = node->children[STARPU_RBTREE_RIGHT];
+			starpu_rbtree_set_parent(successor->children[STARPU_RBTREE_RIGHT], successor);
+			parent->children[STARPU_RBTREE_LEFT] = child;
+
+			if (child != NULL)
+				starpu_rbtree_set_parent(child, parent);
+		}
+
+		goto update_color;
+	}
+
+	/*
+	 * Node has at most one child.
+	 */
+
+	color = starpu_rbtree_color(node);
+	parent = starpu_rbtree_parent(node);
+
+	if (child != NULL)
+		starpu_rbtree_set_parent(child, parent);
+
+	if (unlikely(parent == NULL))
+		tree->root = child;
+	else
+		parent->children[starpu_rbtree_index(node, parent)] = child;
+
+	/*
+	 * The node has been removed, update the colors. The child pointer can
+	 * be null, in which case it is considered a black leaf.
+	 */
+ update_color:
+	if (color == STARPU_RBTREE_COLOR_RED)
+		return;
+
+	for (;;)
 	{
-            starpu_rbtree_set_black(brother->children[left]);
-            starpu_rbtree_set_red(brother);
-            starpu_rbtree_rotate(tree, brother, right);
-            brother = parent->children[right];
-        }
-
-        /*
-         * Brother's left child is black. Exchange parent and brother colors
-         * (we already know brother is black), set brother's right child black,
-         * rotate left at parent and leave.
-         */
-        starpu_rbtree_set_color(brother, starpu_rbtree_color(parent));
-        starpu_rbtree_set_black(parent);
-        starpu_rbtree_set_black(brother->children[right]);
-        starpu_rbtree_rotate(tree, parent, left);
-        break;
-    }
-
-    assert((tree->root == NULL) || starpu_rbtree_is_black(tree->root));
+		if ((child != NULL) && starpu_rbtree_is_red(child))
+		{
+			starpu_rbtree_set_black(child);
+			break;
+		}
+
+		if (parent == NULL)
+			break;
+
+		left = starpu_rbtree_index(child, parent);
+		right = 1 - left;
+
+		brother = parent->children[right];
+
+		/*
+		 * Brother is red. Recolor and rotate left at parent so that brother
+		 * becomes black.
+		 */
+		if (starpu_rbtree_is_red(brother))
+		{
+			starpu_rbtree_set_black(brother);
+			starpu_rbtree_set_red(parent);
+			starpu_rbtree_rotate(tree, parent, left);
+			brother = parent->children[right];
+		}
+
+		/*
+		 * Brother has no red child. Recolor and repeat at parent.
+		 */
+		if (((brother->children[STARPU_RBTREE_LEFT] == NULL)
+		     || starpu_rbtree_is_black(brother->children[STARPU_RBTREE_LEFT]))
+		    && ((brother->children[STARPU_RBTREE_RIGHT] == NULL)
+			|| starpu_rbtree_is_black(brother->children[STARPU_RBTREE_RIGHT])))
+		{
+			starpu_rbtree_set_red(brother);
+			child = parent;
+			parent = starpu_rbtree_parent(child);
+			continue;
+		}
+
+		/*
+		 * Brother's right child is black. Recolor and rotate right at brother.
+		 */
+		if ((brother->children[right] == NULL)
+		    || starpu_rbtree_is_black(brother->children[right]))
+		{
+			starpu_rbtree_set_black(brother->children[left]);
+			starpu_rbtree_set_red(brother);
+			starpu_rbtree_rotate(tree, brother, right);
+			brother = parent->children[right];
+		}
+
+		/*
+		 * Brother's left child is black. Exchange parent and brother colors
+		 * (we already know brother is black), set brother's right child black,
+		 * rotate left at parent and leave.
+		 */
+		starpu_rbtree_set_color(brother, starpu_rbtree_color(parent));
+		starpu_rbtree_set_black(parent);
+		starpu_rbtree_set_black(brother->children[right]);
+		starpu_rbtree_rotate(tree, parent, left);
+		break;
+	}
+
+	assert((tree->root == NULL) || starpu_rbtree_is_black(tree->root));
 }
 
 struct starpu_rbtree_node * starpu_rbtree_nearest(struct starpu_rbtree_node *parent, int index,
-                                    int direction)
+						  int direction)
 {
-    assert(starpu_rbtree_check_index(direction));
+	assert(starpu_rbtree_check_index(direction));
 
-    if (parent == NULL)
-        return NULL;
+	if (parent == NULL)
+		return NULL;
 
-    assert(starpu_rbtree_check_index(index));
+	assert(starpu_rbtree_check_index(index));
 
-    if (index != direction)
-        return parent;
+	if (index != direction)
+		return parent;
 
-    return starpu_rbtree_walk(parent, direction);
+	return starpu_rbtree_walk(parent, direction);
 }
 
 struct starpu_rbtree_node * starpu_rbtree_firstlast(const struct starpu_rbtree *tree, int direction)
 {
-    struct starpu_rbtree_node *prev, *cur;
+	struct starpu_rbtree_node *prev, *cur;
 
-    assert(starpu_rbtree_check_index(direction));
+	assert(starpu_rbtree_check_index(direction));
 
-    prev = NULL;
+	prev = NULL;
 
-    for (cur = tree->root; cur != NULL; cur = cur->children[direction])
-        prev = cur;
+	for (cur = tree->root; cur != NULL; cur = cur->children[direction])
+		prev = cur;
 
-    return prev;
+	return prev;
 }
 
 struct starpu_rbtree_node * starpu_rbtree_walk(struct starpu_rbtree_node *node, int direction)
 {
-    int left, right;
+	int left, right;
 
-    assert(starpu_rbtree_check_index(direction));
+	assert(starpu_rbtree_check_index(direction));
 
-    left = direction;
-    right = 1 - left;
+	left = direction;
+	right = 1 - left;
 
-    if (node == NULL)
-        return NULL;
+	if (node == NULL)
+		return NULL;
 
-    if (node->children[left] != NULL)
-    {
-        node = node->children[left];
+	if (node->children[left] != NULL)
+	{
+		node = node->children[left];
 
-        while (node->children[right] != NULL)
-            node = node->children[right];
-    }
-    else
-    {
-        for (;;)
+		while (node->children[right] != NULL)
+			node = node->children[right];
+	}
+	else
 	{
-            struct starpu_rbtree_node *parent;
-	    int index;
+		for (;;)
+		{
+			struct starpu_rbtree_node *parent;
+			int index;
 
-            parent = starpu_rbtree_parent(node);
+			parent = starpu_rbtree_parent(node);
 
-            if (parent == NULL)
-                return NULL;
+			if (parent == NULL)
+				return NULL;
 
-            index = starpu_rbtree_index(node, parent);
-            node = parent;
+			index = starpu_rbtree_index(node, parent);
+			node = parent;
 
-            if (index == right)
-                break;
-        }
-    }
+			if (index == right)
+				break;
+		}
+	}
 
-    return node;
+	return node;
 }
 
 /*
@@ -442,59 +442,59 @@ struct starpu_rbtree_node * starpu_rbtree_walk(struct starpu_rbtree_node *node,
  */
 static struct starpu_rbtree_node * starpu_rbtree_find_deepest(struct starpu_rbtree_node *node)
 {
-    struct starpu_rbtree_node *parent;
+	struct starpu_rbtree_node *parent;
 
-    assert(node != NULL);
+	assert(node != NULL);
 
-    for (;;)
-    {
-        parent = node;
-        node = node->children[STARPU_RBTREE_LEFT];
-
-        if (node == NULL)
+	for (;;)
 	{
-            node = parent->children[STARPU_RBTREE_RIGHT];
+		parent = node;
+		node = node->children[STARPU_RBTREE_LEFT];
+
+		if (node == NULL)
+		{
+		node = parent->children[STARPU_RBTREE_RIGHT];
 
-            if (node == NULL)
-                return parent;
-        }
-    }
+		if (node == NULL)
+		    return parent;
+		}
+	}
 }
 
 struct starpu_rbtree_node * starpu_rbtree_postwalk_deepest(const struct starpu_rbtree *tree)
 {
-    struct starpu_rbtree_node *node;
+	struct starpu_rbtree_node *node;
 
-    node = tree->root;
+	node = tree->root;
 
-    if (node == NULL)
-        return NULL;
+	if (node == NULL)
+		return NULL;
 
-    return starpu_rbtree_find_deepest(node);
+	return starpu_rbtree_find_deepest(node);
 }
 
 struct starpu_rbtree_node * starpu_rbtree_postwalk_unlink(struct starpu_rbtree_node *node)
 {
-    struct starpu_rbtree_node *parent;
-    int index;
+	struct starpu_rbtree_node *parent;
+	int index;
 
-    if (node == NULL)
-        return NULL;
+	if (node == NULL)
+		return NULL;
 
-    assert(node->children[STARPU_RBTREE_LEFT] == NULL);
-    assert(node->children[STARPU_RBTREE_RIGHT] == NULL);
+	assert(node->children[STARPU_RBTREE_LEFT] == NULL);
+	assert(node->children[STARPU_RBTREE_RIGHT] == NULL);
 
-    parent = starpu_rbtree_parent(node);
+	parent = starpu_rbtree_parent(node);
 
-    if (parent == NULL)
-        return NULL;
+	if (parent == NULL)
+		return NULL;
 
-    index = starpu_rbtree_index(node, parent);
-    parent->children[index] = NULL;
-    node = parent->children[STARPU_RBTREE_RIGHT];
+	index = starpu_rbtree_index(node, parent);
+	parent->children[index] = NULL;
+	node = parent->children[STARPU_RBTREE_RIGHT];
 
-    if (node == NULL)
-        return parent;
+	if (node == NULL)
+		return parent;
 
-    return starpu_rbtree_find_deepest(node);
+	return starpu_rbtree_find_deepest(node);
 }

+ 1 - 1
src/core/combined_workers.c

@@ -81,7 +81,7 @@ int starpu_combined_worker_assign_workerid(int nworkers, int workerid_array[])
 	 * safe because this method should only be called when the scheduler
 	 * is being initialized. */
 	new_workerid = basic_worker_count + combined_worker_id;
-	STARPU_ASSERT_MSG(new_workerid < STARPU_NMAXWORKERS, "Too many combined workers for parallel task execution. Please use configure option --enable-maxcpus to increase it beyond the current value %d", STARPU_MAXCPUS);
+	STARPU_ASSERT_MSG(new_workerid < STARPU_NMAXWORKERS, "Too many combined workers (%d) for parallel task execution. Please use configure option --enable-maxcpus to increase it beyond the current value %d", new_workerid, STARPU_MAXCPUS);
 	config->topology.ncombinedworkers++;
 
 //	fprintf(stderr, "COMBINED WORKERS ");

+ 1 - 0
src/core/dependencies/data_arbiter_concurrency.c

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2021       Federal University of Rio Grande do Sul (UFRGS)
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 0
src/core/dependencies/data_concurrency.c

@@ -1,6 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ * Copyright (C) 2021       Federal University of Rio Grande do Sul (UFRGS)
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 1 - 1
src/core/disk_ops/unistd/disk_unistd_global.c

@@ -536,7 +536,7 @@ int starpu_unistd_global_full_write(void *base STARPU_ATTRIBUTE_UNUSED, void *ob
 	return starpu_unistd_global_write(base, obj, ptr, 0, size);
 }
 
-#if HAVE_AIO_H
+#if defined(HAVE_AIO_H)
 void * starpu_unistd_global_async_full_read (void * base, void * obj, void ** ptr, size_t * size, unsigned dst_node)
 {
         struct starpu_unistd_global_obj *tmp = (struct starpu_unistd_global_obj *) obj;

+ 10 - 0
src/core/jobs.c

@@ -303,6 +303,16 @@ void _starpu_handle_job_submission(struct _starpu_job *j)
 void starpu_task_end_dep_release(struct starpu_task *t)
 {
 	struct _starpu_job *j = _starpu_get_job_associated_to_task(t);
+
+#ifdef STARPU_USE_FXT
+	struct starpu_task *current = starpu_task_get_current();
+	if (current)
+	{
+		struct _starpu_job *jcurrent = _starpu_get_job_associated_to_task(current);
+		_STARPU_TRACE_TASK_END_DEP(jcurrent, j);
+	}
+#endif
+
 	_starpu_handle_job_termination(j);
 }
 

+ 6 - 0
src/core/perfmodel/perfmodel.c

@@ -541,6 +541,12 @@ void _starpu_set_perf_model_dirs()
 	snprintf(_perf_model_dir_debug, _PERF_MODEL_DIR_MAXLEN, "%s/debug/", _perf_model_dir);
 }
 
+char *_starpu_get_perf_model_dir()
+{
+	_starpu_create_sampling_directory_if_needed();
+	return _perf_model_dir;
+}
+
 char *_starpu_get_perf_model_dir_codelet()
 {
 	_starpu_create_sampling_directory_if_needed();

+ 1 - 0
src/core/perfmodel/perfmodel.h

@@ -65,6 +65,7 @@ struct starpu_perfmodel_arch;
 
 extern unsigned _starpu_calibration_minimum;
 
+char *_starpu_get_perf_model_dir();
 char *_starpu_get_perf_model_dir_codelet() STARPU_ATTRIBUTE_VISIBILITY_DEFAULT;
 char *_starpu_get_perf_model_dir_bus();
 char *_starpu_get_perf_model_dir_debug();

+ 2 - 2
src/core/perfmodel/perfmodel_bus.c

@@ -769,7 +769,7 @@ static void benchmark_all_gpu_devices(void)
 #ifdef STARPU_HAVE_HWLOC
 	hwloc_bitmap_t former_cpuset = hwloc_bitmap_alloc();
 	hwloc_get_cpubind(hwtopology, former_cpuset, HWLOC_CPUBIND_THREAD);
-#elif __linux__
+#elif defined(__linux__)
 	/* Save the current cpu binding */
 	cpu_set_t former_process_affinity;
 	int ret;
@@ -833,7 +833,7 @@ static void benchmark_all_gpu_devices(void)
 #ifdef STARPU_HAVE_HWLOC
 	hwloc_set_cpubind(hwtopology, former_cpuset, HWLOC_CPUBIND_THREAD);
 	hwloc_bitmap_free(former_cpuset);
-#elif __linux__
+#elif defined(__linux__)
 	/* Restore the former affinity */
 	ret = sched_setaffinity(0, sizeof(former_process_affinity), &former_process_affinity);
 	if (ret)

+ 10 - 6
src/core/sched_ctx.h

@@ -216,19 +216,23 @@ int _starpu_get_index_in_ctx_of_workerid(unsigned sched_ctx, unsigned workerid);
 /** Get the mutex corresponding to the global workerid */
 starpu_pthread_mutex_t *_starpu_get_sched_mutex(struct _starpu_sched_ctx *sched_ctx, int worker);
 
-/** Get workers belonging to a certain context, it returns the number of workers
- take care: no mutex taken, the list of workers might not be updated */
+/** Get workers belonging to a certain context, it returns the number
+ * of workers take care: no mutex taken, the list of workers might not
+ * be updated
+ */
 int _starpu_get_workers_of_sched_ctx(unsigned sched_ctx_id, int *pus, enum starpu_worker_archtype arch);
 
-/** Let the worker know it does not belong to the context and that
-   it should stop poping from it */
+/** Let the worker know it does not belong to the context and that it
+ * should stop poping from it
+ */
 void _starpu_worker_gets_out_of_ctx(unsigned sched_ctx_id, struct _starpu_worker *worker);
 
 /** Check if the worker belongs to another sched_ctx */
 unsigned _starpu_worker_belongs_to_a_sched_ctx(int workerid, unsigned sched_ctx_id);
 
-/** indicates wheather this worker should go to sleep or not
-   (if it is the last one awake in a context he should better keep awake) */
+/** indicates wheather this worker should go to sleep or not (if it is
+ * the last one awake in a context he should better keep awake)
+ */
 unsigned _starpu_sched_ctx_last_worker_awake(struct _starpu_worker *worker);
 
 /** If starpu_sched_ctx_set_context() has been called, returns the context

+ 2 - 1
src/core/sched_policy.c

@@ -494,6 +494,7 @@ int _starpu_repush_task(struct _starpu_job *j)
 	 * corresponding dependencies */
 	if (task->cl == NULL || task->where == STARPU_NOWHERE)
 	{
+		_STARPU_TRACE_TASK_NAME_LINE_COLOR(j);
 		if (!_starpu_perf_counter_paused() && !j->internal)
 		{
 			(void)STARPU_ATOMIC_ADD64(& _starpu_task__g_current_ready__value, -1);
@@ -1164,7 +1165,7 @@ void _starpu_print_idle_time()
 	}
 }
 
-void starpu_sched_task_break(struct starpu_task *task)
+void starpu_sched_task_break(struct starpu_task *task STARPU_ATTRIBUTE_UNUSED)
 {
 	_STARPU_TASK_BREAK_ON(task, sched);
 }

+ 18 - 4
src/core/task.c

@@ -824,8 +824,7 @@ static int _starpu_task_submit_head(struct starpu_task *task)
 				_STARPU_TASK_SET_INTERFACE(task, starpu_data_get_interface_on_node(handle, handle->home_node), i);
 			if (!(task->cl->flags & STARPU_CODELET_NOPLANS) &&
 			    ((handle->nplans && !handle->nchildren) || handle->siblings)
-			    && handle->partition_automatic_disabled == 0
-			    )
+			    && !(mode & STARPU_NOPLAN))
 				/* This handle is involved with asynchronous
 				 * partitioning as a parent or a child, make
 				 * sure the right plan is active, submit
@@ -941,8 +940,6 @@ int _starpu_task_submit(struct starpu_task *task, int nodeps)
 		_STARPU_TRACE_TASK_SUBMIT(j,
 			_starpu_get_sched_ctx_struct(task->sched_ctx)->iterations[0],
 			_starpu_get_sched_ctx_struct(task->sched_ctx)->iterations[1]);
-		_STARPU_TRACE_TASK_NAME(j);
-		_STARPU_TRACE_TASK_LINE(j);
 	}
 
 	/* If this is a continuation, we don't modify the implicit data dependencies detected earlier. */
@@ -1726,3 +1723,20 @@ void starpu_task_ft_success(struct starpu_task *meta_task)
 {
 	starpu_task_end_dep_release(meta_task);
 }
+
+char *starpu_task_status_get_as_string(enum starpu_task_status status)
+{
+	switch(status)
+	{
+	case(STARPU_TASK_INIT) : return "STARPU_TASK_INIT";
+	case(STARPU_TASK_BLOCKED): return "STARPU_TASK_BLOCKED";
+	case(STARPU_TASK_READY): return "STARPU_TASK_READY";
+	case(STARPU_TASK_RUNNING): return "STARPU_TASK_RUNNING";
+	case(STARPU_TASK_FINISHED): return "STARPU_TASK_FINISHED";
+	case(STARPU_TASK_BLOCKED_ON_TAG): return "STARPU_TASK_BLOCKED_ON_TAG";
+	case(STARPU_TASK_BLOCKED_ON_TASK): return "STARPU_TASK_BLOCKED_ON_TASK";
+	case(STARPU_TASK_BLOCKED_ON_DATA): return "STARPU_TASK_BLOCKED_ON_DATA";
+	case(STARPU_TASK_STOPPED): return "STARPU_TASK_STOPPED";
+	default: return "STARPU_TASK_unknown_status";
+	}
+}

+ 59 - 0
src/core/workers.c

@@ -4,6 +4,7 @@
  * Copyright (C) 2011       Télécom-SudParis
  * Copyright (C) 2013       Thibaut Lambert
  * Copyright (C) 2016       Uppsala University
+ * Copyright (C) 2021       Federal University of Rio Grande do Sul (UFRGS)
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -19,6 +20,7 @@
 
 #include <stdlib.h>
 #include <stdio.h>
+#include <strings.h>
 #ifdef __linux__
 #include <sys/utsname.h>
 #endif
@@ -57,6 +59,14 @@
 #include <windows.h>
 #endif
 
+#if defined(_WIN32)
+#ifdef __GNUC__
+#define ffs(arg) __builtin_ffs(arg)
+#else
+#define ffs(arg) _bit_scan_forward(arg)
+#endif
+#endif
+
 
 /* global knobs */
 static int __g_calibrate_knob;
@@ -210,6 +220,7 @@ struct _starpu_driver_info starpu_driver_info[STARPU_NARCH];
 
 void _starpu_driver_info_register(enum starpu_worker_archtype archtype, const struct _starpu_driver_info *info)
 {
+	STARPU_ASSERT(archtype >= 0 && archtype < STARPU_NARCH);
 	starpu_driver_info[archtype] = *info;
 }
 
@@ -1321,7 +1332,10 @@ static void _starpu_build_tree(void)
 static starpu_pthread_mutex_t sig_handlers_mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
 static void (*act_sigint)(int);
 static void (*act_sigsegv)(int);
+static void (*act_sigabrt)(int);
+#ifdef SIGTRAP
 static void (*act_sigtrap)(int);
+#endif
 
 void _starpu_handler(int sig)
 {
@@ -1345,6 +1359,13 @@ void _starpu_handler(int sig)
 			sig_act = SIG_DFL;
 		signal(SIGSEGV, sig_act);
 	}
+	if (sig == SIGABRT)
+	{
+		void (*sig_act)(int) = act_sigabrt;
+		if (sig_act == NULL)
+			sig_act = SIG_DFL;
+		signal(SIGABRT, sig_act);
+	}
 #ifdef SIGTRAP
 	if (sig == SIGTRAP)
 	{
@@ -1372,6 +1393,11 @@ void _starpu_catch_signals(void)
 		old_sig_act = signal(SIGSEGV, _starpu_handler);
 		if (old_sig_act != _starpu_handler)
 			act_sigsegv  = old_sig_act;
+
+		old_sig_act = signal(SIGABRT, _starpu_handler);
+		if (old_sig_act != _starpu_handler)
+			act_sigabrt  = old_sig_act;
+
 #ifdef SIGTRAP
 		old_sig_act = signal(SIGTRAP, _starpu_handler);
 		if (old_sig_act != _starpu_handler)
@@ -1391,6 +1417,13 @@ void _starpu_catch_signals(void)
 			signal(SIGSEGV, act_sigsegv);
 			act_sigsegv = NULL;
 		}
+
+		if (act_sigabrt != NULL)
+		{
+			signal(SIGABRT, act_sigsegv);
+			act_sigabrt = NULL;
+		}
+
 #ifdef SIGTRAP
 		if (act_sigtrap != NULL)
 		{
@@ -2061,6 +2094,24 @@ void starpu_shutdown(void)
 #endif
 }
 
+unsigned starpu_worker_archtype_is_valid(enum starpu_worker_archtype type)
+{
+	return (type >= 0 && type < STARPU_NARCH)
+		&& (starpu_driver_info[type].name_upper != NULL);
+}
+
+enum starpu_worker_archtype starpu_arch_mask_to_worker_archtype(unsigned mask)
+{
+	STARPU_ASSERT(mask && !(mask & (mask-1))); // ensures that only one bit of the mask is set
+
+	enum starpu_worker_archtype worker_type = ffs(mask)-2; // ffs(mask) is the indice of the lesser bit
+
+	STARPU_ASSERT(worker_type >= 0 && worker_type < STARPU_NARCH); // worker_type is positive and lesser than arch number
+	STARPU_ASSERT(starpu_worker_archtype_is_valid(worker_type)); // worker_type is a valid worker architecture
+
+	return worker_type;
+}
+
 #undef starpu_worker_get_count
 unsigned starpu_worker_get_count(void)
 {
@@ -2420,6 +2471,11 @@ int starpu_worker_get_devids(enum starpu_worker_archtype type, int *devids, int
 	return ndevids;
 }
 
+unsigned starpu_worker_type_can_execute_task(enum starpu_worker_archtype worker_type, const struct starpu_task *task)
+{
+	return (STARPU_WORKER_TO_MASK(worker_type) & task->where) != 0;
+}
+
 void starpu_worker_get_name(int id, char *dst, size_t maxlen)
 {
 	char *name = _starpu_config.workers[id].name;
@@ -2611,6 +2667,7 @@ unsigned starpu_worker_get_sched_ctx_list(int workerid, unsigned **sched_ctxs)
 
 const char *starpu_worker_get_type_as_string(enum starpu_worker_archtype type)
 {
+	STARPU_ASSERT(type >= 0 && type < STARPU_NARCH);
 	const char *ret = starpu_driver_info[type].name_upper;
 	if (!ret)
 		ret = "unknown";
@@ -2619,6 +2676,7 @@ const char *starpu_worker_get_type_as_string(enum starpu_worker_archtype type)
 
 const char *starpu_worker_get_type_as_env_var(enum starpu_worker_archtype type)
 {
+	STARPU_ASSERT(type >= 0 && type < STARPU_NARCH);
 	const char *ret = starpu_driver_info[type].name_var;
 	if (!ret)
 		ret = "UNKNOWN";
@@ -2832,6 +2890,7 @@ void starpu_worker_set_waking_up_callback(void (*callback)(unsigned workerid))
 
 enum starpu_node_kind starpu_worker_get_memory_node_kind(enum starpu_worker_archtype type)
 {
+	STARPU_ASSERT(type >= 0 && type < STARPU_NARCH);
 	enum starpu_node_kind kind = starpu_driver_info[type].memory_kind;
 	STARPU_ASSERT_MSG(kind != (enum starpu_node_kind) -1, "no memory for archtype %d", type);
 	return kind;

+ 2 - 0
src/core/workers.h

@@ -445,6 +445,8 @@ extern int _starpu_keys_initialized;
 extern starpu_pthread_key_t _starpu_worker_key;
 extern starpu_pthread_key_t _starpu_worker_set_key;
 
+void _starpu_set_catch_signals(int do_catch_signal);
+
 /** Three functions to manage argv, argc */
 void _starpu_set_argc_argv(int *argc, char ***argv);
 int *_starpu_get_argc();

+ 1 - 1
src/datawizard/coherency.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2008-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
- * Copyright (C) 2018       Federal University of Rio Grande do Sul (UFRGS)
+ * Copyright (C) 2018,2021  Federal University of Rio Grande do Sul (UFRGS)
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by

+ 0 - 0
src/datawizard/coherency.h


部分文件因为文件数量过多而无法显示