Browse Source

Merge branch 'master' into fpga

Nathalie Furmento 5 years ago
parent
commit
c82fd431f7
100 changed files with 11315 additions and 1136 deletions
  1. 1 0
      .gitignore
  2. 110 52
      ChangeLog
  3. 42 10
      configure.ac
  4. 8 4
      doc/doxygen/chapters/210_check_list_performance.doxy
  5. 5 1
      doc/doxygen/chapters/301_tasks.doxy
  6. 80 3
      doc/doxygen/chapters/310_data_management.doxy
  7. 2 2
      doc/doxygen/chapters/320_scheduling.doxy
  8. 7 17
      doc/doxygen/chapters/370_online_performance_tools.doxy
  9. 82 31
      doc/doxygen/chapters/380_offline_performance_tools.doxy
  10. 7 5
      doc/doxygen/chapters/401_out_of_core.doxy
  11. 2 1
      doc/doxygen/chapters/410_mpi_support.doxy
  12. 1 1
      doc/doxygen/chapters/470_simgrid.doxy
  13. 47 13
      doc/doxygen/chapters/501_environment_variables.doxy
  14. 3 2
      doc/doxygen/chapters/510_configure_options.doxy
  15. 4274 0
      doc/doxygen/chapters/images/trace_recv_use.eps
  16. BIN
      doc/doxygen/chapters/images/trace_recv_use.pdf
  17. BIN
      doc/doxygen/chapters/images/trace_recv_use.png
  18. 4019 0
      doc/doxygen/chapters/images/trace_send_use.eps
  19. BIN
      doc/doxygen/chapters/images/trace_send_use.pdf
  20. BIN
      doc/doxygen/chapters/images/trace_send_use.png
  21. 3 3
      doc/doxygen/refman.tex
  22. 3 3
      doc/doxygen_dev/refman.tex
  23. 45 40
      examples/cholesky/cholesky_compil.c
  24. 116 77
      examples/cholesky/cholesky_grain_tag.c
  25. 65 60
      examples/cholesky/cholesky_implicit.c
  26. 1 1
      examples/cholesky/cholesky_kernels.c
  27. 149 102
      examples/cholesky/cholesky_tag.c
  28. 55 52
      examples/cholesky/cholesky_tile_tag.c
  29. 5 5
      examples/interface/complex_interface.c
  30. 2 0
      examples/lu/lu_example.c
  31. 3 1
      examples/matvecmult/matvecmult.c
  32. 3 3
      examples/matvecmult/matvecmult_kernel.cl
  33. 10 0
      examples/native_fortran/nf_matrix.f90
  34. 13 7
      examples/perf_monitoring/perf_counters_02.c
  35. 6 2
      examples/pipeline/pipeline.c
  36. 1 1
      examples/sched_ctx/axpy_partition_gpu.h
  37. 29 43
      examples/spmv/spmv.c
  38. 103 0
      include/fstarpu_mod.f90
  39. 6 0
      include/starpu.h
  40. 1 0
      include/starpu_config.h.in
  41. 40 2
      include/starpu_cuda.h
  42. 12 1
      include/starpu_data_filters.h
  43. 274 4
      include/starpu_data_interfaces.h
  44. 8 0
      include/starpu_fxt.h
  45. 9 0
      include/starpu_perf_monitoring.h
  46. 13 0
      include/starpu_profiling.h
  47. 14 12
      include/starpu_task.h
  48. 1 0
      include/starpu_thread.h
  49. 46 26
      include/starpu_util.h
  50. 66 14
      m4/acinclude.m4
  51. 5 0
      min-dgels/Makefile.in
  52. 6 4
      mpi/examples/matrix_decomposition/mpi_cholesky.c
  53. 61 56
      mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
  54. 2 2
      mpi/examples/matrix_decomposition/mpi_cholesky_codelets.h
  55. 11 2
      mpi/examples/matrix_decomposition/mpi_cholesky_distributed.c
  56. 1 1
      mpi/examples/matrix_decomposition/mpi_cholesky_kernels.c
  57. 21 18
      mpi/examples/matrix_decomposition/mpi_decomposition_matrix.c
  58. 2 2
      mpi/examples/matrix_decomposition/mpi_decomposition_matrix.h
  59. 8 2
      mpi/examples/matrix_decomposition/mpi_decomposition_params.c
  60. 2 1
      mpi/examples/matrix_decomposition/mpi_decomposition_params.h
  61. 2 0
      mpi/src/Makefile.am
  62. 5 1
      mpi/src/mpi/starpu_mpi_mpi.c
  63. 27 94
      mpi/src/nmad/starpu_mpi_nmad.c
  64. 1 0
      mpi/src/nmad/starpu_mpi_nmad_backend.c
  65. 5 1
      mpi/src/nmad/starpu_mpi_nmad_backend.h
  66. 169 0
      mpi/src/nmad/starpu_mpi_nmad_unknown_datatype.c
  67. 43 0
      mpi/src/nmad/starpu_mpi_nmad_unknown_datatype.h
  68. 1 0
      mpi/src/starpu_mpi.c
  69. 46 1
      mpi/src/starpu_mpi_datatype.c
  70. 31 31
      mpi/src/starpu_mpi_fxt.h
  71. 6 1
      mpi/src/starpu_mpi_init.c
  72. 6 2
      mpi/tests/Makefile.am
  73. 38 0
      mpi/tests/mpi_barrier.c
  74. 120 14
      mpi/tests/pingpong.c
  75. 180 0
      mpi/tests/sendrecv_bench.c
  76. 1 1
      sc_hypervisor/examples/cholesky/cholesky_kernels.c
  77. 1 0
      src/Makefile.am
  78. 10 10
      src/common/fxt.c
  79. 291 176
      src/common/fxt.h
  80. 29 1
      src/common/knobs.c
  81. 1 1
      src/common/knobs.h
  82. 21 1
      src/common/thread.c
  83. 4 1
      src/common/utils.c
  84. 1 0
      src/core/dependencies/implicit_data_deps.c
  85. 3 0
      src/core/disk.c
  86. 13 9
      src/core/perfmodel/perfmodel_bus.c
  87. 3 2
      src/core/perfmodel/regression.c
  88. 9 6
      src/core/sched_ctx.c
  89. 2 2
      src/core/sched_ctx.h
  90. 13 14
      src/core/sched_policy.c
  91. 57 32
      src/core/simgrid.c
  92. 19 6
      src/core/task.c
  93. 16 15
      src/core/topology.c
  94. 5 2
      src/core/tree.c
  95. 29 15
      src/core/workers.c
  96. 10 0
      src/core/workers.h
  97. 140 7
      src/datawizard/copy_driver.c
  98. 5 0
      src/datawizard/filters.c
  99. 41 1
      src/datawizard/interfaces/bcsr_filters.c
  100. 0 0
      src/datawizard/interfaces/block_interface.c

+ 1 - 0
.gitignore

@@ -11,6 +11,7 @@
 /GRTAGS
 /GTAGS
 /tags
+/TAGS
 /config.cache
 /doc/starpu.info
 *~

+ 110 - 52
ChangeLog

@@ -1,8 +1,8 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2012-2014,2016-2018                      Inria
-# Copyright (C) 2009-2019                                Université de Bordeaux
-# Copyright (C) 2010-2019                                CNRS
+# Copyright (C) 2009-2020                                Université de Bordeaux
+# Copyright (C) 2010-2020                                CNRS
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -16,19 +16,37 @@
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 #
 
-StarPU 1.4.0 (svn revision xxxx)
+StarPU 1.4.0 (git revision xxxx)
 ==============================================
 New features:
   * Fault tolerance support with starpu_task_ft_failed().
   * Add get_max_size method to data interfaces for applications using data with
     variable size to express their maximal potential size.
+  * New offline tool to draw graph showing elapsed time between sent
+    or received data and their use by tasks
+  * Add 4D tensor data interface.
+  * New sched_tasks.rec trace file which monitors task scheduling push/pop actions
 
 Small changes:
   * Use the S4U interface of Simgrid instead of xbt and MSG.
 
-StarPU 1.3.3 (git revision xxx)
+StarPU 1.3.4 (git revision xxx)
 ==============================================
 
+Small features:
+  * New environment variables STARPU_BUS_STATS_FILE and
+    STARPU_WORKER_STATS_FILE to specify files in which to display
+    statistics about data transfers and workers.
+  * Add starpu_bcsr_filter_vertical_block filtering function.
+  * Add starpu_interface_copy2d, 3d, and 4d to easily request data copies from
+    data interfaces.
+  * Move optimized cuda 2d copy from interfaces to new
+    starpu_cuda_copy2d_async_sync and starpu_cuda_copy3d_async_sync, and use
+    them from starpu_interface_copy2d and 3d.
+
+StarPU 1.3.3 (git revision 11afc5b007fe1ab1c729b55b47a5a98ef7f3cfad)
+====================================================================
+
 New features:
   * New semantic for starpu_task_insert() and alike parameters
     STARPU_CALLBACK_ARG, STARPU_PROLOGUE_CALLBACK_ARG, and
@@ -76,7 +94,7 @@ Small changes:
      idle time instead of overhead.
 
 StarPU 1.3.2 (git revision af22a20fc00a37addf3cc6506305f89feed940b0)
-==============================================
+====================================================================
 
 Small changes:
   * Improve OpenMP support to detect the environment is valid before
@@ -89,7 +107,7 @@ Small changes:
     communications progress.
 
 StarPU 1.3.1 (git revision 01949488b4f8e6fe26d2c200293b8aae5876b038)
-==============================================
+====================================================================
 
 Small features:
   * Add starpu_filter_nparts_compute_chunk_size_and_offset helper.
@@ -100,7 +118,7 @@ Small changes:
     library is available, also check the compiled code can be run.
 
 StarPU 1.3.0 (git revision 24ca83c6dbb102e1cfc41db3bb21c49662067062)
-==============================================
+====================================================================
 
 New features:
   * New scheduler 'heteroprio' with heterogeneous priorities
@@ -248,33 +266,41 @@ Small changes:
   * STARPU_COMM_STATS also displays the bandwidth
   * Update data interfaces implementations to only use public API
 
-StarPU 1.2.9 (git revision xxx)
-==============================================
+StarPU 1.2.9 (git revision 3aca8da3138a99e93d7f93905d2543bd6f1ea1df)
+====================================================================
+
+Small changes:
   * Add STARPU_SIMGRID_TRANSFER_COST environment variable to easily disable
     data transfer costs.
+  * New dmdap "data-aware performance model (priority)" scheduler
+  * Modification in the Native Fortran interface of the functions
+    fstarpu_mpi_task_insert, fstarpu_mpi_task_build and
+    fstarpu_mpi_task_post_build to only take 1 parameter being the MPI
+    communicator, the codelet and the various parameters for the task.
 
 StarPU 1.2.8 (git revision f66374c9ad39aefb7cf5dfc31f9ab3d756bcdc3c)
-==============================================
+====================================================================
 
 Small features:
   * Minor fixes
 
 StarPU 1.2.7 (git revision 07cb7533c22958a76351bec002955f0e2818c530)
-==============================================
+====================================================================
 
 Small features:
   * Add STARPU_HWLOC_INPUT environment variable to save initialization time.
   * Add starpu_data_set/get_ooc_flag.
+  * Use starpu_mpi_tag_t (int64_t) for MPI communication tag
 
 StarPU 1.2.6 (git revision 23049adea01837479f309a75c002dacd16eb34ad)
-==============================================
+====================================================================
 
 Small changes:
   * Fix crash for lws scheduler
   * Avoid making hwloc load PCI topology when CUDA is not enabled
 
 StarPU 1.2.5 (git revision 22f32916916d158e3420033aa160854d1dd341bd)
-==============================================
+====================================================================
 
 Small features:
   * Add a new value STARPU_TASK_COLOR to be used in
@@ -285,7 +311,7 @@ Changes:
   * Do not export -lcuda -lcudart -lOpenCL in *starpu*.pc.
 
 StarPU 1.2.4 (git revision 255cf98175ef462749780f30bfed21452b74b594)
-==============================================
+====================================================================
 
 Small features:
    * Catch of signals SIGINT and SIGSEGV to dump fxt trace files.
@@ -302,8 +328,8 @@ Small features:
    * Add a way to choose the dag.dot colors.
 
 
-StarPU 1.2.3 (svn revision 22444)
-==============================================
+StarPU 1.2.3 (git revision 586ba6452a8eef99f275c891ce08933ae542c6c2)
+====================================================================
 
 New features:
   * Add per-node MPI data.
@@ -335,8 +361,8 @@ Small changes:
     reduction methods are provided, and make sure a handle is
     initialized before trying to read it.
 
-StarPU 1.2.2 (svn revision 21308)
-==============================================
+StarPU 1.2.2 (git revision a0b01437b7b91f33fb3ca36bdea35271cad34464)
+===================================================================
 
 New features:
   * Add starpu_data_acquire_try and starpu_data_acquire_on_node_try.
@@ -377,8 +403,8 @@ Small changes:
   * Fix odd ordering of CPU workers on CPUs due to GPUs stealing some
     cores
 
-StarPU 1.2.1 (svn revision 20299)
-==============================================
+StarPU 1.2.1 (git revision 473acaec8a1fb4f4c73d8b868e4f044b736b41ea)
+====================================================================
 
 New features:
   * Add starpu_fxt_trace_user_event_string.
@@ -414,8 +440,8 @@ Small changes:
     temporary data
   * Fix compatibility with simgrid 3.14
 
-StarPU 1.2.0 (svn revision 18521)
-==============================================
+StarPU 1.2.0 (git revision 5a86e9b61cd01b7797e18956283cc6ea22adfe11)
+====================================================================
 
 New features:
   * MIC Xeon Phi support
@@ -616,19 +642,51 @@ Small changes:
     STARPU_NMIC will be the number of devices, and STARPU_NMICCORES
     will be the number of cores per device.
 
-StarPU 1.1.5 (svn revision xxx)
-==============================================
+StarPU 1.1.8 (git revision f7b7abe9f86361cbc96f2b51c6ad7336b7d1d628)
+====================================================================
 The scheduling context release
 
+Small changes:
+  * Fix compatibility with simgrid 3.14
+  * Fix lock ordering for memory reclaiming
+
+StarPU 1.1.7 (git revision 341044b67809892cf4a388e482766beb50256907)
+====================================================================
+The scheduling context release
+
+Small changes:
+  * Fix type of data home node to allow users to pass -1 to define
+    temporary data
+
+StarPU 1.1.6 (git revision cdffbd5f5447e4d076d659232b3deb14f3c20da6)
+====================================================================
+The scheduling context release
+
+Small features:
+  * Add starpu_task_get_task_succs to get the list of children of a given
+    task.
+  * Ranges can be provided in STARPU_WORKERS_CPUID
+
+Small changes:
+  * Various fixes for MacOS and windows systems
+
+StarPU 1.1.5 (git revision 20469c6f3e7ecd6c0568c8e4e4b5b652598308d8xxx)
+=======================================================================
+The scheduling context release
+
+New features:
   * Add starpu_memory_pin and starpu_memory_unpin to pin memory allocated
     another way than starpu_malloc.
   * Add starpu_task_wait_for_n_submitted() and
     STARPU_LIMIT_MAX_NSUBMITTED_TASKS/STARPU_LIMIT_MIN_NSUBMITTED_TASKS to
     easily control the number of submitted tasks by making task submission
     block.
+  * Add STARPU_NOWHERE to create synchronization tasks with data.
+  * Document how to switch between differents views of the same data.
+  * Add Fortran 90 module and example using it
 
-StarPU 1.1.4 (svn revision 14856)
-==============================================
+StarPU 1.1.4 (git revision 2a3d30b28d6d099d271134a786335acdbb3931a3)
+====================================================================
 The scheduling context release
 
 New features:
@@ -662,8 +720,8 @@ Small features:
 Changes:
   * Fix complexity of implicit task/data dependency, from quadratic to linear.
 
-StarPU 1.1.3 (svn revision 13450)
-==============================================
+StarPU 1.1.3 (git revision 11afc5b007fe1ab1c729b55b47a5a98ef7f3cfad)
+====================================================================
 The scheduling context release
 
 New features:
@@ -678,8 +736,8 @@ Small changes:
     issues on parallel launches, MPI runs notably.
   * Lots of build fixes for icc on Windows.
 
-StarPU 1.1.2 (svn revision 13011)
-==============================================
+StarPU 1.1.2 (git revision d14c550798630bbc4f3da2b07d793c47e3018f02)
+====================================================================
 The scheduling context release
 
 New features:
@@ -692,8 +750,8 @@ New features:
   * Add STARPU_TRACE_BUFFER_SIZE environment variable to specify the size of
     the trace buffer.
 
-StarPU 1.1.1 (svn revision 12638)
-==============================================
+StarPU 1.1.1 (git revision dab2e51117fac5bef767f3a6b7677abb2147d2f2)
+====================================================================
 The scheduling context release
 
 New features:
@@ -737,8 +795,8 @@ Small changes:
     configure option --enable-starpufft-examples needs to be specified
     to change this behaviour.
 
-StarPU 1.1.0 (svn revision 11960)
-==============================================
+StarPU 1.1.0 (git revision 3c4bc72ccef30e767680cad3d749c4e9010d4476)
+====================================================================
 The scheduling context release
 
 New features:
@@ -975,8 +1033,8 @@ Small changes:
   * Fix performance regression: dmda queues were inadvertently made
     LIFOs in r9611.
 
-StarPU 1.0.3 (svn revision 7379)
-==============================================
+StarPU 1.0.3 (git revision 25f8b3a7b13050e99bf1725ca6f52cfd62e7a861)
+====================================================================
 
 Changes:
   * Several bug fixes in the build system
@@ -984,8 +1042,8 @@ Changes:
   * Fix generating FXT traces bigger than 64MiB.
   * Improve ENODEV error detections in StarPU FFT
 
-StarPU 1.0.2 (svn revision 7210)
-==============================================
+StarPU 1.0.2 (git revision 6f95de279d6d796a39debe8d6c5493b3bdbe0c37)
+====================================================================
 
 Changes:
   * Add starpu_block_shadow_filter_func_vector and an example.
@@ -994,8 +1052,8 @@ Changes:
   * Fix parallel tasks CPU binding and combined worker generation.
   * Fix generating FXT traces bigger than 64MiB.
 
-StarPU 1.0.1 (svn revision 6659)
-==============================================
+StarPU 1.0.1 (git revision 97ea6e15a273e23e4ddabf491b0f9481373ca01a)
+====================================================================
 
 Changes:
   * hwloc support. Warn users when hwloc is not found on the system and
@@ -1009,8 +1067,8 @@ Changes:
   * Update SOCL to use new API
   * Documentation improvement.
 
-StarPU 1.0.0 (svn revision 6306)
-==============================================
+StarPU 1.0.0 (git revision d3ad9ca318ec9acfeaf8eb7d8a018b09e4722292)
+====================================================================
 The extensions-again release
 
 New features:
@@ -1096,8 +1154,8 @@ Small changes:
   * Documentation improvement.
 
 
-StarPU 0.9 (svn revision 3721)
-==============================================
+StarPU 0.9 (git revision 12bba8528fc0d85367d885cddc383ba54efca464)
+==================================================================
 The extensions release
 
   * Provide the STARPU_REDUX data access mode
@@ -1119,8 +1177,8 @@ The extensions release
   * Add stencil MPI example
   * Initial support for CUDA4
 
-StarPU 0.4 (svn revision 2535)
-==============================================
+StarPU 0.4 (git revision ad8d8be3619f211f228c141282d7d504646fc2a6)
+==================================================================
 The API strengthening release
 
   * Major API improvements
@@ -1145,8 +1203,8 @@ The API strengthening release
     - Add OpenCL support
     - Add support for Windows
 
-StarPU 0.2.901 aka 0.3-rc1 (svn revision 1236)
-==============================================
+StarPU 0.2.901 aka 0.3-rc1 (git revision 991f2abb772c17c3d45bbcf27f46197652e6a3ef)
+==================================================================================
 The asynchronous heterogeneous multi-accelerator release
 
   * Many API changes and code cleanups
@@ -1174,8 +1232,8 @@ The asynchronous heterogeneous multi-accelerator release
     specify where to bind the workers
   * Use the hwloc library to detect the actual number of cores
 
-StarPU 0.2.0 (svn revision 1013)
-==============================================
+StarPU 0.2.0 (git revision 73e989f0783e10815aff394f80242760c4ed098c)
+====================================================================
 The Stabilizing-the-Basics release
 
   * Various API cleanups
@@ -1189,8 +1247,8 @@ The Stabilizing-the-Basics release
   * More examples are supplied
 
 
-StarPU 0.1.0 (svn revision 794)
-==============================================
+StarPU 0.1.0 (git revision 911869a96b40c74eb92b30a43d3e08bf445d8078)
+====================================================================
 First release.
 
 Status:

+ 42 - 10
configure.ac

@@ -1,11 +1,11 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
 # Copyright (C) 2011-2018                                Inria
-# Copyright (C) 2009-2019                                Université de Bordeaux
+# Copyright (C) 2009-2020                                Université de Bordeaux
 # Copyright (C) 2017                                     Guillaume Beauchamp
 # Copyright (C) 2018                                     Federal University of Rio Grande do Sul (UFRGS)
 # Copyright (C) 2018                                     Umeà University
-# Copyright (C) 2010-2019                                CNRS
+# Copyright (C) 2010-2020                                CNRS
 # Copyright (C) 2013                                     Thibaut Lambert
 # Copyright (C) 2011                                     Télécom-SudParis
 #
@@ -20,7 +20,7 @@
 #
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 #
-AC_INIT([StarPU], [1.3.99], [starpu-devel@lists.gforge.inria.fr], [starpu], [http://runtime.bordeaux.inria.fr/StarPU/])
+AC_INIT([StarPU], [1.3.99], [starpu-devel@lists.gforge.inria.fr], [starpu], [http://starpu.gforge.inria.fr/])
 AC_CONFIG_SRCDIR(include/starpu.h)
 AC_CONFIG_AUX_DIR([build-aux])
 
@@ -279,7 +279,7 @@ if test x$enable_simgrid = xyes ; then
 	AC_CHECK_TYPES([smx_actor_t], [AC_DEFINE([STARPU_HAVE_SMX_ACTOR_T], [1], [Define to 1 if you have the smx_actor_t type.])], [], [[#include <simgrid/simix.h>]])
 
 	# Latest functions
-	AC_CHECK_FUNCS([MSG_process_attach MSG_zone_get_hosts MSG_process_self_name MSG_process_userdata_init])
+	AC_CHECK_FUNCS([MSG_process_attach sg_actor_attach sg_actor_init MSG_zone_get_hosts MSG_process_self_name MSG_process_userdata_init sg_actor_data])
 	AC_CHECK_FUNCS([xbt_mutex_try_acquire smpi_process_set_user_data sg_zone_get_by_name sg_link_name sg_host_route sg_host_self sg_host_speed simcall_process_create sg_config_continue_after_help])
 	AC_CHECK_FUNCS([xbt_barrier_init], [AC_DEFINE([STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT], [1], [Define to 1 if you have the `xbt_barrier_init' function.])])
 	AC_CHECK_DECLS([smpi_process_set_user_data], [], [], [[#include <smpi/smpi.h>]])
@@ -499,7 +499,7 @@ fi
 AC_SUBST(CC_OR_MPICC, $cc_or_mpicc)
 
 AC_ARG_ENABLE(mpi-pedantic-isend, [AS_HELP_STRING([--enable-mpi-pedantic-isend],
-				   [Enable StarPU MPI pedantic isend])],
+				   [Prevent StarPU MPI from reading buffers while being sent over MPI])],
 				   enable_mpi_pedantic_isend=$enableval, enable_mpi_pedantic_isend=no)
 if test x$enable_mpi_pedantic_isend = xyes; then
 	AC_DEFINE(STARPU_MPI_PEDANTIC_ISEND, [1], [enable StarPU MPI pedantic isend])
@@ -697,9 +697,6 @@ if test x$enable_simgrid = xno ; then
 fi
 
 AM_CONDITIONAL(STARPU_MPI_CHECK, test x$running_mpi_check = xyes)
-if test x$running_mpi_check = xyes -a x$enable_simgrid = xyes -a x$enable_shared = xyes ; then
-    AC_MSG_ERROR([MPI with simgrid can not work with shared libraries, use --disable-shared to fix this])
-fi
 if test x$use_mpi = xyes ; then
     AC_MSG_CHECKING(whether MPI tests should be run)
     AC_MSG_RESULT($running_mpi_check)
@@ -727,6 +724,19 @@ else
 	running_mpi_check=no
 fi
 
+if test x$build_mpi_lib = xyes -o x$build_nmad_lib = xyes ; then
+    if test x$enable_simgrid = xyes ; then
+        if test x$enable_shared = xyes ; then
+	    AC_MSG_ERROR([MPI with simgrid can not work with shared libraries, use --disable-shared to fix this])
+        else
+	    CFLAGS="$CFLAGS -fPIC"
+	    CXXFLAGS="$CXXFLAGS -fPIC"
+	    NVCCFLAGS="$NVCCFLAGS --compiler-options -fPIC"
+	    FFLAGS="$FFLAGS -fPIC"
+        fi
+    fi
+fi
+
 AM_CONDITIONAL(STARPU_USE_MPI_MPI, test x$build_mpi_lib = xyes)
 AM_CONDITIONAL(STARPU_USE_MPI_NMAD, test x$build_nmad_lib = xyes)
 AM_CONDITIONAL(STARPU_USE_MPI, test x$build_nmad_lib = xyes -o x$build_mpi_lib = xyes)
@@ -1076,6 +1086,21 @@ STARPU_CHECK_SYNC_FETCH_AND_OR
 # This defines HAVE_SYNC_LOCK_TEST_AND_SET
 STARPU_CHECK_SYNC_LOCK_TEST_AND_SET
 
+# This defines HAVE_ATOMIC_COMPARE_EXCHANGE_N
+STARPU_CHECK_ATOMIC_COMPARE_EXCHANGE_N
+
+# This defines HAVE_ATOMIC_EXCHANGE_N
+STARPU_CHECK_ATOMIC_EXCHANGE_N
+
+# This defines HAVE_ATOMIC_FETCH_ADD
+STARPU_CHECK_ATOMIC_FETCH_ADD
+
+# This defines HAVE_ATOMIC_FETCH_OR
+STARPU_CHECK_ATOMIC_FETCH_OR
+
+# This defines HAVE_ATOMIC_TEST_AND_SET
+STARPU_CHECK_ATOMIC_TEST_AND_SET
+
 # This defines HAVE_SYNC_SYNCHRONIZE
 STARPU_CHECK_SYNC_SYNCHRONIZE
 
@@ -2190,6 +2215,10 @@ if test x$use_fxt = xyes; then
 		FXT_LIBS="$(pkg-config --variable=libdir fxt)/libfxt.a -Wl,--as-needed $(pkg-config --libs --static fxt) -Wl,--no-as-needed"
 	fi
 
+	AC_CHECK_LIB([papi], [PAPI_library_init],
+		     [AC_DEFINE([STARPU_PAPI], [1], [Define to 1 if you have the libpapi library])
+		      PAPI_LIBS=-lpapi])
+
 	##########################################
 	# Poti is a library to generate paje trace files
 	##########################################
@@ -3082,6 +3111,9 @@ if test x$enable_mlr = xyes -a "$starpu_windows" != "yes" ; then
 	install_min_dgels=no
 	support_mlr=yes
    	STARPU_SEARCH_LIBS(LAPACK,[dgels_],[lapack],use_system_lapack=yes,,)
+	if test x$blas_lib = xnone ; then
+	   use_system_lapack=no
+	fi
 	if test x$use_system_lapack = xyes; then
 	   	AC_DEFINE(STARPU_MLR_MODEL, [1], [use reflapack library])
 		LDFLAGS="-llapack $LDFLAGS"
@@ -3205,7 +3237,7 @@ AS_IF([test "$use_hwloc" != "no"],
 AM_CONDITIONAL(STARPU_HAVE_HWLOC, test "x$have_valid_hwloc" = "xyes")
 # in case hwloc was explicitely required, but is not available, this is an error
 AS_IF([test "$use_hwloc" = "yes" -a "$have_valid_hwloc" = "no"],
-      [AC_MSG_ERROR([cannot find hwloc])]
+      [AC_MSG_ERROR([cannot find hwloc or pkg-config])]
      )
 # in case hwloc is not available but was not explicitely disabled, this is an error
 AS_IF([test "$have_valid_hwloc" = "no" -a "$use_hwloc" != "no"],
@@ -3498,7 +3530,7 @@ AM_CONDITIONAL(AVAILABLE_DOC, [test x$available_doc != xno])
 ###############################################################################
 
 # these are the flags needed for linking libstarpu (and thus also for static linking)
-LIBSTARPU_LDFLAGS="$STARPU_OPENCL_LDFLAGS $STARPU_CUDA_LDFLAGS $HWLOC_LIBS $FXT_LIBS $STARPU_COI_LDFLAGS $STARPU_SCIF_LDFLAGS $STARPU_RCCE_LDFLAGS $STARPU_LEVELDB_LDFLAGS $STARPU_GLPK_LDFLAGS $STARPU_LEVELDB_LDFLAGS $SIMGRID_LIBS $STARPU_BLAS_LDFLAGS $STARPU_OMP_LDFLAGS $DGELS_LIBS"
+LIBSTARPU_LDFLAGS="$STARPU_OPENCL_LDFLAGS $STARPU_CUDA_LDFLAGS $HWLOC_LIBS $FXT_LIBS $PAPI_LIBS $STARPU_COI_LDFLAGS $STARPU_SCIF_LDFLAGS $STARPU_RCCE_LDFLAGS $STARPU_LEVELDB_LDFLAGS $STARPU_GLPK_LDFLAGS $STARPU_LEVELDB_LDFLAGS $SIMGRID_LIBS $STARPU_BLAS_LDFLAGS $STARPU_OMP_LDFLAGS $DGELS_LIBS"
 AC_SUBST([LIBSTARPU_LDFLAGS])
 
 LIBSTARPU_LINK=libstarpu-$STARPU_EFFECTIVE_VERSION.la

+ 8 - 4
doc/doxygen/chapters/210_check_list_performance.doxy

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011-2013,2015,2017                      Inria
- * Copyright (C) 2010-2019                                CNRS
+ * Copyright (C) 2010-2020                                CNRS
  * Copyright (C) 2009-2011,2013-2019                      Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -102,7 +102,7 @@ to use a version that takes the a stream parameter.
 Unfortunately, some CUDA libraries do not have stream variants of
 kernels. This will seriously lower the potential for overlapping.
 If some CUDA calls are made without specifying this local stream,
-synchronization needs to be explicited with cudaThreadSynchronize() around these
+synchronization needs to be explicited with cudaDeviceSynchronize() around these
 calls, to make sure that they get properly synchronized with the calls using
 the local stream. Notably, \c cudaMemcpy() and \c cudaMemset() are actually
 asynchronous and need such explicit synchronization! Use \c cudaMemcpyAsync() and
@@ -440,10 +440,14 @@ and in Joules for the energy consumption models.
 A quick view of how many tasks each worker has executed can be obtained by setting
 <c>export STARPU_WORKER_STATS=1</c> (\ref STARPU_WORKER_STATS). This is a convenient way to check that
 execution did happen on accelerators, without penalizing performance with
-the profiling overhead.
+the profiling overhead. \ref STARPU_WORKER_STATS_FILE can be defined
+to specify a filename in which to display statistics, by default
+statistics are printed on the standard error stream.
 
 A quick view of how much data transfers have been issued can be obtained by setting
-<c>export STARPU_BUS_STATS=1</c> (\ref STARPU_BUS_STATS).
+<c>export STARPU_BUS_STATS=1</c> (\ref STARPU_BUS_STATS). \ref
+STARPU_BUS_STATS_FILE can be defined to specify a filename in which to
+display statistics, by default statistics are printed on the standard error stream.
 
 More detailed profiling information can be enabled by using <c>export STARPU_PROFILING=1</c> (\ref STARPU_PROFILING)
 or by

+ 5 - 1
doc/doxygen/chapters/301_tasks.doxy

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2010-2019                                CNRS
  * Copyright (C) 2011,2012,2018                           Inria
- * Copyright (C) 2009-2011,2014-2016,2018                 Université de Bordeaux
+ * Copyright (C) 2009-2011,2014-2016,2018-2019            Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -31,6 +31,8 @@ checked if bad performance are observed. To get a grasp at the scalability
 possibility according to task size, one can run
 <c>tests/microbenchs/tasks_size_overhead.sh</c> which draws curves of the
 speedup of independent tasks of very small sizes.
+To determine what task size your application is actually using, one can use
+<c>starpu_fxt_data_trace</c>, see \ref DataTrace .
 
 The choice of scheduler also has impact over the overhead: for instance, the
  scheduler <c>dmda</c> takes time to make a decision, while <c>eager</c> does
@@ -564,6 +566,8 @@ worker sizes (making several measurements for each worker size) and
 thus be able to avoid choosing a large combined worker if the codelet
 does not actually scale so much.
 
+This is however for now only proof of concept, and has not really been optimized yet.
+
 \subsection CombinedWorkers Combined Workers
 
 By default, StarPU creates combined workers according to the architecture

+ 80 - 3
doc/doxygen/chapters/310_data_management.doxy

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2019                                CNRS
- * Copyright (C) 2009-2011,2014-2019                      Université de Bordeaux
+ * Copyright (C) 2009-2011,2014-2020                      Université de Bordeaux
  * Copyright (C) 2011,2012                                Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -101,6 +101,21 @@ can also be partitioned with some overlapping by using
 starpu_block_filter_block_shadow(), starpu_block_filter_vertical_block_shadow(),
 or starpu_block_filter_depth_block_shadow().
 
+\subsection TensorDataInterface Tensor Data Interface
+
+To register 4-D matrices with potential paddings on Y, Z, and T dimensions,
+one can use the tensor data interface. Here an example of how to
+register a tensor data to StarPU by using starpu_tensor_data_register().
+
+\code{.c}
+float *block;
+starpu_data_handle_t block_handle;
+block = (float*)malloc(nx*ny*nz*nt*sizeof(float));
+starpu_tensor_data_register(&block_handle, STARPU_MAIN_RAM, (uintptr_t)block, nx, nx*ny, nx*ny*nz, nx, ny, nz, nt, sizeof(float));
+\endcode
+
+Partitioning filters are not implemented yet.
+
 \subsection BCSRDataInterface BCSR Data Interface
 
 BCSR (Blocked Compressed Sparse Row Representation) sparse matrix data
@@ -169,7 +184,9 @@ StarPU provides an example on how to deal with such matrices in
 <c>examples/spmv</c>.
 
 BCSR data handles can be partitioned into its dense matrix blocks by using
-starpu_bcsr_filter_canonical_block().
+starpu_bcsr_filter_canonical_block(), or split into other BCSR data handles by
+using starpu_bcsr_filter_vertical_block() (but only split along the leading dimension is
+supported, i.e. along adjacent nnz blocks)
 
 \subsection CSRDataInterface CSR Data Interface
 
@@ -202,7 +219,39 @@ that the StarPU core knows the new data layout. The starpu_data_interface_ops
 structure however then needs to have the starpu_data_interface_ops::dontcache
 field set to 1, to prevent StarPU from trying to perform any cached allocation,
 since the allocated size will vary. An example is available in
-<c>tests/datawizard/variable_size.c</c>
+<c>tests/datawizard/variable_size.c</c>. The example uses its own data
+interface so as to contain some simulation information for data growth, but the
+principle can be applied for any data interface.
+
+The principle is to use <c>starpu_malloc_on_node_flags</c> to make the new
+allocation, and use <c>starpu_free_on_node_flags</c> to release any previous
+allocation. The flags have to be precisely like in the example:
+
+\code{.c}
+unsigned workerid = starpu_worker_get_id_check();
+unsigned dst_node = starpu_worker_get_memory_node(workerid);
+interface->ptr = starpu_malloc_on_node_flags(dst_node, size + increase, STARPU_MALLOC_PINNED | STARPU_MALLOC_COUNT | STARPU_MEMORY_OVERFLOW);
+starpu_free_on_node_flags(dst_node, old, size, STARPU_MALLOC_PINNED | STARPU_MALLOC_COUNT | STARPU_MEMORY_OVERFLOW);
+interface->size += increase;
+\endcode
+
+so that the allocated area has the expected properties and the allocation is accounted for properly.
+
+Depending on the interface (vector, CSR, etc.) you may have to fix several
+members of the data interface: e.g. both <c>nx</c> and <c>allocsize</c> for
+vectors, and store the pointer both in <c>ptr</c> and <c>dev_handle</c>.
+
+Some interfaces make a distinction between the actual number of elements
+stored in the data and the actually allocated buffer. For instance, the vector
+interface uses the <c>nx</c> field for the former, and the <c>allocsize</c> for
+the latter. This allows for lazy reallocation to avoid reallocating the buffer
+everytime to exactly match the actual number of elements. Computations and data
+transfers will use <c>nx</c> field, while allocation functions will use the
+<c>allocsize</c>. One just has to make sure that <c>allocsize</c> is always
+bigger or equal to <c>nx</c>.
+
+Important note: one can not change the size of a partitioned data.
+
 
 \section DataManagement Data Management
 
@@ -508,6 +557,34 @@ starpu_data_invalidate_submit(handle);
 
 And now we can start using vertical slices, etc.
 
+\section DataPointers Handles data buffer pointers
+
+A simple understanding of starpu handles is that it's a collection of buffers on
+each memory node of the machine, which contain the same data.  The picture is
+however made more complex with the OpenCL support and with partitioning.
+
+When partitioning a handle, the data buffers of the subhandles will indeed
+be inside the data buffers of the main handle (to save transferring data
+back and forth between the main handle and the subhandles). But in OpenCL,
+a <c>cl_mem</c> is not a pointer, but an opaque value on which pointer
+arithmetic can not be used. That is why data interfaces contain three members:
+<c>dev_handle</c>, <c>offset</c>, and <c>ptr</c>. The <c>dev_handle</c> member
+is what the allocation function returned, and one can not do arithmetic on
+it. The <c>offset</c> member is the offset inside the allocated area, most often
+it will be 0 because data start at the beginning of the allocated area, but
+when the handle is partitioned, the subhandles will have varying <c>offset</c>
+values, for each subpiece. The <c>ptr</c> member, in the non-OpenCL case, i.e.
+when pointer arithmetic can be used on <c>dev_handle</c>, is just the sum of
+<c>dev_handle</c> and <c>offset</c>, provided for convenience.
+
+This means that:
+<ul>
+<li>computation kernels can use <c>ptr</c> in non-OpenCL implementations.</li>
+<li>computation kernels have to use <c>dev_handle</c> and <c>offset</c> in the OpenCL implementation.</li>
+<li>allocation methods of data interfaces have to store the value returned by starpu_malloc_on_node in <c>dev_handle</c> and <c>ptr</c>, and set <c>offset</c> to 0.</li>
+<li>partitioning filters have to copy over <c>dev_handle</c> without modifying it, set in the child different values of <c>offset</c>, and set <c>ptr</c> accordingly as the sum of <c>dev_handle</c> and <c>offset</c>.</li>
+</ul>
+
 \section DefiningANewDataFilter Defining A New Data Filter
 
 StarPU provides a series of predefined filters in \ref API_Data_Partition, but

+ 2 - 2
doc/doxygen/chapters/320_scheduling.doxy

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2019                                CNRS
+ * Copyright (C) 2010-2020                                CNRS
  * Copyright (C) 2011,2012,2016                           Inria
  * Copyright (C) 2009-2011,2014-2019                      Université de Bordeaux
  *
@@ -168,7 +168,7 @@ processing units), the idle power of the machine should be given by setting
 be obtained from the machine power supplier.
 
 The energy actually consumed by the total execution can be displayed by setting
-<c>export STARPU_PROFILING=1 STARPU_WORKER_STATS=1</c> .
+<c>export STARPU_PROFILING=1 STARPU_WORKER_STATS=1</c> (\ref STARPU_PROFILING and \ref STARPU_WORKER_STATS).
 
 For OpenCL devices, on-line task consumption measurement is currently supported through the
 <c>CL_PROFILING_POWER_CONSUMED</c> OpenCL extension, implemented in the MoviSim

+ 7 - 17
doc/doxygen/chapters/370_online_performance_tools.doxy

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011,2012,2016                           Inria
- * Copyright (C) 2010-2019                                CNRS
+ * Copyright (C) 2010-2020                                CNRS
  * Copyright (C) 2009-2011,2014,2016,2018-2019            Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -94,7 +94,9 @@ information associated to a worker.
 
 To easily display all this information, the environment variable
 \ref STARPU_WORKER_STATS can be set to <c>1</c> (in addition to setting
-\ref STARPU_PROFILING to 1). A summary will then be displayed at program termination:
+\ref STARPU_PROFILING to 1). A summary will then be displayed at
+program termination.  To display the summary in a file instead of the
+standard error stream, use the environment variable \ref STARPU_WORKER_STATS_FILE.
 
 \verbatim
 Worker stats:
@@ -150,7 +152,9 @@ CUDA 2  4534.229519     2417.069025     2417.060863     0.000000
 
 Statistics about the data transfers which were performed and temporal average
 of bandwidth usage can be obtained by setting the environment variable
-\ref STARPU_BUS_STATS to <c>1</c>; a summary will then be displayed at program termination:
+\ref STARPU_BUS_STATS to <c>1</c>; a summary will then be displayed at
+program termination. To display the summary in a file instead of the
+standard error stream, use the environment variable \ref STARPU_BUS_STATS_FILE.
 
 \verbatim
 Data transfer stats:
@@ -463,20 +467,6 @@ The application can also request an on-the-fly XML report of the performance
 model, by calling starpu_perfmodel_dump_xml() to print the report to a
 <c>FILE*</c>.
 
-\section DataTrace Data trace and tasks length
-It is possible to get statistics about tasks length and data size by using :
-\verbatim
-$ starpu_fxt_data_trace filename [codelet1 codelet2 ... codeletn]
-\endverbatim
-Where filename is the FxT trace file and codeletX the names of the codelets you
-want to profile (if no names are specified, <c>starpu_fxt_data_trace</c> will profile them all).
-This will create a file, <c>data_trace.gp</c> which
-can be executed to get a <c>.eps</c> image of these results. On the image, each point represents a
-task, and each color corresponds to a codelet.
-
-\image html data_trace.png
-\image latex data_trace.eps "" width=\textwidth
-
 \section Performance Monitoring Counters
 
 This section presents the StarPU performance monitoring framework. It summarizes the objectives of the framework. It then introduces the entities involved in the framework. It presents the API of the framework, as well as some implementation details. It exposes the typical sequence of operations to plug an external tool to monitor a performance counter of StarPU.

+ 82 - 31
doc/doxygen/chapters/380_offline_performance_tools.doxy

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2011,2012,2015-2017                      Inria
  * Copyright (C) 2010-2019                                CNRS
- * Copyright (C) 2009-2011,2014-2017,2019                 Université de Bordeaux
+ * Copyright (C) 2009-2011,2014-2017,2019-2020            Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -160,6 +160,13 @@ starpu_task::color. Colors are expressed with the following format
 <c>basic_examples/task_insert_color</c> for examples on how to assign
 colors.
 
+To get statistics on the time spend in runtime overhead, one can use the
+statistics plugin of ViTE. In Preferences, select Plugins. In "States Type",
+select "Worker State". Then click on "Reload" to update the histogram. The red
+"Idle" percentages are due to lack of parallelism, while the brown "Overhead"
+and "Scheduling" percentages are due to the overhead of the runtime and of the
+scheduler.
+
 To identify tasks precisely, the application can also set the field
 starpu_task::tag_id or setting \ref STARPU_TAG_ONLY when calling
 starpu_task_insert(). The value of the tag will then show up in the
@@ -202,7 +209,7 @@ $ dot -Tpdf dag.dot -o output.pdf
 
 Another generated trace file gives details on the executed tasks. The
 file, created in the current directory, is named <c>tasks.rec</c>. This file
-is in the recutils format, i.e. <c>Field: value</c> lines, and empty lines to
+is in the recutils format, i.e. <c>Field: value</c> lines, and empty lines are used to
 separate each task.  This can be used as a convenient input for various ad-hoc
 analysis tools. By default it only contains information about the actual
 execution. Performance models can be obtained by running
@@ -224,6 +231,14 @@ Another possibility is to obtain the performance models as an auxiliary <c>perfm
 $ starpu_perfmodel_recdump tasks.rec -o perfmodel.rec
 \endverbatim
 
+\subsubsection TraceSchedTaskDetails Getting Scheduling Task Details
+
+The file, <c>sched_tasks.rec</c>, created in the current directory,
+and in the recutils format, gives information about the tasks
+scheduling, and lists the push and pop actions of the scheduler. For
+each action, it gives the timestamp, the job priority and the job id.
+Each action is separated from the next one by empty lines.
+
 \subsubsection MonitoringActivity Monitoring Activity
 
 Another generated trace file is an activity trace. The file, created
@@ -259,6 +274,20 @@ and whose name start with "modular-"), the call to
 which can be viewed in a javascript-enabled web browser. It shows the
 flow of tasks between the components of the modular scheduler.
 
+\subsubsection TimeBetweenSendRecvDataUse Analyzing Time Between MPI Data Transfer and Use by Tasks
+
+<c>starpu_fxt_tool</c> produces a file called <c>comms.rec</c> which describes all 
+MPI communications. The script <c>starpu_send_recv_data_use.py</c> uses this file 
+and <c>tasks.rec</c> in order to produce two graphs: the first one shows durations 
+between the reception of data and their usage by a task and the second one plots the 
+same graph but with elapsed time between send and usage of a data by the sender.
+
+\image html trace_recv_use.png
+\image latex trace_recv_use.eps "" width=\textwidth
+
+\image html trace_send_use.png
+\image latex trace_send_use.eps "" width=\textwidth
+
 \subsection LimitingScopeTrace Limiting The Scope Of The Trace
 
 For computing statistics, it is useful to limit the trace to a given portion of
@@ -434,6 +463,21 @@ histogram of the codelet execution time distribution.
 \image html distrib_data_histo.png
 \image latex distrib_data_histo.eps "" width=\textwidth
 
+\section DataTrace Data trace and tasks length
+
+It is possible to get statistics about tasks length and data size by using :
+\verbatim
+$ starpu_fxt_data_trace filename [codelet1 codelet2 ... codeletn]
+\endverbatim
+Where filename is the FxT trace file and codeletX the names of the codelets you
+want to profile (if no names are specified, <c>starpu_fxt_data_trace</c> will profile them all).
+This will create a file, <c>data_trace.gp</c> which
+can be executed to get a <c>.eps</c> image of these results. On the image, each point represents a
+task, and each color corresponds to a codelet.
+
+\image html data_trace.png
+\image latex data_trace.eps "" width=\textwidth
+
 \section TraceStatistics Trace Statistics
 
 More than just codelet performance, it is interesting to get statistics over all
@@ -526,6 +570,17 @@ more efficient):
 $ starpu_paje_sort paje.trace
 \endverbatim
 
+\section PapiCounters PAPI counters
+
+Performance counter values can be obtained from the PAPI framework if
+<c>./configure</c> detected the libpapi. One has to set the \ref STARPU_PROFILING
+environment variable to 1 and then specify which counters to record with the
+\ref STARPU_PROF_PAPI_EVENTS environment variable. For instance:
+
+\verbatim
+export STARPU_PROFILING=1 STARPU_PROF_PAPI_EVENTS="PAPI_TOT_INS PAPI_TOT_CYC"
+\endverbatim
+
 \section TheoreticalLowerBoundOnExecutionTime Theoretical Lower Bound On Execution Time
 
 StarPU can record a trace of what tasks are needed to complete the
@@ -601,58 +656,50 @@ Moreover, statistics will be displayed at the end of the execution on
 data handles which have not been cleared out. This can be disabled by
 setting the environment variable \ref STARPU_MEMORY_STATS to <c>0</c>.
 
-For example, if you do not unregister data at the end of the complex
-example, you will get something similar to:
-
-\verbatim
-$ STARPU_MEMORY_STATS=0 ./examples/interface/complex
-Complex[0] = 45.00 + 12.00 i
-Complex[0] = 78.00 + 78.00 i
-Complex[0] = 45.00 + 12.00 i
-Complex[0] = 45.00 + 12.00 i
-\endverbatim
+For example, by adding a call to the function
+starpu_data_display_memory_stats() in the fblock example before
+unpartitioning the data, one will get something
+similar to:
 
 \verbatim
-$ STARPU_MEMORY_STATS=1 ./examples/interface/complex
-Complex[0] = 45.00 + 12.00 i
-Complex[0] = 78.00 + 78.00 i
-Complex[0] = 45.00 + 12.00 i
-Complex[0] = 45.00 + 12.00 i
-
+$ STARPU_MEMORY_STATS=1 ./examples/filters/fblock
+...
 #---------------------
-Memory stats:
+Memory stats :
 #-------
-Data on Node #3
+Data on Node #2
 #-----
-Data : 0x553ff40
-Size : 16
+Data : 0x5562074e8670
+Size : 144
 
 #--
 Data access stats
 /!\ Work Underway
 Node #0
-	Direct access : 4
+	Direct access : 0
 	Loaded (Owner) : 0
 	Loaded (Shared) : 0
-	Invalidated (was Owner) : 0
+	Invalidated (was Owner) : 1
 
-Node #3
+Node #2
 	Direct access : 0
-	Loaded (Owner) : 0
-	Loaded (Shared) : 1
+	Loaded (Owner) : 1
+	Loaded (Shared) : 0
 	Invalidated (was Owner) : 0
 
+#-------
+Data on Node #3
 #-----
-Data : 0x5544710
-Size : 16
+Data : 0x5562074e9338
+Size : 96
 
 #--
 Data access stats
 /!\ Work Underway
 Node #0
-	Direct access : 2
+	Direct access : 0
 	Loaded (Owner) : 0
-	Loaded (Shared) : 1
+	Loaded (Shared) : 0
 	Invalidated (was Owner) : 1
 
 Node #3
@@ -660,6 +707,10 @@ Node #3
 	Loaded (Owner) : 1
 	Loaded (Shared) : 0
 	Invalidated (was Owner) : 0
+
+
+#---------------------
+...
 \endverbatim
 
 \section DataStatistics Data Statistics

+ 7 - 5
doc/doxygen/chapters/401_out_of_core.doxy

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2013,2014,2016-2019                      CNRS
+ * Copyright (C) 2013,2014,2016-2020                      CNRS
  * Copyright (C) 2013,2014,2017,2018-2019                 Université de Bordeaux
  * Copyright (C) 2013                                     Inria
  * Copyright (C) 2013                                     Corentin Salingue
@@ -91,7 +91,7 @@ system daemons, and application data).
 When the register call is made, StarPU will benchmark the disk. This can
 take some time.
 
-<strong>Warning: the size thus has to be at least \ref STARPU_DISK_SIZE_MIN bytes ! </strong> 
+<strong>Warning: the size thus has to be at least \ref STARPU_DISK_SIZE_MIN bytes ! </strong>
 
 StarPU will then automatically try to evict unused data to this new disk. One
 can also use the standard StarPU memory node API to prefetch data etc., see the
@@ -127,7 +127,7 @@ value right after this call, and thus the very first task using the handle needs
 to use the ::STARPU_W mode like above, ::STARPU_R or ::STARPU_RW would not make
 sense.
 
-By default, StarPU will try to push any data handle to the disk. 
+By default, StarPU will try to push any data handle to the disk.
 To specify whether a given handle should be pushed to the disk,
 starpu_data_set_ooc_flag() should be used.
 
@@ -174,7 +174,9 @@ work on this area in the coming future.
 
 Beyond pure performance feedback, some figures are interesting to have a look at.
 
-Using <c>export STARPU_BUS_STATS=1</c> gives an overview of the data
+Using <c>export STARPU_BUS_STATS=1</c> (\ref STARPU_BUS_STATS and \ref STARPU_BUS_STATS_FILE
+to define a filename in which to display statistics, by default the
+standard error stream is used) gives an overview of the data
 transfers which were needed. The values can also be obtained at runtime
 by using starpu_bus_get_profiling_info(). An example can be read in
 <c>src/profiling/profiling_helpers.c</c>.
@@ -188,7 +190,7 @@ Data transfer speed for /tmp/sthibault-disk-DJzhAj (node 1):
 1 -> 0: 23858 µs
 
 #---------------------
-TEST DISK MEMORY 
+TEST DISK MEMORY
 
 #---------------------
 Data transfer stats:

+ 2 - 1
doc/doxygen/chapters/410_mpi_support.doxy

@@ -236,7 +236,8 @@ For send communications, data is acquired with the mode ::STARPU_R.
 When using the \c configure option
 \ref enable-mpi-pedantic-isend "--enable-mpi-pedantic-isend", the mode
 ::STARPU_RW is used to make sure there is no more than 1 concurrent
-\c MPI_Isend() call accessing a data.
+\c MPI_Isend() call accessing a data
+and StarPU does not read from it from tasks during the communication.
 
 Internally, all communication are divided in 2 communications, a first
 message is used to exchange an envelope describing the data (i.e its

+ 1 - 1
doc/doxygen/chapters/470_simgrid.doxy

@@ -23,7 +23,7 @@
 /*! \page SimGridSupport SimGrid Support
 
 StarPU can use Simgrid in order to simulate execution on an arbitrary
-platform. This was tested with SimGrid from 3.11 to 3.16, and 3.18 to 3.23.
+platform. This was tested with SimGrid from 3.11 to 3.16, and 3.18 to 3.24.
 Other versions may have compatibility issues. 3.17 notably does not build at
 all. MPI simulation does not work with version 3.22.
 

+ 47 - 13
doc/doxygen/chapters/501_environment_variables.doxy

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011-2013,2015-2017                      Inria
- * Copyright (C) 2010-2019                                CNRS
- * Copyright (C) 2009-2011,2013-2019                      Université de Bordeaux
+ * Copyright (C) 2010-2020                                CNRS
+ * Copyright (C) 2009-2011,2013-2020                      Université de Bordeaux
  * Copyright (C) 2016                                     Uppsala University
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -535,6 +535,13 @@ Define the idle power of the machine (\ref Energy-basedScheduling).
 Enable on-line performance monitoring (\ref EnablingOn-linePerformanceMonitoring).
 </dd>
 
+<dt>STARPU_PROF_PAPI_EVENTS</dt>
+<dd>
+\anchor STARPU_PROF_PAPI_EVENTS
+\addindex __env__STARPU_PROF_PAPI_EVENTS
+Specify which PAPI events should be recorded in the trace (\ref PapiCounters).
+</dd>
+
 </dl>
 
 \section Extensions Extensions
@@ -607,6 +614,22 @@ When set to 0, the use of priorities to order MPI communications is disabled
 (\ref MPISupport).
 </dd>
 
+<dt>STARPU_MPI_NDETACHED_SEND</dt>
+<dd>
+\anchor STARPU_MPI_NDETACHED_SEND
+\addindex __env__STARPU_MPI_NDETACHED_SEND
+This sets the number of send requests that StarPU-MPI will emit concurrently. The default is 10.
+</dd>
+
+<dt>STARPU_MPI_NREADY_PROCESS</dt>
+<dd>
+\anchor STARPU_MPI_NREADY_PROCESS
+\addindex __env__STARPU_MPI_NREADY_PROCESS
+This sets the number of requests that StarPU-MPI will submit to MPI before
+polling for termination of existing requests. The default is 10.
+</dd>
+
+
 <dt>STARPU_MPI_FAKE_SIZE</dt>
 <dd>
 \anchor STARPU_MPI_FAKE_SIZE
@@ -1070,7 +1093,17 @@ StarPU for internal data structures during execution.
 \anchor STARPU_BUS_STATS
 \addindex __env__STARPU_BUS_STATS
 When defined, statistics about data transfers will be displayed when calling
-starpu_shutdown() (\ref Profiling).
+starpu_shutdown() (\ref Profiling). By default, statistics are printed
+on the standard error stream, use the environement variable \ref
+STARPU_BUS_STATS_FILE to define another filename.
+</dd>
+
+<dt>STARPU_BUS_STATS_FILE</dt>
+<dd>
+\anchor STARPU_BUS_STATS_FILE
+\addindex __env__STARPU_BUS_STATS_FILE
+Define the name of the file where to display data transfers
+statistics, see \ref STARPU_BUS_STATS.
 </dd>
 
 <dt>STARPU_WORKER_STATS</dt>
@@ -1080,7 +1113,17 @@ starpu_shutdown() (\ref Profiling).
 When defined, statistics about the workers will be displayed when calling
 starpu_shutdown() (\ref Profiling). When combined with the
 environment variable \ref STARPU_PROFILING, it displays the energy
-consumption (\ref Energy-basedScheduling).
+consumption (\ref Energy-basedScheduling).  By default, statistics are
+printed on the standard error stream, use the environement variable
+\ref STARPU_WORKER_STATS_FILE to define another filename.
+</dd>
+
+<dt>STARPU_WORKER_STATS_FILE</dt>
+<dd>
+\anchor STARPU_WORKER_STATS_FILE
+\addindex __env__STARPU_WORKER_STATS_FILE
+Define the name of the file where to display workers statistics, see
+\ref STARPU_WORKER_STATS.
 </dd>
 
 <dt>STARPU_STATS</dt>
@@ -1187,15 +1230,6 @@ the current time() (unless SimGrid mode is enabled, in which case it is always
 0). \ref STARPU_RAND_SEED allows to set the seed to a specific value.
 </dd>
 
-<dt>STARPU_IDLE_TIME</dt>
-<dd>
-\anchor STARPU_IDLE_TIME
-\addindex __env__STARPU_IDLE_TIME
-When set to a value being a valid filename, a corresponding file
-will be created when shutting down StarPU. The file will contain the
-sum of all the workers' idle time.
-</dd>
-
 <dt>STARPU_GLOBAL_ARBITER</dt>
 <dd>
 \anchor STARPU_GLOBAL_ARBITER

+ 3 - 2
doc/doxygen/chapters/510_configure_options.doxy

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2011-2013,2015-2017                      Inria
  * Copyright (C) 2010-2017, 2019                          CNRS
- * Copyright (C) 2009-2011,2013-2018                      Université de Bordeaux
+ * Copyright (C) 2009-2011,2013-2019                      Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -389,7 +389,8 @@ to be available in the main memory of the node submitting the request.
 For send communications, data is acquired with the mode ::STARPU_R.
 When enabling the pedantic mode, data are instead acquired with the
 ::STARPU_RW which thus ensures that there is not more than 1
-concurrent MPI_Isend calls accessing the data.
+concurrent MPI_Isend calls accessing the data
+and StarPU does not read from it from tasks during the communication.
 </dd>
 
 <dt>--enable-mpi-master-slave</dt>

File diff suppressed because it is too large
+ 4274 - 0
doc/doxygen/chapters/images/trace_recv_use.eps


BIN
doc/doxygen/chapters/images/trace_recv_use.pdf


BIN
doc/doxygen/chapters/images/trace_recv_use.png


File diff suppressed because it is too large
+ 4019 - 0
doc/doxygen/chapters/images/trace_send_use.eps


BIN
doc/doxygen/chapters/images/trace_send_use.pdf


BIN
doc/doxygen/chapters/images/trace_send_use.png


+ 3 - 3
doc/doxygen/refman.tex

@@ -37,11 +37,11 @@ Generated by Doxygen.
 This manual documents the usage of StarPU version \STARPUVERSION. Its contents
 was last updated on \STARPUUPDATED.\\
 
-Copyright © 2009–2018 Université de Bordeaux
+Copyright © 2009–2019 Université de Bordeaux
 
-Copyright © 2010-2018 CNRS
+Copyright © 2010-2019 CNRS
 
-Copyright © 2011-2018 Inria
+Copyright © 2011-2019 Inria
 
 \medskip
 

+ 3 - 3
doc/doxygen_dev/refman.tex

@@ -34,11 +34,11 @@ Generated by Doxygen.
 This manual documents the internal usage of StarPU version \STARPUVERSION. Its contents
 was last updated on \STARPUUPDATED.\\
 
-Copyright © 2009–2018 Université de Bordeaux
+Copyright © 2009–2019 Université de Bordeaux
 
-Copyright © 2010-2018 CNRS
+Copyright © 2010-2019 CNRS
 
-Copyright © 2011-2018 Inria
+Copyright © 2011-2019 Inria
 
 \medskip
 

+ 45 - 40
examples/cholesky/cholesky_compil.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011-2013,2015                           Inria
- * Copyright (C) 2009-2017,2019                           Université de Bordeaux
+ * Copyright (C) 2009-2017,2019-2020                           Université de Bordeaux
  * Copyright (C) 2010-2013,2015-2017                      CNRS
  * Copyright (C) 2013                                     Thibaut Lambert
  * Copyright (C) 2010                                     Mehdi Juhoor
@@ -24,6 +24,9 @@
  * compiler-side optimizations.
  */
 
+/* Note: this is using fortran ordering, i.e. column-major ordering, i.e.
+ * elements with consecutive row number are consecutive in memory */
+
 #include "cholesky.h"
 #include "../sched_ctx_utils/sched_ctx_utils.h"
 #include <math.h>
@@ -50,8 +53,8 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
 	unsigned long nelems = starpu_matrix_get_nx(dataA);
 	unsigned long nn = nelems/nblocks;
-	int N = nblocks;
 	int M = nblocks;
+	int N = nblocks;
 
 	int lambda_b = starpu_get_env_float_default("CHOLESKY_LAMBDA_B", nblocks);
 	int lambda_o_u = starpu_get_env_float_default("CHOLESKY_LAMBDA_O_U", 0);
@@ -70,7 +73,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 #define ceild(n,d)  ceil(((double)(n))/((double)(d)))
 #define floord(n,d) floor(((double)(n))/((double)(d)))
 
-#define A(i,j) starpu_data_get_sub_data(dataA, 2, j, i)
+#define A(i,j) starpu_data_get_sub_data(dataA, 2, i, j)
 
 #define _POTRF(cl, A, prio, name) do { \
 		int ret = starpu_task_insert(cl, \
@@ -204,31 +207,34 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 static int cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 {
 	starpu_data_handle_t dataA;
-	unsigned x, y;
+	unsigned m, n;
 
 	/* monitor and partition the A matrix into blocks :
-	 * one block is now determined by 2 unsigned (i,j) */
+	 * one block is now determined by 2 unsigned (m,n) */
 	starpu_matrix_data_register(&dataA, STARPU_MAIN_RAM, (uintptr_t)matA, ld, size, size, sizeof(float));
 
+	/* Split into blocks of complete rows first */
 	struct starpu_data_filter f =
 	{
-		.filter_func = starpu_matrix_filter_vertical_block,
+		.filter_func = starpu_matrix_filter_block,
 		.nchildren = nblocks
 	};
 
+	/* Then split rows into tiles */
 	struct starpu_data_filter f2 =
 	{
-		.filter_func = starpu_matrix_filter_block,
+		/* Note: here "vertical" is for row-major, we are here using column-major. */
+		.filter_func = starpu_matrix_filter_vertical_block,
 		.nchildren = nblocks
 	};
 
 	starpu_data_map_filters(dataA, 2, &f, &f2);
 
-	for (x = 0; x < nblocks; x++)
-		for (y = 0; y < nblocks; y++)
+	for (m = 0; m < nblocks; m++)
+		for (n = 0; n < nblocks; n++)
 		{
-			starpu_data_handle_t data = starpu_data_get_sub_data(dataA, 2, x, y);
-			starpu_data_set_coordinates(data, 2, x, y);
+			starpu_data_handle_t data = starpu_data_get_sub_data(dataA, 2, m, n);
+			starpu_data_set_coordinates(data, 2, m, n);
 		}
 
 	int ret = _cholesky(dataA, nblocks);
@@ -244,14 +250,14 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 	float *mat = NULL;
 
 #ifndef STARPU_SIMGRID
-	unsigned i,j;
+	unsigned m,n;
 	starpu_malloc_flags((void **)&mat, (size_t)size*size*sizeof(float), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
-	for (i = 0; i < size; i++)
+	for (n = 0; n < size; n++)
 	{
-		for (j = 0; j < size; j++)
+		for (m = 0; m < size; m++)
 		{
-			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
-			/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
+			mat[m +n*size] = (1.0f/(1.0f+m+n)) + ((m == n)?1.0f*size:0.0f);
+			/* mat[m +n*size] = ((m == n)?1.0f*size:0.0f); */
 		}
 	}
 
@@ -259,13 +265,13 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 #ifdef PRINT_OUTPUT
 	FPRINTF(stdout, "Input :\n");
 
-	for (j = 0; j < size; j++)
+	for (m = 0; m < size; m++)
 	{
-		for (i = 0; i < size; i++)
+		for (n = 0; n < size; n++)
 		{
-			if (i <= j)
+			if (n <= m)
 			{
-				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
+				FPRINTF(stdout, "%2.2f\t", mat[m +n*size]);
 			}
 			else
 			{
@@ -282,18 +288,17 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 #ifndef STARPU_SIMGRID
 #ifdef PRINT_OUTPUT
 	FPRINTF(stdout, "Results :\n");
-	for (j = 0; j < size; j++)
+	for (m = 0; m < size; m++)
 	{
-		for (i = 0; i < size; i++)
+		for (n = 0; n < size; n++)
 		{
-			if (i <= j)
+			if (n <= m)
 			{
-				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
+				FPRINTF(stdout, "%2.2f\t", mat[m +n*size]);
 			}
 			else
 			{
 				FPRINTF(stdout, ".\t");
-				mat[j+i*size] = 0.0f; /* debug */
 			}
 		}
 		FPRINTF(stdout, "\n");
@@ -303,13 +308,13 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 	if (check_p)
 	{
 		FPRINTF(stderr, "compute explicit LLt ...\n");
-		for (j = 0; j < size; j++)
+		for (m = 0; m < size; m++)
 		{
-			for (i = 0; i < size; i++)
+			for (n = 0; n < size; n++)
 			{
-				if (i > j)
+				if (n > m)
 				{
-					mat[j+i*size] = 0.0f; /* debug */
+					mat[m+n*size] = 0.0f; /* debug */
 				}
 			}
 		}
@@ -321,13 +326,13 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 
 		FPRINTF(stderr, "comparing results ...\n");
 #ifdef PRINT_OUTPUT
-		for (j = 0; j < size; j++)
+		for (m = 0; m < size; m++)
 		{
-			for (i = 0; i < size; i++)
+			for (n = 0; n < size; n++)
 			{
-				if (i <= j)
+				if (n <= m)
 				{
-					FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size]);
+					FPRINTF(stdout, "%2.2f\t", test_mat[m +n*size]);
 				}
 				else
 				{
@@ -338,17 +343,17 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 		}
 #endif
 
-		for (j = 0; j < size; j++)
+		for (m = 0; m < size; m++)
 		{
-			for (i = 0; i < size; i++)
+			for (n = 0; n < size; n++)
 			{
-				if (i <= j)
+				if (n <= m)
 				{
-	                                float orig = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
-	                                float err = fabsf(test_mat[j +i*size] - orig) / orig;
-	                                if (err > 0.00001)
+	                                float orig = (1.0f/(1.0f+m+n)) + ((m == n)?1.0f*size:0.0f);
+	                                float err = fabsf(test_mat[m +n*size] - orig) / orig;
+	                                if (err > 0.0001)
 					{
-	                                        FPRINTF(stderr, "Error[%u, %u] --> %2.6f != %2.6f (err %2.6f)\n", i, j, test_mat[j +i*size], orig, err);
+	                                        FPRINTF(stderr, "Error[%u, %u] --> %2.6f != %2.6f (err %2.6f)\n", m, n, test_mat[m +n*size], orig, err);
 	                                        assert(0);
 	                                }
 	                        }

+ 116 - 77
examples/cholesky/cholesky_grain_tag.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2008-2017                                Université de Bordeaux
+ * Copyright (C) 2008-2017,2020                           Université de Bordeaux
  * Copyright (C) 2012,2013                                Inria
  * Copyright (C) 2010-2013,2015,2017                      CNRS
  * Copyright (C) 2013                                     Thibaut Lambert
@@ -28,6 +28,9 @@
  * remainder of the matrix with a smaller granularity.
  */
 
+/* Note: this is using fortran ordering, i.e. column-major ordering, i.e.
+ * elements with consecutive row number are consecutive in memory */
+
 #include "cholesky.h"
 
 #if defined(STARPU_USE_CUDA) && defined(STARPU_HAVE_MAGMA)
@@ -64,7 +67,8 @@ static struct starpu_task * create_task_11(starpu_data_handle_t dataA, unsigned
 	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k);
 
 	/* this is an important task */
-	task->priority = STARPU_MAX_PRIO;
+	if (!noprio_p)
+		task->priority = STARPU_MAX_PRIO;
 
 	/* enforce dependencies ... */
 	if (k > 0)
@@ -78,19 +82,19 @@ static struct starpu_task * create_task_11(starpu_data_handle_t dataA, unsigned
 	return task;
 }
 
-static int create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j, unsigned reclevel)
+static int create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned m, unsigned reclevel)
 {
 	int ret;
 
-	struct starpu_task *task = create_task(TAG21_AUX(k, j, reclevel));
+	struct starpu_task *task = create_task(TAG21_AUX(k, m, reclevel));
 
 	task->cl = &cl21;
 
 	/* which sub-data is manipulated ? */
 	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k);
-	task->handles[1] = starpu_data_get_sub_data(dataA, 2, k, j);
+	task->handles[1] = starpu_data_get_sub_data(dataA, 2, m, k);
 
-	if (j == k+1)
+	if (!noprio_p && (m == k+1))
 	{
 		task->priority = STARPU_MAX_PRIO;
 	}
@@ -98,37 +102,37 @@ static int create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j, un
 	/* enforce dependencies ... */
 	if (k > 0)
 	{
-		starpu_tag_declare_deps(TAG21_AUX(k, j, reclevel), 2, TAG11_AUX(k, reclevel), TAG22_AUX(k-1, k, j, reclevel));
+		starpu_tag_declare_deps(TAG21_AUX(k, m, reclevel), 2, TAG11_AUX(k, reclevel), TAG22_AUX(k-1, m, k, reclevel));
 	}
 	else
 	{
-		starpu_tag_declare_deps(TAG21_AUX(k, j, reclevel), 1, TAG11_AUX(k, reclevel));
+		starpu_tag_declare_deps(TAG21_AUX(k, m, reclevel), 1, TAG11_AUX(k, reclevel));
 	}
 
-	int n = starpu_matrix_get_nx(task->handles[0]);
-	task->flops = FLOPS_STRSM(n, n);
+	int nx = starpu_matrix_get_nx(task->handles[0]);
+	task->flops = FLOPS_STRSM(nx, nx);
 
 	ret = starpu_task_submit(task);
 	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	return ret;
 }
 
-static int create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, unsigned j, unsigned reclevel)
+static int create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned m, unsigned n, unsigned reclevel)
 {
 	int ret;
 
-/*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22_AUX(k,i,j)); */
+/*	FPRINTF(stdout, "task 22 k,n,m = %d,%d,%d TAG = %llx\nx", k,m,n, TAG22_AUX(k,m,n)); */
 
-	struct starpu_task *task = create_task(TAG22_AUX(k, i, j, reclevel));
+	struct starpu_task *task = create_task(TAG22_AUX(k, m, n, reclevel));
 
 	task->cl = &cl22;
 
 	/* which sub-data is manipulated ? */
-	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, i);
-	task->handles[1] = starpu_data_get_sub_data(dataA, 2, k, j);
-	task->handles[2] = starpu_data_get_sub_data(dataA, 2, i, j);
+	task->handles[0] = starpu_data_get_sub_data(dataA, 2, n, k);
+	task->handles[1] = starpu_data_get_sub_data(dataA, 2, m, k);
+	task->handles[2] = starpu_data_get_sub_data(dataA, 2, m, n);
 
-	if ( (i == k + 1) && (j == k +1) )
+	if ( (n == k + 1) && (m == k +1) )
 	{
 		task->priority = STARPU_MAX_PRIO;
 	}
@@ -136,15 +140,15 @@ static int create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, un
 	/* enforce dependencies ... */
 	if (k > 0)
 	{
-		starpu_tag_declare_deps(TAG22_AUX(k, i, j, reclevel), 3, TAG22_AUX(k-1, i, j, reclevel), TAG21_AUX(k, i, reclevel), TAG21_AUX(k, j, reclevel));
+		starpu_tag_declare_deps(TAG22_AUX(k, m, n, reclevel), 3, TAG22_AUX(k-1, m, n, reclevel), TAG21_AUX(k, n, reclevel), TAG21_AUX(k, m, reclevel));
 	}
 	else
 	{
-		starpu_tag_declare_deps(TAG22_AUX(k, i, j, reclevel), 2, TAG21_AUX(k, i, reclevel), TAG21_AUX(k, j, reclevel));
+		starpu_tag_declare_deps(TAG22_AUX(k, m, n, reclevel), 2, TAG21_AUX(k, n, reclevel), TAG21_AUX(k, m, reclevel));
 	}
 
-	int n = starpu_matrix_get_nx(task->handles[0]);
-	task->flops = FLOPS_SGEMM(n, n, n);
+	int nx = starpu_matrix_get_nx(task->handles[0]);
+	task->flops = FLOPS_SGEMM(nx, nx, nx);
 
 	ret = starpu_task_submit(task);
 	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
@@ -166,7 +170,7 @@ static int cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 	struct starpu_task *entry_task = NULL;
 
 	/* create all the DAG nodes */
-	unsigned i,j,k;
+	unsigned k, m, n;
 
 	starpu_data_handle_t dataA;
 
@@ -176,15 +180,18 @@ static int cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 
 	starpu_data_set_sequential_consistency_flag(dataA, 0);
 
+	/* Split into blocks of complete rows first */
 	struct starpu_data_filter f =
 	{
-		.filter_func = starpu_matrix_filter_vertical_block,
+		.filter_func = starpu_matrix_filter_block,
 		.nchildren = nblocks
 	};
 
+	/* Then split rows into tiles */
 	struct starpu_data_filter f2 =
 	{
-		.filter_func = starpu_matrix_filter_block,
+		/* Note: here "vertical" is for row-major, we are here using column-major. */
+		.filter_func = starpu_matrix_filter_vertical_block,
 		.nchildren = nblocks
 	};
 
@@ -206,16 +213,16 @@ static int cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 		}
 
-		for (j = k+1; j<nblocks; j++)
+		for (m = k+1; m<nblocks; m++)
 		{
-		     	ret = create_task_21(dataA, k, j, reclevel);
+		     	ret = create_task_21(dataA, k, m, reclevel);
 			if (ret == -ENODEV) return 77;
 
-			for (i = k+1; i<nblocks; i++)
+			for (n = k+1; n<nblocks; n++)
 			{
-				if (i <= j)
+				if (n <= m)
 				{
-				     ret = create_task_22(dataA, k, i, j, reclevel);
+				     ret = create_task_22(dataA, k, m, n, reclevel);
 				     if (ret == -ENODEV) return 77;
 				}
 			}
@@ -225,11 +232,8 @@ static int cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 
 	/* schedule the codelet */
 	ret = starpu_task_submit(entry_task);
-	if (STARPU_UNLIKELY(ret == -ENODEV))
-	{
-		FPRINTF(stderr, "No worker may execute this task\n");
-		return 77;
-	}
+	if (ret == -ENODEV) return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 
 	if (nblocks == nbigblocks)
 	{
@@ -248,11 +252,11 @@ static int cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 		STARPU_ASSERT(tag_array);
 
 		unsigned ind = 0;
-		for (i = nbigblocks; i < nblocks; i++)
-		for (j = nbigblocks; j < nblocks; j++)
+		for (n = nbigblocks; n < nblocks; n++)
+		for (m = nbigblocks; m < nblocks; m++)
 		{
-			if (i <= j)
-				tag_array[ind++] = TAG22_AUX(nbigblocks - 1, i, j, reclevel);
+			if (n <= m)
+				tag_array[ind++] = TAG22_AUX(nbigblocks - 1, m, n, reclevel);
 		}
 
 		starpu_tag_wait_array(ind, tag_array);
@@ -268,7 +272,7 @@ static int cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 	}
 }
 
-static void initialize_system(int argc, char **argv, float **A, unsigned pinned)
+static int initialize_system(int argc, char **argv, float **A, unsigned pinned)
 {
 	int ret;
 	int flags = STARPU_MALLOC_SIMULATION_FOLDED;
@@ -279,7 +283,7 @@ static void initialize_system(int argc, char **argv, float **A, unsigned pinned)
 
 	ret = starpu_init(NULL);
 	if (ret == -ENODEV)
-		exit(77);
+		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	init_sizes();
@@ -301,6 +305,8 @@ static void initialize_system(int argc, char **argv, float **A, unsigned pinned)
 	if (pinned)
 		flags |= STARPU_MALLOC_PINNED;
 	starpu_malloc_flags((void **)A, size_p*size_p*sizeof(float), flags);
+
+	return 0;
 }
 
 int cholesky_grain(float *matA, unsigned size, unsigned ld, unsigned nblocks, unsigned nbigblocks)
@@ -344,34 +350,33 @@ int main(int argc, char **argv)
 	 *	Hilbert matrix : h(i,j) = 1/(i+j+1)
 	 * */
 
-     	int ret;
-
 	float *mat = NULL;
-	initialize_system(argc, argv, &mat, pinned_p);
+	int ret = initialize_system(argc, argv, &mat, pinned_p);
+	if (ret) return ret;
 
 #ifndef STARPU_SIMGRID
-	unsigned i,j;
-	for (i = 0; i < size_p; i++)
+	unsigned m,n;
+
+	for (n = 0; n < size_p; n++)
 	{
-		for (j = 0; j < size_p; j++)
+		for (m = 0; m < size_p; m++)
 		{
-			mat[j +i*size_p] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size_p:0.0f);
-			/* mat[j +i*size_p] = ((i == j)?1.0f*size_p:0.0f); */
+			mat[m +n*size_p] = (1.0f/(1.0f+n+m)) + ((n == m)?1.0f*size_p:0.0f);
+			/* mat[m +n*size_p] = ((n == m)?1.0f*size_p:0.0f); */
 		}
 	}
-#endif
 
-
-#ifdef CHECK_OUTPUT
+/* #define PRINT_OUTPUT */
+#ifdef PRINT_OUTPUT
 	FPRINTF(stdout, "Input :\n");
 
-	for (j = 0; j < size_p; j++)
+	for (m = 0; m < size_p; m++)
 	{
-		for (i = 0; i < size_p; i++)
+		for (n = 0; n < size_p; n++)
 		{
-			if (i <= j)
+			if (n <= m)
 			{
-				FPRINTF(stdout, "%2.2f\t", mat[j +i*size_p]);
+				FPRINTF(stdout, "%2.2f\t", mat[m +n*size_p]);
 			}
 			else
 			{
@@ -381,53 +386,87 @@ int main(int argc, char **argv)
 		FPRINTF(stdout, "\n");
 	}
 #endif
+#endif
 
 	ret = cholesky_grain(mat, size_p, size_p, nblocks_p, nbigblocks_p);
 
-#ifdef CHECK_OUTPUT
+#ifndef STARPU_SIMGRID
+#ifdef PRINT_OUTPUT
 	FPRINTF(stdout, "Results :\n");
 
-	for (j = 0; j < size_p; j++)
+	for (m = 0; m < size_p; m++)
 	{
-		for (i = 0; i < size_p; i++)
+		for (n = 0; n < size_p; n++)
 		{
-			if (i <= j)
+			if (n <= m)
 			{
-				FPRINTF(stdout, "%2.2f\t", mat[j +i*size_p]);
+				FPRINTF(stdout, "%2.2f\t", mat[m +n*size_p]);
 			}
 			else
 			{
 				FPRINTF(stdout, ".\t");
-				mat[j+i*size_p] = 0.0f; /* debug */
 			}
 		}
 		FPRINTF(stdout, "\n");
 	}
+#endif
 
-	FPRINTF(stderr, "compute explicit LLt ...\n");
-	float *test_mat = malloc(size_p*size_p*sizeof(float));
-	STARPU_ASSERT(test_mat);
-
-	STARPU_SSYRK("L", "N", size_p, size_p, 1.0f,
-		     mat, size_p, 0.0f, test_mat, size_p);
-
-	FPRINTF(stderr, "comparing results ...\n");
-	for (j = 0; j < size_p; j++)
+	if (check_p)
 	{
-		for (i = 0; i < size_p; i++)
+		FPRINTF(stderr, "compute explicit LLt ...\n");
+		for (m = 0; m < size_p; m++)
 		{
-			if (i <= j)
+			for (n = 0; n < size_p; n++)
 			{
-                                FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size_p]);
+				if (n > m)
+				{
+					mat[m+n*size_p] = 0.0f; /* debug */
+				}
 			}
-			else
+		}
+		float *test_mat = malloc(size_p*size_p*sizeof(float));
+		STARPU_ASSERT(test_mat);
+
+		STARPU_SSYRK("L", "N", size_p, size_p, 1.0f,
+			     mat, size_p, 0.0f, test_mat, size_p);
+
+		FPRINTF(stderr, "comparing results ...\n");
+#ifdef PRINT_OUTPUT
+		for (m = 0; m < size_p; m++)
+		{
+			for (n = 0; n < size_p; n++)
 			{
-				FPRINTF(stdout, ".\t");
+				if (n <= m)
+				{
+					FPRINTF(stdout, "%2.2f\t", test_mat[m +n*size_p]);
+				}
+				else
+				{
+					FPRINTF(stdout, ".\t");
+				}
 			}
+			FPRINTF(stdout, "\n");
 		}
-		FPRINTF(stdout, "\n");
+#endif
+
+		for (m = 0; m < size_p; m++)
+		{
+			for (n = 0; n < size_p; n++)
+			{
+				if (n <= m)
+				{
+	                                float orig = (1.0f/(1.0f+m+n)) + ((m == n)?1.0f*size_p:0.0f);
+	                                float err = fabsf(test_mat[m +n*size_p] - orig) / orig;
+	                                if (err > 0.0001)
+					{
+	                                        FPRINTF(stderr, "Error[%u, %u] --> %2.6f != %2.6f (err %2.6f)\n", m, n, test_mat[m +n*size_p], orig, err);
+	                                        assert(0);
+	                                }
+	                        }
+			}
+	        }
+		free(test_mat);
 	}
-	free(test_mat);
 #endif
 
 	shutdown_system(&mat, size_p, pinned_p);

+ 65 - 60
examples/cholesky/cholesky_implicit.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011-2013,2015                           Inria
- * Copyright (C) 2009-2017                                Université de Bordeaux
+ * Copyright (C) 2009-2017,2020                           Université de Bordeaux
  * Copyright (C) 2010-2013,2015-2017                      CNRS
  * Copyright (C) 2013                                     Thibaut Lambert
  * Copyright (C) 2010                                     Mehdi Juhoor
@@ -23,6 +23,9 @@
  * The whole algorithm thus appears clearly in the task submission loop in _cholesky().
  */
 
+/* Note: this is using fortran ordering, i.e. column-major ordering, i.e.
+ * elements with consecutive row number are consecutive in memory */
+
 #include "cholesky.h"
 #include "../sched_ctx_utils/sched_ctx_utils.h"
 
@@ -46,9 +49,9 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 	double start;
 	double end;
 
-	unsigned i,j,k;
-	unsigned long n = starpu_matrix_get_nx(dataA);
-	unsigned long nn = n/nblocks;
+	unsigned k,m,n;
+	unsigned long nx = starpu_matrix_get_nx(dataA);
+	unsigned long nn = nx/nblocks;
 
 	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
 
@@ -75,45 +78,45 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 		if (ret == -ENODEV) return 77;
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 
-		for (j = k+1; j<nblocks; j++)
+		for (m = k+1; m<nblocks; m++)
 		{
-                        starpu_data_handle_t sdatakj = starpu_data_get_sub_data(dataA, 2, k, j);
+                        starpu_data_handle_t sdatamk = starpu_data_get_sub_data(dataA, 2, m, k);
 
                         ret = starpu_task_insert(&cl21,
-						 STARPU_PRIORITY, noprio_p ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - j) : (j == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+						 STARPU_PRIORITY, noprio_p ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
 						 STARPU_R, sdatakk,
-						 STARPU_RW, sdatakj,
+						 STARPU_RW, sdatamk,
 						 STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
-						 STARPU_TAG_ONLY, TAG21(k,j),
+						 STARPU_TAG_ONLY, TAG21(m,k),
 						 0);
 			if (ret == -ENODEV) return 77;
 			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 		}
 		starpu_data_wont_use(sdatakk);
 
-		for (j = k+1; j<nblocks; j++)
+		for (m = k+1; m<nblocks; m++)
 		{
-                        starpu_data_handle_t sdatakj = starpu_data_get_sub_data(dataA, 2, k, j);
-			for (i = k+1; i<nblocks; i++)
+                        starpu_data_handle_t sdatamk = starpu_data_get_sub_data(dataA, 2, m, k);
+			for (n = k+1; n<nblocks; n++)
 			{
-				if (i <= j)
+				if (n <= m)
                                 {
-					starpu_data_handle_t sdataki = starpu_data_get_sub_data(dataA, 2, k, i);
-					starpu_data_handle_t sdataij = starpu_data_get_sub_data(dataA, 2, i, j);
+					starpu_data_handle_t sdatank = starpu_data_get_sub_data(dataA, 2, n, k);
+					starpu_data_handle_t sdatamn = starpu_data_get_sub_data(dataA, 2, m, n);
 
 					ret = starpu_task_insert(&cl22,
-								 STARPU_PRIORITY, noprio_p ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - j - i) : ((i == k+1) && (j == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
-								 STARPU_R, sdataki,
-								 STARPU_R, sdatakj,
-								 cl22.modes[2], sdataij,
+								 STARPU_PRIORITY, noprio_p ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+								 STARPU_R, sdatamk,
+								 STARPU_R, sdatank,
+								 cl22.modes[2], sdatamn,
 								 STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
-								 STARPU_TAG_ONLY, TAG22(k,i,j),
+								 STARPU_TAG_ONLY, TAG22(k,m,n),
 								 0);
 					if (ret == -ENODEV) return 77;
 					STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
                                 }
 			}
-			starpu_data_wont_use(sdatakj);
+			starpu_data_wont_use(sdatamk);
 		}
 		starpu_iteration_pop();
 	}
@@ -128,7 +131,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
 	double timing = end - start;
 
-	double flop = FLOPS_SPOTRF(n);
+	double flop = FLOPS_SPOTRF(nx);
 
 	if(with_ctxs_p || with_noctxs_p || chole1_p || chole2_p)
 		update_sched_ctx_timing_results((flop/timing/1000.0f), (timing/1000000.0f));
@@ -139,7 +142,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 			PRINTF("\tTms\tTGFlops");
 		PRINTF("\n");
 
-		PRINTF("%lu\t%.0f\t%.1f", n, timing/1000, (flop/timing/1000.0f));
+		PRINTF("%lu\t%.0f\t%.1f", nx, timing/1000, (flop/timing/1000.0f));
 		if (bound_lp_p)
 		{
 			FILE *f = fopen("cholesky.lp", "w");
@@ -166,31 +169,34 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 static int cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 {
 	starpu_data_handle_t dataA;
-	unsigned x, y;
+	unsigned m, n;
 
 	/* monitor and partition the A matrix into blocks :
-	 * one block is now determined by 2 unsigned (i,j) */
+	 * one block is now determined by 2 unsigned (m,n) */
 	starpu_matrix_data_register(&dataA, STARPU_MAIN_RAM, (uintptr_t)matA, ld, size, size, sizeof(float));
 
+	/* Split into blocks of complete rows first */
 	struct starpu_data_filter f =
 	{
-		.filter_func = starpu_matrix_filter_vertical_block,
+		.filter_func = starpu_matrix_filter_block,
 		.nchildren = nblocks
 	};
 
+	/* Then split rows into tiles */
 	struct starpu_data_filter f2 =
 	{
-		.filter_func = starpu_matrix_filter_block,
+		/* Note: here "vertical" is for row-major, we are here using column-major. */
+		.filter_func = starpu_matrix_filter_vertical_block,
 		.nchildren = nblocks
 	};
 
 	starpu_data_map_filters(dataA, 2, &f, &f2);
 
-	for (x = 0; x < nblocks; x++)
-		for (y = 0; y < nblocks; y++)
+	for (m = 0; m < nblocks; m++)
+		for (n = 0; n < nblocks; n++)
 		{
-			starpu_data_handle_t data = starpu_data_get_sub_data(dataA, 2, x, y);
-			starpu_data_set_coordinates(data, 2, x, y);
+			starpu_data_handle_t data = starpu_data_get_sub_data(dataA, 2, m, n);
+			starpu_data_set_coordinates(data, 2, m, n);
 		}
 
 	int ret = _cholesky(dataA, nblocks);
@@ -206,14 +212,14 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 	float *mat = NULL;
 
 #ifndef STARPU_SIMGRID
-	unsigned i,j;
+	unsigned m,n;
 	starpu_malloc_flags((void **)&mat, (size_t)size*size*sizeof(float), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
-	for (i = 0; i < size; i++)
+	for (n = 0; n < size; n++)
 	{
-		for (j = 0; j < size; j++)
+		for (m = 0; m < size; m++)
 		{
-			mat[j +i*size] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
-			/* mat[j +i*size] = ((i == j)?1.0f*size:0.0f); */
+			mat[m +n*size] = (1.0f/(1.0f+m+n)) + ((m == n)?1.0f*size:0.0f);
+			/* mat[m +n*size] = ((m == n)?1.0f*size:0.0f); */
 		}
 	}
 
@@ -221,13 +227,13 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 #ifdef PRINT_OUTPUT
 	FPRINTF(stdout, "Input :\n");
 
-	for (j = 0; j < size; j++)
+	for (m = 0; m < size; m++)
 	{
-		for (i = 0; i < size; i++)
+		for (n = 0; n < size; n++)
 		{
-			if (i <= j)
+			if (n <= m)
 			{
-				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
+				FPRINTF(stdout, "%2.2f\t", mat[m +n*size]);
 			}
 			else
 			{
@@ -244,18 +250,17 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 #ifndef STARPU_SIMGRID
 #ifdef PRINT_OUTPUT
 	FPRINTF(stdout, "Results :\n");
-	for (j = 0; j < size; j++)
+	for (m = 0; m < size; m++)
 	{
-		for (i = 0; i < size; i++)
+		for (n = 0; n < size; n++)
 		{
-			if (i <= j)
+			if (n <= m)
 			{
-				FPRINTF(stdout, "%2.2f\t", mat[j +i*size]);
+				FPRINTF(stdout, "%2.2f\t", mat[m +n*size]);
 			}
 			else
 			{
 				FPRINTF(stdout, ".\t");
-				mat[j+i*size] = 0.0f; /* debug */
 			}
 		}
 		FPRINTF(stdout, "\n");
@@ -265,13 +270,13 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 	if (check_p)
 	{
 		FPRINTF(stderr, "compute explicit LLt ...\n");
-		for (j = 0; j < size; j++)
+		for (m = 0; m < size; m++)
 		{
-			for (i = 0; i < size; i++)
+			for (n = 0; n < size; n++)
 			{
-				if (i > j)
+				if (n > m)
 				{
-					mat[j+i*size] = 0.0f; /* debug */
+					mat[m+n*size] = 0.0f; /* debug */
 				}
 			}
 		}
@@ -283,13 +288,13 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 
 		FPRINTF(stderr, "comparing results ...\n");
 #ifdef PRINT_OUTPUT
-		for (j = 0; j < size; j++)
+		for (m = 0; m < size; m++)
 		{
-			for (i = 0; i < size; i++)
+			for (n = 0; n < size; n++)
 			{
-				if (i <= j)
+				if (n <= m)
 				{
-					FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size]);
+					FPRINTF(stdout, "%2.2f\t", test_mat[m +n*size]);
 				}
 				else
 				{
@@ -300,17 +305,17 @@ static void execute_cholesky(unsigned size, unsigned nblocks)
 		}
 #endif
 
-		for (j = 0; j < size; j++)
+		for (m = 0; m < size; m++)
 		{
-			for (i = 0; i < size; i++)
+			for (n = 0; n < size; n++)
 			{
-				if (i <= j)
+				if (n <= m)
 				{
-	                                float orig = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
-	                                float err = fabsf(test_mat[j +i*size] - orig) / orig;
-	                                if (err > 0.00001)
+	                                float orig = (1.0f/(1.0f+m+n)) + ((m == n)?1.0f*size:0.0f);
+	                                float err = fabsf(test_mat[m +n*size] - orig) / orig;
+	                                if (err > 0.0001)
 					{
-	                                        FPRINTF(stderr, "Error[%u, %u] --> %2.6f != %2.6f (err %2.6f)\n", i, j, test_mat[j +i*size], orig, err);
+	                                        FPRINTF(stderr, "Error[%u, %u] --> %2.6f != %2.6f (err %2.6f)\n", m, n, test_mat[m +n*size], orig, err);
 	                                        assert(0);
 	                                }
 	                        }

+ 1 - 1
examples/cholesky/cholesky_kernels.c

@@ -235,7 +235,7 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, void *_a
 #if (MAGMA_VERSION_MAJOR > 1) || (MAGMA_VERSION_MAJOR == 1 && MAGMA_VERSION_MINOR >= 4)
 			cudaError_t cures = cudaStreamSynchronize(stream);
 #else
-			cudaError_t cures = cudaThreadSynchronize();
+			cudaError_t cures = cudaDeviceSynchronize();
 #endif
 			STARPU_ASSERT(!cures);
 			}

+ 149 - 102
examples/cholesky/cholesky_tag.c

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2008-2017                                Université de Bordeaux
+ * Copyright (C) 2008-2017,2020                           Université de Bordeaux
  * Copyright (C) 2012,2013                                Inria
- * Copyright (C) 2010-2013,2015,2017                      CNRS
+ * Copyright (C) 2010-2013,2015,2017,2020                 CNRS
  * Copyright (C) 2013                                     Thibaut Lambert
  * Copyright (C) 2010                                     Mehdi Juhoor
  *
@@ -24,6 +24,9 @@
  * It also uses data partitioning to split the matrix into submatrices
  */
 
+/* Note: this is using fortran ordering, i.e. column-major ordering, i.e.
+ * elements with consecutive row number are consecutive in memory */
+
 #include "cholesky.h"
 #include <starpu_perfmodel.h>
 
@@ -76,17 +79,19 @@ static struct starpu_task * create_task_11(starpu_data_handle_t dataA, unsigned
 	return task;
 }
 
-static void create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j)
+static int create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned m)
 {
-	struct starpu_task *task = create_task(TAG21(k, j));
+	int ret;
+
+	struct starpu_task *task = create_task(TAG21(k, m));
 
 	task->cl = &cl21;
 
 	/* which sub-data is manipulated ? */
 	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, k);
-	task->handles[1] = starpu_data_get_sub_data(dataA, 2, k, j);
+	task->handles[1] = starpu_data_get_sub_data(dataA, 2, m, k);
 
-	if (!noprio_p && (j == k+1))
+	if (!noprio_p && (m == k+1))
 	{
 		task->priority = STARPU_MAX_PRIO;
 	}
@@ -94,39 +99,37 @@ static void create_task_21(starpu_data_handle_t dataA, unsigned k, unsigned j)
 	/* enforce dependencies ... */
 	if (k > 0)
 	{
-		starpu_tag_declare_deps(TAG21(k, j), 2, TAG11(k), TAG22(k-1, k, j));
+		starpu_tag_declare_deps(TAG21(k, m), 2, TAG11(k), TAG22(k-1, m, k));
 	}
 	else
 	{
-		starpu_tag_declare_deps(TAG21(k, j), 1, TAG11(k));
+		starpu_tag_declare_deps(TAG21(k, m), 1, TAG11(k));
 	}
 
-	int n = starpu_matrix_get_nx(task->handles[0]);
-	task->flops = FLOPS_STRSM(n, n);
-
-	int ret = starpu_task_submit(task);
-        if (STARPU_UNLIKELY(ret == -ENODEV))
-	{
-                FPRINTF(stderr, "No worker may execute this task\n");
-                exit(0);
-        }
+	int nx = starpu_matrix_get_nx(task->handles[0]);
+	task->flops = FLOPS_STRSM(nx, nx);
 
+	ret = starpu_task_submit(task);
+	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	return ret;
 }
 
-static void create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, unsigned j)
+static int create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned m, unsigned n)
 {
-/*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */
+	int ret;
 
-	struct starpu_task *task = create_task(TAG22(k, i, j));
+/*	FPRINTF(stdout, "task 22 k,n,m = %d,%d,%d TAG = %llx\n", k,m,n, TAG22(k,m,n)); */
+
+	struct starpu_task *task = create_task(TAG22(k, m, n));
 
 	task->cl = &cl22;
 
 	/* which sub-data is manipulated ? */
-	task->handles[0] = starpu_data_get_sub_data(dataA, 2, k, i);
-	task->handles[1] = starpu_data_get_sub_data(dataA, 2, k, j);
-	task->handles[2] = starpu_data_get_sub_data(dataA, 2, i, j);
+	task->handles[0] = starpu_data_get_sub_data(dataA, 2, n, k);
+	task->handles[1] = starpu_data_get_sub_data(dataA, 2, m, k);
+	task->handles[2] = starpu_data_get_sub_data(dataA, 2, m, n);
 
-	if (!noprio_p && (i == k + 1) && (j == k +1) )
+	if (!noprio_p && (n == k + 1) && (m == k +1) )
 	{
 		task->priority = STARPU_MAX_PRIO;
 	}
@@ -134,40 +137,37 @@ static void create_task_22(starpu_data_handle_t dataA, unsigned k, unsigned i, u
 	/* enforce dependencies ... */
 	if (k > 0)
 	{
-		starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), TAG21(k, i), TAG21(k, j));
+		starpu_tag_declare_deps(TAG22(k, m, n), 3, TAG22(k-1, m, n), TAG21(k, n), TAG21(k, m));
 	}
 	else
 	{
-		starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG21(k, i), TAG21(k, j));
+		starpu_tag_declare_deps(TAG22(k, m, n), 2, TAG21(k, n), TAG21(k, m));
 	}
 
-	int n = starpu_matrix_get_nx(task->handles[0]);
-	task->flops = FLOPS_SGEMM(n, n, n);
+	int nx = starpu_matrix_get_nx(task->handles[0]);
+	task->flops = FLOPS_SGEMM(nx, nx, nx);
 
-	int ret = starpu_task_submit(task);
-        if (STARPU_UNLIKELY(ret == -ENODEV))
-	{
-                FPRINTF(stderr, "No worker may execute this task\n");
-                exit(0);
-        }
+	ret = starpu_task_submit(task);
+	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	return ret;
 }
 
-
-
 /*
  *	code to bootstrap the factorization
  *	and construct the DAG
  */
 
-static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
+static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 {
+	int ret;
+
 	double start;
 	double end;
 
 	struct starpu_task *entry_task = NULL;
 
 	/* create all the DAG nodes */
-	unsigned i,j,k;
+	unsigned k, m, n;
 
 	start = starpu_timing_now();
 
@@ -182,36 +182,44 @@ static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 		}
 		else
 		{
-			int ret = starpu_task_submit(task);
-                        if (STARPU_UNLIKELY(ret == -ENODEV))
+			ret = starpu_task_submit(task);
+			if (ret == -ENODEV)
 			{
-                                FPRINTF(stderr, "No worker may execute this task\n");
-                                exit(0);
-                        }
-
+				starpu_data_unpartition(dataA, STARPU_MAIN_RAM);
+				return 77;
+			}
+			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 		}
 
-		for (j = k+1; j<nblocks; j++)
+		for (m = k+1; m<nblocks; m++)
 		{
-			create_task_21(dataA, k, j);
+			ret = create_task_21(dataA, k, m);
+			if (ret == -ENODEV)
+			{
+				starpu_data_unpartition(dataA, STARPU_MAIN_RAM);
+				return 77;
+			}
 
-			for (i = k+1; i<nblocks; i++)
+			for (n = k+1; n<nblocks; n++)
 			{
-				if (i <= j)
-					create_task_22(dataA, k, i, j);
+				if (n <= m)
+				{
+					ret = create_task_22(dataA, k, m, n);
+					if (ret == -ENODEV)
+					{
+						starpu_data_unpartition(dataA, STARPU_MAIN_RAM);
+						return 77;
+					}
+				}
 			}
 		}
 		starpu_iteration_pop();
 	}
 
 	/* schedule the codelet */
-	int ret = starpu_task_submit(entry_task);
-        if (STARPU_UNLIKELY(ret == -ENODEV))
-	{
-                FPRINTF(stderr, "No worker may execute this task\n");
-                exit(0);
-        }
-
+	ret = starpu_task_submit(entry_task);
+	if (ret == -ENODEV) return 77;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 
 	/* stall the application until the end of computations */
 	starpu_tag_wait(TAG11(nblocks-1));
@@ -220,15 +228,16 @@ static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
 	end = starpu_timing_now();
 
-
 	double timing = end - start;
 
-	unsigned n = starpu_matrix_get_nx(dataA);
+	unsigned nx = starpu_matrix_get_nx(dataA);
 
-	double flop = (1.0f*n*n*n)/3.0f;
+	double flop = (1.0f*nx*nx*nx)/3.0f;
 
 	PRINTF("# size\tms\tGFlops\n");
-	PRINTF("%u\t%.0f\t%.1f\n", n, timing/1000, (flop/timing/1000.0f));
+	PRINTF("%u\t%.0f\t%.1f\n", nx, timing/1000, (flop/timing/1000.0f));
+
+	return 0;
 }
 
 static int initialize_system(int argc, char **argv, float **A, unsigned pinned)
@@ -268,33 +277,38 @@ static int initialize_system(int argc, char **argv, float **A, unsigned pinned)
 	return 0;
 }
 
-static void cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
+static int cholesky(float *matA, unsigned size, unsigned ld, unsigned nblocks)
 {
 	starpu_data_handle_t dataA;
+	int ret;
 
 	/* monitor and partition the A matrix into blocks :
-	 * one block is now determined by 2 unsigned (i,j) */
+	 * one block is now determined by 2 unsigned (m,n) */
 	starpu_matrix_data_register(&dataA, STARPU_MAIN_RAM, (uintptr_t)matA, ld, size, size, sizeof(float));
 
 	starpu_data_set_sequential_consistency_flag(dataA, 0);
 
+	/* Split into blocks of complete rows first */
 	struct starpu_data_filter f =
 	{
-		.filter_func = starpu_matrix_filter_vertical_block,
+		.filter_func = starpu_matrix_filter_block,
 		.nchildren = nblocks
 	};
 
+	/* Then split rows into tiles */
 	struct starpu_data_filter f2 =
 	{
-		.filter_func = starpu_matrix_filter_block,
+		/* Note: here "vertical" is for row-major, we are here using column-major. */
+		.filter_func = starpu_matrix_filter_vertical_block,
 		.nchildren = nblocks
 	};
 
 	starpu_data_map_filters(dataA, 2, &f, &f2);
 
-	_cholesky(dataA, nblocks);
+	ret = _cholesky(dataA, nblocks);
 
 	starpu_data_unregister(dataA);
+	return ret;
 }
 
 static void shutdown_system(float **matA, unsigned dim, unsigned pinned)
@@ -321,28 +335,28 @@ int main(int argc, char **argv)
 	if (ret) return ret;
 
 #ifndef STARPU_SIMGRID
-	unsigned i,j;
-	for (i = 0; i < size_p; i++)
+	unsigned m,n;
+
+	for (n = 0; n < size_p; n++)
 	{
-		for (j = 0; j < size_p; j++)
+		for (m = 0; m < size_p; m++)
 		{
-			mat[j +i*size_p] = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size_p:0.0f);
-			/* mat[j +i*size_p] = ((i == j)?1.0f*size_p:0.0f); */
+			mat[m +n*size_p] = (1.0f/(1.0f+n+m)) + ((n == m)?1.0f*size_p:0.0f);
+			/* mat[m +n*size_p] = ((n == m)?1.0f*size_p:0.0f); */
 		}
 	}
-#endif
-
 
-#ifdef CHECK_OUTPUT
+/* #define PRINT_OUTPUT */
+#ifdef PRINT_OUTPUT
 	FPRINTF(stdout, "Input :\n");
 
-	for (j = 0; j < size_p; j++)
+	for (m = 0; m < size_p; m++)
 	{
-		for (i = 0; i < size_p; i++)
+		for (n = 0; n < size_p; n++)
 		{
-			if (i <= j)
+			if (n <= m)
 			{
-				FPRINTF(stdout, "%2.2f\t", mat[j +i*size_p]);
+				FPRINTF(stdout, "%2.2f\t", mat[m +n*size_p]);
 			}
 			else
 			{
@@ -352,56 +366,89 @@ int main(int argc, char **argv)
 		FPRINTF(stdout, "\n");
 	}
 #endif
+#endif
 
+	ret = cholesky(mat, size_p, size_p, nblocks_p);
 
-	cholesky(mat, size_p, size_p, nblocks_p);
-
-#ifdef CHECK_OUTPUT
+#ifndef STARPU_SIMGRID
+#ifdef PRINT_OUTPUT
 	FPRINTF(stdout, "Results :\n");
 
-	for (j = 0; j < size_p; j++)
+	for (m = 0; m < size_p; m++)
 	{
-		for (i = 0; i < size_p; i++)
+		for (n = 0; n < size_p; n++)
 		{
-			if (i <= j)
+			if (n <= m)
 			{
-				FPRINTF(stdout, "%2.2f\t", mat[j +i*size_p]);
+				FPRINTF(stdout, "%2.2f\t", mat[m +n*size_p]);
 			}
 			else
 			{
 				FPRINTF(stdout, ".\t");
-				mat[j+i*size_p] = 0.0f; /* debug */
 			}
 		}
 		FPRINTF(stdout, "\n");
 	}
+#endif
 
-	FPRINTF(stderr, "compute explicit LLt ...\n");
-	float *test_mat = malloc(size_p*size_p*sizeof(float));
-	STARPU_ASSERT(test_mat);
-
-	STARPU_SSYRK("L", "N", size_p, size_p, 1.0f,
-		     mat, size_p, 0.0f, test_mat, size_p);
-
-	FPRINTF(stderr, "comparing results ...\n");
-	for (j = 0; j < size_p; j++)
+	if (check_p)
 	{
-		for (i = 0; i < size_p; i++)
+		FPRINTF(stderr, "compute explicit LLt ...\n");
+		for (m = 0; m < size_p; m++)
 		{
-			if (i <= j)
+			for (n = 0; n < size_p; n++)
 			{
-				FPRINTF(stdout, "%2.2f\t", test_mat[j +i*size_p]);
+				if (n > m)
+				{
+					mat[m+n*size_p] = 0.0f; /* debug */
+				}
 			}
-			else
+		}
+		float *test_mat = malloc(size_p*size_p*sizeof(float));
+		STARPU_ASSERT(test_mat);
+
+		STARPU_SSYRK("L", "N", size_p, size_p, 1.0f,
+			     mat, size_p, 0.0f, test_mat, size_p);
+
+		FPRINTF(stderr, "comparing results ...\n");
+#ifdef PRINT_OUTPUT
+		for (m = 0; m < size_p; m++)
+		{
+			for (n = 0; n < size_p; n++)
 			{
-				FPRINTF(stdout, ".\t");
+				if (n <= m)
+				{
+					FPRINTF(stdout, "%2.2f\t", test_mat[m +n*size_p]);
+				}
+				else
+				{
+					FPRINTF(stdout, ".\t");
+				}
 			}
+			FPRINTF(stdout, "\n");
 		}
-		FPRINTF(stdout, "\n");
+#endif
+
+		for (m = 0; m < size_p; m++)
+		{
+			for (n = 0; n < size_p; n++)
+			{
+				if (n <= m)
+				{
+	                                float orig = (1.0f/(1.0f+m+n)) + ((m == n)?1.0f*size_p:0.0f);
+	                                float err = fabsf(test_mat[m +n*size_p] - orig) / orig;
+	                                if (err > 0.0001)
+					{
+	                                        FPRINTF(stderr, "Error[%u, %u] --> %2.6f != %2.6f (err %2.6f)\n", m, n, test_mat[m +n*size_p], orig, err);
+	                                        assert(0);
+	                                }
+	                        }
+			}
+	        }
+		free(test_mat);
 	}
-	free(test_mat);
 #endif
 
 	shutdown_system(&mat, size_p, pinned_p);
-	return 0;
+	return ret;
 }

+ 55 - 52
examples/cholesky/cholesky_tile_tag.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2017                                Université de Bordeaux
+ * Copyright (C) 2009-2017,2020                           Université de Bordeaux
  * Copyright (C) 2012,2013                                Inria
  * Copyright (C) 2010-2013,2015-2017                      CNRS
  * Copyright (C) 2013                                     Thibaut Lambert
@@ -23,13 +23,16 @@
  * It also directly registers matrix tiles instead of using partitioning.
  */
 
+/* Note: this is using fortran ordering, i.e. column-major ordering, i.e.
+ * elements with consecutive row number are consecutive in memory */
+
 #include "cholesky.h"
 
 #if defined(STARPU_USE_CUDA) && defined(STARPU_HAVE_MAGMA)
 #include "magma.h"
 #endif
 
-/* A [ y ] [ x ] */
+/* A [ m ] [ n ] */
 float *A[NMAXBLOCKS][NMAXBLOCKS];
 starpu_data_handle_t A_state[NMAXBLOCKS][NMAXBLOCKS];
 
@@ -78,19 +81,19 @@ static struct starpu_task * create_task_11(unsigned k, unsigned nblocks)
 	return task;
 }
 
-static int create_task_21(unsigned k, unsigned j)
+static int create_task_21(unsigned k, unsigned m)
 {
 	int ret;
 
-	struct starpu_task *task = create_task(TAG21(k, j));
+	struct starpu_task *task = create_task(TAG21(m, k));
 
 	task->cl = &cl21;
 
 	/* which sub-data is manipulated ? */
 	task->handles[0] = A_state[k][k];
-	task->handles[1] = A_state[j][k];
+	task->handles[1] = A_state[m][k];
 
-	if (j == k+1)
+	if (m == k+1)
 	{
 		task->priority = STARPU_MAX_PRIO;
 	}
@@ -98,11 +101,11 @@ static int create_task_21(unsigned k, unsigned j)
 	/* enforce dependencies ... */
 	if (k > 0)
 	{
-		starpu_tag_declare_deps(TAG21(k, j), 2, TAG11(k), TAG22(k-1, k, j));
+		starpu_tag_declare_deps(TAG21(m, k), 2, TAG11(k), TAG22(k-1, m, k));
 	}
 	else
 	{
-		starpu_tag_declare_deps(TAG21(k, j), 1, TAG11(k));
+		starpu_tag_declare_deps(TAG21(m, k), 1, TAG11(k));
 	}
 
 	int n = starpu_matrix_get_nx(task->handles[0]);
@@ -113,22 +116,22 @@ static int create_task_21(unsigned k, unsigned j)
 	return ret;
 }
 
-static int create_task_22(unsigned k, unsigned i, unsigned j)
+static int create_task_22(unsigned k, unsigned m, unsigned n)
 {
 	int ret;
 
-/*	FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */
+/*	FPRINTF(stdout, "task 22 k,n,m = %d,%d,%d TAG = %llx\n", k,m,n, TAG22(k,m,n)); */
 
-	struct starpu_task *task = create_task(TAG22(k, i, j));
+	struct starpu_task *task = create_task(TAG22(k, m, n));
 
 	task->cl = &cl22;
 
 	/* which sub-data is manipulated ? */
-	task->handles[0] = A_state[i][k];
-	task->handles[1] = A_state[j][k];
-	task->handles[2] = A_state[j][i];
+	task->handles[0] = A_state[n][k];
+	task->handles[1] = A_state[m][k];
+	task->handles[2] = A_state[m][n];
 
-	if ( (i == k + 1) && (j == k +1) )
+	if (!noprio_p && (n == k + 1) && (m == k +1) )
 	{
 		task->priority = STARPU_MAX_PRIO;
 	}
@@ -136,15 +139,15 @@ static int create_task_22(unsigned k, unsigned i, unsigned j)
 	/* enforce dependencies ... */
 	if (k > 0)
 	{
-		starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), TAG21(k, i), TAG21(k, j));
+		starpu_tag_declare_deps(TAG22(k, m, n), 3, TAG22(k-1, m, n), TAG21(n, k), TAG21(m, k));
 	}
 	else
 	{
-		starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG21(k, i), TAG21(k, j));
+		starpu_tag_declare_deps(TAG22(k, m, n), 2, TAG21(n, k), TAG21(m, k));
 	}
 
-	int n = starpu_matrix_get_nx(task->handles[0]);
-	task->flops = FLOPS_SGEMM(n, n, n);
+	int nx = starpu_matrix_get_nx(task->handles[0]);
+	task->flops = FLOPS_SGEMM(nx, nx, nx);
 
 	ret = starpu_task_submit(task);
 	if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
@@ -166,7 +169,7 @@ static int cholesky_no_stride(void)
 	struct starpu_task *entry_task = NULL;
 
 	/* create all the DAG nodes */
-	unsigned i,j,k;
+	unsigned k, m, n;
 
 	for (k = 0; k < nblocks_p; k++)
 	{
@@ -183,17 +186,17 @@ static int cholesky_no_stride(void)
 			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 		}
 
-		for (j = k+1; j<nblocks_p; j++)
+		for (m = k+1; m<nblocks_p; m++)
 		{
-			ret = create_task_21(k, j);
+			ret = create_task_21(k, m);
 			if (ret == -ENODEV) return 77;
 
-			for (i = k+1; i<nblocks_p; i++)
+			for (n = k+1; n<nblocks_p; n++)
 			{
-				if (i <= j)
+				if (n <= m)
 				{
-				     ret = create_task_22(k, i, j);
-				     if (ret == -ENODEV) return 77;
+					ret = create_task_22(k, m, n);
+					if (ret == -ENODEV) return 77;
 				}
 			}
 		}
@@ -222,7 +225,7 @@ static int cholesky_no_stride(void)
 
 int main(int argc, char **argv)
 {
-	unsigned x, y;
+	unsigned n, m;
 	int ret;
 
 #ifdef STARPU_HAVE_MAGMA
@@ -256,13 +259,13 @@ int main(int argc, char **argv)
 
 	starpu_cublas_init();
 
-	for (y = 0; y < nblocks_p; y++)
-	for (x = 0; x < nblocks_p; x++)
+	for (m = 0; m < nblocks_p; m++)
+	for (n = 0; n < nblocks_p; n++)
 	{
-		if (x <= y)
+		if (n <= m)
 		{
-			starpu_malloc_flags((void **)&A[y][x], BLOCKSIZE*BLOCKSIZE*sizeof(float), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
-			assert(A[y][x]);
+			starpu_malloc_flags((void **)&A[m][n], BLOCKSIZE*BLOCKSIZE*sizeof(float), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
+			assert(A[m][n]);
 		}
 	}
 
@@ -271,44 +274,44 @@ int main(int argc, char **argv)
 	 *
 	 *	Hilbert matrix : h(i,j) = 1/(i+j+1) ( + n In to make is stable )
 	 * */
-	for (y = 0; y < nblocks_p; y++)
-	for (x = 0; x < nblocks_p; x++)
-	if (x <= y)
+	for (m = 0; m < nblocks_p; m++)
+	for (n = 0; n < nblocks_p; n++)
+	if (n <= m)
 	{
-		unsigned i, j;
-		for (i = 0; i < BLOCKSIZE; i++)
-		for (j = 0; j < BLOCKSIZE; j++)
+		unsigned mm, nn;
+		for (mm = 0; mm < BLOCKSIZE; mm++)
+		for (nn = 0; nn < BLOCKSIZE; nn++)
 		{
-			A[y][x][i*BLOCKSIZE + j] =
-				(float)(1.0f/((float) (1.0+(x*BLOCKSIZE+i)+(y*BLOCKSIZE+j))));
+			A[m][n][mm*BLOCKSIZE + nn] =
+				(float)(1.0f/((float) (1.0+(n*BLOCKSIZE+mm)+(m*BLOCKSIZE+nn))));
 
 			/* make it a little more numerically stable ... ;) */
-			if ((x == y) && (i == j))
-				A[y][x][i*BLOCKSIZE + j] += (float)(2*size_p);
+			if ((n == m) && (mm == nn))
+				A[m][n][mm*BLOCKSIZE + nn] += (float)(2*size_p);
 		}
 	}
 #endif
 
-	for (y = 0; y < nblocks_p; y++)
-	for (x = 0; x < nblocks_p; x++)
+	for (m = 0; m < nblocks_p; m++)
+	for (n = 0; n < nblocks_p; n++)
 	{
-		if (x <= y)
+		if (n <= m)
 		{
-			starpu_matrix_data_register(&A_state[y][x], STARPU_MAIN_RAM, (uintptr_t)A[y][x],
+			starpu_matrix_data_register(&A_state[m][n], STARPU_MAIN_RAM, (uintptr_t)A[m][n],
 						    BLOCKSIZE, BLOCKSIZE, BLOCKSIZE, sizeof(float));
-			starpu_data_set_coordinates(A_state[y][x], 2, x, y);
+			starpu_data_set_coordinates(A_state[m][n], 2, n, m);
 		}
 	}
 
 	ret = cholesky_no_stride();
 
-	for (y = 0; y < nblocks_p; y++)
-	for (x = 0; x < nblocks_p; x++)
+	for (m = 0; m < nblocks_p; m++)
+	for (n = 0; n < nblocks_p; n++)
 	{
-		if (x <= y)
+		if (n <= m)
 		{
-			starpu_data_unregister(A_state[y][x]);
-			starpu_free_flags(A[y][x], BLOCKSIZE*BLOCKSIZE*sizeof(float), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
+			starpu_data_unregister(A_state[m][n]);
+			starpu_free_flags(A[m][n], BLOCKSIZE*BLOCKSIZE*sizeof(float), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
 		}
 	}
 

+ 5 - 5
examples/interface/complex_interface.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2012,2013                                Inria
  * Copyright (C) 2012-2015,2017,2018,2019                 CNRS
- * Copyright (C) 2012-2015,2018                           Université de Bordeaux
+ * Copyright (C) 2012-2015,2018-2019                      Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -73,8 +73,8 @@ static void complex_register_data_handle(starpu_data_handle_t handle, unsigned h
 		}
 		else
 		{
-			local_interface->real = 0;
-			local_interface->imaginary = 0;
+			local_interface->real = NULL;
+			local_interface->imaginary = NULL;
 		}
 	}
 }
@@ -83,8 +83,8 @@ static starpu_ssize_t complex_allocate_data_on_node(void *data_interface, unsign
 {
 	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *) data_interface;
 
-	double *addr_real = 0;
-	double *addr_imaginary = 0;
+	double *addr_real = NULL;
+	double *addr_imaginary = NULL;
 	starpu_ssize_t requested_memory = complex_interface->nx * sizeof(complex_interface->real[0]);
 
 	addr_real = (double*) starpu_malloc_on_node(node, requested_memory);

+ 2 - 0
examples/lu/lu_example.c

@@ -444,7 +444,9 @@ int main(int argc, char **argv)
 		free(ipiv);
 #endif
 
+#ifndef STARPU_SIMGRID
 	starpu_free_flags(A, (size_t)size*size*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
+#endif
 
 	starpu_cublas_shutdown();
 

+ 3 - 1
examples/matvecmult/matvecmult.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2015                                Université de Bordeaux
+ * Copyright (C) 2010-2015, 2020                          Université de Bordeaux
  * Copyright (C) 2010-2017, 2019                          CNRS
  * Copyright (C) 2013                                     Inria
  *
@@ -34,6 +34,7 @@ void opencl_codelet(void *descr[], void *_args)
 	cl_mem mult = (cl_mem)STARPU_VECTOR_GET_DEV_HANDLE(descr[2]);
 	int nx = STARPU_MATRIX_GET_NX(descr[0]);
 	int ny = STARPU_MATRIX_GET_NY(descr[0]);
+	int ld = STARPU_MATRIX_GET_LD(descr[0]);
 
         id = starpu_worker_get_id_check();
         devid = starpu_worker_get_devid(id);
@@ -47,6 +48,7 @@ void opencl_codelet(void *descr[], void *_args)
         err |= clSetKernelArg(kernel, n++, sizeof(nx), (void*)&nx);
         err |= clSetKernelArg(kernel, n++, sizeof(ny), (void*)&ny);
 	err |= clSetKernelArg(kernel, n++, sizeof(mult), &mult);
+	err |= clSetKernelArg(kernel, n++, sizeof(ld), (void*)&ld);
         if (err) STARPU_OPENCL_REPORT_ERROR(err);
 
 	{

+ 3 - 3
examples/matvecmult/matvecmult_kernel.cl

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010,2011,2017                           CNRS
- * Copyright (C) 2014                                     Université de Bordeaux
+ * Copyright (C) 2014,2020                                Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -15,7 +15,7 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-__kernel void matVecMult(const __global float *A, const __global float *X, int n, int m, __global float *Y)
+__kernel void matVecMult(const __global float *A, const __global float *X, int n, int m, __global float *Y, int ld)
 {
 	const int i = get_global_id(0);
 	if (i < m)
@@ -24,7 +24,7 @@ __kernel void matVecMult(const __global float *A, const __global float *X, int n
 		int j;
 
 		for (j = 0; j < n; j++)
-		       val += A[i*n+j] * X[j];
+		       val += A[i*ld+j] * X[j];
 
 		Y[i] = val;
 	}

+ 10 - 0
examples/native_fortran/nf_matrix.f90

@@ -29,6 +29,8 @@ program nf_matrix
         type(c_ptr) :: dh_mb    ! a pointer for the 'mb' vector data handle
         integer(c_int) :: err   ! return status for fstarpu_init
         integer(c_int) :: ncpu  ! number of cpus workers
+        real(c_double) :: start_time ! start clock in usec
+        real(c_double) :: end_time   ! end clock in usec
 
         allocate(ma(5,6))
         do i=1,5
@@ -57,6 +59,9 @@ program nf_matrix
                 stop 77
         end if
 
+        ! collect the start clock time
+        start_time = fstarpu_timing_now()
+
         ! allocate an empty codelet structure
         cl_mat = fstarpu_codelet_allocate()
 
@@ -102,10 +107,15 @@ program nf_matrix
         ! free codelet structure
         call fstarpu_codelet_free(cl_mat)
 
+        ! collect the start clock time
+        end_time = fstarpu_timing_now()
+
         ! shut StarPU down
         call fstarpu_shutdown()
 
         deallocate(mb)
         deallocate(ma)
 
+        print "(es 10.3)", end_time - start_time
+
 end program nf_matrix

+ 13 - 7
examples/perf_monitoring/perf_counters_02.c

@@ -16,6 +16,7 @@
 
 #include <starpu.h>
 #include <assert.h>
+#include <inttypes.h>
 
 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
 
@@ -42,7 +43,7 @@ void g_listener_cb(struct starpu_perf_counter_listener *listener, struct starpu_
 	int64_t g_total_submitted = starpu_perf_counter_sample_get_int64_value(sample, id_g_total_submitted);
 	int64_t g_peak_submitted = starpu_perf_counter_sample_get_int64_value(sample, id_g_peak_submitted);
 	int64_t g_peak_ready = starpu_perf_counter_sample_get_int64_value(sample, id_g_peak_ready);
-	printf("global: g_total_submitted = %ld, g_peak_submitted = %ld, g_peak_ready = %ld\n", g_total_submitted, g_peak_submitted, g_peak_ready);
+	printf("global: g_total_submitted = %"PRId64", g_peak_submitted = %"PRId64", g_peak_ready = %"PRId64"\n", g_total_submitted, g_peak_submitted, g_peak_ready);
 }
 
 void w_listener_cb(struct starpu_perf_counter_listener *listener, struct starpu_perf_counter_sample *sample, void *context)
@@ -53,7 +54,7 @@ void w_listener_cb(struct starpu_perf_counter_listener *listener, struct starpu_
 	int64_t w_total_executed = starpu_perf_counter_sample_get_int64_value(sample, id_w_total_executed);
 	double w_cumul_execution_time = starpu_perf_counter_sample_get_double_value(sample, id_w_cumul_execution_time);
 
-	printf("worker[%d]: w_total_executed = %ld, w_cumul_execution_time = %lf\n", workerid, w_total_executed, w_cumul_execution_time);
+	printf("worker[%d]: w_total_executed = %"PRId64", w_cumul_execution_time = %lf\n", workerid, w_total_executed, w_cumul_execution_time);
 }
 
 void c_listener_cb(struct starpu_perf_counter_listener *listener, struct starpu_perf_counter_sample *sample, void *context)
@@ -65,13 +66,13 @@ void c_listener_cb(struct starpu_perf_counter_listener *listener, struct starpu_
 	int64_t c_peak_ready = starpu_perf_counter_sample_get_int64_value(sample, id_c_peak_ready);
 	int64_t c_total_executed = starpu_perf_counter_sample_get_int64_value(sample, id_c_total_executed);
 	double c_cumul_execution_time = starpu_perf_counter_sample_get_double_value(sample, id_c_cumul_execution_time);
-	if (cl->name == NULL)
+	if (cl->name != NULL)
 	{
-		printf("codelet[%s]: c_total_submitted = %ld, c_peak_submitted = %ld, c_peak_ready = %ld, c_total_executed = %ld, c_cumul_execution_time = %lf\n", cl->name, c_total_submitted, c_peak_submitted, c_peak_ready, c_total_executed, c_cumul_execution_time);
+		printf("codelet[%s]: c_total_submitted = %"PRId64", c_peak_submitted = %"PRId64", c_peak_ready = %"PRId64", c_total_executed = %"PRId64", c_cumul_execution_time = %lf\n", cl->name, c_total_submitted, c_peak_submitted, c_peak_ready, c_total_executed, c_cumul_execution_time);
 	}
 	else
 	{
-		printf("codelet[%p]: c_total_submitted = %ld, c_peak_submitted = %ld, c_peak_ready = %ld, c_total_executed = %ld, c_cumul_execution_time = %lf\n", cl, c_total_submitted, c_peak_submitted, c_peak_ready, c_total_executed, c_cumul_execution_time);
+		printf("codelet[%p]: c_total_submitted = %"PRId64", c_peak_submitted = %"PRId64", c_peak_ready = %"PRId64", c_total_executed = %"PRId64", c_cumul_execution_time = %lf\n", cl, c_total_submitted, c_peak_submitted, c_peak_ready, c_total_executed, c_cumul_execution_time);
 	}
 }
 
@@ -107,9 +108,14 @@ const enum starpu_perf_counter_scope c_scope = starpu_perf_counter_scope_per_cod
 
 int main(int argc, char **argv)
 {
-	int ret;
+	struct starpu_conf conf;
+	starpu_conf_init(&conf);
+	
+	/* Start collecting perfomance counter right after initialization */
+	conf.start_perf_counter_collection = 1;
 
-	ret = starpu_init(NULL);
+	int ret;
+	ret = starpu_init(&conf);
 	if (ret == -ENODEV)
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");

+ 6 - 2
examples/pipeline/pipeline.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2012-2015,2017,2019                      CNRS
- * Copyright (C) 2012,2014-2017                           Université de Bordeaux
+ * Copyright (C) 2012,2014-2017,2019                      Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -177,6 +177,10 @@ static struct starpu_codelet pipeline_codelet_sum =
 	.model = &pipeline_model_sum
 };
 
+static void release_sem(void *arg) {
+	sem_post(arg);
+};
+
 int main(void)
 {
 	int ret = 0;
@@ -243,7 +247,7 @@ int main(void)
 
 		ret = starpu_task_insert(&pipeline_codelet_sum,
 				STARPU_R, buffersY[l%K],
-				STARPU_CALLBACK_WITH_ARG_NFREE, (void (*)(void*))sem_post, &sems[l%C],
+				STARPU_CALLBACK_WITH_ARG_NFREE, release_sem, &sems[l%C],
 				STARPU_TAG_ONLY, (starpu_tag_t) l,
 				0);
 		if (ret == -ENODEV) goto enodev;

+ 1 - 1
examples/sched_ctx/axpy_partition_gpu.h

@@ -126,7 +126,7 @@ static void buildPartitionedBlockMapping(F cudaFun, int threads, int shmem, int
 
   cudaMemcpyAsync((void*)block_assignment_d,block_assignment,sizeof(block_assignment),cudaMemcpyHostToDevice, current_stream);
   //cudaMemcpy((void*)block_assignment_d,block_assignment,sizeof(block_assignment),cudaMemcpyHostToDevice);
-  //cudaThreadSynchronize();
+  //cudaDeviceSynchronize();
 }
 
 

+ 29 - 43
examples/spmv/spmv.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2011-2013                                Inria
  * Copyright (C) 2010-2013,2015,2017                      CNRS
- * Copyright (C) 2009-2011,2013-2015                      Université de Bordeaux
+ * Copyright (C) 2009-2011,2013-2015,2020                 Université de Bordeaux
  * Copyright (C) 2010                                     Mehdi Juhoor
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -48,46 +48,10 @@ static void parse_args(int argc, char **argv)
 	}
 }
 
-/* This filter function takes a CSR matrix, and divides it into nparts with the
- * same number of rows. */
-static void csr_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts)
-{
-	(void)f;
-	struct starpu_csr_interface *csr_father = (struct starpu_csr_interface *) father_interface;
-	struct starpu_csr_interface *csr_child = (struct starpu_csr_interface *) child_interface;
-
-	uint32_t nrow = csr_father->nrow;
-	size_t elemsize = csr_father->elemsize;
-	uint32_t firstentry = csr_father->firstentry;
-
-	/* Every sub-parts should contain the same number of non-zero entries */
-	uint32_t chunk_size = (nrow + nparts - 1)/nparts;
-	uint32_t *rowptr = csr_father->rowptr;
-
-	uint32_t first_index = id*chunk_size - firstentry;
-	uint32_t local_firstentry = rowptr[first_index];
-
-	uint32_t child_nrow = STARPU_MIN(chunk_size, nrow - id*chunk_size);
-	uint32_t local_nnz = rowptr[first_index + child_nrow] - rowptr[first_index];
-
-	csr_child->id = csr_father->id;
-	csr_child->nnz = local_nnz;
-	csr_child->nrow = child_nrow;
-	csr_child->firstentry = local_firstentry;
-	csr_child->elemsize = elemsize;
-
-	if (csr_father->nzval)
-	{
-		csr_child->rowptr = &csr_father->rowptr[first_index];
-		csr_child->colind = &csr_father->colind[local_firstentry];
-		csr_child->nzval = csr_father->nzval + local_firstentry * elemsize;
-	}
-}
-
 /* partition the CSR matrix along a block distribution */
 static struct starpu_data_filter csr_f =
 {
-	.filter_func = csr_filter_func,
+	.filter_func = starpu_csr_filter_vertical_block,
 	/* This value is defined later on */
 	.nchildren = -1,
 	/* the children also use a csr interface */
@@ -136,6 +100,7 @@ int main(int argc, char **argv)
 	/* Input and Output vectors */
 	float *vector_in_ptr;
 	float *vector_out_ptr;
+	float *vector_exp_out_ptr;
 
 	/*
 	 *	Parse command-line arguments
@@ -159,6 +124,9 @@ int main(int argc, char **argv)
 	starpu_malloc((void **)&rowptr, (size+1)*sizeof(uint32_t));
 	assert(nzval && colind && rowptr);
 
+#define UPPER_BAND 1.
+#define MIDDLE_BAND 5.
+#define LOWER_BAND 1.
 	/* fill the matrix */
 	for (row = 0, pos = 0; row < size; row++)
 	{
@@ -166,18 +134,18 @@ int main(int argc, char **argv)
 
 		if (row > 0)
 		{
-			nzval[pos] = 1.0f;
+			nzval[pos] = LOWER_BAND;
 			colind[pos] = row-1;
 			pos++;
 		}
 		
-		nzval[pos] = 5.0f;
+		nzval[pos] = MIDDLE_BAND;
 		colind[pos] = row;
 		pos++;
 
 		if (row < size - 1)
 		{
-			nzval[pos] = 1.0f;
+			nzval[pos] = UPPER_BAND;
 			colind[pos] = row+1;
 			pos++;
 		}
@@ -190,12 +158,13 @@ int main(int argc, char **argv)
 	/* initiate the 2 vectors */
 	starpu_malloc((void **)&vector_in_ptr, size*sizeof(float));
 	starpu_malloc((void **)&vector_out_ptr, size*sizeof(float));
-	assert(vector_in_ptr && vector_out_ptr);
+	starpu_malloc((void **)&vector_exp_out_ptr, size*sizeof(float));
+	assert(vector_in_ptr && vector_out_ptr && vector_exp_out_ptr);
 
 	/* fill them */
 	for (ind = 0; ind < size; ind++)
 	{
-		vector_in_ptr[ind] = 2.0f;
+		vector_in_ptr[ind] = ind % 100;
 		vector_out_ptr[ind] = 0.0f;
 	}
 
@@ -267,11 +236,28 @@ int main(int argc, char **argv)
                 FPRINTF(stdout, "%2.2f\t%2.2f\n", vector_in_ptr[row], vector_out_ptr[row]);
 	}
 
+	/* Check the result */
+	memset(vector_exp_out_ptr, 0, sizeof(vector_exp_out_ptr[0])*size);
+	for (row = 0; row < size; row++)
+	{
+		if (row > 0)
+			vector_exp_out_ptr[row] += LOWER_BAND * vector_in_ptr[row-1];
+		vector_exp_out_ptr[row] += MIDDLE_BAND * vector_in_ptr[row];
+		if (row < size-1)
+			vector_exp_out_ptr[row] += UPPER_BAND * vector_in_ptr[row+1];
+	}
+	for (row = 0; row < size; row++)
+		if (vector_out_ptr[row] != vector_exp_out_ptr[row]) {
+			FPRINTF(stderr, "check failed at %u: %f vs expected %f\n", row, vector_out_ptr[row], vector_exp_out_ptr[row]);
+			exit(EXIT_FAILURE);
+		}
+
 	starpu_free(nzval);
 	starpu_free(colind);
 	starpu_free(rowptr);
 	starpu_free(vector_in_ptr);
 	starpu_free(vector_out_ptr);
+	starpu_free(vector_exp_out_ptr);
 
 	/*
 	 *	Stop StarPU

+ 103 - 0
include/fstarpu_mod.f90

@@ -784,6 +784,95 @@ module fstarpu_mod
 
                 ! void *starpu_data_get_interface_on_node(starpu_data_handle_t handle, unsigned memory_node);
 
+                ! == starpu_data_interface.h: tensor ==
+
+                ! void starpu_tensor_data_register(starpu_data_handle_t *handle, unsigned home_node, uintptr_t ptr, uint32_t ldy, uint32_t ldz, uint32_t ldt, uint32_t nx, uint32_t ny, uint32_t nz, uint32_t nt, size_t elemsize);
+                subroutine fstarpu_tensor_data_register(dh, home_node, ptr, ldy, ldz, ldt, nx, ny, nz, nt, elt_size) &
+                                bind(C,name="starpu_tensor_data_register")
+                        use iso_c_binding, only: c_ptr, c_int, c_size_t
+                        type(c_ptr), intent(out) :: dh
+                        integer(c_int), value, intent(in) :: home_node
+                        type(c_ptr), value, intent(in) :: ptr
+                        integer(c_int), value, intent(in) :: ldy
+                        integer(c_int), value, intent(in) :: ldz
+                        integer(c_int), value, intent(in) :: ldt
+                        integer(c_int), value, intent(in) :: nx
+                        integer(c_int), value, intent(in) :: ny
+                        integer(c_int), value, intent(in) :: nz
+                        integer(c_int), value, intent(in) :: nt
+                        integer(c_size_t), value, intent(in) :: elt_size
+                end subroutine fstarpu_tensor_data_register
+
+                ! void starpu_tensor_ptr_register(starpu_data_handle_t handle, unsigned node, uintptr_t ptr, uintptr_t dev_handle, size_t offset, uint32_t ldy, uint32_t ldz, uint32_t ldt);
+                subroutine fstarpu_tensor_ptr_register(dh, node, ptr, dev_handle, offset, ldy, ldz, ldt) &
+                                bind(C,name="starpu_tensor_ptr_register")
+                        use iso_c_binding, only: c_ptr, c_int, c_size_t
+                        type(c_ptr), intent(out) :: dh
+                        integer(c_int), value, intent(in) :: node
+                        type(c_ptr), value, intent(in) :: ptr
+                        type(c_ptr), value, intent(in) :: dev_handle
+                        integer(c_size_t), value, intent(in) :: offset
+                        integer(c_int), value, intent(in) :: ldy
+                        integer(c_int), value, intent(in) :: ldz
+                        integer(c_int), value, intent(in) :: ldt
+                end subroutine fstarpu_tensor_ptr_register
+
+                function fstarpu_tensor_get_ptr(buffers, i) bind(C)
+                        use iso_c_binding, only: c_ptr, c_int
+                        type(c_ptr) :: fstarpu_tensor_get_ptr
+                        type(c_ptr), value, intent(in) :: buffers
+                        integer(c_int), value, intent(in) :: i
+                end function fstarpu_tensor_get_ptr
+
+                function fstarpu_tensor_get_ldy(buffers, i) bind(C)
+                        use iso_c_binding, only: c_ptr, c_int
+                        integer(c_int) :: fstarpu_tensor_get_ldy
+                        type(c_ptr), value, intent(in) :: buffers
+                        integer(c_int), value, intent(in) :: i
+                end function fstarpu_tensor_get_ldy
+
+                function fstarpu_tensor_get_ldz(buffers, i) bind(C)
+                        use iso_c_binding, only: c_ptr, c_int
+                        integer(c_int) :: fstarpu_tensor_get_ldz
+                        type(c_ptr), value, intent(in) :: buffers
+                        integer(c_int), value, intent(in) :: i
+                end function fstarpu_tensor_get_ldz
+
+                function fstarpu_tensor_get_ldt(buffers, i) bind(C)
+                        use iso_c_binding, only: c_ptr, c_int
+                        integer(c_int) :: fstarpu_tensor_get_ldt
+                        type(c_ptr), value, intent(in) :: buffers
+                        integer(c_int), value, intent(in) :: i
+                end function fstarpu_tensor_get_ldt
+
+                function fstarpu_tensor_get_nx(buffers, i) bind(C)
+                        use iso_c_binding, only: c_ptr, c_int
+                        integer(c_int) :: fstarpu_tensor_get_nx
+                        type(c_ptr), value, intent(in) :: buffers
+                        integer(c_int), value, intent(in) :: i
+                end function fstarpu_tensor_get_nx
+
+                function fstarpu_tensor_get_ny(buffers, i) bind(C)
+                        use iso_c_binding, only: c_ptr, c_int
+                        integer(c_int) :: fstarpu_tensor_get_ny
+                        type(c_ptr), value, intent(in) :: buffers
+                        integer(c_int), value, intent(in) :: i
+                end function fstarpu_tensor_get_ny
+
+                function fstarpu_tensor_get_nz(buffers, i) bind(C)
+                        use iso_c_binding, only: c_ptr, c_int
+                        integer(c_int) :: fstarpu_tensor_get_nz
+                        type(c_ptr), value, intent(in) :: buffers
+                        integer(c_int), value, intent(in) :: i
+                end function fstarpu_tensor_get_nz
+
+                function fstarpu_tensor_get_nt(buffers, i) bind(C)
+                        use iso_c_binding, only: c_ptr, c_int
+                        integer(c_int) :: fstarpu_tensor_get_nt
+                        type(c_ptr), value, intent(in) :: buffers
+                        integer(c_int), value, intent(in) :: i
+                end function fstarpu_tensor_get_nt
+
                 ! == starpu_data_interface.h: block ==
 
                 ! void starpu_block_data_register(starpu_data_handle_t *handle, unsigned home_node, uintptr_t ptr, uint32_t ldy, uint32_t ldz, uint32_t nx, uint32_t ny, uint32_t nz, size_t elemsize);
@@ -2161,6 +2250,13 @@ module fstarpu_mod
                         use iso_c_binding, only: c_long
                         integer(c_long), value, intent(in) :: code
                 end subroutine fstarpu_trace_user_event
+
+                ! double starpu_timing_now(void)
+                function fstarpu_timing_now () bind(C,name="starpu_timing_now")
+                        use iso_c_binding, only: c_double
+                        real(c_double) :: fstarpu_timing_now
+                end function fstarpu_timing_now
+
         end interface
 
         contains
@@ -2377,4 +2473,11 @@ module fstarpu_mod
                         integer :: i
                         fstarpu_int_to_cptr = transfer(int(i,kind=c_intptr_t),C_NULL_PTR)
                 end function fstarpu_int_to_cptr
+
+                ! Note: do not add binding declarations here in 'CONTAINS'
+                ! section, because the compiler generates empty functions for
+                ! them.
+                ! Instead, put binding declarations in the 'INTERFACE' section
+                ! above.
+
 end module fstarpu_mod

+ 6 - 0
include/starpu.h

@@ -435,6 +435,12 @@ struct starpu_conf
 	   \ref STARPU_CATCH_SIGNALS
 	 */
 	int catch_signals;
+
+	/**
+	   Specify whether StarPU should automatically start to collect
+	   performance counters after initialization
+	 */
+	unsigned start_perf_counter_collection;
 };
 
 /**

+ 1 - 0
include/starpu_config.h.in

@@ -78,6 +78,7 @@
 #undef STARPU_OPENGL_RENDER
 #undef STARPU_USE_GTK
 #undef STARPU_HAVE_X11
+#undef STARPU_PAPI
 
 #undef STARPU_HAVE_POSIX_MEMALIGN
 

+ 40 - 2
include/starpu_cuda.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2012,2014                           Université de Bordeaux
+ * Copyright (C) 2010-2012,2014,2020                      Université de Bordeaux
  * Copyright (C) 2011                                     Inria
  * Copyright (C) 2010-2013,2015,2017,2019                 CNRS
  *
@@ -64,7 +64,7 @@ void starpu_cuda_report_error(const char *func, const char *file, int line, cuda
    stream by hand. Note that the application is not forced to use the
    stream provided by starpu_cuda_get_local_stream() and may also
    create its own streams. Synchronizing with
-   <c>cudaThreadSynchronize()</c> is allowed, but will reduce the
+   <c>cudaDeviceSynchronize()</c> is allowed, but will reduce the
    likelihood of having all transfers overlapped.
 */
 cudaStream_t starpu_cuda_get_local_stream(void);
@@ -87,6 +87,44 @@ const struct cudaDeviceProp *starpu_cuda_get_device_properties(unsigned workerid
 int starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node, void *dst_ptr, unsigned dst_node, size_t ssize, cudaStream_t stream, enum cudaMemcpyKind kind);
 
 /**
+   Copy \p numblocks blocks of \p blocksize bytes from the pointer \p src_ptr on
+   \p src_node to the pointer \p dst_ptr on \p dst_node.
+
+   The blocks start at addresses which are ld_src (resp. ld_dst) bytes apart in
+   the source (resp. destination) interface.
+
+   The function first tries to copy the data asynchronous (unless \p stream is
+   <c>NULL</c>). If the asynchronous copy fails or if \p stream is <c>NULL</c>,
+   it copies the data synchronously. The function returns <c>-EAGAIN</c> if the
+   asynchronous launch was successfull. It returns 0 if the synchronous copy was
+   successful, or fails otherwise.
+*/
+int starpu_cuda_copy2d_async_sync(void *src_ptr, unsigned src_node, void *dst_ptr, unsigned dst_node,
+				  size_t blocksize,
+				  size_t numblocks, size_t ld_src, size_t ld_dst,
+				  cudaStream_t stream, enum cudaMemcpyKind kind);
+
+/**
+   Copy \p numblocks_1 * \p numblocks_2 blocks of \p blocksize bytes from the
+   pointer \p src_ptr on \p src_node to the pointer \p dst_ptr on \p dst_node.
+
+   The blocks are grouped by \p numblocks_1 blocks whose start addresses are
+   ld1_src (resp. ld1_dst) bytes apart in the source (resp. destination)
+   interface.
+
+   The function first tries to copy the data asynchronous (unless \p stream is
+   <c>NULL</c>). If the asynchronous copy fails or if \p stream is <c>NULL</c>,
+   it copies the data synchronously. The function returns <c>-EAGAIN</c> if the
+   asynchronous launch was successfull. It returns 0 if the synchronous copy was
+   successful, or fails otherwise.
+*/
+int starpu_cuda_copy3d_async_sync(void *src_ptr, unsigned src_node, void *dst_ptr, unsigned dst_node,
+				  size_t blocksize,
+				  size_t numblocks_1, size_t ld1_src, size_t ld1_dst,
+				  size_t numblocks_2, size_t ld2_src, size_t ld2_dst,
+				  cudaStream_t stream, enum cudaMemcpyKind kind);
+
+/**
    Call <c>cudaSetDevice(\p devid)</c> or <c>cudaGLSetGLDevice(\p devid)</c>,
    according to whether \p devid is among the field
    starpu_conf::cuda_opengl_interoperability.

+ 12 - 1
include/starpu_data_filters.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2012,2014,2015,2017,2019            Université de Bordeaux
+ * Copyright (C) 2009-2012,2014,2015,2017,2019-2020       Université de Bordeaux
  * Copyright (C) 2010                                     Mehdi Juhoor
  * Copyright (C) 2010-2013,2015,2017,2018,2019            CNRS
  * Copyright (C) 2011                                     Inria
@@ -323,6 +323,13 @@ void starpu_bcsr_filter_canonical_block(void *father_interface, void *child_inte
 */
 struct starpu_data_interface_ops *starpu_bcsr_filter_canonical_block_child_ops(struct starpu_data_filter *f, unsigned child);
 
+/**
+   Partition a block-sparse matrix into block-sparse matrices.
+
+   The split is done along the leading dimension, i.e. along adjacent nnz blocks.
+*/
+void starpu_bcsr_filter_vertical_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);
+
 /** @} */
 
 /**
@@ -344,6 +351,8 @@ void starpu_csr_filter_vertical_block(void *father_interface, void *child_interf
    Predefined partitioning functions for matrix
    data. Examples on how to use them are shown in \ref
    PartitioningData.
+   Note: this is using the C element order which is row-major, i.e. elements
+   with consecutive x coordinates are consecutive in memory.
    @{
 */
 
@@ -450,6 +459,8 @@ void starpu_vector_filter_divide_in_2(void *father_interface, void *child_interf
    Predefined partitioning functions for block data. Examples on how
    to use them are shown in \ref PartitioningData. An example is
    available in \c examples/filters/shadow3d.c
+   Note: this is using the C element order which is row-major, i.e. elements
+   with consecutive x coordinates are consecutive in memory.
    @{
 */
 

+ 274 - 4
include/starpu_data_interfaces.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2019                                Université de Bordeaux
+ * Copyright (C) 2009-2020                                Université de Bordeaux
  * Copyright (C) 2011-2014,2016,2017                      Inria
  * Copyright (C) 2010-2015,2017,2019                           CNRS
  *
@@ -369,7 +369,8 @@ enum starpu_data_interface_id
 	STARPU_VOID_INTERFACE_ID=6, /**< Identifier for the void data interface*/
 	STARPU_MULTIFORMAT_INTERFACE_ID=7, /**< Identifier for the multiformat data interface*/
 	STARPU_COO_INTERFACE_ID=8, /**< Identifier for the COO data interface*/
-	STARPU_MAX_INTERFACE_ID=9 /**< Maximum number of data interfaces */
+	STARPU_TENSOR_INTERFACE_ID=9, /**< Identifier for the block data interface*/
+	STARPU_MAX_INTERFACE_ID=10 /**< Maximum number of data interfaces */
 };
 
 /**
@@ -699,7 +700,90 @@ int starpu_data_interface_get_next_id(void);
    be passed to starpu_interface_copy(). this returns <c>-EAGAIN</c> if the
    transfer is still ongoing, or 0 if the transfer is already completed.
 */
-int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, void *async_data);
+int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node,
+			  uintptr_t dst, size_t dst_offset, unsigned dst_node,
+			  size_t size, void *async_data);
+
+/**
+   Copy \p numblocks blocks of \p blocksize bytes from byte offset \p src_offset
+   of \p src on \p src_node to byte offset \p dst_offset of \p dst on \p
+   dst_node.
+
+   The blocks start at addresses which are ld_src (resp. ld_dst) bytes apart in
+   the source (resp. destination) interface.
+
+   If blocksize == ld_src == ld_dst, the transfer is optimized into a single
+   starpu_interface_copy call.
+
+   This is to be used in the starpu_data_copy_methods::any_to_any copy
+   method for 2D data, which is provided with \p async_data to be passed to
+   starpu_interface_copy(). this returns <c>-EAGAIN</c> if the transfer is still
+   ongoing, or 0 if the transfer is already completed.
+*/
+int starpu_interface_copy2d(uintptr_t src, size_t src_offset, unsigned src_node,
+			    uintptr_t dst, size_t dst_offset, unsigned dst_node,
+			    size_t blocksize,
+			    size_t numblocks, size_t ld_src, size_t ld_dst,
+			    void *async_data);
+
+/**
+   Copy \p numblocks_1 * \p numblocks_2 blocks of \p blocksize bytes from byte
+   offset \p src_offset of \p src on \p src_node to byte offset \p dst_offset of
+   \p dst on \p dst_node.
+
+   The blocks are grouped by \p numblocks_1 blocks whose start addresses are
+   ld1_src (resp. ld1_dst) bytes apart in the source (resp. destination)
+   interface.
+
+   Such groups are grouped by numblocks_2 groups whose start addresses are
+   ld2_src (resp. ld2_dst) bytes apart in the source (resp. destination)
+   interface.
+
+   If the blocks are contiguous, the transfers will be optimized.
+
+   This is to be used in the starpu_data_copy_methods::any_to_any copy
+   method for 3D data, which is provided with \p async_data to be passed to
+   starpu_interface_copy(). this returns <c>-EAGAIN</c> if the transfer is still
+   ongoing, or 0 if the transfer is already completed.
+*/
+int starpu_interface_copy3d(uintptr_t src, size_t src_offset, unsigned src_node,
+			    uintptr_t dst, size_t dst_offset, unsigned dst_node,
+			    size_t blocksize,
+			    size_t numblocks1, size_t ld1_src, size_t ld1_dst,
+			    size_t numblocks2, size_t ld2_src, size_t ld2_dst,
+			    void *async_data);
+
+/**
+   Copy \p numblocks_1 * \p numblocks_2 * \p numblocks_3 blocks of \p blocksize
+   bytes from byte offset \p src_offset of \p src on \p src_node to byte offset
+   \p dst_offset of \p dst on \p dst_node.
+
+   The blocks are grouped by \p numblocks_1 blocks whose start addresses are
+   ld1_src (resp. ld1_dst) bytes apart in the source (resp. destination)
+   interface.
+
+   Such groups are grouped by numblocks_2 groups whose start addresses are
+   ld2_src (resp. ld2_dst) bytes apart in the source (resp. destination)
+   interface.
+
+   Such groups are grouped by numblocks_3 groups whose start addresses are
+   ld3_src (resp. ld3_dst) bytes apart in the source (resp. destination)
+   interface.
+
+   If the blocks are contiguous, the transfers will be optimized.
+
+   This is to be used in the starpu_data_copy_methods::any_to_any copy
+   method for 3D data, which is provided with \p async_data to be passed to
+   starpu_interface_copy(). this returns <c>-EAGAIN</c> if the transfer is still
+   ongoing, or 0 if the transfer is already completed.
+*/
+int starpu_interface_copy4d(uintptr_t src, size_t src_offset, unsigned src_node,
+			    uintptr_t dst, size_t dst_offset, unsigned dst_node,
+			    size_t blocksize,
+			    size_t numblocks1, size_t ld1_src, size_t ld1_dst,
+			    size_t numblocks2, size_t ld2_src, size_t ld2_dst,
+			    size_t numblocks3, size_t ld3_src, size_t ld3_dst,
+			    void *async_data);
 
 /**
    When an asynchonous implementation of the data transfer is implemented, the call
@@ -1176,6 +1260,187 @@ designated by \p interface.
 /** @} */
 
 /**
+   @name Tensor Data Interface
+   @{
+*/
+
+extern struct starpu_data_interface_ops starpu_interface_tensor_ops;
+
+/* TODO: rename to 4dtensor? */
+/* TODO: add allocsize support */
+/**
+   Tensor interface for 4D dense tensors
+*/
+struct starpu_tensor_interface
+{
+	enum starpu_data_interface_id id; /**< identifier of the interface */
+
+	uintptr_t ptr;                    /**< local pointer of the tensor */
+	uintptr_t dev_handle;             /**< device handle of the tensor. */
+	size_t offset;                    /**< offset in the tensor. */
+	uint32_t nx;                      /**< number of elements on the x-axis of the tensor. */
+	uint32_t ny;                      /**< number of elements on the y-axis of the tensor. */
+	uint32_t nz;                      /**< number of elements on the z-axis of the tensor. */
+	uint32_t nt;                      /**< number of elements on the t-axis of the tensor. */
+	uint32_t ldy;                     /**< number of elements between two lines */
+	uint32_t ldz;                     /**< number of elements between two planes */
+	uint32_t ldt;                     /**< number of elements between two cubes */
+	size_t elemsize;                  /**< size of the elements of the tensor. */
+};
+
+/**
+   Register the \p nx x \p ny x \p nz x \p nt 4D tensor of \p elemsize byte elements
+   pointed by \p ptr and initialize \p handle to represent it. Again, \p ldy,
+   \p ldz, and \p ldt specify the number of elements between rows, between z planes and between t cubes.
+
+   Here an example of how to use the function.
+   \code{.c}
+   float *tensor;
+   starpu_data_handle_t tensor_handle;
+   tensor = (float*)malloc(nx*ny*nz*nt*sizeof(float));
+   starpu_tensor_data_register(&tensor_handle, STARPU_MAIN_RAM, (uintptr_t)tensor, nx, nx*ny, nx*ny*nz, nx, ny, nz, nt, sizeof(float));
+   \endcode
+*/
+void starpu_tensor_data_register(starpu_data_handle_t *handle, int home_node, uintptr_t ptr, uint32_t ldy, uint32_t ldz, uint32_t ldt, uint32_t nx, uint32_t ny, uint32_t nz, uint32_t nt, size_t elemsize);
+
+/**
+   Register into the \p handle that to store data on node \p node it should use the
+   buffer located at \p ptr, or device handle \p dev_handle and offset \p offset
+   (for OpenCL, notably), with \p ldy elements between rows, and \p ldz
+   elements between z planes, and \p ldt elements between t cubes.
+*/
+void starpu_tensor_ptr_register(starpu_data_handle_t handle, unsigned node, uintptr_t ptr, uintptr_t dev_handle, size_t offset, uint32_t ldy, uint32_t ldz, uint32_t ldt);
+
+/**
+   Return the number of elements on the x-axis of the tensor
+   designated by \p handle.
+ */
+uint32_t starpu_tensor_get_nx(starpu_data_handle_t handle);
+
+/**
+   Return the number of elements on the y-axis of the tensor
+   designated by \p handle.
+ */
+uint32_t starpu_tensor_get_ny(starpu_data_handle_t handle);
+
+/**
+   Return the number of elements on the z-axis of the tensor
+   designated by \p handle.
+ */
+uint32_t starpu_tensor_get_nz(starpu_data_handle_t handle);
+
+/**
+   Return the number of elements on the t-axis of the tensor
+   designated by \p handle.
+ */
+uint32_t starpu_tensor_get_nt(starpu_data_handle_t handle);
+
+/**
+   Return the number of elements between each row of the tensor
+   designated by \p handle, in the format of the current memory node.
+*/
+uint32_t starpu_tensor_get_local_ldy(starpu_data_handle_t handle);
+
+/**
+   Return the number of elements between each z plane of the tensor
+   designated by \p handle, in the format of the current memory node.
+ */
+uint32_t starpu_tensor_get_local_ldz(starpu_data_handle_t handle);
+
+/**
+   Return the number of elements between each t cubes of the tensor
+   designated by \p handle, in the format of the current memory node.
+ */
+uint32_t starpu_tensor_get_local_ldt(starpu_data_handle_t handle);
+
+/**
+   Return the local pointer associated with \p handle.
+ */
+uintptr_t starpu_tensor_get_local_ptr(starpu_data_handle_t handle);
+
+/**
+   Return the size of the elements of the tensor designated by
+   \p handle.
+ */
+size_t starpu_tensor_get_elemsize(starpu_data_handle_t handle);
+
+#if defined(STARPU_HAVE_STATEMENT_EXPRESSIONS) && defined(STARPU_DEBUG)
+#define STARPU_TENSOR_CHECK(interface)           STARPU_ASSERT_MSG((((struct starpu_tensor_interface *)(interface))->id) == STARPU_TENSOR_INTERFACE_ID, "Error. The given data is not a tensor.")
+#define STARPU_TENSOR_GET_PTR(interface)	        ({ STARPU_TENSOR_CHECK(interface); (((struct starpu_tensor_interface *)(interface))->ptr) ; })
+#define STARPU_TENSOR_GET_DEV_HANDLE(interface)	({ STARPU_TENSOR_CHECK(interface); (((struct starpu_tensor_interface *)(interface))->dev_handle) ; })
+#define STARPU_TENSOR_GET_OFFSET(interface)	({ STARPU_TENSOR_CHECK(interface); (((struct starpu_tensor_interface *)(interface))->offset) ; })
+#define STARPU_TENSOR_GET_NX(interface)	        ({ STARPU_TENSOR_CHECK(interface); (((struct starpu_tensor_interface *)(interface))->nx) ; })
+#define STARPU_TENSOR_GET_NY(interface)	        ({ STARPU_TENSOR_CHECK(interface); (((struct starpu_tensor_interface *)(interface))->ny) ; })
+#define STARPU_TENSOR_GET_NZ(interface)	        ({ STARPU_TENSOR_CHECK(interface); (((struct starpu_tensor_interface *)(interface))->nz) ; })
+#define STARPU_TENSOR_GET_NT(interface)	        ({ STARPU_TENSOR_CHECK(interface); (((struct starpu_tensor_interface *)(interface))->nt) ; })
+#define STARPU_TENSOR_GET_LDY(interface)	        ({ STARPU_TENSOR_CHECK(interface); (((struct starpu_tensor_interface *)(interface))->ldy) ; })
+#define STARPU_TENSOR_GET_LDZ(interface)	        ({ STARPU_TENSOR_CHECK(interface); (((struct starpu_tensor_interface *)(interface))->ldz) ; })
+#define STARPU_TENSOR_GET_LDT(interface)	        ({ STARPU_TENSOR_CHECK(interface); (((struct starpu_tensor_interface *)(interface))->ldt) ; })
+#define STARPU_TENSOR_GET_ELEMSIZE(interface)	({ STARPU_TENSOR_CHECK(interface); (((struct starpu_tensor_interface *)(interface))->elemsize) ; })
+#else
+/**
+   Return a pointer to the tensor designated by \p interface.
+ */
+#define STARPU_TENSOR_GET_PTR(interface)	        (((struct starpu_tensor_interface *)(interface))->ptr)
+/**
+   Return a device handle for the tensor designated by \p interface,
+   to be used on OpenCL. The offset returned by
+   ::STARPU_TENSOR_GET_OFFSET has to be used in
+   addition to this.
+ */
+#define STARPU_TENSOR_GET_DEV_HANDLE(interface)	(((struct starpu_tensor_interface *)(interface))->dev_handle)
+/**
+   Return the offset in the tensor designated by \p interface, to be
+   used with the device handle.
+ */
+#define STARPU_TENSOR_GET_OFFSET(interface)	(((struct starpu_tensor_interface *)(interface))->offset)
+/**
+   Return the number of elements on the x-axis of the tensor
+   designated by \p interface.
+ */
+#define STARPU_TENSOR_GET_NX(interface)	        (((struct starpu_tensor_interface *)(interface))->nx)
+/**
+   Return the number of elements on the y-axis of the tensor
+   designated by \p interface.
+ */
+#define STARPU_TENSOR_GET_NY(interface)	        (((struct starpu_tensor_interface *)(interface))->ny)
+/**
+Return the number of elements on the z-axis of the tensor
+designated by \p interface.
+ */
+#define STARPU_TENSOR_GET_NZ(interface)	        (((struct starpu_tensor_interface *)(interface))->nz)
+/**
+Return the number of elements on the t-axis of the tensor
+designated by \p interface.
+ */
+#define STARPU_TENSOR_GET_NT(interface)	        (((struct starpu_tensor_interface *)(interface))->nt)
+/**
+   Return the number of elements between each row of the tensor
+   designated by \p interface. May be equal to nx when there is no padding.
+ */
+#define STARPU_TENSOR_GET_LDY(interface)	        (((struct starpu_tensor_interface *)(interface))->ldy)
+/**
+   Return the number of elements between each z plane of the tensor
+   designated by \p interface. May be equal to nx*ny when there is no
+   padding.
+ */
+#define STARPU_TENSOR_GET_LDZ(interface)	        (((struct starpu_tensor_interface *)(interface))->ldz)
+/**
+   Return the number of elements between each t cubes of the tensor
+   designated by \p interface. May be equal to nx*ny*nz when there is no
+   padding.
+ */
+#define STARPU_TENSOR_GET_LDT(interface)	        (((struct starpu_tensor_interface *)(interface))->ldt)
+/**
+   Return the size of the elements of the tensor designated by
+   \p interface.
+ */
+#define STARPU_TENSOR_GET_ELEMSIZE(interface)	(((struct starpu_tensor_interface *)(interface))->elemsize)
+#endif
+
+/** @} */
+
+/**
    @name Vector Data Interface
    @{
 */
@@ -1547,6 +1812,11 @@ extern struct starpu_data_interface_ops starpu_interface_bcsr_ops;
 /**
    BCSR interface for sparse matrices (blocked compressed sparse
    row representation)
+
+   Note: when a BCSR matrix is partitioned, nzval, colind, and rowptr point into
+   the corresponding father arrays. The rowptr content is thus the same as the
+   father's. Firstentry is used to offset this so it becomes valid for the child
+   arrays.
 */
 struct starpu_bcsr_interface
 {
@@ -1555,7 +1825,7 @@ struct starpu_bcsr_interface
 	uint32_t nnz;                     /**< number of non-zero BLOCKS */
 	uint32_t nrow;                    /**< number of rows (in terms of BLOCKS) */
 
-	uintptr_t nzval;                  /**< non-zero values */
+	uintptr_t nzval;                  /**< non-zero values: nnz blocks of r*c elements */
 	uint32_t *colind;                 /**< array of nnz elements, colind[i] is the block-column index for block i in nzval */
 	uint32_t *rowptr;                 /**< array of nrow+1
 					   * elements, rowptr[i] is

+ 8 - 0
include/starpu_fxt.h

@@ -62,9 +62,12 @@ struct starpu_fxt_options
 	char *out_paje_path;
 	char *distrib_time_path;
 	char *activity_path;
+	char *sched_tasks_path;
 	char *dag_path;
 	char *tasks_path;
 	char *data_path;
+	char *papi_path;
+	char *comms_path;
 	char *anim_path;
 	char *states_path;
 
@@ -136,6 +139,11 @@ void starpu_fxt_stop_profiling(void);
 void starpu_fxt_write_data_trace(char *filename_in);
 
 /**
+    Wrapper to get value of env variable STARPU_FXT_TRACE
+*/
+int starpu_fxt_is_enabled();
+
+/**
    Add an event in the execution trace if FxT is enabled.
 */
 void starpu_fxt_trace_user_event(unsigned long code);

+ 9 - 0
include/starpu_perf_monitoring.h

@@ -63,6 +63,15 @@ struct starpu_perf_counter_sample;
 struct starpu_perf_counter_set;
 
 /**
+  Start collecting performance counter values.
+  */
+void starpu_perf_counter_collection_start();
+/**
+  Stop collecting performance counter values.
+  */
+void starpu_perf_counter_collection_stop();
+
+/**
   Translate scope name constant string to scope id.
   */
 int starpu_perf_counter_scope_name_to_id(const char *name);

+ 13 - 0
include/starpu_profiling.h

@@ -23,6 +23,12 @@
 #include <errno.h>
 #include <time.h>
 
+#include <starpu_config.h>
+
+#ifdef STARPU_PAPI
+#include <papi.h>
+#endif
+
 #ifdef __cplusplus
 extern "C"
 {
@@ -91,6 +97,13 @@ struct starpu_profiling_task_info
 	uint64_t stall_cycles;
 	/** Energy consumed by the task, in Joules */
 	double energy_consumed;
+
+#ifdef STARPU_PAPI
+	/** PAPI Events **/
+	long long int papi_values[PAPI_MAX_HWCTRS];
+	int papi_event_set;
+#endif
+
 };
 
 /**

+ 14 - 12
include/starpu_task.h

@@ -249,32 +249,29 @@ typedef starpu_mpi_ms_kernel_t (*starpu_mpi_ms_func_t)(void);
 #define STARPU_SPECIFIC_NODE_LOCAL (-1)
 
 /**
-    Value to be set in the starpu_codelet::nodes field to request
-    StarPU to put the data in CPU-accessible memory (and let StarPU
-    choose the NUMA node).
+   Value to be set in the starpu_codelet::nodes field to request
+   StarPU to put the data in CPU-accessible memory (and let StarPU
+   choose the NUMA node).
 */
-
 #define STARPU_SPECIFIC_NODE_CPU (-2)
 
 /**
-    Value to be set in the starpu_codelet::nodes field to request
-    StarPU to put the data in some slow memory.
+   Value to be set in the starpu_codelet::nodes field to request
+   StarPU to put the data in some slow memory.
 */
-
 #define STARPU_SPECIFIC_NODE_SLOW (-3)
+
 /**
    Value to be set in the starpu_codelet::nodes field to request
    StarPU to put the data in some fast memory.
 */
-
 #define STARPU_SPECIFIC_NODE_FAST (-4)
 
 /**
-    Value to be set in the starpu_codelet::nodes field to let StarPU decide
-    whether to put the data in the local memory of the worker running the task,
-    or in CPU-accessible memory (and let StarPU choose the NUMA node).
+   Value to be set in the starpu_codelet::nodes field to let StarPU decide
+   whether to put the data in the local memory of the worker running the task,
+   or in CPU-accessible memory (and let StarPU choose the NUMA node).
 */
-
 #define STARPU_SPECIFIC_NODE_LOCAL_OR_CPU (-5)
 
 struct starpu_task;
@@ -592,6 +589,11 @@ struct starpu_codelet
 
 	struct starpu_perf_counter_sample *perf_counter_sample;
 	struct starpu_perf_counter_sample_cl_values *perf_counter_values;
+
+	/**
+	   Whether _starpu_codelet_check_deprecated_fields was already done or not.
+	 */
+	int checked;
 };
 
 /**

+ 1 - 0
include/starpu_thread.h

@@ -83,6 +83,7 @@ int starpu_pthread_equal(starpu_pthread_t t1, starpu_pthread_t t2);
 starpu_pthread_t starpu_pthread_self(void);
 int starpu_pthread_create_on(char *name, starpu_pthread_t *thread, const starpu_pthread_attr_t *attr, void *(*start_routine) (void *), void *arg, starpu_sg_host_t host);
 int starpu_pthread_create(starpu_pthread_t *thread, const starpu_pthread_attr_t *attr, void *(*start_routine) (void *), void *arg);
+starpu_pthread_t _starpu_simgrid_actor_create(const char *name, xbt_main_func_t code, starpu_sg_host_t host, int argc, char *argv[]);
 int starpu_pthread_join(starpu_pthread_t thread, void **retval);
 int starpu_pthread_exit(void *retval) STARPU_ATTRIBUTE_NORETURN;
 int starpu_pthread_attr_init(starpu_pthread_attr_t *attr);

+ 46 - 26
include/starpu_util.h

@@ -322,20 +322,20 @@ extern "C"
 			STARPU_ABORT(); }}
 #endif
 
-/* Note: do not use starpu_cmpxchg / starpu_xchg / starpu_cmpxchgl /
- * starpu_xchgl / starpu_cmpxchg64 / starpu_xchg64, which only
+/* Note: do not use _starpu_cmpxchg / _starpu_xchg / _starpu_cmpxchgl /
+ * _starpu_xchgl / _starpu_cmpxchg64 / _starpu_xchg64, which only
  * assembly-hand-written fallbacks used when building with an old gcc.
- * Rather use STARPU_VAL_COMPARE_AND_SWAP available on all platforms with a
- * recent-enough gcc */
+ * Rather use STARPU_VAL_COMPARE_AND_SWAP and STARPU_VAL_EXCHANGE available on
+ * all platforms with a recent-enough gcc */
 
 #if defined(__i386__) || defined(__x86_64__)
-static __starpu_inline unsigned starpu_cmpxchg(unsigned *ptr, unsigned old, unsigned next)
+static __starpu_inline unsigned _starpu_cmpxchg(unsigned *ptr, unsigned old, unsigned next)
 {
 	__asm__ __volatile__("lock cmpxchgl %2,%1": "+a" (old), "+m" (*ptr) : "q" (next) : "memory");
 	return old;
 }
 #define STARPU_HAVE_CMPXCHG
-static __starpu_inline unsigned starpu_xchg(unsigned *ptr, unsigned next)
+static __starpu_inline unsigned _starpu_xchg(unsigned *ptr, unsigned next)
 {
 	/* Note: xchg is always locked already */
 	__asm__ __volatile__("xchgl %1,%0": "+m" (*ptr), "+q" (next) : : "memory");
@@ -343,13 +343,13 @@ static __starpu_inline unsigned starpu_xchg(unsigned *ptr, unsigned next)
 }
 #define STARPU_HAVE_XCHG
 
-static __starpu_inline uint32_t starpu_cmpxchg32(uint32_t *ptr, uint32_t old, uint32_t next)
+static __starpu_inline uint32_t _starpu_cmpxchg32(uint32_t *ptr, uint32_t old, uint32_t next)
 {
 	__asm__ __volatile__("lock cmpxchgl %2,%1": "+a" (old), "+m" (*ptr) : "q" (next) : "memory");
 	return old;
 }
 #define STARPU_HAVE_CMPXCHG32
-static __starpu_inline uint32_t starpu_xchg32(uint32_t *ptr, uint32_t next)
+static __starpu_inline uint32_t _starpu_xchg32(uint32_t *ptr, uint32_t next)
 {
 	/* Note: xchg is always locked already */
 	__asm__ __volatile__("xchgl %1,%0": "+m" (*ptr), "+q" (next) : : "memory");
@@ -358,13 +358,13 @@ static __starpu_inline uint32_t starpu_xchg32(uint32_t *ptr, uint32_t next)
 #define STARPU_HAVE_XCHG32
 
 #if defined(__i386__)
-static __starpu_inline unsigned long starpu_cmpxchgl(unsigned long *ptr, unsigned long old, unsigned long next)
+static __starpu_inline unsigned long _starpu_cmpxchgl(unsigned long *ptr, unsigned long old, unsigned long next)
 {
 	__asm__ __volatile__("lock cmpxchgl %2,%1": "+a" (old), "+m" (*ptr) : "q" (next) : "memory");
 	return old;
 }
 #define STARPU_HAVE_CMPXCHGL
-static __starpu_inline unsigned long starpu_xchgl(unsigned long *ptr, unsigned long next)
+static __starpu_inline unsigned long _starpu_xchgl(unsigned long *ptr, unsigned long next)
 {
 	/* Note: xchg is always locked already */
 	__asm__ __volatile__("xchgl %1,%0": "+m" (*ptr), "+q" (next) : : "memory");
@@ -374,13 +374,13 @@ static __starpu_inline unsigned long starpu_xchgl(unsigned long *ptr, unsigned l
 #endif
 
 #if defined(__x86_64__)
-static __starpu_inline unsigned long starpu_cmpxchgl(unsigned long *ptr, unsigned long old, unsigned long next)
+static __starpu_inline unsigned long _starpu_cmpxchgl(unsigned long *ptr, unsigned long old, unsigned long next)
 {
 	__asm__ __volatile__("lock cmpxchgq %2,%1": "+a" (old), "+m" (*ptr) : "q" (next) : "memory");
 	return old;
 }
 #define STARPU_HAVE_CMPXCHGL
-static __starpu_inline unsigned long starpu_xchgl(unsigned long *ptr, unsigned long next)
+static __starpu_inline unsigned long _starpu_xchgl(unsigned long *ptr, unsigned long next)
 {
 	/* Note: xchg is always locked already */
 	__asm__ __volatile__("xchgq %1,%0": "+m" (*ptr), "+q" (next) : : "memory");
@@ -390,7 +390,7 @@ static __starpu_inline unsigned long starpu_xchgl(unsigned long *ptr, unsigned l
 #endif
 
 #if defined(__i386__)
-static __starpu_inline uint64_t starpu_cmpxchg64(uint64_t *ptr, uint64_t old, uint64_t next)
+static __starpu_inline uint64_t _starpu_cmpxchg64(uint64_t *ptr, uint64_t old, uint64_t next)
 {
 	uint32_t next_hi = next >> 32;
 	uint32_t next_lo = next & 0xfffffffful;
@@ -401,13 +401,13 @@ static __starpu_inline uint64_t starpu_cmpxchg64(uint64_t *ptr, uint64_t old, ui
 #endif
 
 #if defined(__x86_64__)
-static __starpu_inline uint64_t starpu_cmpxchg64(uint64_t *ptr, uint64_t old, uint64_t next)
+static __starpu_inline uint64_t _starpu_cmpxchg64(uint64_t *ptr, uint64_t old, uint64_t next)
 {
 	__asm__ __volatile__("lock cmpxchgq %2,%1": "+a" (old), "+m" (*ptr) : "q" (next) : "memory");
 	return old;
 }
 #define STARPU_HAVE_CMPXCHG64
-static __starpu_inline uint64_t starpu_xchg64(uint64_t *ptr, uint64_t next)
+static __starpu_inline uint64_t _starpu_xchg64(uint64_t *ptr, uint64_t next)
 {
 	/* Note: xchg is always locked already */
 	__asm__ __volatile__("xchgq %1,%0": "+m" (*ptr), "+q" (next) : : "memory");
@@ -426,7 +426,7 @@ static __starpu_inline unsigned starpu_atomic_##name(unsigned *ptr, unsigned val
 	{ \
 		old = *ptr; \
 		next = expr; \
-		if (starpu_cmpxchg(ptr, old, next) == old) \
+		if (_starpu_cmpxchg(ptr, old, next) == old) \
 			break; \
 	}; \
 	return expr; \
@@ -439,7 +439,7 @@ static __starpu_inline unsigned long starpu_atomic_##name##l(unsigned long *ptr,
 	{ \
 		old = *ptr; \
 		next = expr; \
-		if (starpu_cmpxchgl(ptr, old, next) == old) \
+		if (_starpu_cmpxchgl(ptr, old, next) == old) \
 			break; \
 	}; \
 	return expr; \
@@ -452,7 +452,7 @@ static __starpu_inline uint64_t starpu_atomic_##name##64(uint64_t *ptr, uint64_t
 	{ \
 		old = *ptr; \
 		next = expr; \
-		if (starpu_cmpxchg64(ptr, old, next) == old) \
+		if (_starpu_cmpxchg64(ptr, old, next) == old) \
 			break; \
 	}; \
 	return expr; \
@@ -503,13 +503,13 @@ STARPU_ATOMIC_SOMETHING64(or, old | value)
 #define STARPU_BOOL_COMPARE_AND_SWAP64(ptr, old, value) STARPU_BOOL_COMPARE_AND_SWAP(ptr, old, value)
 #else
 #ifdef STARPU_HAVE_CMPXCHG
-#define STARPU_BOOL_COMPARE_AND_SWAP(ptr, old, value) (starpu_cmpxchg((ptr), (old), (value)) == (old))
+#define STARPU_BOOL_COMPARE_AND_SWAP(ptr, old, value) (_starpu_cmpxchg((ptr), (old), (value)) == (old))
 #endif
 #ifdef STARPU_HAVE_CMPXCHG32
-#define STARPU_BOOL_COMPARE_AND_SWAP32(ptr, old, value) (starpu_cmpxchg32((ptr), (old), (value)) == (old))
+#define STARPU_BOOL_COMPARE_AND_SWAP32(ptr, old, value) (_starpu_cmpxchg32((ptr), (old), (value)) == (old))
 #endif
 #ifdef STARPU_HAVE_CMPXCHG64
-#define STARPU_BOOL_COMPARE_AND_SWAP64(ptr, old, value) (starpu_cmpxchg64((ptr), (old), (value)) == (old))
+#define STARPU_BOOL_COMPARE_AND_SWAP64(ptr, old, value) (_starpu_cmpxchg64((ptr), (old), (value)) == (old))
 #endif
 #endif
 
@@ -519,13 +519,33 @@ STARPU_ATOMIC_SOMETHING64(or, old | value)
 #define STARPU_VAL_COMPARE_AND_SWAP64(ptr, old, value) STARPU_VAL_COMPARE_AND_SWAP(ptr, old, value)
 #else
 #ifdef STARPU_HAVE_CMPXCHG
-#define STARPU_VAL_COMPARE_AND_SWAP(ptr, old, value) (starpu_cmpxchg((ptr), (old), (value)))
+#define STARPU_VAL_COMPARE_AND_SWAP(ptr, old, value) (_starpu_cmpxchg((ptr), (old), (value)))
 #endif
 #ifdef STARPU_HAVE_CMPXCHG32
-#define STARPU_VAL_COMPARE_AND_SWAP32(ptr, old, value) (starpu_cmpxchg32((ptr), (old), (value)))
+#define STARPU_VAL_COMPARE_AND_SWAP32(ptr, old, value) (_starpu_cmpxchg32((ptr), (old), (value)))
 #endif
 #ifdef STARPU_HAVE_CMPXCHG64
-#define STARPU_VAL_COMPARE_AND_SWAP64(ptr, old, value) (starpu_cmpxchg64((ptr), (old), (value)))
+#define STARPU_VAL_COMPARE_AND_SWAP64(ptr, old, value) (_starpu_cmpxchg64((ptr), (old), (value)))
+#endif
+#endif
+
+#ifdef STARPU_HAVE_ATOMIC_EXCHANGE_N
+#define STARPU_VAL_EXCHANGE(ptr, value) (__atomic_exchange_n((ptr), (value), __ATOMIC_SEQ_CST))
+#define STARPU_VAL_EXCHANGEL(ptr, value) STARPU_VAL_EXCHANGE((ptr) (value))
+#define STARPU_VAL_EXCHANGE32(ptr, value) STARPU_VAL_EXCHANGE((ptr) (value))
+#define STARPU_VAL_EXCHANGE64(ptr, value) STARPU_VAL_EXCHANGE((ptr) (value))
+#else
+#ifdef STARPU_HAVE_XCHG
+#define STARPU_VAL_EXCHANGE(ptr, value) (_starpu_xchg((ptr), (value)))
+#endif
+#ifdef STARPU_HAVE_XCHGL
+#define STARPU_VAL_EXCHANGEL(ptr, value) (_starpu_xchgl((ptr), (value)))
+#endif
+#ifdef STARPU_HAVE_XCHG32
+#define STARPU_VAL_EXCHANGE32(ptr, value) (_starpu_xchg32((ptr), (value)))
+#endif
+#ifdef STARPU_HAVE_XCHG64
+#define STARPU_VAL_EXCHANGE64(ptr, value) (_starpu_xchg64((ptr), (value)))
 #endif
 #endif
 
@@ -534,8 +554,8 @@ STARPU_ATOMIC_SOMETHING64(or, old | value)
 #define STARPU_TEST_AND_SET(ptr, value) (__sync_lock_test_and_set ((ptr), (value)))
 #define STARPU_RELEASE(ptr) (__sync_lock_release ((ptr)))
 #elif defined(STARPU_HAVE_XCHG)
-#define STARPU_TEST_AND_SET(ptr, value) (starpu_xchg((ptr), (value)))
-#define STARPU_RELEASE(ptr) (starpu_xchg((ptr), 0))
+#define STARPU_TEST_AND_SET(ptr, value) (_starpu_xchg((ptr), (value)))
+#define STARPU_RELEASE(ptr) (_starpu_xchg((ptr), 0))
 #endif
 
 #ifdef STARPU_HAVE_SYNC_SYNCHRONIZE

+ 66 - 14
m4/acinclude.m4

@@ -2,7 +2,7 @@
 #
 # Copyright (C) 2012                                     Inria
 # Copyright (C) 2012,2017                                CNRS
-# Copyright (C) 2014                                     Université de Bordeaux
+# Copyright (C) 2014,2019                                Université de Bordeaux
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -42,19 +42,6 @@ AC_DEFUN([STARPU_CHECK_SYNC_BOOL_COMPARE_AND_SWAP], [
 	      [Define to 1 if the target supports __sync_bool_compare_and_swap])
   fi])
 
-# Check whether the target supports __sync_val_compare_and_swap.
-AC_DEFUN([STARPU_CHECK_SYNC_VAL_COMPARE_AND_SWAP], [
-  AC_CACHE_CHECK([whether the target supports __sync_val_compare_and_swap],
-		 ac_cv_have_sync_val_compare_and_swap, [
-  AC_LINK_IFELSE([AC_LANG_PROGRAM([int foo, bar;],
-			[bar = __sync_val_compare_and_swap(&foo, 0, 1);])],
-			[ac_cv_have_sync_val_compare_and_swap=yes],
-			[ac_cv_have_sync_val_compare_and_swap=no])])
-  if test $ac_cv_have_sync_val_compare_and_swap = yes; then
-    AC_DEFINE(STARPU_HAVE_SYNC_VAL_COMPARE_AND_SWAP, 1,
-	      [Define to 1 if the target supports __sync_val_compare_and_swap])
-  fi])
-
 # Check whether the target supports __sync_fetch_and_add.
 AC_DEFUN([STARPU_CHECK_SYNC_FETCH_AND_ADD], [
   AC_CACHE_CHECK([whether the target supports __sync_fetch_and_add],
@@ -94,6 +81,71 @@ AC_DEFUN([STARPU_CHECK_SYNC_LOCK_TEST_AND_SET], [
 	      [Define to 1 if the target supports __sync_lock_test_and_set])
   fi])
 
+# Check whether the target supports __atomic_compare_exchange_n.
+AC_DEFUN([STARPU_CHECK_ATOMIC_COMPARE_EXCHANGE_N], [
+  AC_CACHE_CHECK([whether the target supports __atomic_compare_exchange_n],
+		 ac_cv_have_atomic_compare_exchange_n, [
+  AC_LINK_IFELSE([AC_LANG_PROGRAM([int foo, bar, baz;],
+			[baz = __atomic_compare_exchange_n(&foo, &bar, 1, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);])],
+			[ac_cv_have_atomic_compare_exchange_n=yes],
+			[ac_cv_have_atomic_compare_exchange_n=no])])
+  if test $ac_cv_have_atomic_compare_exchange_n = yes; then
+    AC_DEFINE(STARPU_HAVE_ATOMIC_COMPARE_EXCHANGE_N, 1,
+	      [Define to 1 if the target supports __atomic_compare_exchange_n])
+  fi])
+
+# Check whether the target supports __atomic_exchange_n.
+AC_DEFUN([STARPU_CHECK_ATOMIC_EXCHANGE_N], [
+  AC_CACHE_CHECK([whether the target supports __atomic_exchange_n],
+		 ac_cv_have_atomic_exchange_n, [
+  AC_LINK_IFELSE([AC_LANG_PROGRAM([int foo, bar;],
+			[bar = __atomic_exchange_n(&foo, 1, __ATOMIC_SEQ_CST);])],
+			[ac_cv_have_atomic_exchange_n=yes],
+			[ac_cv_have_atomic_exchange_n=no])])
+  if test $ac_cv_have_atomic_exchange_n = yes; then
+    AC_DEFINE(STARPU_HAVE_ATOMIC_EXCHANGE_N, 1,
+	      [Define to 1 if the target supports __atomic_exchange_n])
+  fi])
+
+# Check whether the target supports __atomic_fetch_add.
+AC_DEFUN([STARPU_CHECK_ATOMIC_FETCH_ADD], [
+  AC_CACHE_CHECK([whether the target supports __atomic_fetch_add],
+		 ac_cv_have_atomic_fetch_add, [
+  AC_LINK_IFELSE([AC_LANG_PROGRAM([int foo, bar;],
+			[bar = __atomic_fetch_add(&foo, 1, __ATOMIC_SEQ_CST);])],
+			[ac_cv_have_atomic_fetch_add=yes],
+			[ac_cv_have_atomic_fetch_add=no])])
+  if test $ac_cv_have_atomic_fetch_add = yes; then
+    AC_DEFINE(STARPU_HAVE_ATOMIC_FETCH_ADD, 1,
+	      [Define to 1 if the target supports __atomic_fetch_add])
+  fi])
+
+# Check whether the target supports __atomic_fetch_or.
+AC_DEFUN([STARPU_CHECK_ATOMIC_FETCH_OR], [
+  AC_CACHE_CHECK([whether the target supports __atomic_fetch_or],
+		 ac_cv_have_atomic_fetch_or, [
+  AC_LINK_IFELSE([AC_LANG_PROGRAM([int foo, bar;],
+			[bar = __atomic_fetch_or(&foo, 1, __ATOMIC_SEQ_CST);])],
+			[ac_cv_have_atomic_fetch_or=yes],
+			[ac_cv_have_atomic_fetch_or=no])])
+  if test $ac_cv_have_atomic_fetch_or = yes; then
+    AC_DEFINE(STARPU_HAVE_ATOMIC_FETCH_OR, 1,
+	      [Define to 1 if the target supports __atomic_fetch_or])
+  fi])
+
+# Check whether the target supports __atomic_test_and_set.
+AC_DEFUN([STARPU_CHECK_ATOMIC_TEST_AND_SET], [
+  AC_CACHE_CHECK([whether the target supports __atomic_test_and_set],
+		 ac_cv_have_atomic_test_and_set, [
+  AC_LINK_IFELSE([AC_LANG_PROGRAM([int foo, bar;],
+			[bar = __atomic_test_and_set(&foo, __ATOMIC_SEQ_CST);])],
+			[ac_cv_have_atomic_test_and_set=yes],
+			[ac_cv_have_atomic_test_and_set=no])])
+  if test $ac_cv_have_atomic_test_and_set = yes; then
+    AC_DEFINE(STARPU_HAVE_ATOMIC_TEST_AND_SET, 1,
+	      [Define to 1 if the target supports __atomic_test_and_set])
+  fi])
+
 # Check whether the target supports __sync_synchronize.
 AC_DEFUN([STARPU_CHECK_SYNC_SYNCHRONIZE], [
   AC_CACHE_CHECK([whether the target supports __sync_synchronize],

+ 5 - 0
min-dgels/Makefile.in

@@ -1,13 +1,16 @@
 CC = @CC@
 LD = @LD@
+srcdir = @srcdir@
 
 CLAPACK=base
 ADDITIONAL=additional
 
 all:
 	mkdir -p build
+	[ -d "$(CLAPACK)" ] || cp -a $(srcdir)/$(CLAPACK) .
 	cd $(CLAPACK) && $(MAKE) blaslib CC="$(CC)" LD="$(LD)"
 	cd $(CLAPACK) && $(MAKE) f2clib CC="$(CC)" LD="$(LD)"
+	[ -d "$(ADDITIONAL)" ] || cp -a $(srcdir)/$(ADDITIONAL) .
 	cd $(ADDITIONAL) && $(CC) -c -fPIC *.c && ar cr ../build/minlibdgels.a *.o && ranlib ../build/minlibdgels.a
 
 install:
@@ -33,3 +36,5 @@ check:
 showcheck: check
 
 showsuite: check
+
+recheck: check

+ 6 - 4
mpi/examples/matrix_decomposition/mpi_cholesky.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013,2015-2017                      CNRS
- * Copyright (C) 2009-2012,2014,2015,2018                 Université de Bordeaux
+ * Copyright (C) 2010-2013,2015-2017,2020                 CNRS
+ * Copyright (C) 2009-2012,2014,2015,2018,2020            Université de Bordeaux
  * Copyright (C) 2012                                     Inria
  * Copyright (C) 2010                                     Mehdi Juhoor
  *
@@ -63,13 +63,15 @@ int main(int argc, char **argv)
 #ifndef STARPU_SIMGRID
 	matrix_display(bmat, rank);
 
-	dw_cholesky_check_computation(bmat, rank, nodes, &correctness, &flops);
+	if (check)
+		dw_cholesky_check_computation(bmat, rank, nodes, &correctness, &flops, 0.001);
 #endif
 
 	matrix_free(&bmat, rank, nodes, 1);
 
 #ifndef STARPU_SIMGRID
-	assert(correctness);
+	if (check)
+		assert(correctness);
 #endif
 
 	if (rank == 0)

+ 61 - 56
mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2015,2017,2018                      CNRS
- * Copyright (C) 2009,2010,2014,2015,2017,2018            Université de Bordeaux
+ * Copyright (C) 2010-2015,2017,2018,2020                 CNRS
+ * Copyright (C) 2009,2010,2014,2015,2017,2018,2020       Université de Bordeaux
  * Copyright (C) 2013                                     Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -20,6 +20,7 @@
 #include <common/blas.h>
 #include <sys/time.h>
 #include <limits.h>
+#include <math.h>
 
 /*
  *	Create the codelets
@@ -78,24 +79,24 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 	double start;
 	double end;
 	starpu_data_handle_t **data_handles;
-	unsigned x,y,i,j,k;
+	unsigned k, m, n;
 
 	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
 
 	/* create all the DAG nodes */
 
 	data_handles = malloc(nblocks*sizeof(starpu_data_handle_t *));
-	for(x=0 ; x<nblocks ; x++) data_handles[x] = malloc(nblocks*sizeof(starpu_data_handle_t));
+	for(m=0 ; m<nblocks ; m++) data_handles[m] = malloc(nblocks*sizeof(starpu_data_handle_t));
 
-	for(x = 0; x < nblocks ; x++)
+	for (m = 0; m < nblocks; m++)
 	{
-		for (y = 0; y < nblocks; y++)
+		for(n = 0; n < nblocks ; n++)
 		{
-			int mpi_rank = my_distrib(x, y, nodes);
+			int mpi_rank = my_distrib(m, n, nodes);
 			if (mpi_rank == rank)
 			{
-				//fprintf(stderr, "[%d] Owning data[%d][%d]\n", rank, x, y);
-				starpu_matrix_data_register(&data_handles[x][y], STARPU_MAIN_RAM, (uintptr_t)matA[x][y],
+				//fprintf(stderr, "[%d] Owning data[%d][%d]\n", rank, n, m);
+				starpu_matrix_data_register(&data_handles[m][n], STARPU_MAIN_RAM, (uintptr_t)matA[m][n],
 						ld, size/nblocks, size/nblocks, sizeof(float));
 			}
 #ifdef STARPU_DEVEL
@@ -104,14 +105,14 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 			else
 			{
 				/* I don't own this index, but will need it for my computations */
-				//fprintf(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, x, y);
-				starpu_matrix_data_register(&data_handles[x][y], -1, (uintptr_t)NULL,
+				//fprintf(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, n, m);
+				starpu_matrix_data_register(&data_handles[m][n], -1, (uintptr_t)NULL,
 						ld, size/nblocks, size/nblocks, sizeof(float));
 			}
-			if (data_handles[x][y])
+			if (data_handles[m][n])
 			{
-				starpu_data_set_coordinates(data_handles[x][y], 2, x, y);
-				starpu_mpi_data_register(data_handles[x][y], (y*nblocks)+x, mpi_rank);
+				starpu_data_set_coordinates(data_handles[m][n], 2, n, m);
+				starpu_mpi_data_register(data_handles[m][n], (m*nblocks)+n, mpi_rank);
 			}
 		}
 	}
@@ -128,34 +129,34 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 				       STARPU_RW, data_handles[k][k],
 				       0);
 
-		for (j = k+1; j<nblocks; j++)
+		for (m = k+1; m<nblocks; m++)
 		{
 			starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
-					       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - j) : (j == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+					       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
 					       STARPU_R, data_handles[k][k],
-					       STARPU_RW, data_handles[k][j],
+					       STARPU_RW, data_handles[m][k],
 					       0);
 
 			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[k][k]);
 			if (my_distrib(k, k, nodes) == rank)
 				starpu_data_wont_use(data_handles[k][k]);
 
-			for (i = k+1; i<nblocks; i++)
+			for (n = k+1; n<nblocks; n++)
 			{
-				if (i <= j)
+				if (n <= m)
 				{
 					starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
-							       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - j - i) : ((i == k+1) && (j == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
-							       STARPU_R, data_handles[k][i],
-							       STARPU_R, data_handles[k][j],
-							       STARPU_RW | STARPU_COMMUTE, data_handles[i][j],
+							       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+							       STARPU_R, data_handles[n][k],
+							       STARPU_R, data_handles[m][k],
+							       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
 							       0);
 				}
 			}
 
-			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[k][j]);
-			if (my_distrib(k, j, nodes) == rank)
-				starpu_data_wont_use(data_handles[k][j]);
+			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[m][k]);
+			if (my_distrib(m, k, nodes) == rank)
+				starpu_data_wont_use(data_handles[m][k]);
 		}
 		starpu_iteration_pop();
 	}
@@ -165,14 +166,18 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 	starpu_mpi_barrier(MPI_COMM_WORLD);
 	end = starpu_timing_now();
 
-	for(x = 0; x < nblocks ; x++)
+	for (m = 0; m < nblocks; m++)
 	{
-		for (y = 0; y < nblocks; y++)
+		for(n = 0; n < nblocks ; n++)
 		{
-			if (data_handles[x][y])
-				starpu_data_unregister(data_handles[x][y]);
+			/* Get back data on node 0 for the check */
+			if (check)
+				starpu_mpi_get_data_on_node(MPI_COMM_WORLD, data_handles[m][n], 0);
+
+			if (data_handles[m][n])
+				starpu_data_unregister(data_handles[m][n]);
 		}
-		free(data_handles[x]);
+		free(data_handles[m]);
 	}
 	free(data_handles);
 
@@ -183,33 +188,33 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 	}
 }
 
-void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *correctness, double *flops)
+void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *correctness, double *flops, double epsilon)
 {
-	unsigned i,j,x,y;
+	unsigned nn,mm,n,m;
 	float *rmat = malloc(size*size*sizeof(float));
 
-	for(x=0 ; x<nblocks ; x++)
+	for(n=0 ; n<nblocks ; n++)
 	{
-		for(y=0 ; y<nblocks ; y++)
+		for(m=0 ; m<nblocks ; m++)
 		{
-			for (i = 0; i < BLOCKSIZE; i++)
+			for (nn = 0; nn < BLOCKSIZE; nn++)
 			{
-				for (j = 0; j < BLOCKSIZE; j++)
+				for (mm = 0; mm < BLOCKSIZE; mm++)
 				{
-					rmat[j+(y*BLOCKSIZE)+(i+(x*BLOCKSIZE))*size] = matA[x][y][j +i*BLOCKSIZE];
+					rmat[mm+(m*BLOCKSIZE)+(nn+(n*BLOCKSIZE))*size] = matA[m][n][mm +nn*BLOCKSIZE];
 				}
 			}
 		}
 	}
 
 	FPRINTF(stderr, "[%d] compute explicit LLt ...\n", rank);
-	for (j = 0; j < size; j++)
+	for (mm = 0; mm < size; mm++)
 	{
-		for (i = 0; i < size; i++)
+		for (nn = 0; nn < size; nn++)
 		{
-			if (i > j)
+			if (nn > mm)
 			{
-				rmat[j+i*size] = 0.0f; // debug
+				rmat[mm+nn*size] = 0.0f; // debug
 			}
 		}
 	}
@@ -222,13 +227,13 @@ void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *corr
 	FPRINTF(stderr, "[%d] comparing results ...\n", rank);
 	if (display)
 	{
-		for (j = 0; j < size; j++)
+		for (mm = 0; mm < size; mm++)
 		{
-			for (i = 0; i < size; i++)
+			for (nn = 0; nn < size; nn++)
 			{
-				if (i <= j)
+				if (nn <= mm)
 				{
-					printf("%2.2f\t", test_mat[j +i*size]);
+					printf("%2.2f\t", test_mat[mm +nn*size]);
 				}
 				else
 				{
@@ -240,24 +245,24 @@ void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *corr
 	}
 
 	*correctness = 1;
-	for(x = 0; x < nblocks ; x++)
+	for(n = 0; n < nblocks ; n++)
 	{
-		for (y = 0; y < nblocks; y++)
+		for (m = 0; m < nblocks; m++)
 		{
-			int mpi_rank = my_distrib(x, y, nodes);
+			int mpi_rank = my_distrib(m, n, nodes);
 			if (mpi_rank == rank)
 			{
-				for (i = (size/nblocks)*x ; i < (size/nblocks)*x+(size/nblocks); i++)
+				for (nn = (size/nblocks)*n ; nn < (size/nblocks)*n+(size/nblocks); nn++)
 				{
-					for (j = (size/nblocks)*y ; j < (size/nblocks)*y+(size/nblocks); j++)
+					for (mm = (size/nblocks)*m ; mm < (size/nblocks)*m+(size/nblocks); mm++)
 					{
-						if (i <= j)
+						if (nn <= mm)
 						{
-							float orig = (1.0f/(1.0f+i+j)) + ((i == j)?1.0f*size:0.0f);
-							float err = abs(test_mat[j +i*size] - orig);
-							if (err > 0.00001)
+							float orig = (1.0f/(1.0f+nn+mm)) + ((nn == mm)?1.0f*size:0.0f);
+							float err = fabsf(test_mat[mm +nn*size] - orig) / orig;
+							if (err > epsilon)
 							{
-								FPRINTF(stderr, "[%d] Error[%u, %u] --> %2.2f != %2.2f (err %2.2f)\n", rank, i, j, test_mat[j +i*size], orig, err);
+								FPRINTF(stderr, "[%d] Error[%u, %u] --> %2.20f != %2.20f (err %2.20f)\n", rank, nn, mm, test_mat[mm +nn*size], orig, err);
 								*correctness = 0;
 								*flops = 0;
 								break;

+ 2 - 2
mpi/examples/matrix_decomposition/mpi_cholesky_codelets.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013,2015,2017                      CNRS
+ * Copyright (C) 2010-2013,2015,2017,2020                 CNRS
  * Copyright (C) 2009,2010,2014                           Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -25,6 +25,6 @@
  */
 void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing, double *flops);
 
-void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *correctness, double *flops);
+void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *correctness, double *flops, double epsilon);
 
 #endif /* __MPI_CHOLESKY_CODELETS_H__ */

+ 11 - 2
mpi/examples/matrix_decomposition/mpi_cholesky_distributed.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2013,2015,2017                      CNRS
- * Copyright (C) 2009-2011,2014,2015,2017,2018            Université de Bordeaux
+ * Copyright (C) 2010-2013,2015,2017,2020                 CNRS
+ * Copyright (C) 2009-2011,2014,2015,2017,2018, 2020            Université de Bordeaux
  * Copyright (C) 2012                                     Inria
  * Copyright (C) 2010                                     Mehdi Juhoor
  *
@@ -33,6 +33,9 @@ int main(int argc, char **argv)
 	float ***bmat;
 	int rank, nodes, ret;
 	double timing, flops;
+#ifndef STARPU_SIMGRID
+	int correctness=1;
+#endif
 
 	ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
@@ -42,6 +45,12 @@ int main(int argc, char **argv)
 
 	parse_args(argc, argv, nodes);
 
+	if (check)
+	{
+		fprintf(stderr,"can't check in distributed mode\n");
+		check = 0;
+	}
+
 	matrix_init(&bmat, rank, nodes, 0);
 
 	dw_cholesky(bmat, size/nblocks, rank, nodes, &timing, &flops);

+ 1 - 1
mpi/examples/matrix_decomposition/mpi_cholesky_kernels.c

@@ -216,7 +216,7 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, void *_a
 #if (MAGMA_VERSION_MAJOR > 1) || (MAGMA_VERSION_MAJOR == 1 && MAGMA_VERSION_MINOR >= 4)
 			cudaError_t cures = cudaStreamSynchronize(stream);
 #else
-				cudaError_t cures = cudaThreadSynchronize();
+				cudaError_t cures = cudaDeviceSynchronize();
 #endif
 				STARPU_ASSERT(!cures);
 			}

+ 21 - 18
mpi/examples/matrix_decomposition/mpi_decomposition_matrix.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2013,2015-2017                      CNRS
- * Copyright (C) 2009-2012,2014,2015                      Université de Bordeaux
+ * Copyright (C) 2009-2012,2014,2015,2020                 Université de Bordeaux
  * Copyright (C) 2010                                     Mehdi Juhoor
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -19,7 +19,7 @@
 #include "mpi_cholesky.h"
 
 /* Returns the MPI node number where data indexes index is */
-int my_distrib(int x, int y, int nb_nodes)
+int my_distrib(int y, int x, int nb_nodes)
 {
 	(void)nb_nodes;
 	//return (x+y) % nb_nodes;
@@ -62,27 +62,30 @@ void matrix_display(float ***bmat, int rank)
 	}
 }
 
+/* Note: bmat is indexed by bmat[m][n][mm+nn*BLOCKSIZE],
+ * i.e. the content of the tiles is column-major, but the array of tiles is
+ * row-major to keep the m,n notation everywhere */
 void matrix_init(float ****bmat, int rank, int nodes, int alloc_everywhere)
 {
-	unsigned i,j,x,y;
+	unsigned nn,mm,m,n;
 
 	*bmat = malloc(nblocks * sizeof(float **));
-	for(x=0 ; x<nblocks ; x++)
+	for(m=0 ; m<nblocks ; m++)
 	{
-		(*bmat)[x] = malloc(nblocks * sizeof(float *));
-		for(y=0 ; y<nblocks ; y++)
+		(*bmat)[m] = malloc(nblocks * sizeof(float *));
+		for(n=0 ; n<nblocks ; n++)
 		{
-			int mpi_rank = my_distrib(x, y, nodes);
+			int mpi_rank = my_distrib(m, n, nodes);
 			if (alloc_everywhere || (mpi_rank == rank))
 			{
-				starpu_malloc((void **)&(*bmat)[x][y], BLOCKSIZE*BLOCKSIZE*sizeof(float));
-				for (i = 0; i < BLOCKSIZE; i++)
+				starpu_malloc((void **)&(*bmat)[m][n], BLOCKSIZE*BLOCKSIZE*sizeof(float));
+				for (nn = 0; nn < BLOCKSIZE; nn++)
 				{
-					for (j = 0; j < BLOCKSIZE; j++)
+					for (mm = 0; mm < BLOCKSIZE; mm++)
 					{
 #ifndef STARPU_SIMGRID
-						(*bmat)[x][y][j +i*BLOCKSIZE] = (1.0f/(1.0f+(i+(x*BLOCKSIZE)+j+(y*BLOCKSIZE)))) + ((i+(x*BLOCKSIZE) == j+(y*BLOCKSIZE))?1.0f*size:0.0f);
-						//mat[j +i*size] = ((i == j)?1.0f*size:0.0f);
+						(*bmat)[m][n][mm +nn*BLOCKSIZE] = (1.0f/(1.0f+(nn+(m*BLOCKSIZE)+mm+(n*BLOCKSIZE)))) + ((nn+(m*BLOCKSIZE) == mm+(n*BLOCKSIZE))?1.0f*size:0.0f);
+						//mat[mm +nn*size] = ((nn == mm)?1.0f*size:0.0f);
 #endif
 					}
 				}
@@ -93,19 +96,19 @@ void matrix_init(float ****bmat, int rank, int nodes, int alloc_everywhere)
 
 void matrix_free(float ****bmat, int rank, int nodes, int alloc_everywhere)
 {
-	unsigned x, y;
+	unsigned m, n;
 
-	for(x=0 ; x<nblocks ; x++)
+	for(m=0 ; m<nblocks ; m++)
 	{
-		for(y=0 ; y<nblocks ; y++)
+		for(n=0 ; n<nblocks ; n++)
 		{
-			int mpi_rank = my_distrib(x, y, nodes);
+			int mpi_rank = my_distrib(m, n, nodes);
 			if (alloc_everywhere || (mpi_rank == rank))
 			{
-				starpu_free((void *)(*bmat)[x][y]);
+				starpu_free((void *)(*bmat)[m][n]);
 			}
 		}
-		free((*bmat)[x]);
+		free((*bmat)[m]);
 	}
 	free(*bmat);
 }

+ 2 - 2
mpi/examples/matrix_decomposition/mpi_decomposition_matrix.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2013,2015,2017                      CNRS
- * Copyright (C) 2009-2012,2014                           Université de Bordeaux
+ * Copyright (C) 2009-2012,2014,2020                      Université de Bordeaux
  * Copyright (C) 2010                                     Mehdi Juhoor
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -20,7 +20,7 @@
 #define __MPI_CHOLESKY_MATRIX_H__
 
 /* Returns the MPI node number where data indexes index is */
-int my_distrib(int x, int y, int nb_nodes);
+int my_distrib(int y, int x, int nb_nodes);
 
 void matrix_display(float ***bmat, int rank);
 void matrix_init(float ****bmat, int rank, int nodes, int alloc_everywhere);

+ 8 - 2
mpi/examples/matrix_decomposition/mpi_decomposition_params.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2013,2015-2017                      CNRS
- * Copyright (C) 2009,2010,2014-2017                      Université de Bordeaux
+ * Copyright (C) 2009,2010,2014-2017,2020                 Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -35,6 +35,7 @@ unsigned nblocks = 16;
 unsigned nbigblocks = 2;
 #endif
 unsigned noprio = 0;
+unsigned check = 0;
 unsigned display = 0;
 int dblockx = -1;
 int dblocky = -1;
@@ -79,6 +80,11 @@ void parse_args(int argc, char **argv, int nodes)
                         noprio = 1;
                 }
 
+                if (strcmp(argv[i], "-check") == 0)
+                {
+                        check = 1;
+                }
+
                 if (strcmp(argv[i], "-display") == 0)
                 {
                         display = 1;
@@ -86,7 +92,7 @@ void parse_args(int argc, char **argv, int nodes)
 
                 if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0)
                 {
-			printf("usage : %s [-size size] [-nblocks nblocks] [-no-prio] [-display]\n", argv[0]);
+			printf("usage : %s [-size size] [-nblocks nblocks] [-no-prio] [-display] [-check]\n", argv[0]);
                 }
         }
 

+ 2 - 1
mpi/examples/matrix_decomposition/mpi_decomposition_params.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2013,2015,2017                      CNRS
- * Copyright (C) 2009,2010,2014                           Université de Bordeaux
+ * Copyright (C) 2009,2010,2014,2020                      Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -24,6 +24,7 @@ extern unsigned size;
 extern unsigned nblocks;
 extern unsigned nbigblocks;
 extern unsigned noprio;
+extern unsigned check;
 extern unsigned display;
 extern int dblockx;
 extern int dblocky;

+ 2 - 0
mpi/src/Makefile.am

@@ -75,6 +75,7 @@ noinst_HEADERS =					\
 	mpi/starpu_mpi_driver.h				\
 	mpi/starpu_mpi_mpi_backend.h			\
 	nmad/starpu_mpi_nmad_backend.h			\
+	nmad/starpu_mpi_nmad_unknown_datatype.h		\
 	load_balancer/policy/data_movements_interface.h	\
 	load_balancer/policy/load_data_interface.h	\
 	load_balancer/policy/load_balancer_policy.h
@@ -95,6 +96,7 @@ libstarpumpi_@STARPU_EFFECTIVE_VERSION@_la_SOURCES =	\
 	starpu_mpi_fortran.c				\
 	starpu_mpi_task_insert_fortran.c		\
 	starpu_mpi_init.c				\
+	nmad/starpu_mpi_nmad_unknown_datatype.c		\
 	nmad/starpu_mpi_nmad.c				\
 	nmad/starpu_mpi_nmad_backend.c			\
 	mpi/starpu_mpi_mpi.c				\

+ 5 - 1
mpi/src/mpi/starpu_mpi_mpi.c

@@ -382,7 +382,7 @@ static void _starpu_mpi_isend_data_func(struct _starpu_mpi_req *req)
 	_starpu_mpi_simgrid_wait_req(&req->backend->data_request, &req->status_store, &req->queue, &req->done);
 #endif
 
-	_STARPU_MPI_TRACE_ISEND_SUBMIT_END(req->node_tag.node.rank, req->node_tag.data_tag, starpu_data_get_size(req->data_handle), req->pre_sync_jobid);
+	_STARPU_MPI_TRACE_ISEND_SUBMIT_END(req->node_tag.node.rank, req->node_tag.data_tag, starpu_data_get_size(req->data_handle), req->pre_sync_jobid, req->data_handle);
 
 	/* somebody is perhaps waiting for the MPI request to be posted */
 	STARPU_PTHREAD_MUTEX_LOCK(&req->backend->req_mutex);
@@ -1150,6 +1150,9 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 	int i;
 	for (i = 0; i < *(argc_argv->argc); i++)
 		argv_cpy[i] = strdup((*(argc_argv->argv))[i]);
+#ifdef HAVE_SG_ACTOR_DATA
+	_starpu_simgrid_actor_create("main", smpi_simulated_main_, _starpu_simgrid_get_host_by_name("MAIN"), *(argc_argv->argc), argv_cpy);
+#else
 	MSG_process_create_with_arguments("main", smpi_simulated_main_, NULL, _starpu_simgrid_get_host_by_name("MAIN"), *(argc_argv->argc), argv_cpy);
 	/* And set TSD for us */
 	void **tsd;
@@ -1159,6 +1162,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 		_STARPU_ERROR("Your version of simgrid does not provide smpi_process_set_user_data, we can not continue without it\n");
 	}
 	smpi_process_set_user_data(tsd);
+#endif
         /* And wait for StarPU to get initialized, to come back to the same
          * situation as native execution where that's always the case. */
 	starpu_wait_initialized();

+ 27 - 94
mpi/src/nmad/starpu_mpi_nmad.c

@@ -39,13 +39,14 @@
 #include <nm_sendrecv_interface.h>
 #include <nm_mpi_nmad.h>
 #include "starpu_mpi_nmad_backend.h"
+#include "starpu_mpi_nmad_unknown_datatype.h"
 
-static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req,nm_sr_event_t event);
+void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req,nm_sr_event_t event);
 #ifdef STARPU_VERBOSE
 static char *_starpu_mpi_request_type(enum _starpu_mpi_request_type request_type);
 #endif
 
-static void _starpu_mpi_handle_pending_request(struct _starpu_mpi_req *req);
+void _starpu_mpi_handle_pending_request(struct _starpu_mpi_req *req);
 static void _starpu_mpi_add_sync_point_in_fxt(void);
 
 /* Condition to wake up waiting for all current MPI requests to finish */
@@ -88,6 +89,8 @@ static void _starpu_mpi_isend_data_func(struct _starpu_mpi_req *req)
 {
 	_STARPU_MPI_LOG_IN();
 
+	STARPU_ASSERT_MSG(req->registered_datatype == 1, "Datatype is not registered, it cannot be sent through this way !");
+
 	_STARPU_MPI_DEBUG(30, "post NM isend request %p type %s tag %ld src %d data %p datasize %ld ptr %p datatype '%s' count %d registered_datatype %d sync %d\n", req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.node.rank, req->data_handle, starpu_data_get_size(req->data_handle), req->ptr, req->datatype_name, (int)req->count, req->registered_datatype, req->sync);
 
 	_starpu_mpi_comm_amounts_inc(req->node_tag.node.comm, req->node_tag.node.rank, req->datatype, req->count);
@@ -111,7 +114,7 @@ static void _starpu_mpi_isend_data_func(struct _starpu_mpi_req *req)
 		STARPU_ASSERT_MSG(req->ret == NM_ESUCCESS, "MPI_Issend returning %d", req->ret);
 	}
 
-	_STARPU_MPI_TRACE_ISEND_SUBMIT_END(req->node_tag.node.rank, req->node_tag.data_tag, starpu_data_get_size(req->data_handle), req->pre_sync_jobid);
+	_STARPU_MPI_TRACE_ISEND_SUBMIT_END(req->node_tag.node.rank, req->node_tag.data_tag, starpu_data_get_size(req->data_handle), req->pre_sync_jobid, req->data_handle);
 
 	_starpu_mpi_handle_pending_request(req);
 
@@ -124,49 +127,15 @@ void _starpu_mpi_isend_size_func(struct _starpu_mpi_req *req)
 
 	if (req->registered_datatype == 1)
 	{
-		req->backend->waited = 1;
 		req->count = 1;
 		req->ptr = starpu_data_handle_to_pointer(req->data_handle, STARPU_MAIN_RAM);
+
+		_starpu_mpi_isend_data_func(req);
 	}
 	else
 	{
-		starpu_ssize_t psize = -1;
-		int ret;
-		req->backend->waited =2;
-
-		// Do not pack the data, just try to find out the size
-		starpu_data_pack(req->data_handle, NULL, &psize);
-
-		if (psize != -1)
-		{
-			// We already know the size of the data, let's send it to overlap with the packing of the data
-			_STARPU_MPI_DEBUG(20, "Sending size %ld (%ld %s) to node %d (first call to pack)\n", psize, sizeof(req->count), "MPI_BYTE", req->node_tag.node.rank);
-			req->count = psize;
-			//ret = nm_sr_isend(nm_mpi_communicator_get_session(p_req->p_comm),nm_mpi_communicator_get_gate(p_comm,req->srcdst), req->mpi_tag,&req->count, sizeof(req->count), &req->backend->size_req);
-			ret = nm_sr_isend(req->backend->session,req->backend->gate, req->node_tag.data_tag,&req->count, sizeof(req->count), &req->backend->size_req);
-
-			//	ret = MPI_Isend(&req->count, sizeof(req->count), MPI_BYTE, req->srcdst, req->mpi_tag, req->comm, &req->backend->size_req);
-			STARPU_ASSERT_MSG(ret == NM_ESUCCESS, "when sending size, nm_sr_isend returning %d", ret);
-		}
-
-		// Pack the data
-		starpu_data_pack(req->data_handle, &req->ptr, &req->count);
-		if (psize == -1)
-		{
-			// We know the size now, let's send it
-			_STARPU_MPI_DEBUG(1, "Sending size %ld (%ld %s) with tag %ld to node %d (second call to pack)\n", req->count, sizeof(req->count), "MPI_BYTE", req->node_tag.data_tag, req->node_tag.node.rank);
-			ret = nm_sr_isend(req->backend->session,req->backend->gate, req->node_tag.data_tag,&req->count, sizeof(req->count), &req->backend->size_req);
-			STARPU_ASSERT_MSG(ret == NM_ESUCCESS, "when sending size, nm_sr_isend returning %d", ret);
-		}
-		else
-		{
-			// We check the size returned with the 2 calls to pack is the same
-			STARPU_ASSERT_MSG(req->count == psize, "Calls to pack_data returned different sizes %ld != %ld", req->count, psize);
-		}
-
-		// We can send the data now
+		_starpu_mpi_isend_unknown_datatype(req);
 	}
-	_starpu_mpi_isend_data_func(req);
 }
 
 /********************************************************/
@@ -179,11 +148,12 @@ static void _starpu_mpi_irecv_data_func(struct _starpu_mpi_req *req)
 {
 	_STARPU_MPI_LOG_IN();
 
+	STARPU_ASSERT_MSG(req->registered_datatype == 1, "Datatype is not registered, it cannot be received through this way !");
+
 	_STARPU_MPI_DEBUG(20, "post NM irecv request %p type %s tag %ld src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n", req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.node.rank, req->data_handle, req->ptr, req->datatype_name, (int)req->count, req->registered_datatype);
 
 	_STARPU_MPI_TRACE_IRECV_SUBMIT_BEGIN(req->node_tag.node.rank, req->node_tag.data_tag);
 
-	//req->ret = MPI_Irecv(req->ptr, req->count, req->datatype, req->srcdst, req->mpi_tag, req->comm, &req->request);
 	struct nm_data_s data;
 	nm_mpi_nmad_data_get(&data, (void*)req->ptr, req->datatype, req->count);
 	nm_sr_recv_init(req->backend->session, &(req->backend->data_request));
@@ -197,23 +167,6 @@ static void _starpu_mpi_irecv_data_func(struct _starpu_mpi_req *req)
 	_STARPU_MPI_LOG_OUT();
 }
 
-struct _starpu_mpi_irecv_size_callback
-{
-	starpu_data_handle_t handle;
-	struct _starpu_mpi_req *req;
-};
-
-static void _starpu_mpi_irecv_size_callback(void *arg)
-{
-	struct _starpu_mpi_irecv_size_callback *callback = (struct _starpu_mpi_irecv_size_callback *)arg;
-
-	starpu_data_unregister(callback->handle);
-	callback->req->ptr = malloc(callback->req->count);
-	STARPU_ASSERT_MSG(callback->req->ptr, "cannot allocate message of size %ld", callback->req->count);
-	_starpu_mpi_irecv_data_func(callback->req);
-	free(callback);
-}
-
 void _starpu_mpi_irecv_size_func(struct _starpu_mpi_req *req)
 {
 	_STARPU_MPI_LOG_IN();
@@ -227,11 +180,7 @@ void _starpu_mpi_irecv_size_func(struct _starpu_mpi_req *req)
 	}
 	else
 	{
-		struct _starpu_mpi_irecv_size_callback *callback = malloc(sizeof(struct _starpu_mpi_irecv_size_callback));
-		callback->req = req;
-		starpu_variable_data_register(&callback->handle, 0, (uintptr_t)&(callback->req->count), sizeof(callback->req->count));
-		_STARPU_MPI_DEBUG(4, "Receiving size with tag %ld from node %d\n", req->node_tag.data_tag, req->node_tag.node.rank);
-		_starpu_mpi_irecv_common(callback->handle, req->node_tag.node.rank, req->node_tag.data_tag, req->node_tag.node.comm, 1, 0, _starpu_mpi_irecv_size_callback, callback,1,0,0);
+		_starpu_mpi_irecv_unknown_datatype(req);
 	}
 
 }
@@ -347,7 +296,7 @@ static char *_starpu_mpi_request_type(enum _starpu_mpi_request_type request_type
 }
 #endif
 
-static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req,nm_sr_event_t event)
+void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req,nm_sr_event_t event)
 {
 	_STARPU_MPI_LOG_IN();
 
@@ -356,22 +305,10 @@ static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req,n
 
 	if (req->request_type == RECV_REQ || req->request_type == SEND_REQ)
 	{
+		nm_mpi_nmad_data_release(req->datatype);
+
 		if (req->registered_datatype == 0)
 		{
-			if(req->backend->waited == 1)
-			        nm_mpi_nmad_data_release(req->datatype);
-			if (req->request_type == SEND_REQ)
-			{
-				req->backend->waited--;
-				// We need to make sure the communication for sending the size
-				// has completed, as MPI can re-order messages, let's count
-				// recerived message.
-				// FIXME concurent access.
-				STARPU_ASSERT_MSG(event == NM_SR_EVENT_FINALIZED, "Callback with event %d", event);
-				if(req->backend->waited>0)
-					return;
-
-			}
 			if (req->request_type == RECV_REQ)
 				// req->ptr is freed by starpu_data_unpack
 				starpu_data_unpack(req->data_handle, req->ptr, req->count);
@@ -380,7 +317,6 @@ static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req,n
 		}
 		else
 		{
-		        nm_mpi_nmad_data_release(req->datatype);
 			_starpu_mpi_datatype_free(req->data_handle, &req->datatype);
 		}
 	}
@@ -425,17 +361,9 @@ void _starpu_mpi_handle_request_termination_callback(nm_sr_event_t event, const
 	_starpu_mpi_handle_request_termination(ref,event);
 }
 
-static void _starpu_mpi_handle_pending_request(struct _starpu_mpi_req *req)
+void _starpu_mpi_handle_pending_request(struct _starpu_mpi_req *req)
 {
-	if(req->request_type == SEND_REQ && req->backend->waited>1)
-	{
-		nm_sr_request_set_ref(&(req->backend->size_req), req);
-		nm_sr_request_monitor(req->backend->session, &(req->backend->size_req), NM_SR_EVENT_FINALIZED,_starpu_mpi_handle_request_termination_callback);
-	}
-	/* the if must be before, because the first callback can directly free
-	* a detached request (the second callback free if req->backend->waited>1). */
 	nm_sr_request_set_ref(&(req->backend->data_request), req);
-
 	nm_sr_request_monitor(req->backend->session, &(req->backend->data_request), NM_SR_EVENT_FINALIZED,_starpu_mpi_handle_request_termination_callback);
 }
 
@@ -503,6 +431,9 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 	int i;
 	for (i = 0; i < *(argc_argv->argc); i++)
 		argv_cpy[i] = strdup((*(argc_argv->argv))[i]);
+#ifdef HAVE_SG_ACTOR_DATA
+	_starpu_simgrid_actor_create("main", smpi_simulated_main_, _starpu_simgrid_get_host_by_name("MAIN"), *(argc_argv->argc), argv_cpy);
+#else
 	MSG_process_create_with_arguments("main", smpi_simulated_main_, NULL, _starpu_simgrid_get_host_by_name("MAIN"), *(argc_argv->argc), argv_cpy);
 	/* And set TSD for us */
 	void **tsd;
@@ -513,6 +444,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 	}
 	smpi_process_set_user_data(tsd);
 #endif
+#endif
 
 	_starpu_mpi_comm_amounts_init(argc_argv->comm);
 	_starpu_mpi_cache_init(argc_argv->comm);
@@ -520,10 +452,13 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 	_starpu_mpi_datatype_init();
 
 #ifdef STARPU_USE_FXT
-	_starpu_fxt_wait_initialisation();
-	/* We need to record our ID in the trace before the main thread makes any MPI call */
-	_STARPU_MPI_TRACE_START(argc_argv->rank, argc_argv->world_size);
-	starpu_profiling_set_id(argc_argv->rank);
+	if (_starpu_fxt_wait_initialisation())
+	{
+		/* We need to record our ID in the trace before the main thread makes any MPI call */
+		_STARPU_MPI_TRACE_START(argc_argv->rank, argc_argv->world_size);
+		starpu_profiling_set_id(argc_argv->rank);
+		_starpu_mpi_add_sync_point_in_fxt();
+	}
 #endif //STARPU_USE_FXT
 
 	/* notify the main thread that the progression thread is ready */
@@ -532,8 +467,6 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 	STARPU_PTHREAD_COND_SIGNAL(&progress_cond);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
 
-	_starpu_mpi_add_sync_point_in_fxt();
-
 	while (1)
 	{
 		struct callback_lfstack_cell_s* c = callback_lfstack_pop(&callback_stack);

+ 1 - 0
mpi/src/nmad/starpu_mpi_nmad_backend.c

@@ -46,6 +46,7 @@ void _starpu_mpi_nmad_backend_request_init(struct _starpu_mpi_req *req)
 
 void _starpu_mpi_nmad_backend_request_fill(struct _starpu_mpi_req *req, MPI_Comm comm, int is_internal_req)
 {
+	/* this function gives session and gate: */
 	nm_mpi_nmad_dest(&req->backend->session, &req->backend->gate, comm, req->node_tag.node.rank);
 }
 

+ 5 - 1
mpi/src/nmad/starpu_mpi_nmad_backend.h

@@ -37,9 +37,13 @@ struct _starpu_mpi_req_backend
 	nm_gate_t gate;
 	nm_session_t session;
 	nm_sr_request_t data_request;
-	int waited;
 	piom_cond_t req_cond;
 	nm_sr_request_t size_req;
+
+	// When datatype is unknown:
+	struct nm_data_s unknown_datatype_body;
+	struct nm_data_s unknown_datatype_data;
+	struct nm_data_s unknown_datatype_size;
 };
 
 #endif // STARPU_USE_MPI_NMAD

+ 169 - 0
mpi/src/nmad/starpu_mpi_nmad_unknown_datatype.c

@@ -0,0 +1,169 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+*
+* Copyright (C) 2019                                     Inria
+*
+* StarPU is free software; you can redistribute it and/or modify
+* it under the terms of the GNU Lesser General Public License as published by
+* the Free Software Foundation; either version 2.1 of the License, or (at
+* your option) any later version.
+*
+* StarPU is distributed in the hope that it will be useful, but
+* WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+*
+* See the GNU Lesser General Public License in COPYING.LGPL for more details.
+*/
+
+
+#include <common/config.h>
+
+#ifdef STARPU_USE_MPI_NMAD
+#include <starpu_mpi_private.h>
+#include <starpu_mpi_stats.h>
+#include <starpu_mpi_datatype.h>
+#include <nm_sendrecv_interface.h>
+#include <nm_mpi_nmad.h>
+#include "starpu_mpi_nmad_backend.h"
+#include "starpu_mpi_nmad_unknown_datatype.h"
+
+#if defined(STARPU_VERBOSE) || defined(STARPU_MPI_VERBOSE)
+extern char *_starpu_mpi_request_type(enum _starpu_mpi_request_type request_type);
+#endif
+
+extern void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req,nm_sr_event_t event);
+extern void _starpu_mpi_handle_pending_request(struct _starpu_mpi_req *req);
+
+struct starpu_nm_datatype_unknown
+{
+	starpu_ssize_t* count;
+	const struct nm_data_s* body;
+};
+
+static void starpu_nm_datatype_unknown_traversal(const void* _content, nm_data_apply_t apply, void* _context);
+const struct nm_data_ops_s starpu_nm_datatype_unknown_ops =
+{
+	.p_traversal = &starpu_nm_datatype_unknown_traversal
+};
+
+NM_DATA_TYPE(datatype_unknown, struct starpu_nm_datatype_unknown, &starpu_nm_datatype_unknown_ops);
+
+static void starpu_nm_datatype_unknown_traversal(const void* _content, nm_data_apply_t apply, void* _context)
+{
+	const struct starpu_nm_datatype_unknown* p_content = _content;
+
+	(*apply)(p_content->count, sizeof(starpu_ssize_t), _context);
+
+	nm_data_traversal_apply(p_content->body, apply, _context);
+}
+
+// warning: this function requires valid pointers for future usage
+void starpu_nm_datatype_unknown_build(struct nm_data_s* datatype_unknown_data, starpu_ssize_t* count, const struct nm_data_s* body)
+{
+	nm_data_datatype_unknown_set(datatype_unknown_data, (struct starpu_nm_datatype_unknown)
+			{
+			.count = count,
+			.body = body
+			});
+}
+
+/**********************************************
+* Send
+**********************************************/
+
+void _starpu_mpi_isend_unknown_datatype(struct _starpu_mpi_req *req)
+{
+	_STARPU_MPI_LOG_IN();
+
+	STARPU_ASSERT_MSG(req->registered_datatype != 1, "Datatype is registered, no need to send it through this way !");
+
+	_STARPU_MPI_DEBUG(30, "post NM isend (unknown datatype) request %p type %s tag %ld src %d data %p datasize %ld ptr %p datatype '%s' count %d registered_datatype %d sync %d\n", req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.node.rank, req->data_handle, starpu_data_get_size(req->data_handle), req->ptr, req->datatype_name, (int)req->count, req->registered_datatype, req->sync);
+
+	_starpu_mpi_comm_amounts_inc(req->node_tag.node.comm, req->node_tag.node.rank, req->datatype, req->count);
+
+	_STARPU_MPI_TRACE_ISEND_SUBMIT_BEGIN(req->node_tag.node.rank, req->node_tag.data_tag, 0);
+
+	starpu_data_pack(req->data_handle, &req->ptr, &req->count);
+
+	nm_mpi_nmad_data_get(&(req->backend->unknown_datatype_body), (void*)req->ptr, req->datatype, req->count);
+
+	// warning: this function requires valid pointers for future usage
+	starpu_nm_datatype_unknown_build(&(req->backend->unknown_datatype_data), &(req->count), &(req->backend->unknown_datatype_body));
+
+	nm_sr_send_init(req->backend->session, &(req->backend->data_request));
+	nm_sr_send_pack_data(req->backend->session, &(req->backend->data_request), &(req->backend->unknown_datatype_data));
+	nm_sr_send_set_priority(req->backend->session, &(req->backend->data_request), req->prio);
+	nm_sr_send_header(req->backend->session, &(req->backend->data_request), sizeof(starpu_ssize_t));
+
+	if (req->sync == 0)
+	{
+		req->ret = nm_sr_send_isend(req->backend->session, &(req->backend->data_request), req->backend->gate, req->node_tag.data_tag);
+		STARPU_ASSERT_MSG(req->ret == NM_ESUCCESS, "nm_sr_send_isend returning %d", req->ret);
+	}
+	else
+	{
+		req->ret = nm_sr_send_issend(req->backend->session, &(req->backend->data_request), req->backend->gate, req->node_tag.data_tag);
+		STARPU_ASSERT_MSG(req->ret == NM_ESUCCESS, "nm_sr_send_issend returning %d", req->ret);
+	}
+
+	_STARPU_MPI_TRACE_ISEND_SUBMIT_END(req->node_tag.node.rank, req->node_tag.data_tag, starpu_data_get_size(req->data_handle), req->pre_sync_jobid, req->data_handle);
+
+	_starpu_mpi_handle_pending_request(req);
+
+	_STARPU_MPI_LOG_OUT();
+}
+
+
+/**********************************************
+ * Receive
+ **********************************************/
+
+static void _starpu_mpi_unknown_datatype_recv_callback(nm_sr_event_t event, const nm_sr_event_info_t* p_info, void* ref)
+{
+	STARPU_ASSERT_MSG(!((event & NM_SR_EVENT_FINALIZED) && (event & NM_SR_EVENT_RECV_DATA)), "Both events can't be triggered at the same time !");
+
+	struct _starpu_mpi_req* req = (struct _starpu_mpi_req*) ref;
+
+	if (event & NM_SR_EVENT_RECV_DATA)
+	{
+		nm_data_contiguous_build(&(req->backend->unknown_datatype_size), &(req->count), sizeof(int));
+
+		int ret = nm_sr_recv_peek(req->backend->session, &(req->backend->data_request), &(req->backend->unknown_datatype_size));
+		STARPU_ASSERT_MSG(ret == NM_ESUCCESS, "nm_sr_recv_peek returned %d", ret);
+
+		req->ptr = malloc(req->count);
+		STARPU_ASSERT_MSG(req->ptr, "cannot allocate message of size %ld", req->count);
+
+		nm_mpi_nmad_data_get(&(req->backend->unknown_datatype_body), (void*) req->ptr, req->datatype, req->count);
+
+		// warning: this function requires valid pointers for future usage
+		starpu_nm_datatype_unknown_build(&(req->backend->unknown_datatype_data), &(req->count), &(req->backend->unknown_datatype_body));
+		nm_sr_recv_unpack_data(req->backend->session, &(req->backend->data_request), &(req->backend->unknown_datatype_data));
+	}
+	else if (event & NM_SR_EVENT_FINALIZED)
+	{
+		_starpu_mpi_handle_request_termination(req, event);
+	}
+}
+
+void _starpu_mpi_irecv_unknown_datatype(struct _starpu_mpi_req *req)
+{
+	_STARPU_MPI_LOG_IN();
+
+	STARPU_ASSERT_MSG(req->registered_datatype != 1, "Datatype is registered, no need to receive it through this way !");
+
+	_STARPU_MPI_DEBUG(20, "post NM irecv (datatype unknown) request %p type %s tag %ld src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n", req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.node.rank, req->data_handle, req->ptr, req->datatype_name, (int)req->count, req->registered_datatype);
+
+	_STARPU_MPI_TRACE_IRECV_SUBMIT_BEGIN(req->node_tag.node.rank, req->node_tag.data_tag);
+
+	nm_sr_recv_init(req->backend->session, &(req->backend->data_request));
+	nm_sr_request_set_ref(&(req->backend->data_request), req);
+	nm_sr_request_monitor(req->backend->session, &(req->backend->data_request), NM_SR_EVENT_FINALIZED | NM_SR_EVENT_RECV_DATA,
+				&_starpu_mpi_unknown_datatype_recv_callback);
+	nm_sr_recv_irecv(req->backend->session, &(req->backend->data_request), req->backend->gate, req->node_tag.data_tag, NM_TAG_MASK_FULL);
+
+	_STARPU_MPI_TRACE_IRECV_SUBMIT_END(req->node_tag.node.rank, req->node_tag.data_tag);
+
+	_STARPU_MPI_LOG_OUT();
+}
+
+#endif //  STARPU_USE_MPI_NMAD

+ 43 - 0
mpi/src/nmad/starpu_mpi_nmad_unknown_datatype.h

@@ -0,0 +1,43 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2019                                     Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_MPI_NMAD_UNKNOWN_DATATYPE_H__
+#define __STARPU_MPI_NMAD_UNKNOWN_DATATYPE_H__
+
+#include <common/config.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#ifdef STARPU_USE_MPI_NMAD
+
+#include <nm_sendrecv_interface.h>
+#include <nm_mpi_nmad.h>
+
+
+void _starpu_mpi_isend_unknown_datatype(struct _starpu_mpi_req *req);
+void _starpu_mpi_irecv_unknown_datatype(struct _starpu_mpi_req *req);
+
+
+#endif // STARPU_USE_MPI_NMAD
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // __STARPU_MPI_NMAD_UNKNOWN_DATATYPE_H__

+ 1 - 0
mpi/src/starpu_mpi.c

@@ -279,6 +279,7 @@ void starpu_mpi_data_register_comm(starpu_data_handle_t data_handle, starpu_mpi_
 	{
 		_mpi_backend._starpu_mpi_backend_data_register(data_handle, data_tag);
 		mpi_data->node_tag.data_tag = data_tag;
+		_STARPU_MPI_TRACE_DATA_SET_TAG(data_handle, data_tag);
 	}
 	if (rank != -1)
 	{

+ 46 - 1
mpi/src/starpu_mpi_datatype.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2010-2017,2019                           CNRS
  * Copyright (C) 2011,2012,2015                           Inria
- * Copyright (C) 2009-2011,2014,2015,2018                 Université de Bordeaux
+ * Copyright (C) 2009-2011,2014,2015,2018,2020            Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -91,6 +91,44 @@ static void handle_to_datatype_block(starpu_data_handle_t data_handle, MPI_Datat
 }
 
 /*
+ * 	Tensor
+ */
+
+static void handle_to_datatype_tensor(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
+{
+	int ret;
+
+	unsigned nx = starpu_tensor_get_nx(data_handle);
+	unsigned ny = starpu_tensor_get_ny(data_handle);
+	unsigned nz = starpu_tensor_get_nz(data_handle);
+	unsigned nt = starpu_tensor_get_nt(data_handle);
+	unsigned ldy = starpu_tensor_get_local_ldy(data_handle);
+	unsigned ldz = starpu_tensor_get_local_ldz(data_handle);
+	unsigned ldt = starpu_tensor_get_local_ldt(data_handle);
+	size_t elemsize = starpu_block_get_elemsize(data_handle);
+
+	MPI_Datatype datatype_3dlayer;
+	ret = MPI_Type_vector(ny, nx*elemsize, ldy*elemsize, MPI_BYTE, &datatype_3dlayer);
+	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_vector failed");
+
+	ret = MPI_Type_commit(&datatype_3dlayer);
+	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_commit failed");
+
+	MPI_Datatype datatype_2dlayer;
+	ret = MPI_Type_create_hvector(nz, 1, ldz*elemsize, datatype_3dlayer, &datatype_2dlayer);
+	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_hvector failed");
+
+	ret = MPI_Type_commit(&datatype_2dlayer);
+	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_commit failed");
+
+	ret = MPI_Type_create_hvector(nt, 1, ldt*elemsize, datatype_2dlayer, datatype);
+	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_hvector failed");
+
+	ret = MPI_Type_commit(datatype);
+	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_commit failed");
+}
+
+/*
  * 	Vector
  */
 
@@ -147,8 +185,12 @@ static void handle_to_datatype_void(starpu_data_handle_t data_handle, MPI_Dataty
 
 static starpu_mpi_datatype_allocate_func_t handle_to_datatype_funcs[STARPU_MAX_INTERFACE_ID] =
 {
+//#define DYNAMIC_MATRICES
+#ifndef DYNAMIC_MATRICES
 	[STARPU_MATRIX_INTERFACE_ID]	= handle_to_datatype_matrix,
+#endif
 	[STARPU_BLOCK_INTERFACE_ID]	= handle_to_datatype_block,
+	[STARPU_TENSOR_INTERFACE_ID]	= handle_to_datatype_tensor,
 	[STARPU_VECTOR_INTERFACE_ID]	= handle_to_datatype_vector,
 	[STARPU_CSR_INTERFACE_ID]	= NULL, /* Sent through pack/unpack operations */
 	[STARPU_BCSR_INTERFACE_ID]	= NULL, /* Sent through pack/unpack operations */
@@ -243,8 +285,11 @@ static void _starpu_mpi_handle_free_complex_datatype(MPI_Datatype *datatype)
 
 static starpu_mpi_datatype_free_func_t handle_free_datatype_funcs[STARPU_MAX_INTERFACE_ID] =
 {
+#ifndef DYNAMIC_MATRICES
 	[STARPU_MATRIX_INTERFACE_ID]	= _starpu_mpi_handle_free_simple_datatype,
+#endif
 	[STARPU_BLOCK_INTERFACE_ID]	= _starpu_mpi_handle_free_complex_datatype,
+	[STARPU_TENSOR_INTERFACE_ID]	= _starpu_mpi_handle_free_complex_datatype,
 	[STARPU_VECTOR_INTERFACE_ID]	= _starpu_mpi_handle_free_simple_datatype,
 	[STARPU_CSR_INTERFACE_ID]	= NULL,  /* Sent through pack/unpack operations */
 	[STARPU_BCSR_INTERFACE_ID]	= NULL,  /* Sent through pack/unpack operations */

+ 31 - 31
mpi/src/starpu_mpi_fxt.h

@@ -58,58 +58,61 @@ extern "C"
 #define _STARPU_MPI_FUT_POLLING_END			0x5215
 #define _STARPU_MPI_FUT_DRIVER_RUN_BEGIN		0x5216
 #define _STARPU_MPI_FUT_DRIVER_RUN_END			0x5217
+#define _STARPU_MPI_FUT_DATA_SET_TAG			0x5218
 
 #ifdef STARPU_USE_FXT
 
 #define _STARPU_MPI_TRACE_START(rank, worldsize)	\
-	FUT_DO_PROBE3(_STARPU_MPI_FUT_START, (rank), (worldsize), _starpu_gettid());
+	FUT_DO_ALWAYS_PROBE3(_STARPU_MPI_FUT_START, (rank), (worldsize), _starpu_gettid());
 #define _STARPU_MPI_TRACE_STOP(rank, worldsize)	\
-	FUT_DO_PROBE3(_STARPU_MPI_FUT_STOP, (rank), (worldsize), _starpu_gettid());
+	FUT_DO_ALWAYS_PROBE3(_STARPU_MPI_FUT_STOP, (rank), (worldsize), _starpu_gettid());
 #define _STARPU_MPI_TRACE_BARRIER(rank, worldsize, key)	do {\
 	if (_starpu_fxt_started) \
 	FUT_DO_ALWAYS_PROBE4(_STARPU_MPI_FUT_BARRIER, (rank), (worldsize), (key), _starpu_gettid()); \
 } while (0)
 #define _STARPU_MPI_TRACE_ISEND_SUBMIT_BEGIN(dest, data_tag, size)	\
-	FUT_DO_PROBE4(_STARPU_MPI_FUT_ISEND_SUBMIT_BEGIN, (dest), (data_tag), (size), _starpu_gettid());
-#define _STARPU_MPI_TRACE_ISEND_SUBMIT_END(dest, data_tag, size, jobid)	\
-	FUT_DO_PROBE5(_STARPU_MPI_FUT_ISEND_SUBMIT_END, (dest), (data_tag), (size), (jobid), _starpu_gettid());
+	FUT_FULL_PROBE4(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_ISEND_SUBMIT_BEGIN, (dest), (data_tag), (size), _starpu_gettid());
+#define _STARPU_MPI_TRACE_ISEND_SUBMIT_END(dest, data_tag, size, jobid, handle)	\
+	FUT_FULL_PROBE6(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_ISEND_SUBMIT_END, (dest), (data_tag), (size), (jobid), _starpu_gettid(), (handle));
 #define _STARPU_MPI_TRACE_IRECV_SUBMIT_BEGIN(src, data_tag)	\
-	FUT_DO_PROBE3(_STARPU_MPI_FUT_IRECV_SUBMIT_BEGIN, (src), (data_tag), _starpu_gettid());
+	FUT_FULL_PROBE3(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_IRECV_SUBMIT_BEGIN, (src), (data_tag), _starpu_gettid());
 #define _STARPU_MPI_TRACE_IRECV_SUBMIT_END(src, data_tag)	\
-	FUT_DO_PROBE3(_STARPU_MPI_FUT_IRECV_SUBMIT_END, (src), (data_tag), _starpu_gettid());
+	FUT_FULL_PROBE3(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_IRECV_SUBMIT_END, (src), (data_tag), _starpu_gettid());
 #define _STARPU_MPI_TRACE_ISEND_COMPLETE_BEGIN(dest, data_tag, size)	\
-	FUT_DO_PROBE4(_STARPU_MPI_FUT_ISEND_COMPLETE_BEGIN, (dest), (data_tag), (size), _starpu_gettid());
+	FUT_FULL_PROBE4(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_ISEND_COMPLETE_BEGIN, (dest), (data_tag), (size), _starpu_gettid());
 #define _STARPU_MPI_TRACE_COMPLETE_BEGIN(type, rank, data_tag)		\
 	if (type == RECV_REQ) { _STARPU_MPI_TRACE_IRECV_COMPLETE_BEGIN((rank), (data_tag)); } else if (type == SEND_REQ) { _STARPU_MPI_TRACE_ISEND_COMPLETE_BEGIN((rank), (data_tag), 0); }
 #define _STARPU_MPI_TRACE_ISEND_COMPLETE_END(dest, data_tag, size)	\
-	FUT_DO_PROBE4(_STARPU_MPI_FUT_ISEND_COMPLETE_END, (dest), (data_tag), (size), _starpu_gettid());
+	FUT_FULL_PROBE4(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_ISEND_COMPLETE_END, (dest), (data_tag), (size), _starpu_gettid());
 #define _STARPU_MPI_TRACE_IRECV_COMPLETE_BEGIN(src, data_tag)	\
-	FUT_DO_PROBE3(_STARPU_MPI_FUT_IRECV_COMPLETE_BEGIN, (src), (data_tag), _starpu_gettid());
+	FUT_FULL_PROBE3(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_IRECV_COMPLETE_BEGIN, (src), (data_tag), _starpu_gettid());
 #define _STARPU_MPI_TRACE_IRECV_COMPLETE_END(src, data_tag)	\
-	FUT_DO_PROBE3(_STARPU_MPI_FUT_IRECV_COMPLETE_END, (src), (data_tag), _starpu_gettid());
+	FUT_FULL_PROBE3(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_IRECV_COMPLETE_END, (src), (data_tag), _starpu_gettid());
 #define _STARPU_MPI_TRACE_COMPLETE_END(type, rank, data_tag)		\
 	if (type == RECV_REQ) { _STARPU_MPI_TRACE_IRECV_COMPLETE_END((rank), (data_tag)); } else if (type == SEND_REQ) { _STARPU_MPI_TRACE_ISEND_COMPLETE_END((rank), (data_tag), 0); }
 #define _STARPU_MPI_TRACE_TERMINATED(req, rank, data_tag)		\
-	if ((req)->request_type == RECV_REQ) FUT_DO_PROBE4(_STARPU_MPI_FUT_IRECV_TERMINATED, (rank), (data_tag), (req)->post_sync_jobid, _starpu_gettid()); else \
-	if ((req)->request_type == SEND_REQ) FUT_DO_PROBE3(_STARPU_MPI_FUT_ISEND_TERMINATED, (rank), (data_tag), _starpu_gettid());
+	if ((req)->request_type == RECV_REQ) FUT_FULL_PROBE5(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_IRECV_TERMINATED, (rank), (data_tag), (req)->post_sync_jobid, _starpu_gettid(), (req)->data_handle); else \
+	if ((req)->request_type == SEND_REQ) FUT_FULL_PROBE3(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_ISEND_TERMINATED, (rank), (data_tag), _starpu_gettid());
 #define _STARPU_MPI_TRACE_SLEEP_BEGIN()	\
-	FUT_DO_PROBE1(_STARPU_MPI_FUT_SLEEP_BEGIN, _starpu_gettid());
+	FUT_FULL_PROBE1(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_SLEEP_BEGIN, _starpu_gettid());
 #define _STARPU_MPI_TRACE_SLEEP_END()	\
-	FUT_DO_PROBE1(_STARPU_MPI_FUT_SLEEP_END, _starpu_gettid());
+	FUT_FULL_PROBE1(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_SLEEP_END, _starpu_gettid());
 #define _STARPU_MPI_TRACE_DTESTING_BEGIN()	\
-	FUT_DO_PROBE1(_STARPU_MPI_FUT_DTESTING_BEGIN,  _starpu_gettid());
+	FUT_FULL_PROBE1(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_DTESTING_BEGIN,  _starpu_gettid());
 #define _STARPU_MPI_TRACE_DTESTING_END()	\
-	FUT_DO_PROBE1(_STARPU_MPI_FUT_DTESTING_END, _starpu_gettid());
+	FUT_FULL_PROBE1(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_DTESTING_END, _starpu_gettid());
 #define _STARPU_MPI_TRACE_UTESTING_BEGIN(src, data_tag)	\
-	FUT_DO_PROBE3(_STARPU_MPI_FUT_UTESTING_BEGIN, (src), (data_tag),  _starpu_gettid());
+	FUT_FULL_PROBE3(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_UTESTING_BEGIN, (src), (data_tag),  _starpu_gettid());
 #define _STARPU_MPI_TRACE_UTESTING_END(src, data_tag)	\
-	FUT_DO_PROBE3(_STARPU_MPI_FUT_UTESTING_END, (src), (data_tag), _starpu_gettid());
+	FUT_FULL_PROBE3(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_UTESTING_END, (src), (data_tag), _starpu_gettid());
 #define _STARPU_MPI_TRACE_UWAIT_BEGIN(src, data_tag)	\
-	FUT_DO_PROBE3(_STARPU_MPI_FUT_UWAIT_BEGIN, (src), (data_tag),  _starpu_gettid());
+	FUT_FULL_PROBE3(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_UWAIT_BEGIN, (src), (data_tag),  _starpu_gettid());
 #define _STARPU_MPI_TRACE_UWAIT_END(src, data_tag)	\
-	FUT_DO_PROBE3(_STARPU_MPI_FUT_UWAIT_END, (src), (data_tag), _starpu_gettid());
+	FUT_FULL_PROBE3(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_UWAIT_END, (src), (data_tag), _starpu_gettid());
 #define _STARPU_MPI_TRACE_DATA_SET_RANK(handle, rank)	\
-	FUT_DO_PROBE3(_STARPU_MPI_FUT_DATA_SET_RANK, (handle), (rank), _starpu_gettid());
+	FUT_FULL_PROBE3(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_DATA_SET_RANK, (handle), (rank), _starpu_gettid());
+#define _STARPU_MPI_TRACE_DATA_SET_TAG(handle, data_tag)	\
+	FUT_FULL_PROBE3(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_DATA_SET_TAG, (handle), (data_tag), _starpu_gettid());
 #if 0
 /* This is very expensive in the trace, only enable for debugging */
 #define _STARPU_MPI_TRACE_TESTING_DETACHED_BEGIN()	\
@@ -129,28 +132,24 @@ extern "C"
 #define _STARPU_MPI_TRACE_POLLING_BEGIN()					\
 	if(!trace_loop) {						\
 		trace_loop = 1;							\
-		FUT_DO_PROBE1(_STARPU_MPI_FUT_POLLING_BEGIN, _starpu_gettid()); \
+		FUT_FULL_PROBE1(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_POLLING_BEGIN, _starpu_gettid()); \
 	}
 #define _STARPU_MPI_TRACE_POLLING_END()	\
 	if(trace_loop) {							\
 		trace_loop = 0;							\
-		FUT_DO_PROBE1(_STARPU_MPI_FUT_POLLING_END, _starpu_gettid());	\
+		FUT_FULL_PROBE1(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_POLLING_END, _starpu_gettid());	\
 	}
 #define _STARPU_MPI_TRACE_DRIVER_RUN_BEGIN()	\
-	FUT_DO_PROBE1(_STARPU_MPI_FUT_DRIVER_RUN_BEGIN,  _starpu_gettid());
+	FUT_FULL_PROBE1(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_DRIVER_RUN_BEGIN,  _starpu_gettid());
 #define _STARPU_MPI_TRACE_DRIVER_RUN_END()	\
-	FUT_DO_PROBE1(_STARPU_MPI_FUT_DRIVER_RUN_END, _starpu_gettid());
-#define _STARPU_MPI_TRACE_DRIVER_RUN_BEGIN()	\
-	FUT_DO_PROBE1(_STARPU_MPI_FUT_DRIVER_RUN_BEGIN,  _starpu_gettid());
-#define _STARPU_MPI_TRACE_DRIVER_RUN_END()	\
-	FUT_DO_PROBE1(_STARPU_MPI_FUT_DRIVER_RUN_END, _starpu_gettid());
+	FUT_FULL_PROBE1(_STARPU_FUT_KEYMASK_MPI, _STARPU_MPI_FUT_DRIVER_RUN_END, _starpu_gettid());
 #define TRACE
 #else
 #define _STARPU_MPI_TRACE_START(a, b)				do {} while(0);
 #define _STARPU_MPI_TRACE_STOP(a, b)				do {} while(0);
 #define _STARPU_MPI_TRACE_BARRIER(a, b, c)			do {} while(0);
 #define _STARPU_MPI_TRACE_ISEND_SUBMIT_BEGIN(a, b, c)		do {} while(0);
-#define _STARPU_MPI_TRACE_ISEND_SUBMIT_END(a, b, c, d)		do {} while(0);
+#define _STARPU_MPI_TRACE_ISEND_SUBMIT_END(a, b, c, d, e)	do {} while(0);
 #define _STARPU_MPI_TRACE_IRECV_SUBMIT_BEGIN(a, b)		do {} while(0);
 #define _STARPU_MPI_TRACE_IRECV_SUBMIT_END(a, b)		do {} while(0);
 #define _STARPU_MPI_TRACE_ISEND_COMPLETE_BEGIN(a, b, c)		do {} while(0);
@@ -169,6 +168,7 @@ extern "C"
 #define _STARPU_MPI_TRACE_UWAIT_BEGIN(a, b)			do {} while(0);
 #define _STARPU_MPI_TRACE_UWAIT_END(a, b)			do {} while(0);
 #define _STARPU_MPI_TRACE_DATA_SET_RANK(a, b)			do {} while(0);
+#define _STARPU_MPI_TRACE_DATA_SET_TAG(a, b)			do {} while(0);
 #define _STARPU_MPI_TRACE_TESTING_DETACHED_BEGIN()		do {} while(0)
 #define _STARPU_MPI_TRACE_TESTING_DETACHED_END()		do {} while(0)
 #define _STARPU_MPI_TRACE_TEST_BEGIN(peer, data_tag)		do {} while(0)

+ 6 - 1
mpi/src/starpu_mpi_init.c

@@ -210,7 +210,12 @@ int starpu_mpi_shutdown(void)
 	/* kill the progression thread */
 	_starpu_mpi_progress_shutdown(&value);
 
-	_STARPU_MPI_TRACE_STOP(rank, world_size);
+#ifdef STARPU_USE_FXT
+	if (starpu_fxt_is_enabled())
+	{
+		_STARPU_MPI_TRACE_STOP(rank, world_size);
+	}
+#endif // STARPU_USE_FXT
 
 	_starpu_mpi_comm_amounts_display(stderr, rank);
 	_starpu_mpi_comm_amounts_shutdown();

+ 6 - 2
mpi/tests/Makefile.am

@@ -132,6 +132,7 @@ starpu_mpi_TESTS +=				\
 	mpi_earlyrecv2				\
 	mpi_earlyrecv2_sync			\
 	mpi_irecv				\
+	mpi_barrier				\
 	mpi_redux				\
 	ring					\
 	ring_sync				\
@@ -151,7 +152,8 @@ starpu_mpi_TESTS +=				\
 	sync					\
 	gather					\
 	gather2					\
-	driver
+	driver					\
+	sendrecv_bench
 
 if STARPU_USE_MPI_MPI
 starpu_mpi_TESTS +=				\
@@ -174,6 +176,7 @@ noinst_PROGRAMS =				\
 	mpi_earlyrecv2				\
 	mpi_earlyrecv2_sync			\
 	mpi_irecv				\
+	mpi_barrier				\
 	mpi_isend_detached			\
 	mpi_irecv_detached			\
 	mpi_detached_tag			\
@@ -223,7 +226,8 @@ noinst_PROGRAMS =				\
 	early_request				\
 	starpu_redefine				\
 	load_balancer				\
-	driver
+	driver					\
+	sendrecv_bench
 
 XFAIL_TESTS=					\
 	policy_register_toomany			\

+ 38 - 0
mpi/tests/mpi_barrier.c

@@ -0,0 +1,38 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2019                                     Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include "helper.h"
+
+
+int main(int argc, char **argv)
+{
+	int ret, mpi_init;
+	MPI_Status status;
+
+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
+
+	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
+
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+
+	starpu_mpi_shutdown();
+	if (!mpi_init)
+		MPI_Finalize();
+
+	return 0;
+}

+ 120 - 14
mpi/tests/pingpong.c

@@ -17,15 +17,26 @@
  */
 
 #include <starpu_mpi.h>
+#include <unistd.h>
 #include "helper.h"
 
 #ifdef STARPU_QUICK_CHECK
-#  define NITER	16
+#  define DEFAULT_NITER 16
 #else
-#  define NITER	2048
+#  define DEFAULT_NITER 2048
 #endif
 
-#define SIZE	16
+#define DEFAULT_DATA_SIZE 16
+#define DEFAULT_SLEEP_TIME 0
+#define DEFAULT_METHOD 0 // ping pongs
+
+void usage()
+{
+	fprintf(stderr, "-n [number of iteration] (default: %d)\n", DEFAULT_NITER);
+	fprintf(stderr, "-s [number of floats to exchange] (default: %d)\n", DEFAULT_DATA_SIZE);
+	fprintf(stderr, "-S [time in millisecond of sleep between exchange, less than 1 second] (default: %d)\n", DEFAULT_SLEEP_TIME);
+	fprintf(stderr, "-b : broadcasts instead of simple pair-wise ping-pongs (default: %s)\n", DEFAULT_METHOD ? "broadcast" : "ping pongs");
+}
 
 float *tab;
 starpu_data_handle_t tab_handle;
@@ -35,6 +46,59 @@ int main(int argc, char **argv)
 	int ret, rank, size;
 	int mpi_init;
 
+	int niter = DEFAULT_NITER;
+	int data_size = DEFAULT_DATA_SIZE;
+	int sleep_time = DEFAULT_SLEEP_TIME;
+	int method = DEFAULT_METHOD;
+
+	for (int i = 1; i < argc; i++)
+	{
+		if (strcmp(argv[i], "-n") == 0)
+		{
+			niter = atoi(argv[i+1]);
+			if (niter <= 0)
+			{
+				fprintf(stderr, "%s: illegal argument %s\n", argv[0], argv[i]);
+				usage();
+				exit(0);
+			}
+			i++;
+		}
+		else if (strcmp(argv[i], "-s") == 0)
+		{
+			data_size = atoi(argv[i+1]);
+			if (data_size <= 0)
+			{
+				fprintf(stderr, "%s: illegal argument %s\n", argv[0], argv[i]);
+				usage();
+				exit(0);
+			}
+			i++;
+		}
+		else if(strcmp(argv[i], "-S") == 0)
+		{
+			sleep_time = atoi(argv[i+1]);
+			if (sleep_time <= 0 || sleep_time >= 1000)
+			{
+				fprintf(stderr, "%s: illegal argument %s\n", argv[0], argv[i]);
+				usage();
+				exit(0);
+			}
+			i++;
+		}
+		else if(strcmp(argv[i], "-b") == 0)
+		{
+			method = 1; // broadcasts
+		}
+		else
+		{
+			fprintf(stderr, "%s: illegal argument %s\n", argv[0], argv[i]);
+			usage();
+			exit(0);
+		}
+	}
+
+
 	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
 
 	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
@@ -54,26 +118,68 @@ int main(int argc, char **argv)
 		return STARPU_TEST_SKIPPED;
 	}
 
-	tab = calloc(SIZE, sizeof(float));
+	if (rank == 0)
+	{
+		FPRINTF(stdout, "Number of iterations: %d\n", niter);
+		FPRINTF(stdout, "Number of floats to exchange: %d\n", data_size);
+		FPRINTF(stdout, "Sleep time between exchanges: %d milliseconds\n", sleep_time);
+		if (method == 0)
+			FPRINTF(stdout, "Method: ping pongs\n");
+		else
+			FPRINTF(stdout, "Method: broadcasts\n");
+	}
+
+	tab = calloc(data_size, sizeof(float));
 
-	starpu_vector_data_register(&tab_handle, STARPU_MAIN_RAM, (uintptr_t)tab, SIZE, sizeof(float));
+	starpu_vector_data_register(&tab_handle, STARPU_MAIN_RAM, (uintptr_t)tab, data_size, sizeof(float));
 
-	int nloops = NITER;
 	int loop;
 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
+	int sender;
 
-	for (loop = 0; loop < nloops; loop++)
+	if (method == 0) // ping pongs
 	{
-		if ((loop % 2) == (rank%2))
+		for (loop = 0; loop < niter; loop++)
 		{
-			//FPRINTF_MPI(stderr, "Sending to %d\n", other_rank);
-			starpu_mpi_send(tab_handle, other_rank, loop, MPI_COMM_WORLD);
+			if ((loop % 2) == (rank%2))
+			{
+				//FPRINTF_MPI(stderr, "Sending to %d\n", other_rank);
+				starpu_mpi_send(tab_handle, other_rank, loop, MPI_COMM_WORLD);
+			}
+			else
+			{
+				MPI_Status status;
+				//FPRINTF_MPI(stderr, "Receiving from %d\n", other_rank);
+				starpu_mpi_recv(tab_handle, other_rank, loop, MPI_COMM_WORLD, &status);
+			}
+
+			usleep(sleep_time * 1000);
 		}
-		else
+	}
+	else // broadcasts
+	{
+		for (loop = 0; loop < niter; loop++)
 		{
-			MPI_Status status;
-			//FPRINTF_MPI(stderr, "Receiving from %d\n", other_rank);
-			starpu_mpi_recv(tab_handle, other_rank, loop, MPI_COMM_WORLD, &status);
+			sender = loop % size;
+			if (sender == rank)
+			{
+				for (int r = 0; r < size; r++)
+				{
+					if (r != rank)
+					{
+						starpu_mpi_send(tab_handle, r, (r * niter) + loop, MPI_COMM_WORLD);
+						usleep(sleep_time * 1000);
+					}
+				}
+			}
+			else
+			{
+				MPI_Status status;
+				starpu_mpi_recv(tab_handle, sender, (rank * niter) + loop, MPI_COMM_WORLD, &status);
+
+				for (int r = 0; r < (size-1); r++)
+					usleep(sleep_time * 1000);
+			}
 		}
 	}
 

+ 180 - 0
mpi/tests/sendrecv_bench.c

@@ -0,0 +1,180 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2019                                     Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ *
+ * Basic send receive benchmark.
+ * Inspired a lot from NewMadeleine examples/benchmarks/nm_bench_sendrecv.c
+ */
+
+#include <math.h>
+#include <starpu_mpi.h>
+#include "helper.h"
+
+#define NX_MAX (512 * 1024 * 1024) // kB
+#define NX_MIN 0
+#define MULT_DEFAULT 2
+#define INCR_DEFAULT 0
+#define NX_STEP 1.4 // multiplication
+#define LOOPS_DEFAULT 10000
+
+int times_nb_nodes;
+int times_size;
+int worldsize;
+
+static int comp_double(const void*_a, const void*_b)
+{
+	const double* a = _a;
+	const double* b = _b;
+
+	if(*a < *b)
+		return -1;
+	else if(*a > *b)
+		return 1;
+	else
+		return 0;
+}
+
+static inline uint64_t _next(uint64_t len, double multiplier, uint64_t increment)
+{
+	uint64_t next = len * multiplier + increment;
+
+	if(next <= len)
+		next++;
+
+	return next;
+}
+
+
+static inline uint64_t _iterations(int iterations, uint64_t len)
+{
+	const uint64_t max_data = 512 * 1024 * 1024;
+
+	if(len <= 0)
+		len = 1;
+
+	uint64_t data_size = ((uint64_t)iterations * (uint64_t)len);
+
+	if(data_size  > max_data)
+	{
+		iterations = (max_data / (uint64_t)len);
+		if(iterations < 2)
+			iterations = 2;
+	}
+
+	return iterations;
+}
+
+int main(int argc, char **argv)
+{
+	int ret, rank;
+	starpu_data_handle_t handle_send, handle_recv;
+	int mpi_init;
+	float* vector_send = NULL;
+	float* vector_recv = NULL;
+	double t1, t2;
+	double* lats = malloc(sizeof(double) * LOOPS_DEFAULT);
+	uint64_t iterations = LOOPS_DEFAULT;
+	double multiplier = MULT_DEFAULT;
+	uint64_t increment = INCR_DEFAULT;
+
+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
+	ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
+
+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
+	starpu_mpi_comm_size(MPI_COMM_WORLD, &worldsize);
+
+	STARPU_ASSERT_MSG(worldsize == 2, "We need two prcesses.");
+
+
+	if (rank == 0)
+	{
+		printf("Times in us\n");
+		printf("# size  (Bytes)\t|  latency \t| 10^6 B/s \t| MB/s   \t| d1    \t|median  \t| avg    \t| d9    \t| max\n");
+	}
+
+	int array_size = 0;
+
+	for (uint64_t s = NX_MIN; s <= NX_MAX; s = _next(s, multiplier, increment))
+	{
+		vector_send = malloc(s);
+		vector_recv = malloc(s);
+		memset(vector_send, 0, s);
+		memset(vector_recv, 0, s);
+
+		starpu_vector_data_register(&handle_send, STARPU_MAIN_RAM, (uintptr_t) vector_send, s, 1);
+		starpu_vector_data_register(&handle_recv, STARPU_MAIN_RAM, (uintptr_t) vector_recv, s, 1);
+
+		iterations = _iterations(iterations, s);
+
+		starpu_mpi_barrier(MPI_COMM_WORLD);
+
+		for (int j = 0; j < iterations; j++)
+		{
+			if (rank == 0)
+			{
+				t1 = starpu_timing_now();
+				starpu_mpi_send(handle_send, 1, 0, MPI_COMM_WORLD);
+				starpu_mpi_recv(handle_recv, 1, 1, MPI_COMM_WORLD, NULL);
+				t2 = starpu_timing_now();
+
+				const double delay = t2 - t1;
+				const double t = delay / 2;
+
+				lats[j] = t;
+			}
+			else
+			{
+				starpu_mpi_recv(handle_recv, 0, 0, MPI_COMM_WORLD, NULL);
+				starpu_mpi_send(handle_send, 0, 1, MPI_COMM_WORLD);
+			}
+
+			starpu_mpi_barrier(MPI_COMM_WORLD);
+		}
+
+		if (rank == 0)
+		{
+			qsort(lats, iterations, sizeof(double), &comp_double);
+
+			const double min_lat = lats[0];
+			const double max_lat = lats[iterations - 1];
+			const double med_lat = lats[(iterations - 1) / 2];
+			const double d1_lat = lats[(iterations - 1) / 10];
+			const double d9_lat = lats[9 * (iterations - 1) / 10];
+			double avg_lat = 0.0;
+
+			for(int k = 0; k < iterations; k++)
+			{
+				avg_lat += lats[k];
+			}
+
+			avg_lat /= iterations;
+			const double bw_million_byte = s / min_lat;
+			const double bw_mbyte        = bw_million_byte / 1.048576;
+
+			printf("%9lld\t%9.3lf\t%9.3f\t%9.3f\t%9.3lf\t%9.3lf\t%9.3lf\t%9.3lf\t%9.3lf\n",
+				(long long)s, min_lat, bw_million_byte, bw_mbyte, d1_lat, med_lat, avg_lat, d9_lat, max_lat);
+			fflush(stdout);
+		}
+		starpu_data_unregister(handle_recv);
+		starpu_data_unregister(handle_send);
+
+		free(vector_send);
+		free(vector_recv);
+	}
+
+	starpu_mpi_shutdown();
+
+	return 0;
+}

+ 1 - 1
sc_hypervisor/examples/cholesky/cholesky_kernels.c

@@ -197,7 +197,7 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, void *_a
 				fprintf(stderr, "Error in Magma: %d\n", ret);
 				STARPU_ABORT();
 			}
-			cudaError_t cures = cudaThreadSynchronize();
+			cudaError_t cures = cudaDeviceSynchronize();
 			STARPU_ASSERT(!cures);
 			}
 #else

+ 1 - 0
src/Makefile.am

@@ -246,6 +246,7 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 		\
 	datawizard/interfaces/matrix_interface.c		\
 	datawizard/interfaces/block_filters.c			\
 	datawizard/interfaces/block_interface.c			\
+	datawizard/interfaces/tensor_interface.c		\
 	datawizard/interfaces/vector_interface.c		\
 	datawizard/interfaces/bcsr_filters.c			\
 	datawizard/interfaces/csr_filters.c			\

+ 10 - 10
src/common/fxt.c

@@ -123,21 +123,26 @@ void starpu_fxt_autostart_profiling(int autostart)
 	if (autostart)
 		initial_key_mask = FUT_KEYMASKALL;
 	else
-		initial_key_mask = 0;
+		initial_key_mask = _STARPU_FUT_KEYMASK_META;
 }
 
 void starpu_fxt_start_profiling()
 {
 	unsigned threadid = _starpu_gettid();
 	fut_keychange(FUT_ENABLE, FUT_KEYMASKALL, threadid);
-	_STARPU_TRACE_EVENT("start_profiling");
+	_STARPU_TRACE_META("start_profiling");
 }
 
 void starpu_fxt_stop_profiling()
 {
 	unsigned threadid = _starpu_gettid();
-	_STARPU_TRACE_EVENT("stop_profiling");
-	fut_keychange(FUT_DISABLE, FUT_KEYMASKALL, threadid);
+	_STARPU_TRACE_META("stop_profiling");
+	fut_keychange(FUT_SETMASK, _STARPU_FUT_KEYMASK_META, threadid);
+}
+
+int starpu_fxt_is_enabled()
+{
+	return starpu_get_env_number_default("STARPU_FXT_TRACE", 1);
 }
 
 void _starpu_fxt_init_profiling(unsigned trace_buffer_size)
@@ -145,7 +150,7 @@ void _starpu_fxt_init_profiling(unsigned trace_buffer_size)
 	unsigned threadid;
 
 	STARPU_PTHREAD_MUTEX_LOCK(&_starpu_fxt_started_mutex);
-	if (!(_starpu_fxt_willstart = starpu_get_env_number_default("STARPU_FXT_TRACE", 1)))
+	if (!(_starpu_fxt_willstart = starpu_fxt_is_enabled()))
 	{
 		STARPU_PTHREAD_COND_BROADCAST(&_starpu_fxt_started_cond);
 		STARPU_PTHREAD_MUTEX_UNLOCK(&_starpu_fxt_started_mutex);
@@ -299,11 +304,6 @@ void _starpu_stop_fxt_profiling(void)
 	}
 }
 
-void _starpu_fxt_register_thread(unsigned cpuid)
-{
-	FUT_DO_PROBE2(FUT_NEW_LWP_CODE, cpuid, _starpu_gettid());
-}
-
 #else // STARPU_USE_FXT
 
 void starpu_fxt_autostart_profiling(int autostart STARPU_ATTRIBUTE_UNUSED)

+ 291 - 176
src/common/fxt.h

@@ -236,6 +236,31 @@
 
 #define _STARPU_FUT_DATA_REQUEST_CREATED   0x5185
 
+
+/* Predefined FUT key masks */
+#define _STARPU_FUT_KEYMASK_META           FUT_KEYMASK0
+#define _STARPU_FUT_KEYMASK_USER           FUT_KEYMASK1
+#define _STARPU_FUT_KEYMASK_TASK           FUT_KEYMASK2
+#define _STARPU_FUT_KEYMASK_TASK_VERBOSE   FUT_KEYMASK3
+#define _STARPU_FUT_KEYMASK_DATA           FUT_KEYMASK4
+#define _STARPU_FUT_KEYMASK_DATA_VERBOSE   FUT_KEYMASK5
+#define _STARPU_FUT_KEYMASK_WORKER         FUT_KEYMASK6
+#define _STARPU_FUT_KEYMASK_WORKER_VERBOSE FUT_KEYMASK7
+#define _STARPU_FUT_KEYMASK_DSM            FUT_KEYMASK8
+#define _STARPU_FUT_KEYMASK_DSM_VERBOSE    FUT_KEYMASK9
+#define _STARPU_FUT_KEYMASK_SCHED          FUT_KEYMASK10
+#define _STARPU_FUT_KEYMASK_SCHED_VERBOSE  FUT_KEYMASK11
+#define _STARPU_FUT_KEYMASK_LOCK           FUT_KEYMASK12
+#define _STARPU_FUT_KEYMASK_LOCK_VERBOSE   FUT_KEYMASK13
+#define _STARPU_FUT_KEYMASK_EVENT          FUT_KEYMASK14
+#define _STARPU_FUT_KEYMASK_EVENT_VERBOSE  FUT_KEYMASK15
+#define _STARPU_FUT_KEYMASK_MPI            FUT_KEYMASK16
+#define _STARPU_FUT_KEYMASK_MPI_VERBOSE    FUT_KEYMASK17
+#define _STARPU_FUT_KEYMASK_HYP            FUT_KEYMASK18
+#define _STARPU_FUT_KEYMASK_HYP_VERBOSE    FUT_KEYMASK19
+
+#define _STARPU_FUT_PAPI_TASK_EVENT_VALUE   0x5186
+
 extern unsigned long _starpu_job_cnt;
 
 static inline unsigned long _starpu_fxt_get_job_id(void)
@@ -266,12 +291,15 @@ extern int _starpu_fxt_willstart;
 extern starpu_pthread_mutex_t _starpu_fxt_started_mutex;
 extern starpu_pthread_cond_t _starpu_fxt_started_cond;
 
-static inline void _starpu_fxt_wait_initialisation()
+/* Wait until FXT is started (or not). Returns if FXT was started */
+static inline int _starpu_fxt_wait_initialisation()
 {
 	STARPU_PTHREAD_MUTEX_LOCK(&_starpu_fxt_started_mutex);
 	while (_starpu_fxt_willstart && !_starpu_fxt_started)
 		STARPU_PTHREAD_COND_WAIT(&_starpu_fxt_started_cond, &_starpu_fxt_started_mutex);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&_starpu_fxt_started_mutex);
+
+	return _starpu_fxt_started;
 }
 
 extern unsigned long _starpu_submit_order;
@@ -294,25 +322,21 @@ void _starpu_stop_fxt_profiling(void);
 /* Generate the trace file. Used when catching signals SIGINT and SIGSEGV */
 void _starpu_fxt_dump_file(void);
 
-/* Associate the current processing unit to the identifier of the LWP that runs
- * the worker. */
-void _starpu_fxt_register_thread(unsigned);
-
 #ifdef FUT_NEEDS_COMMIT
 #define _STARPU_FUT_COMMIT(size) fut_commitstampedbuffer(size)
 #else
 #define _STARPU_FUT_COMMIT(size) do { } while (0)
 #endif
 
-#ifdef FUT_DO_PROBE1STR
-#define _STARPU_FUT_DO_PROBE1STR(CODE, P1, str) FUT_DO_PROBE1STR(CODE, P1, str)
+#ifdef FUT_FULL_PROBE1STR
+#define _STARPU_FUT_FULL_PROBE1STR(KEYMASK, CODE, P1, str) FUT_FULL_PROBE1STR(CODE, P1, str)
 #else
 /* Sometimes we need something a little more specific than the wrappers from
  * FxT: these macro permit to put add an event with 3 (or 4) numbers followed
  * by a string. */
-#define _STARPU_FUT_DO_PROBE1STR(CODE, P1, str)			\
+#define _STARPU_FUT_FULL_PROBE1STR(KEYMASK, CODE, P1, str)			\
 do {									\
-    if(fut_active) {							\
+    if(KEYMASK & fut_active) {							\
 	/* No more than FXT_MAX_PARAMS args are allowed */		\
 	/* we add a \0 just in case ... */				\
 	size_t len = STARPU_MIN(strlen(str)+1, (FXT_MAX_PARAMS - 1)*sizeof(unsigned long));\
@@ -329,15 +353,15 @@ do {									\
 } while (0);
 #endif
 
-#ifdef FUT_DO_PROBE2STR
-#define _STARPU_FUT_DO_PROBE2STR(CODE, P1, P2, str) FUT_DO_PROBE2STR(CODE, P1, P2, str)
+#ifdef FUT_FULL_PROBE2STR
+#define _STARPU_FUT_FULL_PROBE2STR(KEYMASK, CODE, P1, P2, str) FUT_FULL_PROBE2STR(CODE, P1, P2, str)
 #else
 /* Sometimes we need something a little more specific than the wrappers from
  * FxT: these macro permit to put add an event with 3 (or 4) numbers followed
  * by a string. */
-#define _STARPU_FUT_DO_PROBE2STR(CODE, P1, P2, str)			\
+#define _STARPU_FUT_FULL_PROBE2STR(KEYMASK, CODE, P1, P2, str)			\
 do {									\
-    if(fut_active) {							\
+    if(KEYMASK & fut_active) {							\
 	/* No more than FXT_MAX_PARAMS args are allowed */		\
 	/* we add a \0 just in case ... */				\
 	size_t len = STARPU_MIN(strlen(str)+1, (FXT_MAX_PARAMS - 2)*sizeof(unsigned long));\
@@ -355,12 +379,12 @@ do {									\
 } while (0);
 #endif
 
-#ifdef FUT_DO_PROBE3STR
-#define _STARPU_FUT_DO_PROBE3STR(CODE, P1, P2, P3, str) FUT_DO_PROBE3STR(CODE, P1, P2, P3, str)
+#ifdef FUT_FULL_PROBE3STR
+#define _STARPU_FUT_FULL_PROBE3STR(KEYMASK, CODE, P1, P2, P3, str) FUT_FULL_PROBE3STR(CODE, P1, P2, P3, str)
 #else
-#define _STARPU_FUT_DO_PROBE3STR(CODE, P1, P2, P3, str)			\
+#define _STARPU_FUT_FULL_PROBE3STR(KEYMASK, CODE, P1, P2, P3, str)			\
 do {									\
-    if(fut_active) {							\
+    if(KEYMASK & fut_active) {							\
 	/* No more than FXT_MAX_PARAMS args are allowed */		\
 	/* we add a \0 just in case ... */				\
 	size_t len = STARPU_MIN(strlen(str)+1, (FXT_MAX_PARAMS - 3)*sizeof(unsigned long));\
@@ -379,12 +403,12 @@ do {									\
 } while (0);
 #endif
 
-#ifdef FUT_DO_PROBE4STR
-#define _STARPU_FUT_DO_PROBE4STR(CODE, P1, P2, P3, P4, str) FUT_DO_PROBE4STR(CODE, P1, P2, P3, P4, str)
+#ifdef FUT_FULL_PROBE4STR
+#define _STARPU_FUT_FULL_PROBE4STR(KEYMASK, CODE, P1, P2, P3, P4, str) FUT_FULL_PROBE4STR(CODE, P1, P2, P3, P4, str)
 #else
-#define _STARPU_FUT_DO_PROBE4STR(CODE, P1, P2, P3, P4, str)		\
+#define _STARPU_FUT_FULL_PROBE4STR(KEYMASK, CODE, P1, P2, P3, P4, str)		\
 do {									\
-    if(fut_active) {							\
+    if(KEYMASK & fut_active) {							\
 	/* No more than FXT_MAX_PARAMS args are allowed */		\
 	/* we add a \0 just in case ... */				\
 	size_t len = STARPU_MIN(strlen(str)+1, (FXT_MAX_PARAMS - 4)*sizeof(unsigned long));\
@@ -404,12 +428,12 @@ do {									\
 } while (0);
 #endif
 
-#ifdef FUT_DO_PROBE5STR
-#define _STARPU_FUT_DO_PROBE5STR(CODE, P1, P2, P3, P4, P5, str) FUT_DO_PROBE5STR(CODE, P1, P2, P3, P4, P5, str)
+#ifdef FUT_FULL_PROBE5STR
+#define _STARPU_FUT_FULL_PROBE5STR(KEYMASK, CODE, P1, P2, P3, P4, P5, str) FUT_FULL_PROBE5STR(CODE, P1, P2, P3, P4, P5, str)
 #else
-#define _STARPU_FUT_DO_PROBE5STR(CODE, P1, P2, P3, P4, P5, str)		\
+#define _STARPU_FUT_FULL_PROBE5STR(KEYMASK, CODE, P1, P2, P3, P4, P5, str)		\
 do {									\
-    if(fut_active) {							\
+    if(KEYMASK & fut_active) {							\
 	/* No more than FXT_MAX_PARAMS args are allowed */		\
 	/* we add a \0 just in case ... */				\
 	size_t len = STARPU_MIN(strlen(str)+1, (FXT_MAX_PARAMS - 5)*sizeof(unsigned long));\
@@ -430,12 +454,12 @@ do {									\
 } while (0);
 #endif
 
-#ifdef FUT_DO_PROBE6STR
-#define _STARPU_FUT_DO_PROBE6STR(CODE, P1, P2, P3, P4, P5, P6, str) FUT_DO_PROBE6STR(CODE, P1, P2, P3, P4, P5, P6, str)
+#ifdef FUT_FULL_PROBE6STR
+#define _STARPU_FUT_FULL_PROBE6STR(KEYMASK, CODE, P1, P2, P3, P4, P5, P6, str) FUT_FULL_PROBE6STR(CODE, P1, P2, P3, P4, P5, P6, str)
 #else
-#define _STARPU_FUT_DO_PROBE6STR(CODE, P1, P2, P3, P4, P5, P6, str)	\
+#define _STARPU_FUT_FULL_PROBE6STR(KEYMASK, CODE, P1, P2, P3, P4, P5, P6, str)	\
 do {									\
-    if(fut_active) {							\
+    if(KEYMASK & fut_active) {							\
 	/* No more than FXT_MAX_PARAMS args are allowed */		\
 	/* we add a \0 just in case ... */				\
 	size_t len = STARPU_MIN(strlen(str)+1, (FXT_MAX_PARAMS - 6)*sizeof(unsigned long));\
@@ -457,12 +481,12 @@ do {									\
 } while (0);
 #endif
 
-#ifdef FUT_DO_PROBE7STR
-#define _STARPU_FUT_DO_PROBE7STR(CODE, P1, P2, P3, P4, P5, P6, P7, str) FUT_DO_PROBE7STR(CODE, P1, P2, P3, P4, P5, P6, P7, str)
+#ifdef FUT_FULL_PROBE7STR
+#define _STARPU_FUT_FULL_PROBE7STR(KEYMASK, CODE, P1, P2, P3, P4, P5, P6, P7, str) FUT_FULL_PROBE7STR(CODE, P1, P2, P3, P4, P5, P6, P7, str)
 #else
-#define _STARPU_FUT_DO_PROBE7STR(CODE, P1, P2, P3, P4, P5, P6, P7, str)	\
+#define _STARPU_FUT_FULL_PROBE7STR(KEYMASK, CODE, P1, P2, P3, P4, P5, P6, P7, str)	\
 do {									\
-    if(fut_active) {							\
+    if(KEYMASK & fut_active) {							\
 	/* No more than FXT_MAX_PARAMS args are allowed */		\
 	/* we add a \0 just in case ... */				\
 	size_t len = STARPU_MIN(strlen(str)+1, (FXT_MAX_PARAMS - 7)*sizeof(unsigned long));\
@@ -497,12 +521,6 @@ do {									\
 	} while (0)
 #endif
 
-#ifndef FUT_DO_PROBE7
-#define FUT_DO_PROBE7(CODE,P1,P2,P3,P4,P5,P6,P7) do { \
-        FUT_RAW_PROBE7(FUT_CODE(CODE, 7),P1,P2,P3,P4,P5,P6,P7); \
-} while (0)
-#endif
-
 #ifndef FUT_RAW_ALWAYS_PROBE2
 #define FUT_RAW_ALWAYS_PROBE2(CODE,P1,P2) do {	\
 		unsigned long *__args __attribute__((unused))=	\
@@ -582,18 +600,92 @@ do {									\
 } while (0)
 
 
+/* full probes */
+#ifndef FUT_FULL_PROBE0
+#define FUT_FULL_PROBE0(KEYMASK,CODE) do { \
+        if( KEYMASK & fut_active ) { \
+                FUT_RAW_ALWAYS_PROBE0(FUT_CODE(CODE, 0)); \
+        } \
+} while(0)
+#endif
+
+#ifndef FUT_FULL_PROBE1
+#define FUT_FULL_PROBE1(KEYMASK,CODE,P1) do { \
+        if( KEYMASK & fut_active ) { \
+                FUT_RAW_ALWAYS_PROBE1(FUT_CODE(CODE, 1),P1); \
+        } \
+} while(0)
+#endif
+
+#ifndef FUT_FULL_PROBE2
+#define FUT_FULL_PROBE2(KEYMASK,CODE,P1,P2) do { \
+        if( KEYMASK & fut_active ) { \
+                FUT_RAW_ALWAYS_PROBE2(FUT_CODE(CODE, 2),P1,P2); \
+        } \
+} while(0)
+#endif
+
+#ifndef FUT_FULL_PROBE3
+#define FUT_FULL_PROBE3(KEYMASK,CODE,P1,P2,P3) do { \
+        if( KEYMASK & fut_active ) { \
+                FUT_RAW_ALWAYS_PROBE3(FUT_CODE(CODE, 3),P1,P2,P3); \
+        } \
+} while(0)
+#endif
+
+#ifndef FUT_FULL_PROBE4
+#define FUT_FULL_PROBE4(KEYMASK,CODE,P1,P2,P3,P4) do { \
+        if( KEYMASK & fut_active ) { \
+                FUT_RAW_ALWAYS_PROBE4(FUT_CODE(CODE, 4),P1,P2,P3,P4); \
+        } \
+} while(0)
+#endif
+
+#ifndef FUT_FULL_PROBE5
+#define FUT_FULL_PROBE5(KEYMASK,CODE,P1,P2,P3,P4,P5) do { \
+        if( KEYMASK & fut_active ) { \
+                FUT_RAW_ALWAYS_PROBE5(FUT_CODE(CODE, 5),P1,P2,P3,P4,P5); \
+        } \
+} while(0)
+#endif
+
+#ifndef FUT_FULL_PROBE6
+#define FUT_FULL_PROBE6(KEYMASK,CODE,P1,P2,P3,P4,P5,P6) do { \
+        if( KEYMASK & fut_active ) { \
+                FUT_RAW_ALWAYS_PROBE6(FUT_CODE(CODE, 6),P1,P2,P3,P4,P5,P6); \
+        } \
+} while(0)
+#endif
+
+#ifndef FUT_FULL_PROBE7
+#define FUT_FULL_PROBE7(KEYMASK,CODE,P1,P2,P3,P4,P5,P6,P7) do { \
+        if( KEYMASK & fut_active ) { \
+                FUT_RAW_ALWAYS_PROBE7(FUT_CODE(CODE, 7),P1,P2,P3,P4,P5,P6,P7); \
+        } \
+} while(0)
+#endif
+
 
-/* workerkind = _STARPU_FUT_CPU_KEY for instance */
 #define _STARPU_TRACE_NEW_MEM_NODE(nodeid)			do {\
 	if (_starpu_fxt_started) \
 		FUT_DO_ALWAYS_PROBE2(_STARPU_FUT_NEW_MEM_NODE, nodeid, _starpu_gettid()); \
 } while (0)
 
-#define _STARPU_TRACE_WORKER_INIT_START(workerkind, workerid, devid, memnode, bindid, sync)	\
-	FUT_DO_PROBE7(_STARPU_FUT_WORKER_INIT_START, workerkind, workerid, devid, memnode, bindid, sync, _starpu_gettid());
+#define _STARPU_TRACE_REGISTER_THREAD(cpuid)			do {\
+	if (_starpu_fxt_started) \
+		FUT_DO_ALWAYS_PROBE2(FUT_NEW_LWP_CODE, cpuid, _starpu_gettid()); \
+} while (0)
 
-#define _STARPU_TRACE_WORKER_INIT_END(__workerid)				\
-	FUT_DO_PROBE2(_STARPU_FUT_WORKER_INIT_END, _starpu_gettid(), (__workerid));
+/* workerkind = _STARPU_FUT_CPU_KEY for instance */
+#define _STARPU_TRACE_WORKER_INIT_START(workerkind, workerid, devid, memnode, bindid, sync)	do {\
+	if (_starpu_fxt_started) \
+		FUT_DO_ALWAYS_PROBE7(_STARPU_FUT_WORKER_INIT_START, workerkind, workerid, devid, memnode, bindid, sync, _starpu_gettid()); \
+} while (0)
+
+#define _STARPU_TRACE_WORKER_INIT_END(__workerid)		do {\
+	if (_starpu_fxt_started) \
+		FUT_DO_ALWAYS_PROBE2(_STARPU_FUT_WORKER_INIT_END, _starpu_gettid(), (__workerid)); \
+} while (0)
 
 #define _STARPU_TRACE_START_CODELET_BODY(job, nimpl, perf_arch, workerid)				\
 do {									\
@@ -601,12 +693,12 @@ do {									\
 	if (name)                                                 \
 	{								\
 		/* we include the task name */			\
-		_STARPU_FUT_DO_PROBE5STR(_STARPU_FUT_START_CODELET_BODY, (job)->job_id, ((job)->task)->sched_ctx, workerid, starpu_worker_get_memory_node(workerid), 1, name); \
+		_STARPU_FUT_FULL_PROBE5STR(_STARPU_FUT_KEYMASK_TASK, _STARPU_FUT_START_CODELET_BODY, (job)->job_id, ((job)->task)->sched_ctx, workerid, starpu_worker_get_memory_node(workerid), 1, name); \
 		if (model_name && strcmp(model_name, name))				\
-			_STARPU_FUT_DO_PROBE1STR(_STARPU_FUT_MODEL_NAME, (job)->job_id, model_name); \
+			_STARPU_FUT_FULL_PROBE1STR(_STARPU_FUT_KEYMASK_TASK, _STARPU_FUT_MODEL_NAME, (job)->job_id, model_name); \
 	}								\
 	else {                                                          \
-		FUT_DO_PROBE5(_STARPU_FUT_START_CODELET_BODY, (job)->job_id, ((job)->task)->sched_ctx, workerid, starpu_worker_get_memory_node(workerid), 0); \
+		FUT_FULL_PROBE5(_STARPU_FUT_KEYMASK_TASK, _STARPU_FUT_START_CODELET_BODY, (job)->job_id, ((job)->task)->sched_ctx, workerid, starpu_worker_get_memory_node(workerid), 0); \
 	}								\
 	{								\
 		if ((job)->task->cl)					\
@@ -621,14 +713,14 @@ do {									\
 				if (__handle->ops->describe)		\
 				{					\
 					__handle->ops->describe(__interface, __buf, sizeof(__buf));	\
-					_STARPU_FUT_DO_PROBE1STR(_STARPU_FUT_CODELET_DATA, workerid, __buf);	\
+					_STARPU_FUT_FULL_PROBE1STR(_STARPU_FUT_KEYMASK_DATA, _STARPU_FUT_CODELET_DATA, workerid, __buf);	\
 				}					\
-				FUT_DO_PROBE4(_STARPU_FUT_CODELET_DATA_HANDLE, (job)->job_id, (__handle), _starpu_data_get_size(__handle), STARPU_TASK_GET_MODE((job)->task, __i));	\
+				FUT_FULL_PROBE4(_STARPU_FUT_KEYMASK_TASK, _STARPU_FUT_CODELET_DATA_HANDLE, (job)->job_id, (__handle), _starpu_data_get_size(__handle), STARPU_TASK_GET_MODE((job)->task, __i));	\
 			}						\
 		}							\
 		const size_t __job_size = _starpu_job_get_data_size((job)->task->cl?(job)->task->cl->model:NULL, perf_arch, nimpl, (job));	\
 		const uint32_t __job_hash = _starpu_compute_buffers_footprint((job)->task->cl?(job)->task->cl->model:NULL, perf_arch, nimpl, (job));\
-		FUT_DO_PROBE7(_STARPU_FUT_CODELET_DETAILS, ((job)->task)->sched_ctx, __job_size, __job_hash, (job)->task->flops / 1000 / ((job)->task->cl && job->task->cl->type != STARPU_SEQ ? j->task_size : 1), (job)->task->tag_id, workerid, ((job)->job_id)); \
+		FUT_FULL_PROBE7(_STARPU_FUT_KEYMASK_TASK_VERBOSE, _STARPU_FUT_CODELET_DETAILS, ((job)->task)->sched_ctx, __job_size, __job_hash, (job)->task->flops / 1000 / ((job)->task->cl && job->task->cl->type != STARPU_SEQ ? j->task_size : 1), (job)->task->tag_id, workerid, ((job)->job_id)); \
 	}								\
 } while(0);
 
@@ -638,59 +730,59 @@ do {									\
 	const uint32_t job_hash = _starpu_compute_buffers_footprint((job)->task->cl?(job)->task->cl->model:NULL, perf_arch, nimpl, (job));\
 	char _archname[32]=""; \
 	starpu_perfmodel_get_arch_name(perf_arch, _archname, 32, 0);	\
-	_STARPU_FUT_DO_PROBE5STR(_STARPU_FUT_END_CODELET_BODY, (job)->job_id, (job_size), (job_hash), workerid, _starpu_gettid(), _archname); \
+	_STARPU_FUT_FULL_PROBE5STR(_STARPU_FUT_KEYMASK_TASK, _STARPU_FUT_END_CODELET_BODY, (job)->job_id, (job_size), (job_hash), workerid, _starpu_gettid(), _archname); \
 } while(0);
 
 #define _STARPU_TRACE_START_EXECUTING()				\
-	FUT_DO_PROBE1(_STARPU_FUT_START_EXECUTING, _starpu_gettid());
+	FUT_FULL_PROBE1(_STARPU_FUT_KEYMASK_WORKER_VERBOSE, _STARPU_FUT_START_EXECUTING, _starpu_gettid());
 
 #define _STARPU_TRACE_END_EXECUTING()				\
-	FUT_DO_PROBE1(_STARPU_FUT_END_EXECUTING, _starpu_gettid());
+	FUT_FULL_PROBE1(_STARPU_FUT_KEYMASK_WORKER_VERBOSE, _STARPU_FUT_END_EXECUTING, _starpu_gettid());
 
 #define _STARPU_TRACE_START_CALLBACK(job)	\
-	FUT_DO_PROBE2(_STARPU_FUT_START_CALLBACK, job, _starpu_gettid());
+	FUT_FULL_PROBE2(_STARPU_FUT_KEYMASK_WORKER_VERBOSE, _STARPU_FUT_START_CALLBACK, job, _starpu_gettid());
 
 #define _STARPU_TRACE_END_CALLBACK(job)	\
-	FUT_DO_PROBE2(_STARPU_FUT_END_CALLBACK, job, _starpu_gettid());
+	FUT_FULL_PROBE2(_STARPU_FUT_KEYMASK_WORKER_VERBOSE, _STARPU_FUT_END_CALLBACK, job, _starpu_gettid());
 
 #define _STARPU_TRACE_JOB_PUSH(task, prio)	\
-	FUT_DO_PROBE3(_STARPU_FUT_JOB_PUSH, _starpu_get_job_associated_to_task(task)->job_id, prio, _starpu_gettid());
+	FUT_FULL_PROBE3(_STARPU_FUT_KEYMASK_SCHED, _STARPU_FUT_JOB_PUSH, _starpu_get_job_associated_to_task(task)->job_id, prio, _starpu_gettid());
 
 #define _STARPU_TRACE_JOB_POP(task, prio)	\
-	FUT_DO_PROBE3(_STARPU_FUT_JOB_POP, _starpu_get_job_associated_to_task(task)->job_id, prio, _starpu_gettid());
+	FUT_FULL_PROBE3(_STARPU_FUT_KEYMASK_SCHED, _STARPU_FUT_JOB_POP, _starpu_get_job_associated_to_task(task)->job_id, prio, _starpu_gettid());
 
 #define _STARPU_TRACE_UPDATE_TASK_CNT(counter)	\
-	FUT_DO_PROBE2(_STARPU_FUT_UPDATE_TASK_CNT, counter, _starpu_gettid())
+	FUT_FULL_PROBE2(_STARPU_FUT_KEYMASK_TASK, _STARPU_FUT_UPDATE_TASK_CNT, counter, _starpu_gettid())
 
 #define _STARPU_TRACE_START_FETCH_INPUT(job)	\
-	FUT_DO_PROBE2(_STARPU_FUT_START_FETCH_INPUT_ON_TID, job, _starpu_gettid());
+	FUT_FULL_PROBE2(_STARPU_FUT_KEYMASK_DSM, _STARPU_FUT_START_FETCH_INPUT_ON_TID, job, _starpu_gettid());
 
 #define _STARPU_TRACE_END_FETCH_INPUT(job)	\
-	FUT_DO_PROBE2(_STARPU_FUT_END_FETCH_INPUT_ON_TID, job, _starpu_gettid());
+	FUT_FULL_PROBE2(_STARPU_FUT_KEYMASK_DSM, _STARPU_FUT_END_FETCH_INPUT_ON_TID, job, _starpu_gettid());
 
 #define _STARPU_TRACE_START_PUSH_OUTPUT(job)	\
-	FUT_DO_PROBE2(_STARPU_FUT_START_PUSH_OUTPUT_ON_TID, job, _starpu_gettid());
+	FUT_FULL_PROBE2(_STARPU_FUT_KEYMASK_DSM, _STARPU_FUT_START_PUSH_OUTPUT_ON_TID, job, _starpu_gettid());
 
 #define _STARPU_TRACE_END_PUSH_OUTPUT(job)	\
-	FUT_DO_PROBE2(_STARPU_FUT_END_PUSH_OUTPUT_ON_TID, job, _starpu_gettid());
+	FUT_FULL_PROBE2(_STARPU_FUT_KEYMASK_DSM, _STARPU_FUT_END_PUSH_OUTPUT_ON_TID, job, _starpu_gettid());
 
 #define _STARPU_TRACE_WORKER_END_FETCH_INPUT(job, id)	\
-	FUT_DO_PROBE2(_STARPU_FUT_END_FETCH_INPUT, job, id);
+	FUT_FULL_PROBE2(_STARPU_FUT_KEYMASK_WORKER_VERBOSE, _STARPU_FUT_END_FETCH_INPUT, job, id);
 
 #define _STARPU_TRACE_WORKER_START_FETCH_INPUT(job, id)	\
-	FUT_DO_PROBE2(_STARPU_FUT_START_FETCH_INPUT, job, id);
+	FUT_FULL_PROBE2(_STARPU_FUT_KEYMASK_WORKER_VERBOSE, _STARPU_FUT_START_FETCH_INPUT, job, id);
 
 #define _STARPU_TRACE_TAG(tag, job)	\
-	FUT_DO_PROBE2(_STARPU_FUT_TAG, tag, (job)->job_id)
+	FUT_FULL_PROBE2(_STARPU_FUT_KEYMASK_TASK, _STARPU_FUT_TAG, tag, (job)->job_id)
 
 #define _STARPU_TRACE_TAG_DEPS(tag_child, tag_father)	\
-	FUT_DO_PROBE2(_STARPU_FUT_TAG_DEPS, tag_child, tag_father)
+	FUT_FULL_PROBE2(_STARPU_FUT_KEYMASK_TASK, _STARPU_FUT_TAG_DEPS, tag_child, tag_father)
 
 #define _STARPU_TRACE_TASK_DEPS(job_prev, job_succ)	\
-	_STARPU_FUT_DO_PROBE4STR(_STARPU_FUT_TASK_DEPS, (job_prev)->job_id, (job_succ)->job_id, (job_succ)->task->type, 1, "task")
+	_STARPU_FUT_FULL_PROBE4STR(_STARPU_FUT_KEYMASK_TASK, _STARPU_FUT_TASK_DEPS, (job_prev)->job_id, (job_succ)->job_id, (job_succ)->task->type, 1, "task")
 
 #define _STARPU_TRACE_GHOST_TASK_DEPS(ghost_prev_id, job_succ)		\
-	_STARPU_FUT_DO_PROBE4STR(_STARPU_FUT_TASK_DEPS, (ghost_prev_id), (job_succ)->job_id, (job_succ)->task->type, 1, "ghost")
+	_STARPU_FUT_FULL_PROBE4STR(_STARPU_FUT_KEYMASK_TASK_VERBOSE, _STARPU_FUT_TASK_DEPS, (ghost_prev_id), (job_succ)->job_id, (job_succ)->task->type, 1, "ghost")
 
 #define _STARPU_TRACE_TASK_NAME(job)						\
 do {										\
@@ -698,23 +790,23 @@ do {										\
         const char *model_name = _starpu_job_get_task_name((job));                       \
 	if (model_name)					                        \
 	{									\
-		_STARPU_FUT_DO_PROBE4STR(_STARPU_FUT_TASK_NAME, (job)->job_id, _starpu_gettid(), (long unsigned)exclude_from_dag, 1, model_name);\
+		_STARPU_FUT_FULL_PROBE4STR(_STARPU_FUT_KEYMASK_TASK, _STARPU_FUT_TASK_NAME, (job)->job_id, _starpu_gettid(), (long unsigned)exclude_from_dag, 1, model_name);\
 	}									\
 	else {									\
-		FUT_DO_PROBE4(_STARPU_FUT_TASK_NAME, (job)->job_id, _starpu_gettid(), (long unsigned)exclude_from_dag, 0);\
+		FUT_FULL_PROBE4(_STARPU_FUT_KEYMASK_TASK, _STARPU_FUT_TASK_NAME, (job)->job_id, _starpu_gettid(), (long unsigned)exclude_from_dag, 0);\
 	}									\
 } while(0);
 
 #define _STARPU_TRACE_TASK_COLOR(job)						\
 do { \
 	if ((job)->task->color != 0) \
-		FUT_DO_PROBE3(_STARPU_FUT_TASK_COLOR, (job)->job_id, (job)->task->color, _starpu_gettid()); \
+		FUT_FULL_PROBE3(_STARPU_FUT_KEYMASK_TASK, _STARPU_FUT_TASK_COLOR, (job)->job_id, (job)->task->color, _starpu_gettid()); \
 	else if ((job)->task->cl && (job)->task->cl->color != 0) \
-		FUT_DO_PROBE3(_STARPU_FUT_TASK_COLOR, (job)->job_id, (job)->task->cl->color, _starpu_gettid()); \
+		FUT_FULL_PROBE3(_STARPU_FUT_KEYMASK_TASK, _STARPU_FUT_TASK_COLOR, (job)->job_id, (job)->task->cl->color, _starpu_gettid()); \
 } while(0)
 
 #define _STARPU_TRACE_TASK_DONE(job)						\
-	FUT_DO_PROBE2(_STARPU_FUT_TASK_DONE, (job)->job_id, _starpu_gettid())
+	FUT_FULL_PROBE2(_STARPU_FUT_KEYMASK_TASK, _STARPU_FUT_TASK_DONE, (job)->job_id, _starpu_gettid())
 
 #define _STARPU_TRACE_TAG_DONE(tag)						\
 do {										\
@@ -722,266 +814,285 @@ do {										\
         const char *model_name = _starpu_job_get_task_name((job));                       \
 	if (model_name)                                                         \
 	{									\
-          _STARPU_FUT_DO_PROBE3STR(_STARPU_FUT_TAG_DONE, (tag)->id, _starpu_gettid(), 1, model_name); \
+          _STARPU_FUT_FULL_PROBE3STR(_STARPU_FUT_KEYMASK_TASK, _STARPU_FUT_TAG_DONE, (tag)->id, _starpu_gettid(), 1, model_name); \
 	}									\
 	else {									\
-		FUT_DO_PROBE3(_STARPU_FUT_TAG_DONE, (tag)->id, _starpu_gettid(), 0);\
+		FUT_FULL_PROBE3(_STARPU_FUT_KEYMASK_TASK, _STARPU_FUT_TAG_DONE, (tag)->id, _starpu_gettid(), 0);\
 	}									\
 } while(0);
 
 #define _STARPU_TRACE_DATA_NAME(handle, name) \
-	_STARPU_FUT_DO_PROBE1STR(_STARPU_FUT_DATA_NAME, handle, name)
+	_STARPU_FUT_FULL_PROBE1STR(_STARPU_FUT_KEYMASK_DATA, _STARPU_FUT_DATA_NAME, handle, name)
 
 #define _STARPU_TRACE_DATA_COORDINATES(handle, dim, v) do {\
 	if (_starpu_fxt_started) \
 	switch (dim) { \
-	case 1: FUT_DO_ALWAYS_PROBE3(_STARPU_FUT_DATA_COORDINATES, handle, dim, v[0]); break; \
-	case 2: FUT_DO_ALWAYS_PROBE4(_STARPU_FUT_DATA_COORDINATES, handle, dim, v[0], v[1]); break; \
-	case 3: FUT_DO_ALWAYS_PROBE5(_STARPU_FUT_DATA_COORDINATES, handle, dim, v[0], v[1], v[2]); break; \
-	case 4: FUT_DO_ALWAYS_PROBE6(_STARPU_FUT_DATA_COORDINATES, handle, dim, v[0], v[1], v[2], v[3]); break; \
-	default: FUT_DO_ALWAYS_PROBE7(_STARPU_FUT_DATA_COORDINATES, handle, dim, v[0], v[1], v[2], v[3], v[4]); break; \
+	case 1: FUT_FULL_PROBE3(_STARPU_FUT_KEYMASK_DATA_VERBOSE, _STARPU_FUT_DATA_COORDINATES, handle, dim, v[0]); break; \
+	case 2: FUT_FULL_PROBE4(_STARPU_FUT_KEYMASK_DATA_VERBOSE, _STARPU_FUT_DATA_COORDINATES, handle, dim, v[0], v[1]); break; \
+	case 3: FUT_FULL_PROBE5(_STARPU_FUT_KEYMASK_DATA_VERBOSE, _STARPU_FUT_DATA_COORDINATES, handle, dim, v[0], v[1], v[2]); break; \
+	case 4: FUT_FULL_PROBE6(_STARPU_FUT_KEYMASK_DATA_VERBOSE, _STARPU_FUT_DATA_COORDINATES, handle, dim, v[0], v[1], v[2], v[3]); break; \
+	default: FUT_FULL_PROBE7(_STARPU_FUT_KEYMASK_DATA_VERBOSE, _STARPU_FUT_DATA_COORDINATES, handle, dim, v[0], v[1], v[2], v[3], v[4]); break; \
 	} \
 } while (0)
 
 #define _STARPU_TRACE_DATA_COPY(src_node, dst_node, size)	\
-	FUT_DO_PROBE3(_STARPU_FUT_DATA_COPY, src_node, dst_node, size)
+	FUT_FULL_PROBE3(_STARPU_FUT_KEYMASK_DSM, _STARPU_FUT_DATA_COPY, src_node, dst_node, size)
 
 #define _STARPU_TRACE_DATA_WONT_USE(handle)						\
-	FUT_DO_PROBE4(_STARPU_FUT_DATA_WONT_USE, handle, _starpu_fxt_get_submit_order(), _starpu_fxt_get_job_id(), _starpu_gettid())
+	FUT_FULL_PROBE4(_STARPU_FUT_KEYMASK_DATA, _STARPU_FUT_DATA_WONT_USE, handle, _starpu_fxt_get_submit_order(), _starpu_fxt_get_job_id(), _starpu_gettid())
 
 #define _STARPU_TRACE_DATA_DOING_WONT_USE(handle)						\
-	FUT_DO_PROBE1(_STARPU_FUT_DATA_DOING_WONT_USE, handle)
+	FUT_FULL_PROBE1(_STARPU_FUT_KEYMASK_DSM, _STARPU_FUT_DATA_DOING_WONT_USE, handle)
 
 #define _STARPU_TRACE_START_DRIVER_COPY(src_node, dst_node, size, com_id, prefetch, handle) \
-	FUT_DO_PROBE6(_STARPU_FUT_START_DRIVER_COPY, src_node, dst_node, size, com_id, prefetch, handle)
+	FUT_FULL_PROBE6(_STARPU_FUT_KEYMASK_DSM, _STARPU_FUT_START_DRIVER_COPY, src_node, dst_node, size, com_id, prefetch, handle)
 
 #define _STARPU_TRACE_END_DRIVER_COPY(src_node, dst_node, size, com_id, prefetch)	\
-	FUT_DO_PROBE5(_STARPU_FUT_END_DRIVER_COPY, src_node, dst_node, size, com_id, prefetch)
+	FUT_FULL_PROBE5(_STARPU_FUT_KEYMASK_DSM, _STARPU_FUT_END_DRIVER_COPY, src_node, dst_node, size, com_id, prefetch)
 
 #define _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node)	\
-	FUT_DO_PROBE2(_STARPU_FUT_START_DRIVER_COPY_ASYNC, src_node, dst_node)
+	FUT_FULL_PROBE2(_STARPU_FUT_KEYMASK_DSM, _STARPU_FUT_START_DRIVER_COPY_ASYNC, src_node, dst_node)
 
 #define _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node)	\
-	FUT_DO_PROBE2(_STARPU_FUT_END_DRIVER_COPY_ASYNC, src_node, dst_node)
+	FUT_FULL_PROBE2(_STARPU_FUT_KEYMASK_DSM, _STARPU_FUT_END_DRIVER_COPY_ASYNC, src_node, dst_node)
 
 #define _STARPU_TRACE_WORK_STEALING(empty_q, victim_q)		\
-	FUT_DO_PROBE2(_STARPU_FUT_WORK_STEALING, empty_q, victim_q)
+	FUT_FULL_PROBE2(_STARPU_FUT_KEYMASK_SCHED_VERBOSE, _STARPU_FUT_WORK_STEALING, empty_q, victim_q)
 
-#define _STARPU_TRACE_WORKER_DEINIT_START			\
-	FUT_DO_PROBE1(_STARPU_FUT_WORKER_DEINIT_START, _starpu_gettid());
+#define _STARPU_TRACE_WORKER_DEINIT_START			do {\
+	if (_starpu_fxt_started) \
+		FUT_DO_ALWAYS_PROBE1(_STARPU_FUT_WORKER_DEINIT_START, _starpu_gettid()); \
+} while(0)
 
-#define _STARPU_TRACE_WORKER_DEINIT_END(workerkind)		\
-	FUT_DO_PROBE2(_STARPU_FUT_WORKER_DEINIT_END, workerkind, _starpu_gettid());
+#define _STARPU_TRACE_WORKER_DEINIT_END(workerkind)		do {\
+	if (_starpu_fxt_started) \
+		FUT_DO_ALWAYS_PROBE2(_STARPU_FUT_WORKER_DEINIT_END, workerkind, _starpu_gettid()); \
+} while(0)
 
 #define _STARPU_TRACE_WORKER_SCHEDULING_START	\
-	FUT_DO_PROBE1(_STARPU_FUT_WORKER_SCHEDULING_START, _starpu_gettid());
+	FUT_FULL_PROBE1(_STARPU_FUT_KEYMASK_WORKER_VERBOSE, _STARPU_FUT_WORKER_SCHEDULING_START, _starpu_gettid());
 
 #define _STARPU_TRACE_WORKER_SCHEDULING_END	\
-	FUT_DO_PROBE1(_STARPU_FUT_WORKER_SCHEDULING_END, _starpu_gettid());
+	FUT_FULL_PROBE1(_STARPU_FUT_KEYMASK_WORKER_VERBOSE, _STARPU_FUT_WORKER_SCHEDULING_END, _starpu_gettid());
 
 #define _STARPU_TRACE_WORKER_SCHEDULING_PUSH	\
-	FUT_DO_PROBE1(_STARPU_FUT_WORKER_SCHEDULING_PUSH, _starpu_gettid());
+	FUT_FULL_PROBE1(_STARPU_FUT_KEYMASK_WORKER_VERBOSE, _STARPU_FUT_WORKER_SCHEDULING_PUSH, _starpu_gettid());
 
 #define _STARPU_TRACE_WORKER_SCHEDULING_POP	\
-	FUT_DO_PROBE1(_STARPU_FUT_WORKER_SCHEDULING_POP, _starpu_gettid());
+	FUT_FULL_PROBE1(_STARPU_FUT_KEYMASK_WORKER_VERBOSE, _STARPU_FUT_WORKER_SCHEDULING_POP, _starpu_gettid());
 
 #define _STARPU_TRACE_WORKER_SLEEP_START	\
-	FUT_DO_PROBE1(_STARPU_FUT_WORKER_SLEEP_START, _starpu_gettid());
+	FUT_FULL_PROBE1(_STARPU_FUT_KEYMASK_WORKER, _STARPU_FUT_WORKER_SLEEP_START, _starpu_gettid());
 
 #define _STARPU_TRACE_WORKER_SLEEP_END	\
-	FUT_DO_PROBE1(_STARPU_FUT_WORKER_SLEEP_END, _starpu_gettid());
+	FUT_FULL_PROBE1(_STARPU_FUT_KEYMASK_WORKER, _STARPU_FUT_WORKER_SLEEP_END, _starpu_gettid());
 
 #define _STARPU_TRACE_TASK_SUBMIT(job, iter, subiter)	\
-	FUT_DO_PROBE7(_STARPU_FUT_TASK_SUBMIT, (job)->job_id, iter, subiter, (job)->task->no_submitorder?0:_starpu_fxt_get_submit_order(), (job)->task->priority, (job)->task->type, _starpu_gettid());
+	FUT_FULL_PROBE7(_STARPU_FUT_KEYMASK_TASK, _STARPU_FUT_TASK_SUBMIT, (job)->job_id, iter, subiter, (job)->task->no_submitorder?0:_starpu_fxt_get_submit_order(), (job)->task->priority, (job)->task->type, _starpu_gettid());
 
 #define _STARPU_TRACE_TASK_SUBMIT_START()	\
-	FUT_DO_PROBE1(_STARPU_FUT_TASK_SUBMIT_START, _starpu_gettid());
+	FUT_FULL_PROBE1(_STARPU_FUT_KEYMASK_TASK_VERBOSE, _STARPU_FUT_TASK_SUBMIT_START, _starpu_gettid());
 
 #define _STARPU_TRACE_TASK_SUBMIT_END()	\
-	FUT_DO_PROBE1(_STARPU_FUT_TASK_SUBMIT_END, _starpu_gettid());
+	FUT_FULL_PROBE1(_STARPU_FUT_KEYMASK_TASK_VERBOSE, _STARPU_FUT_TASK_SUBMIT_END, _starpu_gettid());
 
 #define _STARPU_TRACE_TASK_THROTTLE_START()	\
-	FUT_DO_PROBE1(_STARPU_FUT_TASK_THROTTLE_START, _starpu_gettid());
+	FUT_FULL_PROBE1(_STARPU_FUT_KEYMASK_TASK, _STARPU_FUT_TASK_THROTTLE_START, _starpu_gettid());
 
 #define _STARPU_TRACE_TASK_THROTTLE_END()	\
-	FUT_DO_PROBE1(_STARPU_FUT_TASK_THROTTLE_END, _starpu_gettid());
+	FUT_FULL_PROBE1(_STARPU_FUT_KEYMASK_TASK, _STARPU_FUT_TASK_THROTTLE_END, _starpu_gettid());
 
 #define _STARPU_TRACE_TASK_BUILD_START()	\
-	FUT_DO_PROBE1(_STARPU_FUT_TASK_BUILD_START, _starpu_gettid());
+	FUT_FULL_PROBE1(_STARPU_FUT_KEYMASK_TASK_VERBOSE, _STARPU_FUT_TASK_BUILD_START, _starpu_gettid());
 
 #define _STARPU_TRACE_TASK_BUILD_END()	\
-	FUT_DO_PROBE1(_STARPU_FUT_TASK_BUILD_END, _starpu_gettid());
+	FUT_FULL_PROBE1(_STARPU_FUT_KEYMASK_TASK_VERBOSE, _STARPU_FUT_TASK_BUILD_END, _starpu_gettid());
 
 #define _STARPU_TRACE_TASK_MPI_DECODE_START()	\
-	FUT_DO_PROBE1(_STARPU_FUT_TASK_MPI_DECODE_START, _starpu_gettid());
+	FUT_FULL_PROBE1(_STARPU_FUT_KEYMASK_MPI_VERBOSE, _STARPU_FUT_TASK_MPI_DECODE_START, _starpu_gettid());
 
 #define _STARPU_TRACE_TASK_MPI_DECODE_END()	\
-	FUT_DO_PROBE1(_STARPU_FUT_TASK_MPI_DECODE_END, _starpu_gettid());
+	FUT_FULL_PROBE1(_STARPU_FUT_KEYMASK_MPI_VERBOSE, _STARPU_FUT_TASK_MPI_DECODE_END, _starpu_gettid());
 
 #define _STARPU_TRACE_TASK_MPI_PRE_START()	\
-	FUT_DO_PROBE1(_STARPU_FUT_TASK_MPI_PRE_START, _starpu_gettid());
+	FUT_FULL_PROBE1(_STARPU_FUT_KEYMASK_MPI_VERBOSE, _STARPU_FUT_TASK_MPI_PRE_START, _starpu_gettid());
 
 #define _STARPU_TRACE_TASK_MPI_PRE_END()	\
-	FUT_DO_PROBE1(_STARPU_FUT_TASK_MPI_PRE_END, _starpu_gettid());
+	FUT_FULL_PROBE1(_STARPU_FUT_KEYMASK_MPI_VERBOSE, _STARPU_FUT_TASK_MPI_PRE_END, _starpu_gettid());
 
 #define _STARPU_TRACE_TASK_MPI_POST_START()	\
-	FUT_DO_PROBE1(_STARPU_FUT_TASK_MPI_POST_START, _starpu_gettid());
+	FUT_FULL_PROBE1(_STARPU_FUT_KEYMASK_MPI_VERBOSE, _STARPU_FUT_TASK_MPI_POST_START, _starpu_gettid());
 
 #define _STARPU_TRACE_TASK_MPI_POST_END()	\
-	FUT_DO_PROBE1(_STARPU_FUT_TASK_MPI_POST_END, _starpu_gettid());
+	FUT_FULL_PROBE1(_STARPU_FUT_KEYMASK_MPI_VERBOSE, _STARPU_FUT_TASK_MPI_POST_END, _starpu_gettid());
 
 #define _STARPU_TRACE_TASK_WAIT_START(job)	\
-	FUT_DO_PROBE2(_STARPU_FUT_TASK_WAIT_START, (job)->job_id, _starpu_gettid());
+	FUT_FULL_PROBE2(_STARPU_FUT_KEYMASK_TASK_VERBOSE, _STARPU_FUT_TASK_WAIT_START, (job)->job_id, _starpu_gettid());
 
 #define _STARPU_TRACE_TASK_WAIT_END()	\
-	FUT_DO_PROBE1(_STARPU_FUT_TASK_WAIT_END, _starpu_gettid());
+	FUT_FULL_PROBE1(_STARPU_FUT_KEYMASK_TASK_VERBOSE, _STARPU_FUT_TASK_WAIT_END, _starpu_gettid());
 
 #define _STARPU_TRACE_TASK_WAIT_FOR_ALL_START()	\
-	FUT_DO_PROBE1(_STARPU_FUT_TASK_WAIT_FOR_ALL_START, _starpu_gettid());
+	FUT_FULL_PROBE1(_STARPU_FUT_KEYMASK_TASK_VERBOSE, _STARPU_FUT_TASK_WAIT_FOR_ALL_START, _starpu_gettid());
 
 #define _STARPU_TRACE_TASK_WAIT_FOR_ALL_END()	\
-	FUT_DO_PROBE1(_STARPU_FUT_TASK_WAIT_FOR_ALL_END, _starpu_gettid());
+	FUT_FULL_PROBE1(_STARPU_FUT_KEYMASK_TASK_VERBOSE, _STARPU_FUT_TASK_WAIT_FOR_ALL_END, _starpu_gettid());
 
 #define _STARPU_TRACE_USER_DEFINED_START	\
-	FUT_DO_PROBE1(_STARPU_FUT_USER_DEFINED_START, _starpu_gettid());
+	FUT_FULL_PROBE1(_STARPU_FUT_KEYMASK_USER, _STARPU_FUT_USER_DEFINED_START, _starpu_gettid());
 
 #define _STARPU_TRACE_USER_DEFINED_END		\
-	FUT_DO_PROBE1(_STARPU_FUT_USER_DEFINED_END, _starpu_gettid());
+	FUT_FULL_PROBE1(_STARPU_FUT_KEYMASK_USER, _STARPU_FUT_USER_DEFINED_END, _starpu_gettid());
 
 #define _STARPU_TRACE_START_ALLOC(memnode, size, handle, is_prefetch)               \
-       FUT_DO_PROBE5(_STARPU_FUT_START_ALLOC, memnode, _starpu_gettid(), size, handle, is_prefetch);
+       FUT_FULL_PROBE5(_STARPU_FUT_KEYMASK_DSM, _STARPU_FUT_START_ALLOC, memnode, _starpu_gettid(), size, handle, is_prefetch);
 
 #define _STARPU_TRACE_END_ALLOC(memnode, handle, r)            \
-       FUT_DO_PROBE4(_STARPU_FUT_END_ALLOC, memnode, _starpu_gettid(), handle, r);
+       FUT_FULL_PROBE4(_STARPU_FUT_KEYMASK_DSM, _STARPU_FUT_END_ALLOC, memnode, _starpu_gettid(), handle, r);
 
 #define _STARPU_TRACE_START_ALLOC_REUSE(memnode, size, handle, is_prefetch)         \
-       FUT_DO_PROBE5(_STARPU_FUT_START_ALLOC_REUSE, memnode, _starpu_gettid(), size, handle, is_prefetch);
+       FUT_FULL_PROBE5(_STARPU_FUT_KEYMASK_DSM_VERBOSE, _STARPU_FUT_START_ALLOC_REUSE, memnode, _starpu_gettid(), size, handle, is_prefetch);
 
 #define _STARPU_TRACE_END_ALLOC_REUSE(memnode, handle, r)              \
-       FUT_DO_PROBE4(_STARPU_FUT_END_ALLOC_REUSE, memnode, _starpu_gettid(), handle, r);
+       FUT_FULL_PROBE4(_STARPU_FUT_KEYMASK_DSM_VERBOSE, _STARPU_FUT_END_ALLOC_REUSE, memnode, _starpu_gettid(), handle, r);
 
 #define _STARPU_TRACE_START_FREE(memnode, size, handle)                \
-       FUT_DO_PROBE4(_STARPU_FUT_START_FREE, memnode, _starpu_gettid(), size, handle);
+       FUT_FULL_PROBE4(_STARPU_FUT_KEYMASK_DSM_VERBOSE, _STARPU_FUT_START_FREE, memnode, _starpu_gettid(), size, handle);
 
 #define _STARPU_TRACE_END_FREE(memnode, handle)                \
-       FUT_DO_PROBE3(_STARPU_FUT_END_FREE, memnode, _starpu_gettid(), handle);
+       FUT_FULL_PROBE3(_STARPU_FUT_KEYMASK_DSM_VERBOSE, _STARPU_FUT_END_FREE, memnode, _starpu_gettid(), handle);
 
 #define _STARPU_TRACE_START_WRITEBACK(memnode, handle)         \
-       FUT_DO_PROBE3(_STARPU_FUT_START_WRITEBACK, memnode, _starpu_gettid(), handle);
+       FUT_FULL_PROBE3(_STARPU_FUT_KEYMASK_DSM_VERBOSE, _STARPU_FUT_START_WRITEBACK, memnode, _starpu_gettid(), handle);
 
 #define _STARPU_TRACE_END_WRITEBACK(memnode, handle)           \
-       FUT_DO_PROBE3(_STARPU_FUT_END_WRITEBACK, memnode, _starpu_gettid(), handle);
+       FUT_FULL_PROBE3(_STARPU_FUT_KEYMASK_DSM_VERBOSE, _STARPU_FUT_END_WRITEBACK, memnode, _starpu_gettid(), handle);
 
 #define _STARPU_TRACE_USED_MEM(memnode,used)		\
-	FUT_DO_PROBE3(_STARPU_FUT_USED_MEM, memnode, used, _starpu_gettid());
+	FUT_FULL_PROBE3(_STARPU_FUT_KEYMASK_DSM_VERBOSE, _STARPU_FUT_USED_MEM, memnode, used, _starpu_gettid());
 
 #define _STARPU_TRACE_START_MEMRECLAIM(memnode,is_prefetch)		\
-	FUT_DO_PROBE3(_STARPU_FUT_START_MEMRECLAIM, memnode, is_prefetch, _starpu_gettid());
+	FUT_FULL_PROBE3(_STARPU_FUT_KEYMASK_DSM_VERBOSE, _STARPU_FUT_START_MEMRECLAIM, memnode, is_prefetch, _starpu_gettid());
 
 #define _STARPU_TRACE_END_MEMRECLAIM(memnode, is_prefetch)		\
-	FUT_DO_PROBE3(_STARPU_FUT_END_MEMRECLAIM, memnode, is_prefetch, _starpu_gettid());
+	FUT_FULL_PROBE3(_STARPU_FUT_KEYMASK_DSM_VERBOSE, _STARPU_FUT_END_MEMRECLAIM, memnode, is_prefetch, _starpu_gettid());
 
 #define _STARPU_TRACE_START_WRITEBACK_ASYNC(memnode)		\
-	FUT_DO_PROBE2(_STARPU_FUT_START_WRITEBACK_ASYNC, memnode, _starpu_gettid());
+	FUT_FULL_PROBE2(_STARPU_FUT_KEYMASK_DSM_VERBOSE, _STARPU_FUT_START_WRITEBACK_ASYNC, memnode, _starpu_gettid());
 
 #define _STARPU_TRACE_END_WRITEBACK_ASYNC(memnode)		\
-	FUT_DO_PROBE2(_STARPU_FUT_END_WRITEBACK_ASYNC, memnode, _starpu_gettid());
+	FUT_FULL_PROBE2(_STARPU_FUT_KEYMASK_DSM_VERBOSE, _STARPU_FUT_END_WRITEBACK_ASYNC, memnode, _starpu_gettid());
+
+#define _STARPU_TRACE_PAPI_TASK_EVENT(event_id, task, value)	\
+	FUT_DO_PROBE3(_STARPU_FUT_PAPI_TASK_EVENT_VALUE, event_id, _starpu_get_job_associated_to_task(task)->job_id, value)
 
 /* We skip these events becasue they are called so often that they cause FxT to
  * fail and make the overall trace unreadable anyway. */
 #define _STARPU_TRACE_START_PROGRESS(memnode)		\
-	FUT_DO_PROBE2(_STARPU_FUT_START_PROGRESS_ON_TID, memnode, _starpu_gettid());
+	FUT_FULL_PROBE2(_STARPU_FUT_KEYMASK_WORKER_VERBOSE, _STARPU_FUT_START_PROGRESS_ON_TID, memnode, _starpu_gettid());
 
 #define _STARPU_TRACE_END_PROGRESS(memnode)		\
-	FUT_DO_PROBE2(_STARPU_FUT_END_PROGRESS_ON_TID, memnode, _starpu_gettid());
+	FUT_FULL_PROBE2(_STARPU_FUT_KEYMASK_WORKER_VERBOSE, _STARPU_FUT_END_PROGRESS_ON_TID, memnode, _starpu_gettid());
 
 #define _STARPU_TRACE_USER_EVENT(code)			\
-	FUT_DO_PROBE2(_STARPU_FUT_USER_EVENT, code, _starpu_gettid());
+	FUT_FULL_PROBE2(_STARPU_FUT_KEYMASK_USER, _STARPU_FUT_USER_EVENT, code, _starpu_gettid());
+
+#define _STARPU_TRACE_META(S)			\
+	FUT_FULL_PROBESTR(_STARPU_FUT_KEYMASK_META, _STARPU_FUT_EVENT,S)
 
 #define _STARPU_TRACE_SET_PROFILING(status)		\
-	FUT_DO_PROBE2(_STARPU_FUT_SET_PROFILING, status, _starpu_gettid());
+	FUT_FULL_PROBE2(_STARPU_FUT_KEYMASK_META, _STARPU_FUT_SET_PROFILING, status, _starpu_gettid());
 
 #define _STARPU_TRACE_TASK_WAIT_FOR_ALL			\
-	FUT_DO_PROBE0(_STARPU_FUT_TASK_WAIT_FOR_ALL)
+	FUT_FULL_PROBE0(_STARPU_FUT_KEYMASK_TASK, _STARPU_FUT_TASK_WAIT_FOR_ALL)
+
+#define _STARPU_TRACE_EVENT_ALWAYS(S)			do {\
+	if (_starpu_fxt_started) \
+		FUT_DO_ALWAYS_PROBESTR(_STARPU_FUT_EVENT,S) \
+} while(0)
 
 #define _STARPU_TRACE_EVENT(S)			\
-	FUT_DO_PROBESTR(_STARPU_FUT_EVENT,S)
+	FUT_FULL_PROBESTR(_STARPU_FUT_KEYMASK_EVENT, _STARPU_FUT_EVENT,S)
+
+#define _STARPU_TRACE_EVENT_VERBOSE(S)			\
+	FUT_FULL_PROBESTR(_STARPU_FUT_KEYMASK_EVENT_VERBOSE, _STARPU_FUT_EVENT,S)
+
 
 #define _STARPU_TRACE_THREAD_EVENT(S)			\
-	_STARPU_FUT_DO_PROBE1STR(_STARPU_FUT_THREAD_EVENT, _starpu_gettid(), S)
+	_STARPU_FUT_FULL_PROBE1STR(_STARPU_FUT_KEYMASK_WORKER, _STARPU_FUT_THREAD_EVENT, _starpu_gettid(), S)
 
 #define _STARPU_TRACE_HYPERVISOR_BEGIN()  \
-	FUT_DO_PROBE1(_STARPU_FUT_HYPERVISOR_BEGIN, _starpu_gettid());
+	FUT_FULL_PROBE1(_STARPU_FUT_KEYMASK_HYP, _STARPU_FUT_HYPERVISOR_BEGIN, _starpu_gettid());
 
 #define _STARPU_TRACE_HYPERVISOR_END() \
-	FUT_DO_PROBE1(_STARPU_FUT_HYPERVISOR_END, _starpu_gettid());
+	FUT_FULL_PROBE1(_STARPU_FUT_KEYMASK_HYP, _STARPU_FUT_HYPERVISOR_END, _starpu_gettid());
 
 #ifdef STARPU_FXT_LOCK_TRACES
 
 #define _STARPU_TRACE_LOCKING_MUTEX()	do { \
 	const char *file; \
 	file = strrchr(__FILE__,'/') + 1; \
-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_LOCKING_MUTEX,__LINE__,_starpu_gettid(),file); \
+	_STARPU_FUT_FULL_PROBE2STR(_STARPU_FUT_KEYMASK_LOCK, _STARPU_FUT_LOCKING_MUTEX,__LINE__,_starpu_gettid(),file); \
 } while (0)
 
 #define _STARPU_TRACE_MUTEX_LOCKED()	do { \
 	const char *file; \
 	file = strrchr(__FILE__,'/') + 1; \
-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_MUTEX_LOCKED,__LINE__,_starpu_gettid(),file); \
+	_STARPU_FUT_FULL_PROBE2STR(_STARPU_FUT_KEYMASK_LOCK_VERBOSE, _STARPU_FUT_MUTEX_LOCKED,__LINE__,_starpu_gettid(),file); \
 } while(0)
 
 #define _STARPU_TRACE_UNLOCKING_MUTEX()	do { \
 	const char *file; \
 	file = strrchr(__FILE__,'/') + 1; \
-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_UNLOCKING_MUTEX,__LINE__,_starpu_gettid(),file); \
+	_STARPU_FUT_FULL_PROBE2STR(_STARPU_FUT_KEYMASK_LOCK, _STARPU_FUT_UNLOCKING_MUTEX,__LINE__,_starpu_gettid(),file); \
 } while(0)
 
 #define _STARPU_TRACE_MUTEX_UNLOCKED()	do {\
 	const char *file; \
 	file = strrchr(__FILE__,'/') + 1; \
-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_MUTEX_UNLOCKED,__LINE__,_starpu_gettid(),file); \
+	_STARPU_FUT_FULL_PROBE2STR(_STARPU_FUT_KEYMASK_LOCK_VERBOSE, _STARPU_FUT_MUTEX_UNLOCKED,__LINE__,_starpu_gettid(),file); \
 } while(0)
 
 #define _STARPU_TRACE_TRYLOCK_MUTEX()	do { \
 	const char *file; \
 	file = strrchr(__FILE__,'/') + 1; \
-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_TRYLOCK_MUTEX,__LINE__,_starpu_gettid(),file); \
+	_STARPU_FUT_FULL_PROBE2STR(_STARPU_FUT_KEYMASK_LOCK, _STARPU_FUT_TRYLOCK_MUTEX,__LINE__,_starpu_gettid(),file); \
 } while(0)
 
 #define _STARPU_TRACE_RDLOCKING_RWLOCK()	do { \
 	const char *file; \
 	file = strrchr(__FILE__,'/') + 1; \
-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_RDLOCKING_RWLOCK,__LINE__,_starpu_gettid(),file); \
+	_STARPU_FUT_FULL_PROBE2STR(_STARPU_FUT_KEYMASK_LOCK, _STARPU_FUT_RDLOCKING_RWLOCK,__LINE__,_starpu_gettid(),file); \
 } while(0)
 
 #define _STARPU_TRACE_RWLOCK_RDLOCKED()	do { \
 	const char *file; \
 	file = strrchr(__FILE__,'/') + 1; \
-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_RWLOCK_RDLOCKED,__LINE__,_starpu_gettid(),file); \
+	_STARPU_FUT_FULL_PROBE2STR(_STARPU_FUT_KEYMASK_LOCK_VERBOSE, _STARPU_FUT_RWLOCK_RDLOCKED,__LINE__,_starpu_gettid(),file); \
 } while(0)
 
 #define _STARPU_TRACE_WRLOCKING_RWLOCK()	do { \
 	const char *file; \
 	file = strrchr(__FILE__,'/') + 1; \
-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_WRLOCKING_RWLOCK,__LINE__,_starpu_gettid(),file); \
+	_STARPU_FUT_FULL_PROBE2STR(_STARPU_FUT_KEYMASK_LOCK, _STARPU_FUT_WRLOCKING_RWLOCK,__LINE__,_starpu_gettid(),file); \
 } while(0)
 
 #define _STARPU_TRACE_RWLOCK_WRLOCKED()	do { \
 	const char *file; \
 	file = strrchr(__FILE__,'/') + 1; \
-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_RWLOCK_WRLOCKED,__LINE__,_starpu_gettid(),file); \
+	_STARPU_FUT_FULL_PROBE2STR(_STARPU_FUT_KEYMASK_LOCK_VERBOSE, _STARPU_FUT_RWLOCK_WRLOCKED,__LINE__,_starpu_gettid(),file); \
 } while(0)
 
 #define _STARPU_TRACE_UNLOCKING_RWLOCK()	do { \
 	const char *file; \
 	file = strrchr(__FILE__,'/') + 1; \
-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_UNLOCKING_RWLOCK,__LINE__,_starpu_gettid(),file); \
+	_STARPU_FUT_FULL_PROBE2STR(_STARPU_FUT_KEYMASK_LOCK, _STARPU_FUT_UNLOCKING_RWLOCK,__LINE__,_starpu_gettid(),file); \
 } while(0)
 
 #define _STARPU_TRACE_RWLOCK_UNLOCKED()	do { \
 	const char *file; \
 	file = strrchr(__FILE__,'/') + 1; \
-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_RWLOCK_UNLOCKED,__LINE__,_starpu_gettid(),file); \
+	_STARPU_FUT_FULL_PROBE2STR(_STARPU_FUT_KEYMASK_LOCK_VERBOSE, _STARPU_FUT_RWLOCK_UNLOCKED,__LINE__,_starpu_gettid(),file); \
 } while(0)
 
 #define STARPU_TRACE_SPINLOCK_CONDITITION (starpu_worker_get_type(starpu_worker_get_id()) == STARPU_CUDA_WORKER)
@@ -990,7 +1101,7 @@ do {										\
 	if (STARPU_TRACE_SPINLOCK_CONDITITION) { \
 		const char *xfile; \
 		xfile = strrchr(file,'/') + 1; \
-		_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_LOCKING_SPINLOCK,line,_starpu_gettid(),xfile); \
+		_STARPU_FUT_FULL_PROBE2STR(_STARPU_FUT_KEYMASK_LOCK_VERBOSE, _STARPU_FUT_LOCKING_SPINLOCK,line,_starpu_gettid(),xfile); \
 	} \
 } while(0)
 
@@ -998,7 +1109,7 @@ do {										\
 	if (STARPU_TRACE_SPINLOCK_CONDITITION) { \
 		const char *xfile; \
 		xfile = strrchr(file,'/') + 1; \
-		_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_SPINLOCK_LOCKED,line,_starpu_gettid(),xfile); \
+		_STARPU_FUT_FULL_PROBE2STR(_STARPU_FUT_KEYMASK_LOCK_VERBOSE, _STARPU_FUT_SPINLOCK_LOCKED,line,_starpu_gettid(),xfile); \
 	} \
 } while(0)
 
@@ -1006,7 +1117,7 @@ do {										\
 	if (STARPU_TRACE_SPINLOCK_CONDITITION) { \
 		const char *xfile; \
 		xfile = strrchr(file,'/') + 1; \
-		_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_UNLOCKING_SPINLOCK,line,_starpu_gettid(),xfile); \
+		_STARPU_FUT_FULL_PROBE2STR(_STARPU_FUT_KEYMASK_LOCK_VERBOSE, _STARPU_FUT_UNLOCKING_SPINLOCK,line,_starpu_gettid(),xfile); \
 	} \
 } while(0)
 
@@ -1014,7 +1125,7 @@ do {										\
 	if (STARPU_TRACE_SPINLOCK_CONDITITION) { \
 		const char *xfile; \
 		xfile = strrchr(file,'/') + 1; \
-		_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_SPINLOCK_UNLOCKED,line,_starpu_gettid(),xfile); \
+		_STARPU_FUT_FULL_PROBE2STR(_STARPU_FUT_KEYMASK_LOCK_VERBOSE, _STARPU_FUT_SPINLOCK_UNLOCKED,line,_starpu_gettid(),xfile); \
 	} \
 } while(0)
 
@@ -1022,32 +1133,32 @@ do {										\
 	if (STARPU_TRACE_SPINLOCK_CONDITITION) { \
 		const char *xfile; \
 		xfile = strrchr(file,'/') + 1; \
-		_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_TRYLOCK_SPINLOCK,line,_starpu_gettid(),xfile); \
+		_STARPU_FUT_FULL_PROBE2STR(_STARPU_FUT_KEYMASK_LOCK_VERBOSE, _STARPU_FUT_TRYLOCK_SPINLOCK,line,_starpu_gettid(),xfile); \
 	} \
 } while(0)
 
 #define _STARPU_TRACE_COND_WAIT_BEGIN()	do { \
 	const char *file; \
 	file = strrchr(__FILE__,'/') + 1; \
-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_COND_WAIT_BEGIN,__LINE__,_starpu_gettid(),file); \
+	_STARPU_FUT_FULL_PROBE2STR(_STARPU_FUT_KEYMASK_LOCK, _STARPU_FUT_COND_WAIT_BEGIN,__LINE__,_starpu_gettid(),file); \
 } while(0)
 
 #define _STARPU_TRACE_COND_WAIT_END()	do { \
 	const char *file; \
 	file = strrchr(__FILE__,'/') + 1; \
-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_COND_WAIT_END,__LINE__,_starpu_gettid(),file); \
+	_STARPU_FUT_FULL_PROBE2STR(_STARPU_FUT_KEYMASK_LOCK, _STARPU_FUT_COND_WAIT_END,__LINE__,_starpu_gettid(),file); \
 } while(0)
 
 #define _STARPU_TRACE_BARRIER_WAIT_BEGIN()	do { \
 	const char *file; \
 	file = strrchr(__FILE__,'/') + 1; \
-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_BARRIER_WAIT_BEGIN,__LINE__,_starpu_gettid(),file); \
+	_STARPU_FUT_FULL_PROBE2STR(_STARPU_FUT_KEYMASK_LOCK, _STARPU_FUT_BARRIER_WAIT_BEGIN,__LINE__,_starpu_gettid(),file); \
 } while(0)
 
 #define _STARPU_TRACE_BARRIER_WAIT_END()	do { \
 	const char *file; \
 	file = strrchr(__FILE__,'/') + 1; \
-	_STARPU_FUT_DO_PROBE2STR(_STARPU_FUT_BARRIER_WAIT_END,__LINE__,_starpu_gettid(),file); \
+	_STARPU_FUT_FULL_PROBE2STR(_STARPU_FUT_KEYMASK_LOCK, _STARPU_FUT_BARRIER_WAIT_END,__LINE__,_starpu_gettid(),file); \
 } while(0)
 
 #else // !STARPU_FXT_LOCK_TRACES
@@ -1076,34 +1187,34 @@ do {										\
 #endif // STARPU_FXT_LOCK_TRACES
 
 #define _STARPU_TRACE_MEMORY_FULL(size)	\
-	FUT_DO_PROBE2(_STARPU_FUT_MEMORY_FULL,size,_starpu_gettid());
+	FUT_FULL_PROBE2(_STARPU_FUT_KEYMASK_DSM, _STARPU_FUT_MEMORY_FULL,size,_starpu_gettid());
 
 #define _STARPU_TRACE_DATA_LOAD(workerid,size)	\
-	FUT_DO_PROBE2(_STARPU_FUT_DATA_LOAD, workerid, size);
+	FUT_FULL_PROBE2(_STARPU_FUT_KEYMASK_DSM, _STARPU_FUT_DATA_LOAD, workerid, size);
 
 #define _STARPU_TRACE_START_UNPARTITION(handle, memnode)		\
-	FUT_DO_PROBE3(_STARPU_FUT_START_UNPARTITION_ON_TID, memnode, _starpu_gettid(), handle);
+	FUT_FULL_PROBE3(_STARPU_FUT_KEYMASK_DSM, _STARPU_FUT_START_UNPARTITION_ON_TID, memnode, _starpu_gettid(), handle);
 
 #define _STARPU_TRACE_END_UNPARTITION(handle, memnode)		\
-	FUT_DO_PROBE3(_STARPU_FUT_END_UNPARTITION_ON_TID, memnode, _starpu_gettid(), handle);
+	FUT_FULL_PROBE3(_STARPU_FUT_KEYMASK_DSM, _STARPU_FUT_END_UNPARTITION_ON_TID, memnode, _starpu_gettid(), handle);
 
 #define _STARPU_TRACE_SCHED_COMPONENT_PUSH_PRIO(workerid, ntasks, exp_len)		\
-	FUT_DO_PROBE4(_STARPU_FUT_SCHED_COMPONENT_PUSH_PRIO, _starpu_gettid(), workerid, ntasks, exp_len);
+	FUT_FULL_PROBE4(_STARPU_FUT_KEYMASK_SCHED, _STARPU_FUT_SCHED_COMPONENT_PUSH_PRIO, _starpu_gettid(), workerid, ntasks, exp_len);
 
 #define _STARPU_TRACE_SCHED_COMPONENT_POP_PRIO(workerid, ntasks, exp_len)		\
-	FUT_DO_PROBE4(_STARPU_FUT_SCHED_COMPONENT_POP_PRIO, _starpu_gettid(), workerid, ntasks, exp_len);
+	FUT_FULL_PROBE4(_STARPU_FUT_KEYMASK_SCHED, _STARPU_FUT_SCHED_COMPONENT_POP_PRIO, _starpu_gettid(), workerid, ntasks, exp_len);
 
 #define _STARPU_TRACE_SCHED_COMPONENT_NEW(component)		\
-	_STARPU_FUT_DO_PROBE1STR(_STARPU_FUT_SCHED_COMPONENT_NEW, component, (component)->name);
+	_STARPU_FUT_FULL_PROBE1STR(_STARPU_FUT_KEYMASK_SCHED, _STARPU_FUT_SCHED_COMPONENT_NEW, component, (component)->name);
 
 #define _STARPU_TRACE_SCHED_COMPONENT_CONNECT(parent, child)		\
-	FUT_DO_PROBE2(_STARPU_FUT_SCHED_COMPONENT_CONNECT, parent, child);
+	FUT_FULL_PROBE2(_STARPU_FUT_KEYMASK_SCHED, _STARPU_FUT_SCHED_COMPONENT_CONNECT, parent, child);
 
 #define _STARPU_TRACE_SCHED_COMPONENT_PUSH(from, to, task)		\
-	FUT_DO_PROBE5(_STARPU_FUT_SCHED_COMPONENT_PUSH, _starpu_gettid(), from, to, task, (task)->priority);
+	FUT_FULL_PROBE5(_STARPU_FUT_KEYMASK_SCHED, _STARPU_FUT_SCHED_COMPONENT_PUSH, _starpu_gettid(), from, to, task, (task)->priority);
 
 #define _STARPU_TRACE_SCHED_COMPONENT_PULL(from, to, task)		\
-	FUT_DO_PROBE5(_STARPU_FUT_SCHED_COMPONENT_PULL, _starpu_gettid(), from, to, task, (task)->priority);
+	FUT_FULL_PROBE5(_STARPU_FUT_KEYMASK_SCHED, _STARPU_FUT_SCHED_COMPONENT_PULL, _starpu_gettid(), from, to, task, (task)->priority);
 
 #define _STARPU_TRACE_HANDLE_DATA_REGISTER(handle)	do {	\
 	const size_t __data_size = handle->ops->get_size(handle); \
@@ -1114,30 +1225,31 @@ do {										\
 		handle->ops->describe(__interface, __buf, sizeof(__buf)); \
 	else \
 		__buf[0] = 0; \
-	FUT_DO_PROBE4STR(_STARPU_FUT_HANDLE_DATA_REGISTER, handle, __data_size, __max_data_size, handle->home_node, __buf); \
+	_STARPU_FUT_FULL_PROBE4STR(_STARPU_FUT_KEYMASK_DATA, _STARPU_FUT_HANDLE_DATA_REGISTER, handle, __data_size, __max_data_size, handle->home_node, __buf); \
 } while (0)
 
 #define _STARPU_TRACE_HANDLE_DATA_UNREGISTER(handle)	\
-	FUT_DO_PROBE1(_STARPU_FUT_HANDLE_DATA_UNREGISTER, handle)
+	FUT_FULL_PROBE1(_STARPU_FUT_KEYMASK_DATA, _STARPU_FUT_HANDLE_DATA_UNREGISTER, handle)
 
 //Coherency Data Traces
 #define _STARPU_TRACE_DATA_STATE_INVALID(handle, node)      \
-       FUT_DO_PROBE2(_STARPU_FUT_DATA_STATE_INVALID, handle, node)
+       FUT_FULL_PROBE2(_STARPU_FUT_KEYMASK_DSM_VERBOSE, _STARPU_FUT_DATA_STATE_INVALID, handle, node)
 
 #define _STARPU_TRACE_DATA_STATE_OWNER(handle, node)           \
-       FUT_DO_PROBE2(_STARPU_FUT_DATA_STATE_OWNER, handle, node)
+       FUT_FULL_PROBE2(_STARPU_FUT_KEYMASK_DSM_VERBOSE, _STARPU_FUT_DATA_STATE_OWNER, handle, node)
 
 #define _STARPU_TRACE_DATA_STATE_SHARED(handle, node)          \
-       FUT_DO_PROBE2(_STARPU_FUT_DATA_STATE_SHARED, handle, node)
+       FUT_FULL_PROBE2(_STARPU_FUT_KEYMASK_DSM_VERBOSE, _STARPU_FUT_DATA_STATE_SHARED, handle, node)
 
 #define _STARPU_TRACE_DATA_REQUEST_CREATED(handle, orig, dest, prio, is_pre)          \
-       FUT_DO_PROBE5(_STARPU_FUT_DATA_REQUEST_CREATED, orig, dest, prio, handle, is_pre)
+       FUT_FULL_PROBE5(_STARPU_FUT_KEYMASK_DSM_VERBOSE, _STARPU_FUT_DATA_REQUEST_CREATED, orig, dest, prio, handle, is_pre)
 
 
 #else // !STARPU_USE_FXT
 
 /* Dummy macros in case FxT is disabled */
 #define _STARPU_TRACE_NEW_MEM_NODE(nodeid)		do {(void)(nodeid);} while(0)
+#define _STARPU_TRACE_REGISTER_THREAD(cpuid)		do {(void)(cpuid);} while(0)
 #define _STARPU_TRACE_WORKER_INIT_START(a,b,c,d,e,f)	do {(void)(a); (void)(b); (void)(c); (void)(d); (void)(e); (void)(f);} while(0)
 #define _STARPU_TRACE_WORKER_INIT_END(workerid)		do {(void)(workerid);} while(0)
 #define _STARPU_TRACE_START_CODELET_BODY(job, nimpl, perf_arch, workerid) 	do {(void)(job); (void)(nimpl); (void)(perf_arch); (void)(workerid);} while(0)
@@ -1216,7 +1328,9 @@ do {										\
 #define _STARPU_TRACE_USER_EVENT(code)			do {(void)(code);} while(0)
 #define _STARPU_TRACE_SET_PROFILING(status)		do {(void)(status);} while(0)
 #define _STARPU_TRACE_TASK_WAIT_FOR_ALL()		do {} while(0)
+#define _STARPU_TRACE_EVENT_ALWAYS(S)				do {(void)(S);} while(0)
 #define _STARPU_TRACE_EVENT(S)				do {(void)(S);} while(0)
+#define _STARPU_TRACE_EVENT_VERBOSE(S)				do {(void)(S);} while(0)
 #define _STARPU_TRACE_THREAD_EVENT(S)			do {(void)(S);} while(0)
 #define _STARPU_TRACE_LOCKING_MUTEX()			do {} while(0)
 #define _STARPU_TRACE_MUTEX_LOCKED()			do {} while(0)
@@ -1258,6 +1372,7 @@ do {										\
 #define _STARPU_TRACE_DATA_STATE_OWNER(handle, node)	do {(void)(handle); (void)(node);} while(0)
 #define _STARPU_TRACE_DATA_STATE_SHARED(handle, node)	do {(void)(handle); (void)(node);} while(0)
 #define _STARPU_TRACE_DATA_REQUEST_CREATED(handle, orig, dest, prio, is_pre) do {(void)(handle); (void)(orig); (void)(dest); (void)(prio); (void)(is_pre);} while(0)
+#define _STARPU_TRACE_PAPI_TASK_EVENT(event_id, task, value) do {(void)(event_id); (void)(task); (void)(value);} while(0)
 
 #endif // STARPU_USE_FXT
 

+ 29 - 1
src/common/knobs.c

@@ -65,8 +65,19 @@ void _starpu_perf_counter_sample_exit(struct starpu_perf_counter_sample *sample)
 
 /* - */
 
-void _starpu_perf_counter_init(void)
+void _starpu_perf_counter_init(struct _starpu_machine_config *pconfig)
 {
+	if (pconfig->conf.start_perf_counter_collection)
+	{
+		/* start perf counter collection immediately */
+		pconfig->perf_counter_pause_depth = 0;
+	}
+	else
+	{
+		/* defer perf counter collection until call to
+		 * starpu_perf_counter_start_collection () */
+		pconfig->perf_counter_pause_depth = 1;
+	}
 	STARPU_ASSERT(!_starpu_machine_is_running());
 	_starpu_perf_counter_sample_init(&global_sample, starpu_perf_counter_scope_global);
 
@@ -84,6 +95,20 @@ void _starpu_perf_counter_exit(void)
 
 /* - */
 
+void starpu_perf_counter_collection_start()
+{
+	STARPU_HG_DISABLE_CHECKING(_starpu_config.perf_counter_pause_depth);
+	(void)STARPU_ATOMIC_ADD(&_starpu_config.perf_counter_pause_depth, -1);
+}
+
+void starpu_perf_counter_collection_stop()
+{
+	STARPU_HG_DISABLE_CHECKING(_starpu_config.perf_counter_pause_depth);
+	(void)STARPU_ATOMIC_ADD(&_starpu_config.perf_counter_pause_depth, +1);
+}
+
+/* - */
+
 int starpu_perf_counter_scope_name_to_id(const char * const name)
 {
 	if (strcmp(name, "global") == 0)
@@ -462,6 +487,9 @@ void _starpu_perf_counter_register_updater(enum starpu_perf_counter_scope scope,
 
 static void update_sample(struct starpu_perf_counter_sample *sample, void *context)
 {
+	if (sample->listener == NULL)
+		return;
+
 	_starpu_spin_lock(&sample->lock);
 	struct perf_counter_array *counters = _get_counters(sample->scope);
 

+ 1 - 1
src/common/knobs.h

@@ -202,7 +202,7 @@ static inline int _starpu_perf_counter_id_build(const enum starpu_perf_counter_s
 
 void _starpu_perf_counter_sample_init(struct starpu_perf_counter_sample *sample, enum starpu_perf_counter_scope scope);
 void _starpu_perf_counter_sample_exit(struct starpu_perf_counter_sample *sample);
-void _starpu_perf_counter_init(void);
+void _starpu_perf_counter_init(struct _starpu_machine_config *pconfig);
 void _starpu_perf_counter_exit(void);
 
 int _starpu_perf_counter_register(enum starpu_perf_counter_scope scope, const char *name, enum starpu_perf_counter_type type, const char *help);

+ 21 - 1
src/common/thread.c

@@ -85,9 +85,21 @@ int starpu_pthread_create_on(char *name, starpu_pthread_t *thread, const starpu_
 #else
 		host = MSG_get_host_by_name("MAIN");
 #endif
+
 	void *tsd;
 	_STARPU_CALLOC(tsd, MAX_TSD+1, sizeof(void*));
+
+#ifdef HAVE_SG_ACTOR_INIT
+	*thread= sg_actor_init(name, host);
+	sg_actor_data_set(*thread, tsd);
+	sg_actor_start(*thread, _starpu_simgrid_thread_start, 2, _args);
+#else
 	*thread = MSG_process_create_with_arguments(name, _starpu_simgrid_thread_start, tsd, host, 2, _args);
+#ifdef HAVE_SG_ACTOR_DATA
+	sg_actor_data_set(*thread, tsd);
+#endif
+#endif
+
 #if SIMGRID_VERSION >= 31500 && SIMGRID_VERSION != 31559
 #  ifdef HAVE_SG_ACTOR_REF
 	sg_actor_ref(*thread);
@@ -300,6 +312,9 @@ extern void *smpi_process_get_user_data();
 int starpu_pthread_setspecific(starpu_pthread_key_t key, const void *pointer)
 {
 	void **array;
+#ifdef HAVE_SG_ACTOR_DATA
+	array = sg_actor_data(sg_actor_self());
+#else
 #if defined(HAVE_SMPI_PROCESS_SET_USER_DATA) || defined(smpi_process_get_user_data)
 #if defined(HAVE_MSG_PROCESS_SELF_NAME) || defined(MSG_process_self_name)
 	const char *process_name = MSG_process_self_name();
@@ -316,6 +331,7 @@ int starpu_pthread_setspecific(starpu_pthread_key_t key, const void *pointer)
 	else
 #endif
 		array = MSG_process_get_data(MSG_process_self());
+#endif
 	array[key] = (void*) pointer;
 	return 0;
 }
@@ -323,6 +339,9 @@ int starpu_pthread_setspecific(starpu_pthread_key_t key, const void *pointer)
 void* starpu_pthread_getspecific(starpu_pthread_key_t key)
 {
 	void **array;
+#ifdef HAVE_SG_ACTOR_DATA
+	array = sg_actor_data(sg_actor_self());
+#else
 #if defined(HAVE_SMPI_PROCESS_SET_USER_DATA) || defined(smpi_process_get_user_data)
 #if defined(HAVE_MSG_PROCESS_SELF_NAME) || defined(MSG_process_self_name)
 	const char *process_name = MSG_process_self_name();
@@ -339,6 +358,7 @@ void* starpu_pthread_getspecific(starpu_pthread_key_t key)
 	else
 #endif
 		array = MSG_process_get_data(MSG_process_self());
+#endif
 	if (!array)
 		return NULL;
 	return array[key];
@@ -970,7 +990,7 @@ int _starpu_pthread_spin_do_lock(starpu_pthread_spinlock_t *lock)
 	while (1)
 	{
 		/* Tell releaser to wake us */
-		unsigned prev = starpu_xchg(&lock->taken, 2);
+		unsigned prev = STARPU_VAL_EXCHANGE(&lock->taken, 2);
 		if (prev == 0)
 			/* Ah, it just got released and we actually acquired
 			 * it!

+ 4 - 1
src/common/utils.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2017,2019                           Université de Bordeaux
+ * Copyright (C) 2010-2017,2019-2020                      Université de Bordeaux
  * Copyright (C) 2012,2016,2017                           Inria
  * Copyright (C) 2010-2017, 2019                          CNRS
  *
@@ -81,6 +81,9 @@ int _starpu_mkpath(const char *s, mode_t mode)
 	char *q, *r = NULL, *path = NULL, *up = NULL;
 	int rv = -1;
 
+	while (s[0] == '/' && s[1] == '/')
+		s++;
+
 	if (strcmp(s, ".") == 0 || strcmp(s, "/") == 0
 #if defined(_WIN32)
 		/* C:/ or C:\ */

+ 1 - 0
src/core/dependencies/implicit_data_deps.c

@@ -327,6 +327,7 @@ struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_
 					l->prev = NULL;
 					handle->last_submitted_accessors.next = &handle->last_submitted_accessors;
 					handle->last_submitted_accessors.prev = &handle->last_submitted_accessors;
+					handle->last_submitted_ghost_sync_id_is_valid = 0;
 				}
 				else if (ghost_accessors_id)
 				{

+ 3 - 0
src/core/disk.c

@@ -148,6 +148,9 @@ int starpu_disk_register(struct starpu_disk_ops *func, void *parameter, starpu_s
 		return -ENOENT;
 	if (size >= 0)
 		_starpu_memory_manager_set_global_memory_size(disk_memnode, size);
+
+	_starpu_mem_chunk_disk_register(disk_memnode);
+
 	return disk_memnode;
 }
 

+ 13 - 9
src/core/perfmodel/perfmodel_bus.c

@@ -227,7 +227,7 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_cuda(int dev, in
 	/* Fill them */
 	memset(h_buffer, 0, size);
 	cudaMemset(d_buffer, 0, size);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 
 	/* hack to avoid third party libs to rebind threads */
 	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
@@ -243,7 +243,7 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_cuda(int dev, in
 	for (iter = 0; iter < NITER; iter++)
 	{
 		cudaMemcpy(d_buffer, h_buffer, size, cudaMemcpyHostToDevice);
-		cudaThreadSynchronize();
+		cudaDeviceSynchronize();
 	}
 	end = starpu_timing_now();
 	timing = end - start;
@@ -255,7 +255,7 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_cuda(int dev, in
 	for (iter = 0; iter < NITER; iter++)
 	{
 		cudaMemcpy(h_buffer, d_buffer, size, cudaMemcpyDeviceToHost);
-		cudaThreadSynchronize();
+		cudaDeviceSynchronize();
 	}
 	end = starpu_timing_now();
 	timing = end - start;
@@ -267,7 +267,7 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_cuda(int dev, in
 	for (iter = 0; iter < NITER; iter++)
 	{
 		cudaMemcpy(d_buffer, h_buffer, 1, cudaMemcpyHostToDevice);
-		cudaThreadSynchronize();
+		cudaDeviceSynchronize();
 	}
 	end = starpu_timing_now();
 	timing = end - start;
@@ -279,7 +279,7 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_cuda(int dev, in
 	for (iter = 0; iter < NITER; iter++)
 	{
 		cudaMemcpy(h_buffer, d_buffer, 1, cudaMemcpyDeviceToHost);
-		cudaThreadSynchronize();
+		cudaDeviceSynchronize();
 	}
 	end = starpu_timing_now();
 	timing = end - start;
@@ -346,7 +346,7 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 	cures = cudaMalloc((void **)&s_buffer, size);
 	STARPU_ASSERT(cures == cudaSuccess);
 	cudaMemset(s_buffer, 0, size);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 
 	/* Initialize CUDA context on the destination */
 	/* We do not need to enable OpenGL interoperability at this point,
@@ -372,7 +372,7 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 	cures = cudaMalloc((void **)&d_buffer, size);
 	STARPU_ASSERT(cures == cudaSuccess);
 	cudaMemset(d_buffer, 0, size);
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 
 	unsigned iter;
 	double timing;
@@ -384,7 +384,7 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 	for (iter = 0; iter < NITER; iter++)
 	{
 		cudaMemcpyPeer(d_buffer, dst, s_buffer, src, size);
-		cudaThreadSynchronize();
+		cudaDeviceSynchronize();
 	}
 	end = starpu_timing_now();
 	timing = end - start;
@@ -396,7 +396,7 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 	for (iter = 0; iter < NITER; iter++)
 	{
 		cudaMemcpyPeer(d_buffer, dst, s_buffer, src, 1);
-		cudaThreadSynchronize();
+		cudaDeviceSynchronize();
 	}
 	end = starpu_timing_now();
 	timing = end - start;
@@ -1505,6 +1505,8 @@ static void load_bus_latency_file(void)
 	{
 		/* File does not exist yet or is bogus */
 		generate_bus_latency_file();
+		res = load_bus_latency_file_content();
+		STARPU_ASSERT(res);
 	}
 
 }
@@ -1944,6 +1946,8 @@ static void load_bus_bandwidth_file(void)
 	{
 		/* File does not exist yet or is bogus */
 		generate_bus_bandwidth_file();
+		res = load_bus_bandwidth_file_content();
+		STARPU_ASSERT(res);
 	}
 }
 

+ 3 - 2
src/core/perfmodel/regression.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2008-2011,2014,2015                      Université de Bordeaux
+ * Copyright (C) 2008-2011,2014,2015,2019                 Université de Bordeaux
  * Copyright (C) 2011                                     Inria
  * Copyright (C) 2010-2012,2015-2017                      CNRS
  *
@@ -17,6 +17,7 @@
  */
 
 #include <core/perfmodel/regression.h>
+#include <float.h>
 
 #define MAXREGITER	1000
 #define EPS 1.0e-10
@@ -129,7 +130,7 @@ static unsigned find_list_size(struct starpu_perfmodel_history_list *list_histor
 
 static double find_list_min(double *y, unsigned n)
 {
-	double min = 1.0e30;
+	double min = DBL_MAX;
 
 	unsigned i;
 	for (i = 0; i < n; i++)

+ 9 - 6
src/core/sched_ctx.c

@@ -1137,8 +1137,8 @@ static void fetch_tasks_from_empty_ctx_list(struct _starpu_sched_ctx *sched_ctx)
 
 		/* if no workers are able to execute the task, it will be put
 		 * in the empty_ctx_tasks list forever again */
-		unsigned nworkers = _starpu_nworkers_able_to_execute_task(old_task, sched_ctx);
-		STARPU_ASSERT(nworkers > 0);
+		unsigned able = _starpu_workers_able_to_execute_task(old_task, sched_ctx);
+		STARPU_ASSERT(able);
 
 		int ret =  _starpu_push_task_to_workers(old_task);
 		/* if we should stop poping from empty ctx tasks */
@@ -1421,9 +1421,9 @@ void starpu_sched_ctx_remove_workers(int *workers_to_remove, unsigned nworkers_t
 	}
 }
 
-int _starpu_nworkers_able_to_execute_task(struct starpu_task *task, struct _starpu_sched_ctx *sched_ctx)
+int _starpu_workers_able_to_execute_task(struct starpu_task *task, struct _starpu_sched_ctx *sched_ctx)
 {
-	unsigned nworkers = 0;
+	unsigned able = 0;
 
 	_starpu_sched_ctx_lock_read(sched_ctx->id);
 	struct starpu_worker_collection *workers = sched_ctx->workers;
@@ -1436,11 +1436,14 @@ int _starpu_nworkers_able_to_execute_task(struct starpu_task *task, struct _star
 		unsigned worker = workers->get_next(workers, &it);
 		STARPU_ASSERT_MSG(worker < STARPU_NMAXWORKERS, "worker id %u", worker);
 		if (starpu_worker_can_execute_task_first_impl(worker, task, NULL))
-			nworkers++;
+		{
+			able++;
+			break;
+		}
 	}
 	_starpu_sched_ctx_unlock_read(sched_ctx->id);
 
-	return nworkers;
+	return able;
 }
 
 /* unused sched_ctx have the id STARPU_NMAX_SCHED_CTXS */

+ 2 - 2
src/core/sched_ctx.h

@@ -233,8 +233,8 @@ unsigned _starpu_sched_ctx_last_worker_awake(struct _starpu_worker *worker);
  * id set by its last call, or the id of the initial context */
 unsigned _starpu_sched_ctx_get_current_context();
 
-/* verify how many workers can execute a certain task */
-int _starpu_nworkers_able_to_execute_task(struct starpu_task *task, struct _starpu_sched_ctx *sched_ctx);
+/* verify that some worker can execute a certain task */
+int _starpu_workers_able_to_execute_task(struct starpu_task *task, struct _starpu_sched_ctx *sched_ctx);
 
 void _starpu_fetch_tasks_from_empty_ctx_list(struct _starpu_sched_ctx *sched_ctx);
 

+ 13 - 14
src/core/sched_policy.c

@@ -405,7 +405,7 @@ static int _starpu_push_task_on_specific_worker(struct starpu_task *task, int wo
 			struct starpu_task *alias = starpu_task_dup(task);
 			alias->destroy = 1;
 
-			_STARPU_TRACE_JOB_PUSH(alias, alias->priority > 0);
+			_STARPU_TRACE_JOB_PUSH(alias, alias->priority);
 			worker = _starpu_get_worker_struct(combined_workerid[j]);
 			ret |= _starpu_push_local_task(worker, alias, 0);
 		}
@@ -446,7 +446,7 @@ int _starpu_repush_task(struct _starpu_job *j)
 		0
 #endif
 		;
-	if (!j->internal && !continuation)
+	if (!_starpu_perf_counter_paused() && !j->internal && !continuation)
 	{
 		(void) STARPU_ATOMIC_ADD64(& _starpu_task__g_current_submitted__value, -1);
 		int64_t value = STARPU_ATOMIC_ADD64(& _starpu_task__g_current_ready__value, 1);
@@ -466,9 +466,9 @@ int _starpu_repush_task(struct _starpu_job *j)
 	{
 		/*if there are workers in the ctx that are not able to execute tasks
 		  we consider the ctx empty */
-		unsigned nworkers = _starpu_nworkers_able_to_execute_task(task, sched_ctx);
+		unsigned able = _starpu_workers_able_to_execute_task(task, sched_ctx);
 
-		if(nworkers == 0)
+		if(!able)
 		{
 			_starpu_sched_ctx_lock_write(sched_ctx->id);
 			starpu_task_list_push_front(&sched_ctx->empty_ctx_tasks, task);
@@ -494,7 +494,7 @@ int _starpu_repush_task(struct _starpu_job *j)
 	 * corresponding dependencies */
 	if (task->cl == NULL || task->where == STARPU_NOWHERE)
 	{
-		if (!j->internal)
+		if (!_starpu_perf_counter_paused() && !j->internal)
 		{
 			(void)STARPU_ATOMIC_ADD64(& _starpu_task__g_current_ready__value, -1);
 			if (task->cl && task->cl->perf_counter_values)
@@ -536,9 +536,8 @@ int _starpu_repush_task(struct _starpu_job *j)
 int _starpu_push_task_to_workers(struct starpu_task *task)
 {
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(task->sched_ctx);
-	unsigned nworkers = 0;
 
-	_STARPU_TRACE_JOB_PUSH(task, task->priority > 0);
+	_STARPU_TRACE_JOB_PUSH(task, task->priority);
 
 	/* if the contexts still does not have workers put the task back to its place in
 	   the empty ctx list */
@@ -546,9 +545,9 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 	{
 		/*if there are workers in the ctx that are not able to execute tasks
 		  we consider the ctx empty */
-		nworkers = _starpu_nworkers_able_to_execute_task(task, sched_ctx);
+		unsigned able = _starpu_workers_able_to_execute_task(task, sched_ctx);
 
-		if (nworkers == 0)
+		if (!able)
 		{
 			_starpu_sched_ctx_lock_write(sched_ctx->id);
 			starpu_task_list_push_back(&sched_ctx->empty_ctx_tasks, task);
@@ -584,7 +583,7 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 		/* When a task can only be executed on a given arch and we have
 		 * only one memory node for that arch, we can systematically
 		 * prefetch before the scheduling decision. */
-		if (starpu_get_prefetch_flag())
+		if (starpu_get_prefetch_flag() && starpu_memory_nodes_get_count() > 1)
 		{
 			if (task->where == STARPU_CPU && config->cpus_nodeid >= 0)
 				starpu_prefetch_task_input_on_node(task, config->cpus_nodeid);
@@ -629,7 +628,7 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 					if (job->task_size > 1)
 					{
 						alias = starpu_task_dup(task);
-						_STARPU_TRACE_JOB_PUSH(alias, alias->priority > 0);
+						_STARPU_TRACE_JOB_PUSH(alias, alias->priority);
 						alias->destroy = 1;
 					}
 					else
@@ -642,7 +641,7 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 		{
 			STARPU_ASSERT(sched_ctx->sched_policy->push_task);
 			/* check out if there are any workers in the context */
-			nworkers = starpu_sched_ctx_get_nworkers(sched_ctx->id);
+			unsigned nworkers = starpu_sched_ctx_get_nworkers(sched_ctx->id);
 			if (nworkers == 0)
 				ret = -1;
 			else
@@ -670,7 +669,7 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 		if(ret == -1)
 		{
 			_STARPU_MSG("repush task \n");
-			_STARPU_TRACE_JOB_POP(task, task->priority > 0);
+			_STARPU_TRACE_JOB_POP(task, task->priority);
 			ret = _starpu_push_task_to_workers(task);
 		}
 	}
@@ -697,7 +696,7 @@ int _starpu_pop_task_end(struct starpu_task *task)
 {
 	if (!task)
 		return 0;
-	_STARPU_TRACE_JOB_POP(task, task->priority > 0);
+	_STARPU_TRACE_JOB_POP(task, task->priority);
 	return 0;
 }
 

+ 57 - 32
src/core/simgrid.c

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2016,2017                                Inria
- * Copyright (C) 2012,2013,2015-2019                      CNRS
- * Copyright (C) 2012-2019                                Université de Bordeaux
+ * Copyright (C) 2012,2013,2015-2020                      CNRS
+ * Copyright (C) 2012-2020                                Université de Bordeaux
  * Copyright (C) 2013                                     Thibaut Lambert
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -68,7 +68,7 @@ static struct transfer_runner
 	starpu_sem_t sem;
 	starpu_pthread_t runner;
 } transfer_runner[STARPU_MAXNODES][STARPU_MAXNODES];
-static int transfer_execute(int argc STARPU_ATTRIBUTE_UNUSED, char *argv[] STARPU_ATTRIBUTE_UNUSED);
+static void *transfer_execute(void *arg);
 
 starpu_pthread_queue_t _starpu_simgrid_task_queue[STARPU_NMAXWORKERS];
 static struct worker_runner
@@ -77,7 +77,7 @@ static struct worker_runner
 	starpu_sem_t sem;
 	starpu_pthread_t runner;
 } worker_runner[STARPU_NMAXWORKERS];
-static int task_execute(int argc STARPU_ATTRIBUTE_UNUSED, char *argv[] STARPU_ATTRIBUTE_UNUSED);
+static void *task_execute(void *arg);
 
 #if defined(HAVE_SG_ZONE_GET_BY_NAME) || defined(sg_zone_get_by_name)
 #define HAVE_STARPU_SIMGRID_GET_AS_BY_NAME
@@ -316,6 +316,11 @@ int do_starpu_main(int argc, char *argv[])
 	/* FIXME: Ugly work-around for bug in simgrid: the MPI context is not properly set at MSG process startup */
 	starpu_sleep(0.000001);
 
+	if (!starpu_main)
+	{
+		_STARPU_ERROR("In simgrid mode, the file containing the main() function of this application needs to be compiled with starpu.h or starpu_simgrid_wrap.h included, to properly rename it into starpu_main\n");
+	}
+
 	main_ret = starpu_main(argc, argv);
 	return main_ret;
 }
@@ -368,11 +373,9 @@ int main(int argc, char **argv)
 	int i;
 	for (i = 0; i < argc; i++)
 		argv_cpy[i] = strdup(argv[i]);
-	void **tsd;
-	_STARPU_CALLOC(tsd, MAX_TSD+1, sizeof(void*));
 
 	/* Run the application in a separate thread */
-	MSG_process_create_with_arguments("main", &do_starpu_main, tsd, _starpu_simgrid_get_host_by_name("MAIN"), argc, argv_cpy);
+	_starpu_simgrid_actor_create("main", &do_starpu_main, _starpu_simgrid_get_host_by_name("MAIN"), argc, argv_cpy);
 
 	/* And run maestro in the main thread */
 	MSG_main();
@@ -392,7 +395,7 @@ void _starpu_simgrid_init_early(int *argc STARPU_ATTRIBUTE_UNUSED, char ***argv
 #ifdef HAVE_SG_CONFIG_CONTINUE_AFTER_HELP
 	sg_config_continue_after_help();
 #endif
-#if defined(HAVE_MSG_PROCESS_ATTACH) || defined(MSG_process_attach)
+#if defined(HAVE_MSG_PROCESS_ATTACH) || defined(MSG_process_attach) || defined(HAVE_SG_ACTOR_ATTACH)
 	if (simgrid_started < 2 && !_starpu_simgrid_running_smpi())
 	{
 		/* "Cannot create_maestro with this ContextFactory.
@@ -411,10 +414,18 @@ void _starpu_simgrid_init_early(int *argc STARPU_ATTRIBUTE_UNUSED, char ***argv
 		SIMIX_set_maestro(maestro, NULL);
 		/* Initialize simgrid */
 		_starpu_start_simgrid(argc, *argv);
+
 		/* And attach the main thread to the main simgrid process */
 		void **tsd;
 		_STARPU_CALLOC(tsd, MAX_TSD+1, sizeof(void*));
+
+#if defined(HAVE_SG_ACTOR_ATTACH) && defined (HAVE_SG_ACTOR_DATA)
+		sg_actor_t actor = sg_actor_attach("main", NULL, _starpu_simgrid_get_host_by_name("MAIN"), NULL);
+		sg_actor_data_set(actor, tsd);
+#else
 		MSG_process_attach("main", tsd, _starpu_simgrid_get_host_by_name("MAIN"), NULL);
+#endif
+
 		/* We initialized through MSG_process_attach */
 		simgrid_started = 3;
 	}
@@ -431,12 +442,16 @@ void _starpu_simgrid_init_early(int *argc STARPU_ATTRIBUTE_UNUSED, char ***argv
 #ifndef STARPU_STATIC_ONLY
 		_STARPU_ERROR("Simgrid currently does not support privatization for dynamically-linked libraries in SMPI. Please reconfigure and build StarPU with --disable-shared");
 #endif
-#ifdef HAVE_MSG_PROCESS_USERDATA_INIT
+#if defined(HAVE_MSG_PROCESS_USERDATA_INIT) && !defined(HAVE_SG_ACTOR_DATA)
 		MSG_process_userdata_init();
 #endif
 		void **tsd;
 		_STARPU_CALLOC(tsd, MAX_TSD+1, sizeof(void*));
+#ifdef HAVE_SG_ACTOR_DATA
+		sg_actor_data_set(sg_actor_self(), tsd);
+#else
 		smpi_process_set_user_data(tsd);
+#endif
 	}
 	unsigned i;
 	for (i = 0; i < STARPU_MAXNODES; i++)
@@ -454,21 +469,22 @@ void _starpu_simgrid_init(void)
 	{
 		char s[32];
 		snprintf(s, sizeof(s), "worker %u runner", i);
-		void **tsd;
-		_STARPU_CALLOC(tsd, MAX_TSD+1, sizeof(void*));
 		starpu_sem_init(&worker_runner[i].sem, 0, 0);
-		tsd[0] = (void*)(uintptr_t) i;
-		worker_runner[i].runner = MSG_process_create_with_arguments(s, task_execute, tsd, _starpu_simgrid_get_host_by_worker(_starpu_get_worker_struct(i)), 0, NULL);
+		starpu_pthread_create_on(s, &worker_runner[i].runner, NULL, task_execute, (void*)(uintptr_t) i, _starpu_simgrid_get_host_by_worker(_starpu_get_worker_struct(i)));
 	}
 }
 
 void _starpu_simgrid_deinit_late(void)
 {
-#if defined(HAVE_MSG_PROCESS_ATTACH) || defined(MSG_process_attach)
+#if defined(HAVE_MSG_PROCESS_ATTACH) || defined(MSG_process_attach) || defined(HAVE_SG_ACTOR_ATTACH)
 	if (simgrid_started == 3)
 	{
 		/* Started with MSG_process_attach, now detach */
+#ifdef HAVE_SG_ACTOR_ATTACH
+		sg_actor_detach();
+#else
 		MSG_process_detach();
+#endif
 		simgrid_started = 0;
 	}
 #endif
@@ -553,12 +569,9 @@ struct task
 };
 
 /* Actually execute the task.  */
-static int task_execute(int argc STARPU_ATTRIBUTE_UNUSED, char *argv[] STARPU_ATTRIBUTE_UNUSED)
+static void *task_execute(void *arg)
 {
-	/* FIXME: Ugly work-around for bug in simgrid: the MPI context is not properly set at MSG process startup */
-	starpu_sleep(0.000001);
-
-	unsigned workerid = (uintptr_t) STARPU_PTHREAD_GETSPECIFIC(0);
+	unsigned workerid = (uintptr_t) arg;
 	struct worker_runner *w = &worker_runner[workerid];
 
 	_STARPU_DEBUG("worker runner %u started\n", workerid);
@@ -630,13 +643,13 @@ void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct star
 		/* This is not useful to include in simulation (and probably
 		 * doesn't have a perfmodel anyway) */
 		return;
-	
+
 	if (isnan(length))
 	{
 		length = starpu_task_expected_length(starpu_task, perf_arch, j->nimpl);
 		STARPU_ASSERT_MSG(!_STARPU_IS_ZERO(length) && !isnan(length),
-				"Codelet %s does not have a perfmodel, or is not calibrated enough, please re-run in non-simgrid mode until it is calibrated",
-			_starpu_job_get_model_name(j));
+				  "Codelet %s does not have a perfmodel (in directory %s), or is not calibrated enough, please re-run in non-simgrid mode until it is calibrated",
+				  _starpu_job_get_model_name(j), _starpu_get_perf_model_dir_codelet());
                 /* TODO: option to add variance according to performance model,
                  * to be able to easily check scheduling robustness */
 	}
@@ -799,10 +812,7 @@ static void transfer_queue(struct transfer *transfer)
 		{
 			char s[64];
 			snprintf(s, sizeof(s), "transfer %u-%u runner", src, dst);
-			void **tsd;
-			_STARPU_CALLOC(tsd, MAX_TSD+1, sizeof(void*));
-			tsd[0] = (void*)(uintptr_t)((src<<16) + dst);
-			t->runner = MSG_process_create_with_arguments(s, transfer_execute, tsd, _starpu_simgrid_get_memnode_host(src), 0, NULL);
+			starpu_pthread_create_on(s, &t->runner, NULL, transfer_execute, (void*)(uintptr_t)((src<<16) + dst), _starpu_simgrid_get_memnode_host(src));
 			starpu_sem_init(&t->sem, 0, 0);
 		}
 		STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
@@ -824,12 +834,9 @@ static void transfer_queue(struct transfer *transfer)
 }
 
 /* Actually execute the transfer, and then start transfers waiting for this one.  */
-static int transfer_execute(int argc STARPU_ATTRIBUTE_UNUSED, char *argv[] STARPU_ATTRIBUTE_UNUSED)
+static void *transfer_execute(void *arg)
 {
-	/* FIXME: Ugly work-around for bug in simgrid: the MPI context is not properly set at MSG process startup */
-	starpu_sleep(0.000001);
-
-	unsigned src_dst = (uintptr_t) STARPU_PTHREAD_GETSPECIFIC(0);
+	unsigned src_dst = (uintptr_t) arg;
 	unsigned src = src_dst >> 16;
 	unsigned dst = src_dst & 0xffff;
 	struct transfer_runner *t = &transfer_runner[src][dst];
@@ -1105,6 +1112,24 @@ _starpu_simgrid_thread_start(int argc STARPU_ATTRIBUTE_UNUSED, char *argv[])
 	return 0;
 }
 
+starpu_pthread_t _starpu_simgrid_actor_create(const char *name, xbt_main_func_t code, starpu_sg_host_t host, int argc, char *argv[])
+{
+	void **tsd;
+	starpu_pthread_t actor;
+	_STARPU_CALLOC(tsd, MAX_TSD+1, sizeof(void*));
+#ifdef HAVE_SG_ACTOR_INIT
+	actor = sg_actor_init(name, host);
+	sg_actor_data_set(actor, tsd);
+	sg_actor_start(actor, code, argc, argv);
+#else
+	actor = MSG_process_create_with_arguments(name, code, tsd, host, argc, argv);
+#ifdef HAVE_SG_ACTOR_DATA
+	sg_actor_data_set(actor, tsd);
+#endif
+#endif
+	return actor;
+}
+
 starpu_sg_host_t _starpu_simgrid_get_memnode_host(unsigned node)
 {
 	const char *fmt;
@@ -1217,7 +1242,7 @@ void _starpu_simgrid_count_ngpus(void)
 						found = 1;
 						break;
 					}
-					
+
 				if (!found)
 					continue;
 

+ 19 - 6
src/core/task.c

@@ -586,6 +586,11 @@ void _starpu_codelet_check_deprecated_fields(struct starpu_codelet *cl)
 {
 	if (!cl)
 		return;
+	if (cl->checked)
+	{
+		STARPU_RMB();
+		return;
+	}
 
 	uint32_t where = cl->where;
 	int is_where_unset = where == 0;
@@ -727,6 +732,9 @@ void _starpu_codelet_check_deprecated_fields(struct starpu_codelet *cl)
 	}
 
 	cl->where = where;
+
+	STARPU_WMB();
+	cl->checked = 1;
 }
 
 void _starpu_task_check_deprecated_fields(struct starpu_task *task STARPU_ATTRIBUTE_UNUSED)
@@ -867,7 +875,7 @@ int _starpu_task_submit(struct starpu_task *task, int nodeps)
 		0
 #endif
 		;
-	if (!j->internal && !continuation)
+	if (!_starpu_perf_counter_paused() && !j->internal && !continuation)
 	{
 		(void) STARPU_ATOMIC_ADD64(&_starpu_task__g_total_submitted__value, 1);
 		int64_t value = STARPU_ATOMIC_ADD64(&_starpu_task__g_current_submitted__value, 1);
@@ -1145,7 +1153,8 @@ int _starpu_task_wait_for_all_and_return_nb_waited_tasks(void)
 int starpu_task_wait_for_all(void)
 {
 	_starpu_task_wait_for_all_and_return_nb_waited_tasks();
-	_starpu_perf_counter_update_global_sample();
+	if (!_starpu_perf_counter_paused())
+		_starpu_perf_counter_update_global_sample();
 	return 0;
 }
 
@@ -1162,7 +1171,8 @@ int _starpu_task_wait_for_all_in_ctx_and_return_nb_waited_tasks(unsigned sched_c
 int starpu_task_wait_for_all_in_ctx(unsigned sched_ctx)
 {
 	_starpu_task_wait_for_all_in_ctx_and_return_nb_waited_tasks(sched_ctx);
-	_starpu_perf_counter_update_global_sample();
+	if (!_starpu_perf_counter_paused())
+		_starpu_perf_counter_update_global_sample();
 	return 0;
 }
 
@@ -1204,7 +1214,8 @@ int starpu_task_wait_for_n_submitted(unsigned n)
 		_STARPU_DEBUG("Waiting for tasks submitted to context %u\n", sched_ctx_id);
 		_starpu_wait_for_n_submitted_tasks_of_sched_ctx(sched_ctx_id, n);
 	}
-	_starpu_perf_counter_update_global_sample();
+	if (!_starpu_perf_counter_paused())
+		_starpu_perf_counter_update_global_sample();
 	return 0;
 }
 
@@ -1212,7 +1223,8 @@ int starpu_task_wait_for_n_submitted_in_ctx(unsigned sched_ctx, unsigned n)
 {
 	_starpu_wait_for_n_submitted_tasks_of_sched_ctx(sched_ctx, n);
 
-	_starpu_perf_counter_update_global_sample();
+	if (!_starpu_perf_counter_paused())
+		_starpu_perf_counter_update_global_sample();
 	return 0;
 }
 /*
@@ -1248,7 +1260,8 @@ int starpu_task_wait_for_no_ready(void)
 		}
 	}
 
-	_starpu_perf_counter_update_global_sample();
+	if (!_starpu_perf_counter_paused())
+		_starpu_perf_counter_update_global_sample();
 	return 0;
 }
 

+ 16 - 15
src/core/topology.c

@@ -319,16 +319,15 @@ int _starpu_task_data_get_node_on_node(struct starpu_task *task, unsigned index,
 		// TODO: rather leave in DDR
 		node = local_node;
 		break;
-        case STARPU_SPECIFIC_NODE_LOCAL_OR_CPU:
+	case STARPU_SPECIFIC_NODE_LOCAL_OR_CPU:
 		if (task->handles[index]->per_node[local_node].state != STARPU_INVALID) {
-			   
+			/* It is here already, rather access it from here */
 			node = local_node;
 		} else {
-                        /* It is not here already, do not bother moving it */
-                        node = STARPU_MAIN_RAM;
-                    }
+			/* It is not here already, do not bother moving it */
+			node = STARPU_MAIN_RAM;
+		}
 		break;
-      
 	}
 	return node;
 }
@@ -353,15 +352,15 @@ int _starpu_task_data_get_node_on_worker(struct starpu_task *task, unsigned inde
 		// TODO: rather leave in DDR
 		node = local_node;
 		break;
-        case STARPU_SPECIFIC_NODE_LOCAL_OR_CPU:
-                if (task->handles[index]->per_node[local_node].state != STARPU_INVALID) {
-                /* It is here already, rather access it from here */
-                node = local_node;
-                 } else {
-                /* It is not here already, do not bother moving it */
-                node = STARPU_MAIN_RAM;
-                 }
-                break;
+	case STARPU_SPECIFIC_NODE_LOCAL_OR_CPU:
+		if (task->handles[index]->per_node[local_node].state != STARPU_INVALID) {
+			/* It is here already, rather access it from here */
+			node = local_node;
+		} else {
+			/* It is not here already, do not bother moving it */
+			node = STARPU_MAIN_RAM;
+		}
+		break;
 	}
 	return node;
 }
@@ -3039,6 +3038,8 @@ int _starpu_build_topology(struct _starpu_machine_config *config, int no_mp_conf
 
 	_starpu_init_workers_binding_and_memory(config, no_mp_config);
 
+	_starpu_mem_chunk_init_last();
+
 	config->cpus_nodeid = -1;
 	config->cuda_nodeid = -1;
 	config->opencl_nodeid = -1;

+ 5 - 2
src/core/tree.c

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2014,2015                                Inria
- * Copyright (C) 2014-2017, 2019                          CNRS
- * Copyright (C) 2014,2016                                Université de Bordeaux
+ * Copyright (C) 2014-2017,2019                           CNRS
+ * Copyright (C) 2014,2016,2019                           Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -112,6 +112,9 @@ struct starpu_tree* starpu_tree_get_neighbour(struct starpu_tree *tree, struct s
 
 	if (father == NULL) return NULL;
 
+	if (father == tree && father->arity == 0)
+		return tree;
+
 	for(st = 0; st < father->arity; st++)
 	{
 		if(&father->nodes[st] == node)

+ 29 - 15
src/core/workers.c

@@ -778,7 +778,7 @@ void _starpu_driver_start(struct _starpu_worker *worker, unsigned fut_key, unsig
 	(void) devid;
 
 #ifdef STARPU_USE_FXT
-	_starpu_fxt_register_thread(worker->bindid);
+	_STARPU_TRACE_REGISTER_THREAD(worker->bindid);
 	_starpu_worker_start(worker, fut_key, sync);
 #endif
 	_starpu_set_local_worker_key(worker);
@@ -1178,6 +1178,9 @@ int starpu_conf_init(struct starpu_conf *conf)
 
 	/* 64MiB by default */
 	conf->trace_buffer_size = starpu_get_env_number_default("STARPU_TRACE_BUFFER_SIZE", 64) << 20;
+
+	/* Do not start performance counter collection by default */
+	conf->start_perf_counter_collection = 0;
 	return 0;
 }
 
@@ -1233,35 +1236,46 @@ struct starpu_tree* starpu_workers_get_tree(void)
 	return _starpu_config.topology.tree;
 }
 
+#if HWLOC_API_VERSION >= 0x20000
+#define NORMAL_CHILD(obj) 1
+#else
+#define NORMAL_CHILD(obj) ((obj)->type < HWLOC_OBJ_BRIDGE)
+#endif
+
 #ifdef STARPU_HAVE_HWLOC
 static void _fill_tree(struct starpu_tree *tree, hwloc_obj_t curr_obj, unsigned depth, hwloc_topology_t topology, struct starpu_tree *father)
 {
 	unsigned i, j;
 	unsigned arity;
-	if (curr_obj->arity == 1)
-	{
-		/* Nothing interestin here, skip level */
-		_fill_tree(tree, curr_obj->children[0], depth+1, topology, father);
-		return;
-	}
-	starpu_tree_insert(tree, curr_obj->logical_index, depth, curr_obj->type == HWLOC_OBJ_PU, curr_obj->arity, father);
+#if HWLOC_API_VERSION >= 0x20000
+	arity = curr_obj->arity;
+#else
 	arity = 0;
 	for(i = 0; i < curr_obj->arity; i++)
 	{
-		hwloc_obj_t child = curr_obj->children[i];
-		if (child->type == HWLOC_OBJ_BRIDGE && (!child->cpuset || hwloc_bitmap_iszero(child->cpuset)))
+		if (!NORMAL_CHILD(curr_obj->children[i]))
 			/* I/O stuff, stop caring */
-			continue;
+			break;
 		arity++;
 	}
+#endif
+
+	if (arity == 1)
+	{
+		/* Nothing interestin here, skip level */
+		_fill_tree(tree, curr_obj->children[0], depth+1, topology, father);
+		return;
+	}
+
+	starpu_tree_insert(tree, curr_obj->logical_index, depth, curr_obj->type == HWLOC_OBJ_PU, arity, father);
 	starpu_tree_prepare_children(arity, tree);
 	j = 0;
 	for(i = 0; i < arity; i++)
 	{
 		hwloc_obj_t child = curr_obj->children[i];
-		if (child->type == HWLOC_OBJ_BRIDGE && (!child->cpuset || hwloc_bitmap_iszero(child->cpuset)))
-			/* I/O stuff, stop caring */
-			continue;
+		if (!NORMAL_CHILD(child))
+			/* I/O stuff, stop caring (shouldn't happen, though) */
+			break;
 #if 0
 		char string[128];
 		hwloc_obj_snprintf(string, sizeof(string), topology, child, "#", 0);
@@ -1626,7 +1640,7 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 	}
 
 	_starpu_initialize_registered_performance_models();
-	_starpu_perf_counter_init();
+	_starpu_perf_counter_init(&_starpu_config);
 	_starpu_perf_knob_init();
 
 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)

+ 10 - 0
src/core/workers.h

@@ -447,6 +447,9 @@ struct _starpu_machine_config
 
 	int watchdog_ok;
 
+	/** When >0, StarPU should stop performance counters collection. */
+	int perf_counter_pause_depth;
+
 	starpu_pthread_mutex_t submitted_mutex;
 };
 
@@ -1195,6 +1198,13 @@ void _starpu_worker_refuse_task(struct _starpu_worker *worker, struct starpu_tas
 void _starpu_set_catch_signals(int do_catch_signal);
 int _starpu_get_catch_signals(void);
 
+/* Performance Monitoring */
+static inline int _starpu_perf_counter_paused(void) 
+{
+	STARPU_RMB();
+	return _starpu_config.perf_counter_pause_depth > 0;
+}
+
 /* @}*/
 
 #endif // __WORKERS_H__

+ 140 - 7
src/datawizard/copy_driver.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2008-2019                                Université de Bordeaux
+ * Copyright (C) 2008-2020                                Université de Bordeaux
  * Copyright (C) 2011-2013,2016,2017                      Inria
  * Copyright (C) 2010,2011,2013,2015-2019                 CNRS
  *
@@ -188,13 +188,13 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 #endif
 
 	struct _starpu_node_ops *node_ops = _starpu_memory_node_get_node_ops(src_node);
-	if (node_ops && node_ops->copy_data_to[dst_kind])
+	if (node_ops && node_ops->copy_interface_to[dst_kind])
 	{
-		return node_ops->copy_data_to[dst_kind](handle, src_interface, src_node, dst_interface, dst_node, req);
+		return node_ops->copy_interface_to[dst_kind](handle, src_interface, src_node, dst_interface, dst_node, req);
 	}
 	else
 	{
-		STARPU_ABORT_MSG("No copy_data_to function defined from node %s to node %s\n", _starpu_node_get_prefix(starpu_node_get_kind(src_node)), _starpu_node_get_prefix(starpu_node_get_kind(dst_node)));
+		STARPU_ABORT_MSG("No copy_interface_to function defined from node %s to node %s\n", _starpu_node_get_prefix(starpu_node_get_kind(src_node)), _starpu_node_get_prefix(starpu_node_get_kind(dst_node)));
 	}
 #endif /* !SIMGRID */
 }
@@ -307,16 +307,16 @@ int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, u
 
 	struct _starpu_node_ops *node_ops = _starpu_memory_node_get_node_ops(src_node);
 
-	if (node_ops && node_ops->copy_interface_to[dst_kind])
+	if (node_ops && node_ops->copy_data_to[dst_kind])
 	{
-		return node_ops->copy_interface_to[dst_kind](src, src_offset, src_node,
+		return node_ops->copy_data_to[dst_kind](src, src_offset, src_node,
 							     dst, dst_offset, dst_node,
 							     size,
 							     async_channel);
 	}
 	else
 	{
-		STARPU_ABORT_MSG("No copy_interface_to function defined from node %s to node %s\n", _starpu_node_get_prefix(starpu_node_get_kind(src_node)), _starpu_node_get_prefix(starpu_node_get_kind(dst_node)));
+		STARPU_ABORT_MSG("No copy_data_to function defined from node %s to node %s\n", _starpu_node_get_prefix(starpu_node_get_kind(src_node)), _starpu_node_get_prefix(starpu_node_get_kind(dst_node)));
 		return -1;
 	}
 
@@ -348,6 +348,139 @@ if (src_kind == STARPU_FPGA_RAM || dst_kind == STARPU_CPU_RAM)
 
 }
 
+int starpu_interface_copy2d(uintptr_t src, size_t src_offset, unsigned src_node,
+			    uintptr_t dst, size_t dst_offset, unsigned dst_node,
+			    size_t blocksize,
+			    size_t numblocks, size_t ld_src, size_t ld_dst,
+			    void *async_data)
+{
+	int ret = 0;
+	unsigned i;
+	struct _starpu_async_channel *async_channel = async_data;
+	enum starpu_node_kind dst_kind = starpu_node_get_kind(dst_node);
+	struct _starpu_node_ops *node_ops = _starpu_memory_node_get_node_ops(src_node);
+
+	STARPU_ASSERT_MSG(ld_src >= blocksize, "block size %lu is bigger than ld %lu in source", (unsigned long) blocksize, (unsigned long) ld_src);
+	STARPU_ASSERT_MSG(ld_dst >= blocksize, "block size %lu is bigger than ld %lu in destination", (unsigned long) blocksize, (unsigned long) ld_dst);
+
+	if (ld_src == blocksize && ld_dst == blocksize)
+		/* Optimize contiguous case */
+		return starpu_interface_copy(src, src_offset, src_node,
+					     dst, dst_offset, dst_node,
+					     blocksize * numblocks, async_data);
+
+	if (node_ops && node_ops->copy2d_data_to[dst_kind])
+		/* Hardware-optimized non-contiguous case */
+		return node_ops->copy2d_data_to[dst_kind](src, src_offset, src_node,
+							     dst, dst_offset, dst_node,
+							     blocksize,
+							     numblocks, ld_src, ld_dst,
+							     async_channel);
+
+	for (i = 0; i < numblocks; i++)
+	{
+		if (starpu_interface_copy(src, src_offset + i*ld_src, src_node,
+					  dst, dst_offset + i*ld_dst, dst_node,
+					  blocksize, async_data))
+			ret = -EAGAIN;
+	}
+
+	return ret;
+}
+
+int starpu_interface_copy3d(uintptr_t src, size_t src_offset, unsigned src_node,
+			    uintptr_t dst, size_t dst_offset, unsigned dst_node,
+			    size_t blocksize,
+			    size_t numblocks_1, size_t ld1_src, size_t ld1_dst,
+			    size_t numblocks_2, size_t ld2_src, size_t ld2_dst,
+			    void *async_data)
+{
+	int ret = 0;
+	unsigned i;
+	struct _starpu_async_channel *async_channel = async_data;
+	enum starpu_node_kind dst_kind = starpu_node_get_kind(dst_node);
+	struct _starpu_node_ops *node_ops = _starpu_memory_node_get_node_ops(src_node);
+
+	STARPU_ASSERT_MSG(ld1_src >= blocksize, "block size %lu is bigger than ld %lu in source", (unsigned long) blocksize, (unsigned long) ld1_src);
+	STARPU_ASSERT_MSG(ld1_dst >= blocksize, "block size %lu is bigger than ld %lu in destination", (unsigned long) blocksize, (unsigned long) ld1_dst);
+
+	STARPU_ASSERT_MSG(ld2_src >= numblocks_1 * ld1_src, "block group size %lu is bigger than group ld %lu in source", (unsigned long) numblocks_1 * ld1_src, (unsigned long) ld2_src);
+	STARPU_ASSERT_MSG(ld2_dst >= numblocks_1 * ld1_dst, "block group size %lu is bigger than group ld %lu in destination", (unsigned long) numblocks_1 * ld1_dst, (unsigned long) ld2_dst);
+
+	if (ld1_src * ld2_src == blocksize * numblocks_1 &&
+	    ld1_dst * ld2_dst == blocksize * numblocks_1)
+		/* Optimize contiguous case */
+		return starpu_interface_copy(src, src_offset, src_node,
+					     dst, dst_offset, dst_node,
+					     blocksize * numblocks_1 * numblocks_2,
+					     async_data);
+
+	if (node_ops && node_ops->copy3d_data_to[dst_kind])
+		/* Hardware-optimized non-contiguous case */
+		return node_ops->copy3d_data_to[dst_kind](src, src_offset, src_node,
+							     dst, dst_offset, dst_node,
+							     blocksize,
+							     numblocks_1, ld1_src, ld1_dst,
+							     numblocks_2, ld2_src, ld2_dst,
+							     async_channel);
+
+
+	for (i = 0; i < numblocks_2; i++)
+	{
+		if (starpu_interface_copy2d(src, src_offset + i*ld2_src, src_node,
+					    dst, dst_offset + i*ld2_dst, dst_node,
+					    blocksize, numblocks_1, ld1_src, ld1_dst,
+					    async_data))
+			ret = -EAGAIN;
+	}
+
+	return ret;
+}
+
+int starpu_interface_copy4d(uintptr_t src, size_t src_offset, unsigned src_node,
+			    uintptr_t dst, size_t dst_offset, unsigned dst_node,
+			    size_t blocksize,
+			    size_t numblocks_1, size_t ld1_src, size_t ld1_dst,
+			    size_t numblocks_2, size_t ld2_src, size_t ld2_dst,
+			    size_t numblocks_3, size_t ld3_src, size_t ld3_dst,
+			    void *async_data)
+{
+	int ret = 0;
+	unsigned i;
+
+	STARPU_ASSERT_MSG(ld1_src >= blocksize, "block size %lu is bigger than ld %lu in source", (unsigned long) blocksize, (unsigned long) ld1_src);
+	STARPU_ASSERT_MSG(ld1_dst >= blocksize, "block size %lu is bigger than ld %lu in destination", (unsigned long) blocksize, (unsigned long) ld1_dst);
+
+	STARPU_ASSERT_MSG(ld2_src >= numblocks_1 * ld1_src, "block group size %lu is bigger than group ld %lu in source", (unsigned long) numblocks_1 * ld1_src, (unsigned long) ld2_src);
+	STARPU_ASSERT_MSG(ld2_dst >= numblocks_1 * ld1_dst, "block group size %lu is bigger than group ld %lu in destination", (unsigned long) numblocks_1 * ld1_dst, (unsigned long) ld2_dst);
+
+	STARPU_ASSERT_MSG(ld3_src >= numblocks_2 * ld2_src, "block group group size %lu is bigger than group group ld %lu in source", (unsigned long) numblocks_2 * ld2_src, (unsigned long) ld3_src);
+	STARPU_ASSERT_MSG(ld3_dst >= numblocks_2 * ld2_dst, "block group group size %lu is bigger than group group ld %lu in destination", (unsigned long) numblocks_2 * ld2_dst, (unsigned long) ld3_dst);
+
+	if (ld1_src * ld2_src * ld3_src == blocksize * numblocks_1 * numblocks_2 &&
+	    ld1_dst * ld2_dst * ld3_dst == blocksize * numblocks_1 * numblocks_2)
+		/* Optimize contiguous case */
+		return starpu_interface_copy(src, src_offset, src_node,
+					     dst, dst_offset, dst_node,
+					     blocksize * numblocks_1 * numblocks_2 * numblocks_3,
+					     async_data);
+
+	/* Probably won't ever have a 4D interface in drivers :) */
+
+	for (i = 0; i < numblocks_3; i++)
+	{
+		if (starpu_interface_copy3d(src, src_offset + i*ld3_src, src_node,
+					    dst, dst_offset + i*ld3_dst, dst_node,
+					    blocksize,
+					    numblocks_1, ld1_src, ld1_dst,
+					    numblocks_2, ld2_src, ld2_dst,
+					    async_data))
+			ret = -EAGAIN;
+	}
+
+	return ret;
+}
+
 void _starpu_driver_wait_request_completion(struct _starpu_async_channel *async_channel)
 {
 

+ 5 - 0
src/datawizard/filters.c

@@ -684,6 +684,7 @@ void _starpu_data_partition_submit(starpu_data_handle_t initial_handle, unsigned
 	_starpu_spin_lock(&initial_handle->header_lock);
 	STARPU_ASSERT_MSG(initial_handle->partitioned == 0, "One can't submit several partition plannings at the same time");
 	STARPU_ASSERT_MSG(initial_handle->readonly == 0, "One can't submit a partition planning while a readonly partitioning is active");
+	STARPU_ASSERT_MSG(nparts > 0, "One can't partition into 0 parts");
 	initial_handle->partitioned++;
 	initial_handle->active_children = children[0]->siblings;
 	_starpu_spin_unlock(&initial_handle->header_lock);
@@ -743,6 +744,7 @@ void starpu_data_partition_readonly_submit(starpu_data_handle_t initial_handle,
 	STARPU_ASSERT_MSG(initial_handle->sequential_consistency, "partition planning is currently only supported for data with sequential consistency");
 	_starpu_spin_lock(&initial_handle->header_lock);
 	STARPU_ASSERT_MSG(initial_handle->partitioned == 0 || initial_handle->readonly, "One can't submit a readonly partition planning at the same time as a readwrite partition planning");
+	STARPU_ASSERT_MSG(nparts > 0, "One can't partition into 0 parts");
 	initial_handle->partitioned++;
 	initial_handle->readonly = 1;
 	if (initial_handle->nactive_readonly_children < initial_handle->partitioned)
@@ -779,6 +781,7 @@ void starpu_data_partition_readwrite_upgrade_submit(starpu_data_handle_t initial
 	_starpu_spin_lock(&initial_handle->header_lock);
 	STARPU_ASSERT_MSG(initial_handle->partitioned == 1, "One can't upgrade a readonly partition planning to readwrite while other readonly partition plannings are active");
 	STARPU_ASSERT_MSG(initial_handle->readonly == 1, "One can only upgrade a readonly partition planning");
+	STARPU_ASSERT_MSG(nparts > 0, "One can't partition into 0 parts");
 	initial_handle->readonly = 0;
 	initial_handle->active_children = initial_handle->active_readonly_children[0];
 	initial_handle->active_readonly_children[0] = NULL;
@@ -805,6 +808,7 @@ void _starpu_data_unpartition_submit(starpu_data_handle_t initial_handle, unsign
 	STARPU_ASSERT_MSG(gather_node == initial_handle->home_node || gather_node == -1, "gathering node different from home node is currently not supported");
 	_starpu_spin_lock(&initial_handle->header_lock);
 	STARPU_ASSERT_MSG(initial_handle->partitioned >= 1, "No partition planning is active for handle %p", initial_handle);
+	STARPU_ASSERT_MSG(nparts > 0, "One can't partition into 0 parts");
 	if (initial_handle->readonly)
 	{
 		/* Replace this children set with the last set in the list of readonly children sets */
@@ -899,6 +903,7 @@ void starpu_data_unpartition_readonly_submit(starpu_data_handle_t initial_handle
 	STARPU_ASSERT_MSG(gather_node == initial_handle->home_node || gather_node == -1, "gathering node different from home node is currently not supported");
 	_starpu_spin_lock(&initial_handle->header_lock);
 	STARPU_ASSERT_MSG(initial_handle->partitioned >= 1, "No partition planning is active for handle %p", initial_handle);
+	STARPU_ASSERT_MSG(nparts > 0, "One can't partition into 0 parts");
 	initial_handle->readonly = 1;
 	_starpu_spin_unlock(&initial_handle->header_lock);
 

+ 41 - 1
src/datawizard/interfaces/bcsr_filters.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2008-2011,2013,2014,2016,2019            Université de Bordeaux
+ * Copyright (C) 2008-2011,2013,2014,2016,2019-2020       Université de Bordeaux
  * Copyright (C) 2010                                     Mehdi Juhoor
  * Copyright (C) 2010,2011,2013,2015,2017,2019            CNRS
  *
@@ -20,6 +20,46 @@
 #include <common/config.h>
 #include <datawizard/filters.h>
 
+void starpu_bcsr_filter_vertical_block(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f, unsigned id, STARPU_ATTRIBUTE_UNUSED unsigned nparts)
+{
+	struct starpu_bcsr_interface *bcsr_father = (struct starpu_bcsr_interface *) father_interface;
+	struct starpu_bcsr_interface *bcsr_child = (struct starpu_bcsr_interface *) child_interface;
+
+	size_t elemsize = bcsr_father->elemsize;
+	uint32_t firstentry = bcsr_father->firstentry;
+	uint32_t r = bcsr_father->r;
+	uint32_t c = bcsr_father->c;
+	uint32_t *rowptr = bcsr_father->rowptr;
+
+	unsigned child_nrow;
+	size_t child_rowoffset;
+
+	STARPU_ASSERT_MSG(bcsr_father->id == STARPU_BCSR_INTERFACE_ID, "%s can only be applied on a bcsr data", __func__);
+
+	bcsr_child->id = bcsr_father->id;
+
+	if (!bcsr_father->nzval)
+		/* Not supported yet */
+		return;
+
+	starpu_filter_nparts_compute_chunk_size_and_offset(bcsr_father->nrow, nparts, 1, id, 1, &child_nrow, &child_rowoffset);
+
+	/* child blocks indexes between these (0-based) */
+	uint32_t start_block = rowptr[child_rowoffset] - firstentry;
+	uint32_t end_block = rowptr[child_rowoffset + child_nrow] - firstentry;
+
+	bcsr_child->nzval = bcsr_father->nzval + start_block * r*c * elemsize;
+	bcsr_child->nnz = end_block - start_block;
+	bcsr_child->nrow = child_nrow;
+	bcsr_child->colind = bcsr_father->colind + start_block;
+	bcsr_child->rowptr = rowptr + child_rowoffset;
+
+	bcsr_child->firstentry = firstentry + start_block;
+	bcsr_child->r = bcsr_father->r;
+	bcsr_child->c = bcsr_father->c;
+	bcsr_child->elemsize = elemsize;
+}
+
 void starpu_bcsr_filter_canonical_block(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f, unsigned id, STARPU_ATTRIBUTE_UNUSED unsigned nparts)
 {
 	struct starpu_bcsr_interface *bcsr_father = (struct starpu_bcsr_interface *) father_interface;

+ 0 - 0
src/datawizard/interfaces/block_interface.c


Some files were not shown because too many files changed in this diff