Explorar el Código

Merge branch 'master' into knobs

Nathalie Furmento hace 6 años
padre
commit
4c96f37f15
Se han modificado 75 ficheros con 1562 adiciones y 305 borrados
  1. 33 1
      ChangeLog
  2. 9 0
      configure.ac
  3. 2 2
      doc/doxygen/Makefile.am
  4. 17 14
      doc/doxygen/chapters/101_building.doxy
  5. 68 63
      doc/doxygen/chapters/210_check_list_performance.doxy
  6. 11 11
      doc/doxygen/chapters/301_tasks.doxy
  7. 7 2
      doc/doxygen/chapters/310_data_management.doxy
  8. 1 1
      doc/doxygen/chapters/380_offline_performance_tools.doxy
  9. 5 5
      doc/doxygen/chapters/470_simgrid.doxy
  10. 9 9
      doc/doxygen/chapters/501_environment_variables.doxy
  11. 5 5
      doc/doxygen/chapters/510_configure_options.doxy
  12. 4 4
      doc/doxygen/chapters/api/threads.doxy
  13. 2 2
      doc/doxygen/refman.tex
  14. 3 3
      examples/callback/prologue.c
  15. 2 2
      examples/cholesky/cholesky.sh
  16. 2 0
      examples/cpp/add_vectors_interface.cpp
  17. 1 1
      examples/dependency/sequential_consistency.c
  18. 1 1
      examples/dependency/task_end_dep_add.c
  19. 2 2
      examples/pipeline/pipeline.c
  20. 6 1
      include/starpu_config.h.in
  21. 13 0
      include/starpu_data_interfaces.h
  22. 41 15
      include/starpu_task.h
  23. 85 2
      include/starpu_task_util.h
  24. 51 3
      include/starpu_thread.h
  25. 1 1
      include/starpu_worker.h
  26. 38 1
      mpi/src/starpu_mpi_task_insert.c
  27. 47 0
      mpi/src/starpu_mpi_task_insert_fortran.c
  28. 5 5
      mpi/tests/callback.c
  29. 5 1
      src/common/fxt.c
  30. 4 3
      src/common/fxt.h
  31. 144 8
      src/common/thread.c
  32. 2 2
      src/common/thread.h
  33. 11 0
      src/common/timing.c
  34. 8 0
      src/common/utils.c
  35. 1 1
      src/core/dependencies/data_concurrency.c
  36. 12 7
      src/core/dependencies/implicit_data_deps.c
  37. 1 1
      src/core/disk.c
  38. 3 1
      src/core/perfmodel/perfmodel_bus.c
  39. 1 0
      src/core/sched_ctx.h
  40. 128 37
      src/core/simgrid.c
  41. 5 5
      src/core/simgrid.h
  42. 2 2
      src/core/simgrid_cpp.cpp
  43. 1 1
      src/core/task.c
  44. 8 4
      src/core/topology.c
  45. 1 0
      src/core/workers.h
  46. 10 2
      src/datawizard/coherency.c
  47. 1 0
      src/datawizard/coherency.h
  48. 1 1
      src/datawizard/copy_driver.c
  49. 2 2
      src/datawizard/data_request.c
  50. 2 2
      src/datawizard/datawizard.c
  51. 2 2
      src/datawizard/filters.c
  52. 1 1
      src/datawizard/malloc.c
  53. 3 3
      src/datawizard/memory_nodes.h
  54. 7 2
      src/debug/traces/starpu_fxt.c
  55. 2 2
      src/drivers/cpu/driver_cpu.c
  56. 9 3
      src/drivers/cuda/driver_cuda.c
  57. 10 5
      src/drivers/opencl/driver_opencl.c
  58. 20 4
      src/util/fstarpu.c
  59. 5 1
      src/util/openmp_runtime_support.c
  60. 17 7
      src/util/starpu_create_sync_task.c
  61. 128 0
      src/util/starpu_task_insert_utils.c
  62. 2 1
      starpufft/include/starpufft.h
  63. 5 4
      starpufft/src/Makefile.am
  64. 13 1
      starpufft/src/starpufftx.c
  65. 191 0
      starpufft/src/starpufftx3d.c
  66. 30 4
      starpufft/tests/testx.c
  67. 45 5
      tests/datawizard/data_implicit_deps.c
  68. 6 0
      tests/datawizard/variable_size.c
  69. 6 2
      tests/fault-tolerance/retry.c
  70. 5 5
      tests/main/codelet_null_callback.c
  71. 3 3
      tests/model-checking/Makefile
  72. 31 6
      tests/model-checking/prio_list.c
  73. 31 2
      tests/model-checking/starpu_barrier.c
  74. 3 2
      tests/sched_ctx/sched_ctx_hierarchy.c
  75. 163 4
      tools/starpu_replay.c

+ 33 - 1
ChangeLog

@@ -20,8 +20,40 @@ StarPU 1.4.0 (svn revision xxxx)
 ==============================================
 New features:
   * Fault tolerance support with starpu_task_ft_failed().
+  * Add get_max_size method to data interfaces for applications using data with
+    variable size to express their maximal potential size.
 
-StarPU 1.3.2 (git revision xxx)
+Small changes:
+  * Use the S4U interface of Simgrid instead of xbt and MSG.
+
+StarPU 1.3.3 (git revision xxx)
+==============================================
+
+New features:
+  * New semantic for starpu_task_insert() and alike parameters
+    STARPU_CALLBACK_ARG, STARPU_PROLOGUE_CALLBACK_ARG, and
+    STARPU_PROLOGUE_CALLBACK_POP_ARG which set respectively
+    starpu_task::callback_arg_free,
+    starpu_task::prologue_callback_arg_free and
+    starpu_task::prologue_callback_pop_arg_free to 1 when used.
+    New parameters STARPU_CALLBACK_ARG_NFREE,
+    STARPU_CALLBACK_WITH_ARG_NFREE, STARPU_PROLOGUE_CALLBACK_ARG_NFREE, and
+    STARPU_PROLOGUE_CALLBACK_POP_ARG_NFREE which set the corresponding
+    fields of starpu_task to 0.
+  * starpufft: Support 3D.
+
+Small features:
+  * New starpu_task_insert() and alike parameter STARPU_TASK_WORKERIDS
+    allowing to set the fields starpu_task::workerids_len and
+    starpu_task::workerids
+  * New starpu_task_insert() and alike parameters
+    STARPU_SEQUENTIAL_CONSISTENCY, STARPU_TASK_NO_SUBMITORDER and
+    STARPU_TASK_PROFILING_INFO
+  * New function starpu_create_callback_task() which creates and
+    submits an empty task with the specified callback
+
+
+StarPU 1.3.2 (git revision af22a20fc00a37addf3cc6506305f89feed940b0)
 ==============================================
 
 Small changes:

+ 9 - 0
configure.ac

@@ -176,12 +176,19 @@ if test x$enable_simgrid = xyes ; then
 	AC_CHECK_HEADERS([simgrid/host.h], [AC_DEFINE([STARPU_HAVE_SIMGRID_HOST_H], [1], [Define to 1 if you have host.h in simgrid/.])])
 	AC_CHECK_HEADERS([simgrid/simdag.h], [AC_DEFINE([STARPU_HAVE_SIMGRID_SIMDAG_H], [1], [Define to 1 if you have simdag.h in simgrid/.])])
 	AC_CHECK_HEADERS([xbt/synchro.h], [AC_DEFINE([STARPU_HAVE_XBT_SYNCHRO_H], [1], [Define to 1 if you have synchro.h in xbt/.])])
+	AC_CHECK_HEADERS([simgrid/actor.h], [AC_DEFINE([STARPU_HAVE_SIMGRID_ACTOR_H], [1], [Define to 1 if you have actor.h in simgrid/.])])
+	AC_CHECK_HEADERS([simgrid/semaphore.h], [AC_DEFINE([STARPU_HAVE_SIMGRID_SEMAPHORE_H], [1], [Define to 1 if you have semaphore.h in simgrid/.])])
+	AC_CHECK_HEADERS([simgrid/mutex.h], [AC_DEFINE([STARPU_HAVE_SIMGRID_MUTEX_H], [1], [Define to 1 if you have mutex.h in simgrid/.])])
+	AC_CHECK_HEADERS([simgrid/cond.h], [AC_DEFINE([STARPU_HAVE_SIMGRID_COND_H], [1], [Define to 1 if you have cond.h in simgrid/.])])
+	AC_CHECK_HEADERS([simgrid/barrier.h], [AC_DEFINE([STARPU_HAVE_SIMGRID_BARRIER_H], [1], [Define to 1 if you have barrier.h in simgrid/.])])
+	AC_CHECK_HEADERS([simgrid/engine.h])
 	AC_CHECK_TYPES([smx_actor_t], [AC_DEFINE([STARPU_HAVE_SMX_ACTOR_T], [1], [Define to 1 if you have the smx_actor_t type.])], [], [[#include <simgrid/simix.h>]])
 
 	# Latest functions
 	AC_CHECK_FUNCS([MSG_process_attach MSG_zone_get_hosts MSG_process_self_name MSG_process_userdata_init])
 	AC_CHECK_FUNCS([xbt_mutex_try_acquire smpi_process_set_user_data sg_zone_get_by_name sg_link_name sg_host_route sg_host_self sg_host_speed simcall_process_create sg_config_continue_after_help])
 	AC_CHECK_FUNCS([xbt_barrier_init], [AC_DEFINE([STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT], [1], [Define to 1 if you have the `xbt_barrier_init' function.])])
+	AC_CHECK_FUNCS([sg_actor_sleep_for sg_actor_self sg_actor_ref sg_host_get_properties sg_host_send_to sg_cfg_set_int sg_actor_self_execute simgrid_get_clock])
 	AC_CHECK_DECLS([smpi_process_set_user_data], [], [], [[#include <smpi/smpi.h>]])
 
 	# Oldies for compatibility with older simgrid
@@ -3433,6 +3440,8 @@ AC_CONFIG_COMMANDS([executable-scripts], [
   test -e tests/microbenchs/parallel_independent_heterogeneous_tasks.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_heterogeneous_tasks.sh tests/microbenchs/
   test -e tests/microbenchs/parallel_independent_homogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_homogeneous_tasks_data.sh tests/microbenchs/
   test -e tests/microbenchs/parallel_independent_homogeneous_tasks.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_homogeneous_tasks.sh tests/microbenchs/
+  test -e tests/microbenchs/parallel_redux_homogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_redux_homogeneous_tasks_data.sh tests/microbenchs/
+  test -e tests/microbenchs/parallel_redux_heterogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_redux_heterogeneous_tasks_data.sh tests/microbenchs/
   mkdir -p tests/datawizard
   test -e tests/datawizard/locality.sh || ln -sf $ac_abs_top_srcdir/tests/datawizard/locality.sh tests/datawizard/
   mkdir -p tests/overlap

+ 2 - 2
doc/doxygen/Makefile.am

@@ -155,7 +155,7 @@ chapters/version.sty: $(chapters)
                 if test -f $(top_srcdir)/doc/doxygen/$$f ; then $(PROG_STAT) --format=%Y $(top_srcdir)/doc/doxygen/$$f ; fi \
         done | sort -r | head -1 > timestamp_sty
 	@if test -s timestamp_sty ; then \
-		LC_ALL=C $(PROG_DATE) --date=@`cat timestamp_sty` +"%d %B %Y" > timestamp_sty_updated ;\
+		LC_ALL=C $(PROG_DATE) --date=@`cat timestamp_sty` +"%F" > timestamp_sty_updated ;\
 		LC_ALL=C $(PROG_DATE) --date=@`cat timestamp_sty` +"%B %Y" > timestamp_sty_updated_month ;\
 	fi
 	@if test -s timestamp_sty_updated ; then \
@@ -174,7 +174,7 @@ chapters/version.html: $(chapters)
                 if test -f $(top_srcdir)/doc/doxygen/$$f ; then $(PROG_STAT) --format=%Y $(top_srcdir)/doc/doxygen/$$f ; fi \
         done | sort -r | head -1 > timestamp_html
 	@if test -s timestamp_html ; then \
-		LC_ALL=C $(PROG_DATE) --date=@`cat timestamp_html` +"%d %B %Y" > timestamp_html_updated ;\
+		LC_ALL=C $(PROG_DATE) --date=@`cat timestamp_html` +"%F" > timestamp_html_updated ;\
 		LC_ALL=C $(PROG_DATE) --date=@`cat timestamp_html` +"%B %Y" > timestamp_html_updated_month ;\
 	fi
 	@echo "This manual documents the usage of StarPU version $(VERSION)." > $(top_srcdir)/doc/doxygen/chapters/version.html

+ 17 - 14
doc/doxygen/chapters/101_building.doxy

@@ -55,7 +55,7 @@ location.
 
 If <c>libhwloc</c> is not available on your system, the option
 \ref without-hwloc "--without-hwloc" should be explicitely given when calling the
-<c>configure</c> script.
+script <c>configure</c>.
 
 
 \subsection GettingSources Getting Sources
@@ -88,8 +88,8 @@ $ git clone https://scm.gforge.inria.fr/anonscm/git/starpu/starpu.git
 
 Running <c>autogen.sh</c> is not necessary when using the tarball
 releases of StarPU.  However when using the source code from the git
-repository, you first need to generate the configure scripts and the
-Makefiles. This requires the availability of <c>autoconf</c> and
+repository, you first need to generate the script <c>configure</c> and the
+different Makefiles. This requires the availability of <c>autoconf</c> and
 <c>automake</c> >= 2.60.
 
 \verbatim
@@ -113,7 +113,7 @@ is advised to put them all in a separate directory. It is then
 easier to cleanup, and this allows to compile several configurations
 out of the same source tree. To do so, simply enter the directory
 where you want the compilation to produce its files, and invoke the
-<c>configure</c> script located in the StarPU source directory.
+script <c>configure</c> located in the StarPU source directory.
 
 \verbatim
 $ mkdir build
@@ -139,7 +139,7 @@ $ make
 Once everything is built, you may want to test the result. An
 extensive set of regression tests is provided with StarPU. Running the
 tests is done by calling <c>make check</c>. These tests are run every night
-and the result from the main profile is publicly available (http://starpu.gforge.inria.fr/testing/).
+and the result from the main profile is publicly available (http://starpu.gforge.inria.fr/testing/master/).
 
 \verbatim
 $ make check
@@ -246,7 +246,7 @@ int main(void)
     {
         return 1;
     }
-    printf("%d CPU coress\n", starpu_worker_get_count_by_type(STARPU_CPU_WORKER));
+    printf("%d CPU cores\n", starpu_worker_get_count_by_type(STARPU_CPU_WORKER));
     printf("%d CUDA GPUs\n", starpu_worker_get_count_by_type(STARPU_CUDA_WORKER));
     printf("%d OpenCL GPUs\n", starpu_worker_get_count_by_type(STARPU_OPENCL_WORKER));
     starpu_shutdown();
@@ -273,7 +273,7 @@ int main(void)
     {
         return 1;
     }
-    printf("%d CPU coress\n", starpu_worker_get_count_by_type(STARPU_CPU_WORKER));
+    printf("%d CPU cores\n", starpu_worker_get_count_by_type(STARPU_CPU_WORKER));
     printf("%d CUDA GPUs\n", starpu_worker_get_count_by_type(STARPU_CUDA_WORKER));
     printf("%d OpenCL GPUs\n", starpu_worker_get_count_by_type(STARPU_OPENCL_WORKER));
     starpu_shutdown();
@@ -428,12 +428,13 @@ While StarPU tasks are executing, the application is not supposed to do
 computations in the threads it starts itself, tasks should be used instead.
 
 If the application needs to reserve some cores for its own computations, it
-can do so with the starpu_conf::reserve_ncpus field, get the core IDs with
+can do so with the field starpu_conf::reserve_ncpus, get the core IDs with
 starpu_get_next_bindid(), and bind to them with starpu_bind_thread_on().
 
-Another option is for the application to put the whole StarPU on pause with
-starpu_pause() before performing its own computations, and let StarPU resume
-executing tasks with starpu_resume().
+Another option is for the application to pause StarPU by calling
+starpu_pause(), then to perform its own computations, and then to
+resume StarPU by calling starpu_resume() so that StarPU can execute
+tasks.
 
 \subsection EnablingOpenCL Enabling OpenCL
 
@@ -499,12 +500,12 @@ multiplication using BLAS and cuBLAS. They output the obtained GFlops.
 
 <c>lu_*</c> perform an LU factorization. They use different dependency primitives.
 
-\subsection SimulatedBenchmarks Simulated benchmarks
+\subsection SimulatedBenchmarks Simulated Benchmarks
 
 It can also be convenient to try simulated benchmarks, if you want to give a try
 at CPU-GPU scheduling without actually having a GPU at hand. This can be done by
-using the simgrid version of StarPU: first install the simgrid simulator from
-http://simgrid.gforge.inria.fr/ (we tested with simgrid from 3.11 to 3.16, and
+using the SimGrid version of StarPU: first install the SimGrid simulator from
+http://simgrid.gforge.inria.fr/ (we tested with SimGrid from 3.11 to 3.16, and
 3.18 to 3.22, other versions may have compatibility issues, 3.17 notably does
 not build at all. MPI simulation does not work with version 3.22),
 then configure StarPU with \ref enable-simgrid
@@ -527,4 +528,6 @@ Performance models are available for <c>cholesky_*</c>, <c>lu_*</c>, <c>*gemm</c
 320, 640, or 960 (plus 1440 for sirocco), and for <c>stencil</c> with block size 128x128x128, 192x192x192, and
 256x256x256.
 
+Read the chapter \ref SimGridSupport for more information on the SimGrid support.
+
 */

+ 68 - 63
doc/doxygen/chapters/210_check_list_performance.doxy

@@ -28,8 +28,9 @@ will show roughly where time is spent, and focus correspondingly.
 
 \section CheckTaskSize Check Task Size
 
-Make sure that your tasks are not too small, because the StarPU runtime overhead
-is not completely zero. You can run the tasks_size_overhead.sh script to get an
+Make sure that your tasks are not too small, as the StarPU runtime overhead
+is not completely zero. As explained in \ref TaskSizeOverhead, you can
+run the script \c tasks_size_overhead.sh to get an
 idea of the scalability of tasks depending on their duration (in µs), on your
 own system.
 
@@ -40,19 +41,18 @@ much bigger than this.
 of cores, so it's better to try to get 10ms-ish tasks.
 
 Tasks durations can easily be observed when performance models are defined (see
-\ref PerformanceModelExample) by using the <c>starpu_perfmodel_plot</c> or
-<c>starpu_perfmodel_display</c> tool (see \ref PerformanceOfCodelets)
+\ref PerformanceModelExample) by using the tools <c>starpu_perfmodel_plot</c> or
+<c>starpu_perfmodel_display</c> (see \ref PerformanceOfCodelets)
 
 When using parallel tasks, the problem is even worse since StarPU has to
-synchronize the execution of tasks.
+synchronize the tasks execution.
 
 \section ConfigurationImprovePerformance Configuration Which May Improve Performance
 
-The \ref enable-fast "--enable-fast" \c configure option disables all
+The \c configure option \ref enable-fast "--enable-fast" disables all
 assertions. This makes StarPU more performant for really small tasks by
 disabling all sanity checks. Only use this for measurements and production, not for development, since this will drop all basic checks.
 
-
 \section DataRelatedFeaturesToImprovePerformance Data Related Features Which May Improve Performance
 
 link to \ref DataManagement
@@ -81,14 +81,14 @@ link to \ref StaticScheduling
 
 For proper overlapping of asynchronous GPU data transfers, data has to be pinned
 by CUDA. Data allocated with starpu_malloc() is always properly pinned. If the
-application is registering to StarPU some data which has not been allocated with
-starpu_malloc(), it should use starpu_memory_pin() to pin it.
+application registers to StarPU some data which has not been allocated with
+starpu_malloc(), starpu_memory_pin() should be called to pin the data memory.
 
 Due to CUDA limitations, StarPU will have a hard time overlapping its own
 communications and the codelet computations if the application does not use a
 dedicated CUDA stream for its computations instead of the default stream,
-which synchronizes all operations of the GPU. StarPU provides one by the use
-of starpu_cuda_get_local_stream() which can be used by all CUDA codelet
+which synchronizes all operations of the GPU. The function
+starpu_cuda_get_local_stream() returns a stream which can be used by all CUDA codelet
 operations to avoid this issue. For instance:
 
 \code{.c}
@@ -105,11 +105,11 @@ If some CUDA calls are made without specifying this local stream,
 synchronization needs to be explicited with cudaThreadSynchronize() around these
 calls, to make sure that they get properly synchronized with the calls using
 the local stream. Notably, \c cudaMemcpy() and \c cudaMemset() are actually
-asynchronous and need such explicit synchronization! Use cudaMemcpyAsync() and
-cudaMemsetAsync() instead.
+asynchronous and need such explicit synchronization! Use \c cudaMemcpyAsync() and
+\c cudaMemsetAsync() instead.
 
-Calling starpu_cublas_init() makes StarPU already do appropriate calls for the
-CUBLAS library. Some libraries like Magma may however change the current stream of CUBLAS v1,
+Calling starpu_cublas_init() will ensure StarPU to properly call the
+CUBLAS library functions. Some libraries like Magma may however change the current stream of CUBLAS v1,
 one then has to call <c>cublasSetKernelStream(</c>starpu_cuda_get_local_stream()<c>)</c> at
 the beginning of the codelet to make sure that CUBLAS is really using the proper
 stream. When using CUBLAS v2, starpu_cublas_get_local_handle() can be called to queue CUBLAS
@@ -147,14 +147,14 @@ triggered by the completion of the kernel.
 Using the flag ::STARPU_CUDA_ASYNC also permits to enable concurrent kernel
 execution, on cards which support it (Kepler and later, notably). This is
 enabled by setting the environment variable \ref STARPU_NWORKER_PER_CUDA to the
-number of kernels to execute concurrently.  This is useful when kernels are
+number of kernels to be executed concurrently.  This is useful when kernels are
 small and do not feed the whole GPU with threads to run.
 
-Concerning memory allocation, you should really not use \c cudaMalloc/ \c cudaFree
-within the kernel, since \c cudaFree introduces a awfully lot of synchronizations
+Concerning memory allocation, you should really not use \c cudaMalloc()/ \c cudaFree()
+within the kernel, since \c cudaFree() introduces a awfully lot of synchronizations
 within CUDA itself. You should instead add a parameter to the codelet with the
 ::STARPU_SCRATCH mode access. You can then pass to the task a handle registered
-with the desired size but with the \c NULL pointer, that handle can even be the
+with the desired size but with the \c NULL pointer, the handle can even be
 shared between tasks, StarPU will allocate per-task data on the fly before task
 execution, and reuse the allocated data between tasks.
 
@@ -177,8 +177,8 @@ kernel startup and completion.
 
 It may happen that for some reason, StarPU does not make progress for a long
 period of time.  Reason are sometimes due to contention inside StarPU, but
-sometimes this is due to external reasons, such as stuck MPI driver, or CUDA
-driver, etc.
+sometimes this is due to external reasons, such as a stuck MPI or CUDA
+driver.
 
 <c>export STARPU_WATCHDOG_TIMEOUT=10000</c> (\ref STARPU_WATCHDOG_TIMEOUT)
 
@@ -187,30 +187,34 @@ any task for 10ms, but lets the application continue normally. In addition to th
 
 <c>export STARPU_WATCHDOG_CRASH=1</c> (\ref STARPU_WATCHDOG_CRASH)
 
-raises <c>SIGABRT</c> in this condition, thus allowing to catch the situation in gdb.
+raises <c>SIGABRT</c> in this condition, thus allowing to catch the
+situation in \c gdb.
+
 It can also be useful to type <c>handle SIGABRT nopass</c> in <c>gdb</c> to be able to let
 the process continue, after inspecting the state of the process.
 
 \section HowToLimitMemoryPerNode How to Limit Memory Used By StarPU And Cache Buffer Allocations
 
 By default, StarPU makes sure to use at most 90% of the memory of GPU devices,
-moving data in and out of the device as appropriate and with prefetch and
-writeback optimizations. Concerning the main memory, by default it will not
-limit its consumption, since by default it has nowhere to push the data to when
-memory gets tight. This also means that by default StarPU will not cache buffer
-allocations in main memory, since it does not know how much of the system memory
-it can afford.
-
-In the case of GPUs, the \ref STARPU_LIMIT_CUDA_MEM, \ref STARPU_LIMIT_CUDA_devid_MEM,
-\ref STARPU_LIMIT_OPENCL_MEM, and \ref STARPU_LIMIT_OPENCL_devid_MEM environment variables
-can be used to control how
-much (in MiB) of the GPU device memory should be used at most by StarPU (their
-default values are 90% of the available memory).
-
-In the case of the main memory, the \ref STARPU_LIMIT_CPU_MEM environment
-variable can be used to specify how much (in MiB) of the main memory should be
-used at most by StarPU for buffer allocations. This way, StarPU will be able to
-cache buffer allocations (which can be a real benefit if a lot of bufferes are
+moving data in and out of the device as appropriate, as well as using
+prefetch and writeback optimizations.
+
+The environment variables \ref STARPU_LIMIT_CUDA_MEM, \ref STARPU_LIMIT_CUDA_devid_MEM,
+\ref STARPU_LIMIT_OPENCL_MEM, and \ref STARPU_LIMIT_OPENCL_devid_MEM
+can be used to control how much (in MiB) of the GPU device memory
+should be used at most by StarPU (the default value is to use 90% of the
+available memory).
+
+By default, the usage of the main memory is not limited, as the
+default mechanims do not provide means to evict main memory when it
+gets too tight. This also means that by default StarPU will not cache buffer
+allocations in main memory, since it does not know how much of the
+system memory it can afford.
+
+The environment variable \ref STARPU_LIMIT_CPU_MEM can be used to
+specify how much (in MiB) of the main memory should be used at most by
+StarPU for buffer allocations. This way, StarPU will be able to
+cache buffer allocations (which can be a real benefit if a lot of buffers are
 involved, or if allocation fragmentation can become a problem), and when using
 \ref OutOfCore, StarPU will know when it should evict data out to the disk.
 
@@ -233,8 +237,8 @@ caches or data out to the disk, starpu_memory_allocate() can be used to
 specify an amount of memory to be accounted for. starpu_memory_deallocate()
 can be used to account freed memory back. Those can for instance be used by data
 interfaces with dynamic data buffers: instead of using starpu_malloc_on_node(),
-they would dynamically allocate data with malloc/realloc, and notify starpu of
-the delta thanks to starpu_memory_allocate() and starpu_memory_deallocate() calls.
+they would dynamically allocate data with \c malloc()/\c realloc(), and notify StarPU of
+the delta by calling starpu_memory_allocate() and starpu_memory_deallocate().
 
 starpu_memory_get_total() and starpu_memory_get_available()
 can be used to get an estimation of how much memory is available.
@@ -251,7 +255,7 @@ to reserve this amount immediately.
 
 It is possible to reduce the memory footprint of the task and data internal
 structures of StarPU by describing the shape of your machine and/or your
-application at the \c configure step.
+application when calling \c configure.
 
 To reduce the memory footprint of the data internal structures of StarPU, one
 can set the
@@ -271,28 +275,27 @@ execution. For example, in the Cholesky factorization (dense linear algebra
 application), the GEMM task uses up to 3 buffers, so it is possible to set the
 maximum number of task buffers to 3 to run a Cholesky factorization on StarPU.
 
-The size of the various structures of StarPU can be printed by 
+The size of the various structures of StarPU can be printed by
 <c>tests/microbenchs/display_structures_size</c>.
 
-It is also often useless to submit *all* the tasks at the same time. One can
-make the starpu_task_submit() function block when a reasonable given number of
-tasks have been submitted, by setting the \ref STARPU_LIMIT_MIN_SUBMITTED_TASKS and
-\ref STARPU_LIMIT_MAX_SUBMITTED_TASKS environment variables, for instance:
+It is also often useless to submit *all* the tasks at the same time.
+Task submission can be blocked when a reasonable given number of
+tasks have been submitted, by setting the environment variables \ref
+STARPU_LIMIT_MIN_SUBMITTED_TASKS and \ref STARPU_LIMIT_MAX_SUBMITTED_TASKS.
 
 <c>
 export STARPU_LIMIT_MAX_SUBMITTED_TASKS=10000
-
 export STARPU_LIMIT_MIN_SUBMITTED_TASKS=9000
 </c>
 
-To make StarPU block submission when 10000 tasks are submitted, and unblock
+will make StarPU block submission when 10000 tasks are submitted, and unblock
 submission when only 9000 tasks are still submitted, i.e. 1000 tasks have
 completed among the 10000 which were submitted when submission was blocked. Of
 course this may reduce parallelism if the threshold is set too low. The precise
 balance depends on the application task graph.
 
 An idea of how much memory is used for tasks and data handles can be obtained by
-setting the \ref STARPU_MAX_MEMORY_USE environment variable to <c>1</c>.
+setting the environment variable \ref STARPU_MAX_MEMORY_USE to <c>1</c>.
 
 \section HowtoReuseMemory How To Reuse Memory
 
@@ -303,7 +306,7 @@ tasks. For this system to work with MPI tasks, you need to submit tasks progress
 of as soon as possible, because in the case of MPI receives, the allocation cache check for reusing data
 buffers will be done at submission time, not at execution time.
 
-You have two options to control the task submission flow. The first one is by
+There is two options to control the task submission flow. The first one is by
 controlling the number of submitted tasks during the whole execution. This can
 be done whether by setting the environment variables
 \ref STARPU_LIMIT_MAX_SUBMITTED_TASKS and \ref STARPU_LIMIT_MIN_SUBMITTED_TASKS to
@@ -348,11 +351,12 @@ To force continuing calibration,
 use <c>export STARPU_CALIBRATE=1</c> (\ref STARPU_CALIBRATE). This may be necessary if your application
 has not-so-stable performance. StarPU will force calibration (and thus ignore
 the current result) until 10 (<c>_STARPU_CALIBRATION_MINIMUM</c>) measurements have been
-made on each architecture, to avoid badly scheduling tasks just because the
+made on each architecture, to avoid bad scheduling decisions just because the
 first measurements were not so good. Details on the current performance model status
-can be obtained from the tool <c>starpu_perfmodel_display</c>: the <c>-l</c>
-option lists the available performance models, and the <c>-s</c> option permits
-to choose the performance model to be displayed. The result looks like:
+can be obtained with the tool <c>starpu_perfmodel_display</c>: the
+option <c>-l</c> lists the available performance models, and the
+option <c>-s</c> allows to choose the performance model to be
+displayed. The result looks like:
 
 \verbatim
 $ starpu_perfmodel_display -s starpu_slu_lu_model_11
@@ -364,7 +368,7 @@ e5a07e31  4096     0.000000e+00  1.717457e+01  5.190038e+00  14
 ...
 \endverbatim
 
-Which shows that for the LU 11 kernel with a 1MiB matrix, the average
+which shows that for the LU 11 kernel with a 1MiB matrix, the average
 execution time on CPUs was about 25ms, with a 0.2ms standard deviation, over
 8 samples. It is a good idea to check this before doing actual performance
 measurements.
@@ -373,7 +377,7 @@ A graph can be drawn by using the tool <c>starpu_perfmodel_plot</c>:
 
 \verbatim
 $ starpu_perfmodel_plot -s starpu_slu_lu_model_11
-4096 16384 65536 262144 1048576 4194304 
+4096 16384 65536 262144 1048576 4194304
 $ gnuplot starpu_starpu_slu_lu_model_11.gp
 $ gv starpu_starpu_slu_lu_model_11.eps
 \endverbatim
@@ -451,28 +455,29 @@ STARPU_BUS_STATS=1</c> and <c>export STARPU_WORKER_STATS=1</c> .
 \section OverheadProfiling Overhead Profiling
 
 \ref OfflinePerformanceTools can already provide an idea of to what extent and
-which part of StarPU bring overhead on the execution time. To get a more precise
-analysis of the parts of StarPU which bring most overhead, <c>gprof</c> can be used.
+which part of StarPU brings an overhead on the execution time. To get a more precise
+analysis of which parts of StarPU bring the most overhead, <c>gprof</c> can be used.
 
 First, recompile and reinstall StarPU with <c>gprof</c> support:
 
 \code
-./configure --enable-perf-debug --disable-shared --disable-build-tests --disable-build-examples
+../configure --enable-perf-debug --disable-shared --disable-build-tests --disable-build-examples
 \endcode
 
 Make sure not to leave a dynamic version of StarPU in the target path: remove
 any remaining <c>libstarpu-*.so</c>
 
 Then relink your application with the static StarPU library, make sure that
-running <c>ldd</c> on your application does not mention any libstarpu
+running <c>ldd</c> on your application does not mention any \c libstarpu
 (i.e. it's really statically-linked).
 
 \code
 gcc test.c -o test $(pkg-config --cflags starpu-1.3) $(pkg-config --libs starpu-1.3)
 \endcode
 
-Now you can run your application, and a <c>gmon.out</c> file should appear in the
-current directory, you can process it by running <c>gprof</c> on your application:
+Now you can run your application, this will create a file
+<c>gmon.out</c> in the current directory, it can be processed by
+running <c>gprof</c> on your application:
 
 \code
 gprof ./test

+ 11 - 11
doc/doxygen/chapters/301_tasks.doxy

@@ -40,7 +40,7 @@ impact that has on the target machine.
 \section TaskSubmission Task Submission
 
 To let StarPU make online optimizations, tasks should be submitted
-asynchronously as much as possible. Ideally, all the tasks should be
+asynchronously as much as possible. Ideally, all tasks should be
 submitted, and mere calls to starpu_task_wait_for_all() or
 starpu_data_unregister() be done to wait for
 termination. StarPU will then be able to rework the whole schedule, overlap
@@ -52,7 +52,7 @@ By default, StarPU will consider the tasks in the order they are submitted by
 the application. If the application programmer knows that some tasks should
 be performed in priority (for instance because their output is needed by many
 other tasks and may thus be a bottleneck if not executed early
-enough), the field starpu_task::priority should be set to transmit the
+enough), the field starpu_task::priority should be set to provide the
 priority information to StarPU.
 
 \section TaskDependencies Task Dependencies
@@ -165,14 +165,14 @@ starpu_task_insert(&dummy_big_cl,
 \endcode
 
 The whole code for this complex data interface is available in the
-directory <c>examples/basic_examples/dynamic_handles.c</c>.
+file <c>examples/basic_examples/dynamic_handles.c</c>.
 
 \section SettingVariableDataHandlesForATask Setting a Variable Number Of Data Handles For a Task
 
-Normally, the number of data handles given to a task is fixed in the
-starpu_codelet::nbuffers codelet field. This field can however be set to
-\ref STARPU_VARIABLE_NBUFFERS, in which case the starpu_task::nbuffers task field
-must be set, and the starpu_task::modes field (or starpu_task::dyn_modes field,
+Normally, the number of data handles given to a task is set with
+starpu_codelet::nbuffers. This field can however be set to
+\ref STARPU_VARIABLE_NBUFFERS, in which case starpu_task::nbuffers
+must be set, and starpu_task::modes (or starpu_task::dyn_modes,
 see \ref SettingManyDataHandlesForATask) should be used to specify the modes for
 the handles.
 
@@ -215,7 +215,7 @@ struct starpu_codelet cl =
 
 Schedulers which are multi-implementation aware (only <c>dmda</c> and
 <c>pheft</c> for now) will use the performance models of all the
-implementations it was given, and pick the one which seems to be the fastest.
+provided implementations, and pick the one which seems to be the fastest.
 
 \section EnablingImplementationAccordingToCapabilities Enabling Implementation According To Capabilities
 
@@ -333,7 +333,7 @@ struct starpu_codelet cl =
 };
 \endcode
 
-Note: the most generic variant should be provided first, as some schedulers are
+Note that the most generic variant should be provided first, as some schedulers are
 not able to try the different variants.
 
 \section InsertTaskUtility Insert Task Utility
@@ -341,7 +341,7 @@ not able to try the different variants.
 StarPU provides the wrapper function starpu_task_insert() to ease
 the creation and submission of tasks.
 
-Here the implementation of the codelet:
+Here the implementation of a codelet:
 
 \code{.c}
 void func_cpu(void *descr[], void *_args)
@@ -477,7 +477,7 @@ ret = starpu_task_get_task_succs(task, sizeof(tasks)/sizeof(*tasks), tasks);
 \section ParallelTasks Parallel Tasks
 
 StarPU can leverage existing parallel computation libraries by the means of
-parallel tasks. A parallel task is a task which gets worked on by a set of CPUs
+parallel tasks. A parallel task is a task which is run by a set of CPUs
 (called a parallel or combined worker) at the same time, by using an existing
 parallel CPU implementation of the computation to be achieved. This can also be
 useful to improve the load balance between slow CPUs and fast GPUs: since CPUs

+ 7 - 2
doc/doxygen/chapters/310_data_management.doxy

@@ -22,11 +22,16 @@ TODO: intro which mentions consistency among other things
 
 \section DataInterface Data Interface
 
-StarPU provides several data interfaces for programmers to describe the data layout of their application. There are predefined interfaces already available in StarPU. Users can define new data interfaces as explained in \ref DefiningANewDataInterface. All functions provided by StarPU are documented in \ref API_Data_Interfaces. You will find a short list below.
+StarPU provides several data interfaces for programmers to describe
+the data layout of their application. There are predefined interfaces
+already available in StarPU. Users can define new data interfaces as
+explained in \ref DefiningANewDataInterface. All functions provided by
+StarPU are documented in \ref API_Data_Interfaces. You will find a
+short list below.
 
 \subsection VariableDataInterface Variable Data Interface
 
-A variable is a given size byte element, typically a scalar. Here an
+A variable is a given-size byte element, typically a scalar. Here an
 example of how to register a variable data to StarPU by using
 starpu_variable_data_register().
 

+ 1 - 1
doc/doxygen/chapters/380_offline_performance_tools.doxy

@@ -438,7 +438,7 @@ histogram of the codelet execution time distribution.
 
 More than just codelet performance, it is interesting to get statistics over all
 kinds of StarPU states (allocations, data transfers, etc.). This is particularly
-useful to check what may have gone wrong in the accurracy of the simgrid
+useful to check what may have gone wrong in the accurracy of the SimGrid
 simulation.
 
 This requires the <c>R</c> statistical tool, with the <c>plyr</c>,

+ 5 - 5
doc/doxygen/chapters/470_simgrid.doxy

@@ -23,14 +23,14 @@
 /*! \page SimGridSupport SimGrid Support
 
 StarPU can use Simgrid in order to simulate execution on an arbitrary
-platform. This was tested with simgrid from 3.11 to 3.16, and 3.18 to 3.22.
+platform. This was tested with SimGrid from 3.11 to 3.16, and 3.18 to 3.23.
 Other versions may have compatibility issues. 3.17 notably does not build at
 all. MPI simulation does not work with version 3.22.
 
 \section Preparing Preparing Your Application For Simulation
 
 There are a few technical details which need to be handled for an application to
-be simulated through Simgrid.
+be simulated through SimGrid.
 
 If the application uses <c>gettimeofday</c> to make its
 performance measurements, the real time will be used, which will be bogus. To
@@ -38,19 +38,19 @@ get the simulated time, it has to use starpu_timing_now() which returns the
 virtual timestamp in us.
 
 For some technical reason, the application's .c file which contains \c main() has
-to be recompiled with \c starpu_simgrid_wrap.h, which in the simgrid case will <c># define main()</c>
+to be recompiled with \c starpu_simgrid_wrap.h, which in the SimGrid case will <c># define main()</c>
 into <c>starpu_main()</c>, and it is \c libstarpu which will provide the real \c main() and
 will call the application's \c main().
 
 To be able to test with crazy data sizes, one may want to only allocate
 application data if the macro \c STARPU_SIMGRID is not defined.  Passing a <c>NULL</c> pointer to
 \c starpu_data_register functions is fine, data will never be read/written to by
-StarPU in Simgrid mode anyway.
+StarPU in SimGrid mode anyway.
 
 To be able to run the application with e.g. CUDA simulation on a system which
 does not have CUDA installed, one can fill the starpu_codelet::cuda_funcs with \c (void*)1, to
 express that there is a CUDA implementation, even if one does not actually
-provide it. StarPU will not actually run it in Simgrid mode anyway by default
+provide it. StarPU will not actually run it in SimGrid mode anyway by default
 (unless the ::STARPU_CODELET_SIMGRID_EXECUTE or ::STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT
 flags are set in the codelet)
 

+ 9 - 9
doc/doxygen/chapters/501_environment_variables.doxy

@@ -647,7 +647,7 @@ STARPU_MPI_DRIVER_CALL_FREQUENCY environment variable set to a positive value.
 \anchor STARPU_SIMGRID_TRANSFER_COST
 \addindex __env__STARPU_SIMGRID_TRANSFER_COST
 When set to 1 (which is the default), data transfers (over PCI bus, typically) are taken into account
-in simgrid mode.
+in SimGrid mode.
 </dd>
 
 <dt>STARPU_SIMGRID_CUDA_MALLOC_COST</dt>
@@ -655,7 +655,7 @@ in simgrid mode.
 \anchor STARPU_SIMGRID_CUDA_MALLOC_COST
 \addindex __env__STARPU_SIMGRID_CUDA_MALLOC_COST
 When set to 1 (which is the default), CUDA malloc costs are taken into account
-in simgrid mode.
+in SimGrid mode.
 </dd>
 
 <dt>STARPU_SIMGRID_CUDA_QUEUE_COST</dt>
@@ -663,14 +663,14 @@ in simgrid mode.
 \anchor STARPU_SIMGRID_CUDA_QUEUE_COST
 \addindex __env__STARPU_SIMGRID_CUDA_QUEUE_COST
 When set to 1 (which is the default), CUDA task and transfer queueing costs are
-taken into account in simgrid mode.
+taken into account in SimGrid mode.
 </dd>
 
 <dt>STARPU_PCI_FLAT</dt>
 <dd>
 \anchor STARPU_PCI_FLAT
 \addindex __env__STARPU_PCI_FLAT
-When unset or set to 0, the platform file created for simgrid will
+When unset or set to 0, the platform file created for SimGrid will
 contain PCI bandwidths and routes.
 </dd>
 
@@ -678,7 +678,7 @@ contain PCI bandwidths and routes.
 <dd>
 \anchor STARPU_SIMGRID_QUEUE_MALLOC_COST
 \addindex __env__STARPU_SIMGRID_QUEUE_MALLOC_COST
-When unset or set to 1, simulate within simgrid the GPU transfer queueing.
+When unset or set to 1, simulate within SimGrid the GPU transfer queueing.
 </dd>
 
 <dt>STARPU_MALLOC_SIMULATION_FOLD</dt>
@@ -695,7 +695,7 @@ MiB. The default is 1, thus allowing 64GiB virtual memory when Linux's
 \anchor STARPU_SIMGRID_TASK_SUBMIT_COST
 \addindex __env__STARPU_SIMGRID_TASK_SUBMIT_COST
 When set to 1 (which is the default), task submission costs are taken into
-account in simgrid mode. This provides more accurate simgrid predictions,
+account in SimGrid mode. This provides more accurate SimGrid predictions,
 especially for the beginning of the execution.
 </dd>
 
@@ -704,7 +704,7 @@ especially for the beginning of the execution.
 \anchor STARPU_SIMGRID_FETCHING_INPUT_COST
 \addindex __env__STARPU_SIMGRID_FETCHING_INPUT_COST
 When set to 1 (which is the default), fetching input costs are taken into
-account in simgrid mode. This provides more accurate simgrid predictions,
+account in SimGrid mode. This provides more accurate SimGrid predictions,
 especially regarding data transfers.
 </dd>
 
@@ -713,7 +713,7 @@ especially regarding data transfers.
 \anchor STARPU_SIMGRID_SCHED_COST
 \addindex __env__STARPU_SIMGRID_SCHED_COST
 When set to 1 (0 is the default), scheduling costs are taken into
-account in simgrid mode. This provides more accurate simgrid predictions,
+account in SimGrid mode. This provides more accurate SimGrid predictions,
 and allows studying scheduling overhead of the runtime system. However,
 it also makes simulation non-deterministic.
 </dd>
@@ -1174,7 +1174,7 @@ average.
 \addindex __env__STARPU_RAND_SEED
 The random scheduler and some examples use random numbers for their own
 working. Depending on the examples, the seed is by default juste always 0 or
-the current time() (unless simgrid mode is enabled, in which case it is always
+the current time() (unless SimGrid mode is enabled, in which case it is always
 0). \ref STARPU_RAND_SEED allows to set the seed to a specific value.
 </dd>
 

+ 5 - 5
doc/doxygen/chapters/510_configure_options.doxy

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011-2013,2015-2017                      Inria
- * Copyright (C) 2010-2017, 2019                                CNRS
+ * Copyright (C) 2010-2017, 2019                          CNRS
  * Copyright (C) 2009-2011,2013-2018                      Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -281,7 +281,7 @@ contain the OpenCL shared libraries---e.g. <c>libOpenCL.so</c>. This defaults to
 \addindex __configure__--enable-opencl-simulator
 Enable considering the provided OpenCL implementation as a simulator, i.e. use
 the kernel duration returned by OpenCL profiling information as wallclock time
-instead of the actual measured real time. This requires simgrid support.
+instead of the actual measured real time. This requires the SimGrid support.
 </dd>
 
 <dt>--enable-maximplementations=<c>count</c></dt>
@@ -679,10 +679,10 @@ Enable memory statistics (\ref MemoryFeedback).
 <dd>
 \anchor enable-simgrid
 \addindex __configure__--enable-simgrid
-Enable simulation of execution in simgrid, to allow easy experimentation with
+Enable simulation of execution in SimGrid, to allow easy experimentation with
 various numbers of cores and GPUs, or amount of memory, etc. Experimental.
 
-The path to simgrid can be specified through the <c>SIMGRID_CFLAGS</c> and
+The path to SimGrid can be specified through the <c>SIMGRID_CFLAGS</c> and
 <c>SIMGRID_LIBS</c> environment variables, for instance:
 
 \verbatim
@@ -727,7 +727,7 @@ Use the smpirun at <c>path</c>
 <dd>
 \anchor enable-simgrid-mc
 \addindex __configure__--enable-simgrid-mc
-Enable the Model Checker in simulation of execution in simgrid, to allow
+Enable the Model Checker in simulation of execution in SimGrid, to allow
 exploring various execution paths.
 </dd>
 

+ 4 - 4
doc/doxygen/chapters/api/threads.doxy

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2017                                CNRS
+ * Copyright (C) 2010-2017, 2019                          CNRS
  * Copyright (C) 2009-2011,2014,2016                      Université de Bordeaux
  * Copyright (C) 2011,2012                                Inria
  *
@@ -20,7 +20,7 @@
 
 \brief This section describes the thread facilities provided
 by StarPU. The thread function are either implemented on top of the
-pthread library or the Simgrid library when the simulated performance
+pthread library or the SimGrid library when the simulated performance
 mode is enabled (\ref SimGridSupport).
 
 \def STARPU_PTHREAD_CREATE_ON
@@ -359,8 +359,8 @@ todo
 \fn void starpu_sleep(float nb_sec)
 \ingroup API_Threads
 Similar to calling Unix' \c sleep function, except that it takes a float
-to allow sub-second sleeping, and when StarPU is compiled in simgrid mode it
-does not really sleep but just makes simgrid record that the thread has taken
+to allow sub-second sleeping, and when StarPU is compiled in SimGrid mode it
+does not really sleep but just makes SimGrid record that the thread has taken
 some time to sleep.
 
 */

+ 2 - 2
doc/doxygen/refman.tex

@@ -37,7 +37,7 @@ Generated by Doxygen.
 This manual documents the usage of StarPU version \STARPUVERSION. Its contents
 was last updated on \STARPUUPDATED.\\
 
-Copyright © 2009–2018 Université de Bordeaux\\
+Copyright © 2009–2018 Université de Bordeaux
 
 Copyright © 2010-2018 CNRS
 
@@ -339,7 +339,7 @@ Documentation License”.
 \hypertarget{GNUFreeDocumentationLicense}{}
 \input{GNUFreeDocumentationLicense}
 
-\part{Index}
+%\part{Index}
 \addcontentsline{toc}{chapter}{Index}
 \printindex
 

+ 3 - 3
examples/callback/prologue.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2013,2014                                Inria
- * Copyright (C) 2010-2017                                CNRS
+ * Copyright (C) 2010-2017,2019                           CNRS
  * Copyright (C) 2009,2010,2013-2015                      Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -92,9 +92,9 @@ int main(void)
 	ret = starpu_task_insert(&cl,
 				 STARPU_RW, handle,
 				 STARPU_PROLOGUE_CALLBACK, prologue_callback_func,
-				 STARPU_PROLOGUE_CALLBACK_ARG, &x,
+				 STARPU_PROLOGUE_CALLBACK_ARG_NFREE, &x,
 				 STARPU_PROLOGUE_CALLBACK_POP, pop_prologue_callback_func,
-				 STARPU_PROLOGUE_CALLBACK_POP_ARG, 5,
+				 STARPU_PROLOGUE_CALLBACK_POP_ARG_NFREE, 5,
 				 0);
 	if (ret == -ENODEV) goto enodev;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");

+ 2 - 2
examples/cholesky/cholesky.sh

@@ -1,7 +1,7 @@
 #!/bin/bash
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2018                                     Université de Bordeaux
+# Copyright (C) 2018-2019                                Université de Bordeaux
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -61,5 +61,5 @@ do
 	COMMA=", "
 done
 gnuplot cholesky.gp
-gv $OUTFILE
+#gv $OUTFILE
 true

+ 2 - 0
examples/cpp/add_vectors_interface.cpp

@@ -280,6 +280,7 @@ static struct starpu_data_interface_ops interface_vector_cpp_ops =
 	.pointer_is_inside = vector_cpp_pointer_is_inside,
 	.get_size = vector_cpp_interface_get_size,
 	.get_alloc_size = NULL,
+	.get_max_size = NULL,
 	.footprint = footprint_vector_cpp_interface_crc32,
 	.alloc_footprint = NULL,
 	.compare = vector_cpp_compare,
@@ -307,6 +308,7 @@ static struct starpu_data_interface_ops interface_vector_cpp_ops =
 	vector_cpp_pointer_is_inside,
 	vector_cpp_interface_get_size,
 	NULL,
+	NULL,
 	footprint_vector_cpp_interface_crc32,
 	NULL,
 	vector_cpp_compare,

+ 1 - 1
examples/dependency/sequential_consistency.c

@@ -69,7 +69,7 @@ void cpu_codeletA(void *descr[], void *args)
 
 	ret = starpu_task_insert(&clB,
 				 STARPU_RW, value_handle,
-				 STARPU_CALLBACK_WITH_ARG, starpu_tag_notify_from_apps, tagHoldC,
+				 STARPU_CALLBACK_WITH_ARG_NFREE, starpu_tag_notify_from_apps, tagHoldC,
 				 STARPU_HANDLES_SEQUENTIAL_CONSISTENCY, handle_sequential_consistency,
 				 STARPU_NAME, "taskB",
 				 0);

+ 1 - 1
examples/dependency/task_end_dep_add.c

@@ -47,7 +47,7 @@ void cpu_codelet(void *descr[], void *args)
 	starpu_task_end_dep_add(task, 1);
 
 	starpu_task_insert(&cl2,
-			   STARPU_CALLBACK_WITH_ARG, starpu_task_end_dep_release, task,
+			   STARPU_CALLBACK_WITH_ARG_NFREE, starpu_task_end_dep_release, task,
 			   0);
 	STARPU_ASSERT(*val == INIT);
 	*val *= 2;

+ 2 - 2
examples/pipeline/pipeline.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2012-2015,2017                           CNRS
+ * Copyright (C) 2012-2015,2017,2019                      CNRS
  * Copyright (C) 2012,2014-2017                           Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -243,7 +243,7 @@ int main(void)
 
 		ret = starpu_task_insert(&pipeline_codelet_sum,
 				STARPU_R, buffersY[l%K],
-				STARPU_CALLBACK_WITH_ARG, (void (*)(void*))sem_post, &sems[l%C],
+				STARPU_CALLBACK_WITH_ARG_NFREE, (void (*)(void*))sem_post, &sems[l%C],
 				STARPU_TAG_ONLY, (starpu_tag_t) l,
 				0);
 		if (ret == -ENODEV) goto enodev;

+ 6 - 1
include/starpu_config.h.in

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011,2012,2014,2016,2017                 Inria
- * Copyright (C) 2009-2018                                Université de Bordeaux
+ * Copyright (C) 2009-2019                                Université de Bordeaux
  * Copyright (C) 2010-2017,2019                           CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -43,6 +43,11 @@
 #undef STARPU_SIMGRID_MC
 #undef STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT
 #undef STARPU_HAVE_SIMGRID_MSG_H
+#undef STARPU_HAVE_SIMGRID_ACTOR_H
+#undef STARPU_HAVE_SIMGRID_SEMAPHORE_H
+#undef STARPU_HAVE_SIMGRID_MUTEX_H
+#undef STARPU_HAVE_SIMGRID_COND_H
+#undef STARPU_HAVE_SIMGRID_BARRIER_H
 #undef STARPU_HAVE_XBT_SYNCHRO_H
 #undef STARPU_HAVE_VALGRIND_H
 #undef STARPU_HAVE_MEMCHECK_H

+ 13 - 0
include/starpu_data_interfaces.h

@@ -456,6 +456,14 @@ struct starpu_data_interface_ops
 	size_t 		 (*get_alloc_size)		(starpu_data_handle_t handle);
 
 	/**
+	   Return the maximum size that the data may need to increase to. For
+	   instance, in the case of compressed matrix tiles this is the size
+	   when the block is fully dense.
+	   This is currently only used for feedback tools.
+	*/
+	size_t 		 (*get_max_size)		(starpu_data_handle_t handle);
+
+	/**
 	  Return a 32bit footprint which characterizes the data size and layout (nx, ny, ld, elemsize, etc.), required for indexing performance models.
 
 	  starpu_hash_crc32c_be() and alike can be used to produce this 32bit value from various types of values.
@@ -656,6 +664,11 @@ size_t starpu_data_get_size(starpu_data_handle_t handle);
 size_t starpu_data_get_alloc_size(starpu_data_handle_t handle);
 
 /**
+   Return the maximum size that the \p handle data may need to increase to.
+*/
+starpu_ssize_t starpu_data_get_max_size(starpu_data_handle_t handle);
+
+/**
    Return the handle corresponding to the data pointed to by the \p ptr host pointer.
 */
 starpu_data_handle_t starpu_data_lookup(const void *ptr);

+ 41 - 15
include/starpu_task.h

@@ -722,7 +722,9 @@ struct starpu_task
 
 	   With starpu_task_insert() and alike this can be specified thanks to
 	   ::STARPU_CALLBACK followed by the function pointer, or thanks to
-	   ::STARPU_CALLBACK_WITH_ARG followed by the function pointer and the argument.
+	   ::STARPU_CALLBACK_WITH_ARG (or
+	   ::STARPU_CALLBACK_WITH_ARG_NFREE) followed by the function
+	   pointer and the argument.
 	*/
 	void (*callback_func)(void *);
 	/**
@@ -733,7 +735,9 @@ struct starpu_task
 
 	   With starpu_task_insert() and alike this can be specified thanks to
 	   ::STARPU_CALLBACK_ARG followed by the function pointer, or thanks to
-	   ::STARPU_CALLBACK_WITH_ARG followed by the function pointer and the argument.
+	   ::STARPU_CALLBACK_WITH_ARG or
+	   ::STARPU_CALLBACK_WITH_ARG_NFREE followed by the function
+	   pointer and the argument.
 	*/
 	void *callback_arg;
 
@@ -751,6 +755,7 @@ struct starpu_task
 	   ::STARPU_PROLOGUE_CALLBACK followed by the function pointer.
 	*/
 	void (*prologue_callback_func)(void *);
+
 	/**
 	   Optional field, the default value is <c>NULL</c>. This is
 	   the pointer passed to the prologue callback function. This
@@ -758,7 +763,7 @@ struct starpu_task
 	   starpu_task::prologue_callback_func is set to <c>NULL</c>.
 
 	   With starpu_task_insert() and alike this can be specified thanks to
-	   ::STARPU_PROLOGUE_CALLBACK followed by the function pointer.
+	   ::STARPU_PROLOGUE_CALLBACK_ARG followed by the argument
 	*/
 	void *prologue_callback_arg;
 
@@ -789,6 +794,7 @@ struct starpu_task
 	   ::STARPU_CL_ARGS.
 	*/
 	unsigned cl_arg_free:1;
+
 	/**
 	   Optional field. In case starpu_task::callback_arg was
 	   allocated by the application through <c>malloc()</c>,
@@ -796,9 +802,12 @@ struct starpu_task
 	   automatically call <c>free(callback_arg)</c> when
 	   destroying the task.
 
-	   TODO: does not have a starpu_task_insert() equivalent
+	   With starpu_task_insert() and alike, this is set to 1 when using
+	   ::STARPU_CALLBACK_ARG or ::STARPU_CALLBACK_WITH_ARG, or set
+	   to 0 when using ::STARPU_CALLBACK_ARG_NFREE
 	*/
 	unsigned callback_arg_free:1;
+
 	/**
 	   Optional field. In case starpu_task::prologue_callback_arg
 	   was allocated by the application through <c>malloc()</c>,
@@ -806,9 +815,12 @@ struct starpu_task
 	   StarPU automatically call
 	   <c>free(prologue_callback_arg)</c> when destroying the task.
 
-	   TODO: does not have a starpu_task_insert() equivalent
+	   With starpu_task_insert() and alike this is set to 1 when using
+	   ::STARPU_PROLOGUE_CALLBACK_ARG, or set to 0 when using
+	   ::STARPU_PROLOGUE_CALLBACK_ARG_NFREE
 	*/
 	unsigned prologue_callback_arg_free:1;
+
 	/**
 	   Optional field. In case starpu_task::prologue_callback_pop_arg
 	   was allocated by the application through <c>malloc()</c>,
@@ -817,7 +829,9 @@ struct starpu_task
 	   <c>free(prologue_callback_pop_arg)</c> when destroying the
 	   task.
 
-	   TODO: does not have a starpu_task_insert() equivalent
+	   With starpu_task_insert() and alike this is set to 1 when using
+	   ::STARPU_PROLOGUE_CALLBACK_POP_ARG, or set to 0 when using
+	   ::STARPU_PROLOGUE_CALLBACK_POP_ARG_NFREE
 	*/
 	unsigned prologue_callback_pop_arg_free:1;
 
@@ -840,7 +854,8 @@ struct starpu_task
 	   this flag permits to disable sequential consistency for
 	   this task, even if data have it enabled.
 
-	   TODO: does not have a starpu_task_insert() equivalent
+	   With starpu_task_insert() and alike this can be specified thanks to
+	   ::STARPU_SEQUENTIAL_CONSISTENCY followed by an unsigned.
 	*/
 	unsigned sequential_consistency:1;
 
@@ -914,7 +929,9 @@ struct starpu_task
 	/**
 	   do not allocate a submitorder id for this task
 
-	   TODO: does not have a starpu_task_insert() equivalent
+	   With starpu_task_insert() and alike this can be specified
+	   thanks to ::STARPU_TASK_NO_SUBMITORDER followed by
+	   an unsigned.
 	*/
 	unsigned no_submitorder:1;
 
@@ -969,7 +986,10 @@ struct starpu_task
 	   workers which are allowed to execute the task.
 	   starpu_task::workerid takes precedence over this.
 
-	   TODO: does not have a starpu_task_insert() equivalent
+	   With starpu_task_insert() and alike, this can be specified
+	   along the field workerids_len thanks to ::STARPU_TASK_WORKERIDS
+	   followed by a number of workers and an array of bits which
+	   size is the number of workers.
 	*/
 	uint32_t *workerids;
 
@@ -977,7 +997,10 @@ struct starpu_task
 	   Optional field. This provides the number of uint32_t values
 	   in the starpu_task::workerids array.
 
-	   TODO: does not have a starpu_task_insert() equivalent
+	   With starpu_task_insert() and alike, this can be specified
+	   along the field workerids thanks to ::STARPU_TASK_WORKERIDS
+	   followed by a number of workers and an array of bits which
+	   size is the number of workers.
 	*/
 	unsigned workerids_len;
 
@@ -1072,7 +1095,9 @@ struct starpu_task
 	/**
 	   Optional field. Profiling information for the task.
 
-	   TODO: does not have a starpu_task_insert() equivalent
+	   With starpu_task_insert() and alike this can be specified thanks to
+	   ::STARPU_TASK_PROFILING_INFO followed by a pointer to the
+	   appropriate struct.
 	*/
 	struct starpu_profiling_task_info *profiling_info;
 
@@ -1515,13 +1540,15 @@ void starpu_task_set_implementation(struct starpu_task *task, unsigned impl);
 unsigned starpu_task_get_implementation(struct starpu_task *task);
 
 /**
-   Create (and submit) an empty task that unlocks a tag once all its
+   Create and submit an empty task that unlocks a tag once all its
    dependencies are fulfilled.
  */
 void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t *deps, void (*callback)(void *), void *callback_arg);
 
-
-
+/**
+   Create and submit an empty task with the given callback
+ */
+void starpu_create_callback_task(void (*callback)(void *), void *callback_arg);
 
 /**
    Function to be used as a prologue callback to enable fault tolerance for the
@@ -1536,7 +1563,6 @@ void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t
  */
 void starpu_task_ft_prologue(void *check_ft);
 
-
 /**
    Create a try-task for a \p meta_task, given a \p template_task task
    template. The meta task can be passed as template on the first call, but

+ 85 - 2
include/starpu_task_util.h

@@ -57,7 +57,7 @@ extern "C"
    Used when calling starpu_task_insert(), must be followed by two
    pointers: one to a callback function, and the other to be given as
    an argument to the callback function; this is equivalent to using
-   both ::STARPU_CALLBACK and ::STARPU_CALLBACK_WITH_ARG.
+   both ::STARPU_CALLBACK and ::STARPU_CALLBACK_ARG.
 */
 #define STARPU_CALLBACK_WITH_ARG (3<<STARPU_MODE_SHIFT)
 
@@ -112,9 +112,30 @@ extern "C"
 */
 #define STARPU_SCHED_CTX	 (13<<STARPU_MODE_SHIFT)
 
+/**
+   Used when calling starpu_task_insert(), must be followed by a
+   pointer to a prologue callback function
+*/
 #define STARPU_PROLOGUE_CALLBACK   (14<<STARPU_MODE_SHIFT)
+
+/**
+   Used when calling starpu_task_insert(), must be followed by a
+   pointer to be given as an argument to the prologue callback
+   function
+*/
 #define STARPU_PROLOGUE_CALLBACK_ARG (15<<STARPU_MODE_SHIFT)
+
+/**
+   Used when calling starpu_task_insert(), must be followed by a
+   pointer to a prologue callback pop function
+*/
 #define STARPU_PROLOGUE_CALLBACK_POP   (16<<STARPU_MODE_SHIFT)
+
+/**
+   Used when calling starpu_task_insert(), must be followed by a
+   pointer to be given as an argument to the prologue callback pop
+   function
+*/
 #define STARPU_PROLOGUE_CALLBACK_POP_ARG (17<<STARPU_MODE_SHIFT)
 
 /**
@@ -220,7 +241,69 @@ extern "C"
 */
 #define STARPU_TASK_END_DEP	(32<<STARPU_MODE_SHIFT)
 
-#define STARPU_SHIFTED_MODE_MAX (33<<STARPU_MODE_SHIFT)
+/**
+   Used when calling starpu_task_insert(), must be followed by an
+   unsigned being a number of workers, and an array of bits which size
+   is the number of workers, the array indicates the set of workers
+   which are allowed to execute the task.
+*/
+#define STARPU_TASK_WORKERIDS (33<<STARPU_MODE_SHIFT)
+
+/**
+   Used when calling starpu_task_insert(), must be followed by an
+   unsigned which sets the sequential consistency for the data
+   parameters of the task.
+*/
+#define STARPU_SEQUENTIAL_CONSISTENCY (34<<STARPU_MODE_SHIFT)
+
+/**
+   Used when calling starpu_task_insert() and alike, must be followed
+   by a pointer to a struct starpu_profiling_task_info
+ */
+#define STARPU_TASK_PROFILING_INFO (35<<STARPU_MODE_SHIFT)
+
+/**
+   Used when calling starpu_task_insert() and alike, must be followed
+   by an unsigned specifying not to allocate a submitorder id for the task
+ */
+#define STARPU_TASK_NO_SUBMITORDER (36<<STARPU_MODE_SHIFT)
+
+/**
+   Used when calling starpu_task_insert(), similarly to
+   ::STARPU_CALLBACK_ARG, must be followed by a pointer to be given as
+   an argument to the callback function, the argument will not be
+   freed, i.e starpu_task::callback_arg_free will be set to 0
+*/
+#define STARPU_CALLBACK_ARG_NFREE	 (37<<STARPU_MODE_SHIFT)
+
+/**
+   Used when calling starpu_task_insert(), similarly to
+   ::STARPU_CALLBACK_WITH_ARG, must be followed by two pointers: one
+   to a callback function, and the other to be given as an argument to
+   the callback function; this is equivalent to using both
+   ::STARPU_CALLBACK and ::STARPU_CALLBACK_ARG_NFREE.
+*/
+#define STARPU_CALLBACK_WITH_ARG_NFREE	 (38<<STARPU_MODE_SHIFT)
+
+/**
+   Used when calling starpu_task_insert(), similarly to
+   ::STARPU_PROLOGUE_CALLBACK_ARG, must be followed by a
+   pointer to be given as an argument to the prologue callback
+   function, the argument will not be
+   freed, i.e starpu_task::prologue_callback_arg_free will be set to 0
+*/
+#define STARPU_PROLOGUE_CALLBACK_ARG_NFREE (39<<STARPU_MODE_SHIFT)
+
+/**
+   Used when calling starpu_task_insert(), similarly to
+   ::STARPU_PROLOGUE_CALLBACK_POP_ARG, must be followed by a pointer
+   to be given as an argument to the prologue callback pop function,
+   the argument will not be freed, i.e
+   starpu_task::prologue_callback_pop_arg_free will be set to 0
+*/
+#define STARPU_PROLOGUE_CALLBACK_POP_ARG_NFREE (40<<STARPU_MODE_SHIFT)
+
+#define STARPU_SHIFTED_MODE_MAX (41<<STARPU_MODE_SHIFT)
 
 /**
    Set the given \p task corresponding to \p cl with the following arguments.

+ 51 - 3
include/starpu_thread.h

@@ -30,6 +30,21 @@
 #else
 #include <xbt/synchro_core.h>
 #endif
+#ifdef STARPU_HAVE_SIMGRID_ACTOR_H
+#include <simgrid/actor.h>
+#endif
+#ifdef STARPU_HAVE_SIMGRID_SEMAPHORE_H
+#include <simgrid/semaphore.h>
+#endif
+#ifdef STARPU_HAVE_SIMGRID_MUTEX_H
+#include <simgrid/mutex.h>
+#endif
+#ifdef STARPU_HAVE_SIMGRID_COND_H
+#include <simgrid/cond.h>
+#endif
+#ifdef STARPU_HAVE_SIMGRID_BARRIER_H
+#include <simgrid/barrier.h>
+#endif
 #ifdef STARPU_HAVE_SIMGRID_MSG_H
 #include <simgrid/msg.h>
 #else
@@ -52,12 +67,21 @@ extern "C"
 
 #ifdef STARPU_SIMGRID
 
+#ifdef STARPU_HAVE_SIMGRID_ACTOR_H
+typedef sg_actor_t starpu_pthread_t;
+#else
 typedef msg_process_t starpu_pthread_t;
+#endif
 typedef int starpu_pthread_attr_t;
 
+#ifdef STARPU_HAVE_SIMGRID_ACTOR_H
+typedef sg_host_t starpu_sg_host_t;
+#else
+typedef msg_host_t starpu_sg_host_t;
+#endif
 int starpu_pthread_equal(starpu_pthread_t t1, starpu_pthread_t t2);
 starpu_pthread_t starpu_pthread_self(void);
-int starpu_pthread_create_on(char *name, starpu_pthread_t *thread, const starpu_pthread_attr_t *attr, void *(*start_routine) (void *), void *arg, msg_host_t host);
+int starpu_pthread_create_on(char *name, starpu_pthread_t *thread, const starpu_pthread_attr_t *attr, void *(*start_routine) (void *), void *arg, starpu_sg_host_t host);
 int starpu_pthread_create(starpu_pthread_t *thread, const starpu_pthread_attr_t *attr, void *(*start_routine) (void *), void *arg);
 int starpu_pthread_join(starpu_pthread_t thread, void **retval);
 int starpu_pthread_exit(void *retval) STARPU_ATTRIBUTE_NORETURN;
@@ -97,7 +121,11 @@ typedef pthread_attr_t starpu_pthread_attr_t;
  */
 
 #ifdef STARPU_SIMGRID
+#ifdef STARPU_HAVE_SIMGRID_MUTEX_H
+typedef sg_mutex_t starpu_pthread_mutex_t;
+#else
 typedef xbt_mutex_t starpu_pthread_mutex_t;
+#endif
 typedef int starpu_pthread_mutexattr_t;
 
 #define STARPU_PTHREAD_MUTEX_INITIALIZER NULL
@@ -173,7 +201,11 @@ typedef pthread_key_t starpu_pthread_key_t;
 
 #ifdef STARPU_SIMGRID
 
+#ifdef STARPU_HAVE_SIMGRID_COND_H
+typedef sg_cond_t starpu_pthread_cond_t;
+#else
 typedef xbt_cond_t starpu_pthread_cond_t;
+#endif
 typedef int starpu_pthread_condattr_t;
 #define STARPU_PTHREAD_COND_INITIALIZER NULL
 
@@ -211,7 +243,11 @@ int starpu_pthread_cond_wait(starpu_pthread_cond_t *cond, starpu_pthread_mutex_t
 
 #ifdef STARPU_SIMGRID
 
+#ifdef STARPU_HAVE_SIMGRID_MUTEX_H
+typedef sg_mutex_t starpu_pthread_rwlock_t;
+#else
 typedef xbt_mutex_t starpu_pthread_rwlock_t;
+#endif
 typedef int starpu_pthread_rwlockattr_t;
 
 int starpu_pthread_rwlock_init(starpu_pthread_rwlock_t *rwlock, const starpu_pthread_rwlockattr_t *attr);
@@ -252,10 +288,18 @@ int starpu_pthread_rwlock_unlock(starpu_pthread_rwlock_t *rwlock);
 
 #if defined(STARPU_SIMGRID) || (!defined(STARPU_HAVE_PTHREAD_BARRIER) && (!defined(_MSC_VER) || defined(BUILDING_STARPU)))
 
-#if defined(STARPU_SIMGRID) && (defined(STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT) || defined(xbt_barrier_init))
+#if defined(STARPU_SIMGRID) && (defined(STARPU_HAVE_SIMGRID_BARRIER_H) || defined(STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT) || defined(xbt_barrier_init))
+#ifdef STARPU_HAVE_SIMGRID_BARRIER_H
+typedef sg_bar_t starpu_pthread_barrier_t;
+#else
 typedef xbt_bar_t starpu_pthread_barrier_t;
+#endif
 typedef int starpu_pthread_barrierattr_t;
-#define STARPU_PTHREAD_BARRIER_SERIAL_THREAD XBT_BARRIER_SERIAL_PROCESS
+#ifdef SG_BARRIER_SERIAL_THREAD
+#  define STARPU_PTHREAD_BARRIER_SERIAL_THREAD SG_BARRIER_SERIAL_THREAD
+#else
+#  define STARPU_PTHREAD_BARRIER_SERIAL_THREAD -1
+#endif
 #else
 typedef struct {
 	starpu_pthread_mutex_t mutex;
@@ -419,7 +463,11 @@ int starpu_pthread_wait_destroy(starpu_pthread_wait_t *w);
 
 #ifdef STARPU_SIMGRID
 
+#ifdef STARPU_HAVE_SIMGRID_SEMAPHORE_H
+typedef sg_sem_t starpu_sem_t;
+#else
 typedef msg_sem_t starpu_sem_t;
+#endif
 int starpu_sem_destroy(starpu_sem_t *);
 int starpu_sem_getvalue(starpu_sem_t *, int *);
 int starpu_sem_init(starpu_sem_t *, int, unsigned);

+ 1 - 1
include/starpu_worker.h

@@ -488,7 +488,7 @@ int starpu_combined_worker_get_size(void);
 
 /**
    Return the rank of the current thread within the combined worker.
-   Can only be used in ::STARPU_FORKJOIN parallel tasks, to know which
+   Can only be used in ::STARPU_SPMD parallel tasks, to know which
    part of the task to work on.
 */
 int starpu_combined_worker_get_rank(void);

+ 38 - 1
mpi/src/starpu_mpi_task_insert.c

@@ -375,10 +375,19 @@ int _starpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nod
 			(void)va_arg(varg_list_copy, _starpu_callback_func_t);
 			(void)va_arg(varg_list_copy, void *);
 		}
+		else if (arg_type==STARPU_CALLBACK_WITH_ARG_NFREE)
+		{
+			(void)va_arg(varg_list_copy, _starpu_callback_func_t);
+			(void)va_arg(varg_list_copy, void *);
+		}
 		else if (arg_type==STARPU_CALLBACK_ARG)
 		{
 			(void)va_arg(varg_list_copy, void *);
 		}
+		else if (arg_type==STARPU_CALLBACK_ARG_NFREE)
+		{
+			(void)va_arg(varg_list_copy, void *);
+		}
 		else if (arg_type==STARPU_PRIORITY)
 		{
 			prio = va_arg(varg_list_copy, int);
@@ -411,6 +420,10 @@ int _starpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nod
                 {
                         (void)va_arg(varg_list_copy, void *);
                 }
+                else if (arg_type==STARPU_PROLOGUE_CALLBACK_ARG_NFREE)
+                {
+                        (void)va_arg(varg_list_copy, void *);
+                }
                 else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP)
                 {
 			(void)va_arg(varg_list_copy, _starpu_callback_func_t);
@@ -419,6 +432,10 @@ int _starpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nod
                 {
                         (void)va_arg(varg_list_copy, void *);
 		}
+                else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP_ARG_NFREE)
+                {
+                        (void)va_arg(varg_list_copy, void *);
+		}
 		else if (arg_type==STARPU_EXECUTE_WHERE)
 		{
 			// the flag is decoded and set later when
@@ -469,6 +486,23 @@ int _starpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nod
 		{
 			(void)va_arg(varg_list_copy, int);
 		}
+		else if (arg_type==STARPU_TASK_WORKERIDS)
+		{
+			(void)va_arg(varg_list_copy, unsigned);
+			(void)va_arg(varg_list_copy, uint32_t*);
+		}
+		else if (arg_type==STARPU_SEQUENTIAL_CONSISTENCY)
+		{
+			(void)va_arg(varg_list_copy, unsigned);
+		}
+		else if (arg_type==STARPU_TASK_PROFILING_INFO)
+		{
+			(void)va_arg(varg_list_copy, struct starpu_profiling_task_info *);
+		}
+		else if (arg_type==STARPU_TASK_NO_SUBMITORDER)
+		{
+			(void)va_arg(varg_list_copy, unsigned);
+		}
 		else
 		{
 			STARPU_ABORT_MSG("Unrecognized argument %d, did you perhaps forget to end arguments with 0?\n", arg_type);
@@ -552,6 +586,9 @@ int _starpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, stru
 
 		*task = starpu_task_create();
 		(*task)->cl_arg_free = 1;
+		(*task)->callback_arg_free = 1;
+		(*task)->prologue_callback_arg_free = 1;
+		(*task)->prologue_callback_pop_arg_free = 1;
 
 		va_copy(varg_list_copy, varg_list);
 		_starpu_task_insert_create(codelet, *task, varg_list_copy);
@@ -824,7 +861,7 @@ void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle,
 				// Submit taskA
 				starpu_task_insert(&_starpu_mpi_redux_data_read_cl,
 						   STARPU_R, data_handle,
-						   STARPU_CALLBACK_WITH_ARG, _starpu_mpi_redux_data_recv_callback, args,
+						   STARPU_CALLBACK_WITH_ARG_NFREE, _starpu_mpi_redux_data_recv_callback, args,
 						   0);
 			}
 		}

+ 47 - 0
mpi/src/starpu_mpi_task_insert_fortran.c

@@ -210,11 +210,23 @@ int _fstarpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_no
 			arg_i++;
 			/* void* */
 		}
+		else if (arg_type==STARPU_CALLBACK_WITH_ARG_NFREE)
+		{
+			arg_i++;
+			/* _starpu_callback_func_t */
+			arg_i++;
+			/* void* */
+		}
 		else if (arg_type==STARPU_CALLBACK_ARG)
 		{
 			arg_i++;
 			/* void* */
 		}
+		else if (arg_type==STARPU_CALLBACK_ARG_NFREE)
+		{
+			arg_i++;
+			/* void* */
+		}
 		else if (arg_type==STARPU_PRIORITY)
 		{
 			prio = *(int *)arglist[arg_i];
@@ -255,6 +267,11 @@ int _fstarpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_no
 			arg_i++;
 			/* void* */
                 }
+                else if (arg_type==STARPU_PROLOGUE_CALLBACK_ARG_NFREE)
+                {
+			arg_i++;
+			/* void* */
+                }
                 else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP)
                 {
 			arg_i++;
@@ -265,6 +282,11 @@ int _fstarpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_no
 			arg_i++;
 			/* void* */
 		}
+                else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP_ARG_NFREE)
+                {
+			arg_i++;
+			/* void* */
+		}
 		else if (arg_type==STARPU_EXECUTE_WHERE)
 		{
 			arg_i++;
@@ -320,6 +342,28 @@ int _fstarpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_no
 			arg_i++;
 			/* int */
 		}
+		else if (arg_type==STARPU_TASK_WORKERIDS)
+		{
+			arg_i++;
+			/* unsigned */
+			arg_i++;
+			/* uint32_t* */
+		}
+		else if (arg_type==STARPU_SEQUENTIAL_CONSISTENCY)
+		{
+			arg_i++;
+			/* unsigned */
+		}
+		else if (arg_type==STARPU_TASK_PROFILING_INFO)
+		{
+			arg_i++;
+			/* struct starpu_profiling_task_info * */
+		}
+		else if (arg_type==STARPU_TASK_NO_SUBMITORDER)
+		{
+			arg_i++;
+			/* unsigned */
+		}
 		else
 		{
 			STARPU_ABORT_MSG("Unrecognized argument %d, did you perhaps forget to end arguments with 0?\n", arg_type);
@@ -400,6 +444,9 @@ int _fstarpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, str
 
 		*task = starpu_task_create();
 		(*task)->cl_arg_free = 1;
+		(*task)->callback_arg_free = 1;
+		(*task)->prologue_callback_arg_free = 1;
+		(*task)->prologue_callback_pop_arg_free = 1;
 
 		_fstarpu_task_insert_create(codelet, *task, arglist);
 		return 0;

+ 5 - 5
mpi/tests/callback.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2013-2015,2017                           CNRS
+ * Copyright (C) 2013-2015,2017,2019                      CNRS
  * Copyright (C) 2014,2015,2017,2018                      Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -71,7 +71,7 @@ int main(int argc, char **argv)
 	ret = starpu_mpi_task_insert(MPI_COMM_WORLD,
 				     NULL,
 				     STARPU_EXECUTE_ON_NODE, 0,
-				     STARPU_CALLBACK_WITH_ARG, callback, &x,
+				     STARPU_CALLBACK_WITH_ARG_NFREE, callback, &x,
 				     0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_task_insert");
 
@@ -81,7 +81,7 @@ int main(int argc, char **argv)
 				     NULL,
 				     STARPU_EXECUTE_ON_NODE, 0,
 				     STARPU_CALLBACK, callback,
-				     STARPU_CALLBACK_ARG, &x,
+				     STARPU_CALLBACK_ARG_NFREE, &x,
 				     0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 
@@ -93,7 +93,7 @@ int main(int argc, char **argv)
 				     NULL,
 				     STARPU_EXECUTE_ON_NODE, 0,
 				     STARPU_PROLOGUE_CALLBACK, prologue_callback,
-				     STARPU_PROLOGUE_CALLBACK_ARG, &y,
+				     STARPU_PROLOGUE_CALLBACK_ARG_NFREE, &y,
 				     0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 
@@ -103,7 +103,7 @@ int main(int argc, char **argv)
 				     &my_codelet,
 				     STARPU_EXECUTE_ON_NODE, 0,
 				     STARPU_PROLOGUE_CALLBACK_POP, prologue_callback,
-				     STARPU_PROLOGUE_CALLBACK_POP_ARG, &y,
+				     STARPU_PROLOGUE_CALLBACK_POP_ARG_NFREE, &y,
 				     0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 

+ 5 - 1
src/common/fxt.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2012,2013,2015                           Inria
- * Copyright (C) 2008-2018                                Université de Bordeaux
+ * Copyright (C) 2008-2019                                Université de Bordeaux
  * Copyright (C) 2010-2018                                CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -69,7 +69,11 @@ long _starpu_gettid(void)
 	 * Don't use the TSD, this is getting called before we would have the
 	 * time to allocate it.  */
 #ifdef STARPU_SIMGRID
+#  ifdef HAVE_SG_ACTOR_SELF
+	return (uintptr_t) sg_actor_self();
+#  else
 	return (uintptr_t) MSG_process_self();
+#  endif
 #else
 #if defined(__linux__)
 	return syscall(SYS_gettid);

+ 4 - 3
src/common/fxt.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011-2017                                Inria
- * Copyright (C) 2008-2018                                Université de Bordeaux
+ * Copyright (C) 2008-2019                                Université de Bordeaux
  * Copyright (C) 2013                                     Joris Pablo
  * Copyright (C) 2018                                     Federal University of Rio Grande do Sul (UFRGS)
  * Copyright (C) 2010-2019                                CNRS
@@ -1106,13 +1106,14 @@ do {										\
 
 #define _STARPU_TRACE_HANDLE_DATA_REGISTER(handle)	do {	\
 	const size_t __data_size = handle->ops->get_size(handle); \
-	char __buf[(FXT_MAX_PARAMS-2)*sizeof(long)]; \
+	const starpu_ssize_t __max_data_size = _starpu_data_get_max_size(handle); \
+	char __buf[(FXT_MAX_PARAMS-4)*sizeof(long)]; \
 	void *__interface = handle->per_node[0].data_interface; \
 	if (handle->ops->describe) \
 		handle->ops->describe(__interface, __buf, sizeof(__buf)); \
 	else \
 		__buf[0] = 0; \
-	FUT_DO_PROBE3STR(_STARPU_FUT_HANDLE_DATA_REGISTER, handle, __data_size, handle->home_node, __buf); \
+	FUT_DO_PROBE4STR(_STARPU_FUT_HANDLE_DATA_REGISTER, handle, __data_size, __max_data_size, handle->home_node, __buf); \
 } while (0)
 
 #define _STARPU_TRACE_HANDLE_DATA_UNREGISTER(handle)	\

+ 144 - 8
src/common/thread.c

@@ -65,10 +65,14 @@ int starpu_pthread_equal(starpu_pthread_t t1, starpu_pthread_t t2)
 
 starpu_pthread_t starpu_pthread_self(void)
 {
+#ifdef HAVE_SG_ACTOR_SELF
+	return sg_actor_self();
+#else
 	return MSG_process_self();
+#endif
 }
 
-int starpu_pthread_create_on(char *name, starpu_pthread_t *thread, const starpu_pthread_attr_t *attr STARPU_ATTRIBUTE_UNUSED, void *(*start_routine) (void *), void *arg, msg_host_t host)
+int starpu_pthread_create_on(char *name, starpu_pthread_t *thread, const starpu_pthread_attr_t *attr STARPU_ATTRIBUTE_UNUSED, void *(*start_routine) (void *), void *arg, starpu_sg_host_t host)
 {
 	char **_args;
 	_STARPU_MALLOC(_args, 3*sizeof(char*));
@@ -76,12 +80,20 @@ int starpu_pthread_create_on(char *name, starpu_pthread_t *thread, const starpu_
 	asprintf(&_args[1], "%p", arg);
 	_args[2] = NULL;
 	if (!host)
+#ifdef STARPU_HAVE_SIMGRID_HOST_H
+		host = sg_host_by_name("MAIN");
+#else
 		host = MSG_get_host_by_name("MAIN");
+#endif
 	void *tsd;
 	_STARPU_CALLOC(tsd, MAX_TSD+1, sizeof(void*));
 	*thread = MSG_process_create_with_arguments(name, _starpu_simgrid_thread_start, tsd, host, 2, _args);
 #if SIMGRID_VERSION >= 31500 && SIMGRID_VERSION != 31559
+#  ifdef HAVE_SG_ACTOR_REF
+	sg_actor_ref(*thread);
+#  else
 	MSG_process_ref(*thread);
+#  endif
 #endif
 	return 0;
 }
@@ -94,19 +106,31 @@ int starpu_pthread_create(starpu_pthread_t *thread, const starpu_pthread_attr_t
 int starpu_pthread_join(starpu_pthread_t thread STARPU_ATTRIBUTE_UNUSED, void **retval STARPU_ATTRIBUTE_UNUSED)
 {
 #if SIMGRID_VERSION >= 31400
+#  ifdef STARPU_HAVE_SIMGRID_ACTOR_H
+	sg_actor_join(thread, 1000000);
+#  else
 	MSG_process_join(thread, 1000000);
+#  endif
 #if SIMGRID_VERSION >= 31500 && SIMGRID_VERSION != 31559
+#  ifdef HAVE_SG_ACTOR_REF
+	sg_actor_unref(thread);
+#  else
 	MSG_process_unref(thread);
+#  endif
 #endif
 #else
-	MSG_process_sleep(1);
+	starpu_sleep(1);
 #endif
 	return 0;
 }
 
 int starpu_pthread_exit(void *retval STARPU_ATTRIBUTE_UNUSED)
 {
+#ifdef HAVE_SG_ACTOR_SELF
+	sg_actor_kill(sg_actor_self());
+#else
 	MSG_process_kill(MSG_process_self());
+#endif
 	STARPU_ABORT_MSG("MSG_process_kill(MSG_process_self()) returned?!");
 }
 
@@ -128,14 +152,22 @@ int starpu_pthread_attr_setdetachstate(starpu_pthread_attr_t *attr STARPU_ATTRIB
 
 int starpu_pthread_mutex_init(starpu_pthread_mutex_t *mutex, const starpu_pthread_mutexattr_t *mutexattr STARPU_ATTRIBUTE_UNUSED)
 {
+#ifdef STARPU_HAVE_SIMGRID_MUTEX_H
+	*mutex = sg_mutex_init();
+#else
 	*mutex = xbt_mutex_init();
+#endif
 	return 0;
 }
 
 int starpu_pthread_mutex_destroy(starpu_pthread_mutex_t *mutex)
 {
 	if (*mutex)
+#ifdef STARPU_HAVE_SIMGRID_MUTEX_H
+		sg_mutex_destroy(*mutex);
+#else
 		xbt_mutex_destroy(*mutex);
+#endif
 	return 0;
 }
 
@@ -148,16 +180,28 @@ int starpu_pthread_mutex_lock(starpu_pthread_mutex_t *mutex)
 	if (!*mutex)
 	{
 		/* Here we may get preempted */
+#ifdef STARPU_HAVE_SIMGRID_MUTEX_H
+		sg_mutex_t new_mutex = sg_mutex_init();
+#else
 		xbt_mutex_t new_mutex = xbt_mutex_init();
+#endif
 		if (!*mutex)
 			*mutex = new_mutex;
 		else
 			/* Somebody already initialized it while we were
-			 * calling xbt_mutex_init, this one is now useless */
+			 * calling sg_mutex_init, this one is now useless */
+#ifdef STARPU_HAVE_SIMGRID_MUTEX_H
+			sg_mutex_destroy(new_mutex);
+#else
 			xbt_mutex_destroy(new_mutex);
+#endif
 	}
 
+#ifdef STARPU_HAVE_SIMGRID_MUTEX_H
+	sg_mutex_lock(*mutex);
+#else
 	xbt_mutex_acquire(*mutex);
+#endif
 
 	_STARPU_TRACE_MUTEX_LOCKED();
 
@@ -168,7 +212,11 @@ int starpu_pthread_mutex_unlock(starpu_pthread_mutex_t *mutex)
 {
 	_STARPU_TRACE_UNLOCKING_MUTEX();
 
+#ifdef STARPU_HAVE_SIMGRID_MUTEX_H
+	sg_mutex_unlock(*mutex);
+#else
 	xbt_mutex_release(*mutex);
+#endif
 
 	_STARPU_TRACE_MUTEX_UNLOCKED();
 
@@ -180,7 +228,9 @@ int starpu_pthread_mutex_trylock(starpu_pthread_mutex_t *mutex)
 	int ret;
 	_STARPU_TRACE_TRYLOCK_MUTEX();
 
-#if defined(HAVE_XBT_MUTEX_TRY_ACQUIRE) || defined(xbt_mutex_try_acquire)
+#ifdef STARPU_HAVE_SIMGRID_MUTEX_H
+	ret = sg_mutex_try_lock(*mutex);
+#elif defined(HAVE_XBT_MUTEX_TRY_ACQUIRE) || defined(xbt_mutex_try_acquire)
 	ret = xbt_mutex_try_acquire(*mutex);
 #else
 	ret = simcall_mutex_trylock((smx_mutex_t)*mutex);
@@ -296,7 +346,11 @@ void* starpu_pthread_getspecific(starpu_pthread_key_t key)
 
 int starpu_pthread_cond_init(starpu_pthread_cond_t *cond, starpu_pthread_condattr_t *cond_attr STARPU_ATTRIBUTE_UNUSED)
 {
+#ifdef STARPU_HAVE_SIMGRID_COND_H
+	*cond = sg_cond_init();
+#else
 	*cond = xbt_cond_init();
+#endif
 	return 0;
 }
 
@@ -307,27 +361,43 @@ static void _starpu_pthread_cond_auto_init(starpu_pthread_cond_t *cond)
 	if (!*cond)
 	{
 		/* Here we may get preempted */
+#ifdef STARPU_HAVE_SIMGRID_COND_H
+		sg_cond_t new_cond = sg_cond_init();
+#else
 		xbt_cond_t new_cond = xbt_cond_init();
+#endif
 		if (!*cond)
 			*cond = new_cond;
 		else
 			/* Somebody already initialized it while we were
 			 * calling xbt_cond_init, this one is now useless */
+#ifdef STARPU_HAVE_SIMGRID_COND_H
+			sg_cond_destroy(new_cond);
+#else
 			xbt_cond_destroy(new_cond);
+#endif
 	}
 }
 
 int starpu_pthread_cond_signal(starpu_pthread_cond_t *cond)
 {
 	_starpu_pthread_cond_auto_init(cond);
+#ifdef STARPU_HAVE_SIMGRID_COND_H
+	sg_cond_notify_one(*cond);
+#else
 	xbt_cond_signal(*cond);
+#endif
 	return 0;
 }
 
 int starpu_pthread_cond_broadcast(starpu_pthread_cond_t *cond)
 {
 	_starpu_pthread_cond_auto_init(cond);
+#ifdef STARPU_HAVE_SIMGRID_COND_H
+	sg_cond_notify_all(*cond);
+#else
 	xbt_cond_broadcast(*cond);
+#endif
 	return 0;
 }
 
@@ -336,7 +406,11 @@ int starpu_pthread_cond_wait(starpu_pthread_cond_t *cond, starpu_pthread_mutex_t
 	_STARPU_TRACE_COND_WAIT_BEGIN();
 
 	_starpu_pthread_cond_auto_init(cond);
+#ifdef STARPU_HAVE_SIMGRID_COND_H
+	sg_cond_wait(*cond, *mutex);
+#else
 	xbt_cond_wait(*cond, *mutex);
+#endif
 
 	_STARPU_TRACE_COND_WAIT_END();
 
@@ -357,7 +431,11 @@ int starpu_pthread_cond_timedwait(starpu_pthread_cond_t *cond, starpu_pthread_mu
 	_STARPU_TRACE_COND_WAIT_BEGIN();
 
 	_starpu_pthread_cond_auto_init(cond);
+#ifdef STARPU_HAVE_SIMGRID_COND_H
+	ret = sg_cond_wait_for(*cond, *mutex, delay) ? ETIMEDOUT : 0;
+#else
 	ret = xbt_cond_timedwait(*cond, *mutex, delay) ? ETIMEDOUT : 0;
+#endif
 
 	_STARPU_TRACE_COND_WAIT_END();
 
@@ -370,10 +448,17 @@ int starpu_pthread_cond_timedwait(starpu_pthread_cond_t *cond, starpu_pthread_mu
 int starpu_pthread_cond_destroy(starpu_pthread_cond_t *cond)
 {
 	if (*cond)
+#ifdef STARPU_HAVE_SIMGRID_COND_H
+		sg_cond_destroy(*cond);
+#else
 		xbt_cond_destroy(*cond);
+#endif
 	return 0;
 }
 
+/* TODO: use rwlocks
+ * https://gforge.inria.fr/tracker/index.php?func=detail&aid=17213&group_id=12&atid=165
+ */
 int starpu_pthread_rwlock_init(starpu_pthread_rwlock_t *restrict rwlock, const starpu_pthread_rwlockattr_t *restrict attr STARPU_ATTRIBUTE_UNUSED)
 {
 	return starpu_pthread_mutex_init(rwlock, NULL);
@@ -438,7 +523,32 @@ int starpu_pthread_rwlock_unlock(starpu_pthread_rwlock_t *rwlock)
 	return p_ret;
 }
 
-#if defined(STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT) || defined(xbt_barrier_init)
+#ifdef STARPU_HAVE_SIMGRID_BARRIER_H
+int starpu_pthread_barrier_init(starpu_pthread_barrier_t *restrict barrier, const starpu_pthread_barrierattr_t *restrict attr STARPU_ATTRIBUTE_UNUSED, unsigned count)
+{
+	*barrier = sg_barrier_init(count);
+	return 0;
+}
+
+int starpu_pthread_barrier_destroy(starpu_pthread_barrier_t *barrier)
+{
+	if (*barrier)
+		sg_barrier_destroy(*barrier);
+	return 0;
+}
+
+int starpu_pthread_barrier_wait(starpu_pthread_barrier_t *barrier)
+{
+	int ret;
+
+	_STARPU_TRACE_BARRIER_WAIT_BEGIN();
+
+	ret = sg_barrier_wait(*barrier);
+
+	_STARPU_TRACE_BARRIER_WAIT_END();
+	return ret;
+}
+#elif defined(STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT) || defined(xbt_barrier_init)
 int starpu_pthread_barrier_init(starpu_pthread_barrier_t *restrict barrier, const starpu_pthread_barrierattr_t *restrict attr STARPU_ATTRIBUTE_UNUSED, unsigned count)
 {
 	*barrier = xbt_barrier_init(count);
@@ -454,12 +564,14 @@ int starpu_pthread_barrier_destroy(starpu_pthread_barrier_t *barrier)
 
 int starpu_pthread_barrier_wait(starpu_pthread_barrier_t *barrier)
 {
+	int ret;
+
 	_STARPU_TRACE_BARRIER_WAIT_BEGIN();
 
-	xbt_barrier_wait(*barrier);
+	ret = xbt_barrier_wait(*barrier);
 
 	_STARPU_TRACE_BARRIER_WAIT_END();
-	return 0;
+	return ret;
 }
 #endif /* defined(STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT) */
 
@@ -602,7 +714,7 @@ int starpu_pthread_queue_destroy(starpu_pthread_queue_t *q)
 
 #endif /* STARPU_SIMGRID */
 
-#if (defined(STARPU_SIMGRID) && (!defined(STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT)) && !defined(xbt_barrier_init)) || (!defined(STARPU_SIMGRID) && !defined(STARPU_HAVE_PTHREAD_BARRIER))
+#if (defined(STARPU_SIMGRID) && !defined(STARPU_HAVE_SIMGRID_BARRIER_H) && !defined(STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT) && !defined(xbt_barrier_init)) || (!defined(STARPU_SIMGRID) && !defined(STARPU_HAVE_PTHREAD_BARRIER))
 int starpu_pthread_barrier_init(starpu_pthread_barrier_t *restrict barrier, const starpu_pthread_barrierattr_t *restrict attr STARPU_ATTRIBUTE_UNUSED, unsigned count)
 {
 	int ret = starpu_pthread_mutex_init(&barrier->mutex, NULL);
@@ -929,32 +1041,52 @@ void _starpu_pthread_spin_do_unlock(starpu_pthread_spinlock_t *lock)
 
 int starpu_sem_destroy(starpu_sem_t *sem)
 {
+#ifdef STARPU_HAVE_SIMGRID_SEMAPHORE_H
+	sg_sem_destroy(*sem);
+#else
 	MSG_sem_destroy(*sem);
+#endif
 	return 0;
 }
 
 int starpu_sem_init(starpu_sem_t *sem, int pshared, unsigned value)
 {
 	STARPU_ASSERT_MSG(pshared == 0, "pshared semaphores not supported under simgrid");
+#ifdef STARPU_HAVE_SIMGRID_SEMAPHORE_H
+	*sem = sg_sem_init(value);
+#else
 	*sem = MSG_sem_init(value);
+#endif
 	return 0;
 }
 
 int starpu_sem_post(starpu_sem_t *sem)
 {
+#ifdef STARPU_HAVE_SIMGRID_SEMAPHORE_H
+	sg_sem_release(*sem);
+#else
 	MSG_sem_release(*sem);
+#endif
 	return 0;
 }
 
 int starpu_sem_wait(starpu_sem_t *sem)
 {
+#ifdef STARPU_HAVE_SIMGRID_SEMAPHORE_H
+	sg_sem_acquire(*sem);
+#else
 	MSG_sem_acquire(*sem);
+#endif
 	return 0;
 }
 
 int starpu_sem_trywait(starpu_sem_t *sem)
 {
+#ifdef STARPU_HAVE_SIMGRID_SEMAPHORE_H
+	if (sg_sem_would_block(*sem))
+#else
 	if (MSG_sem_would_block(*sem))
+#endif
 		return EAGAIN;
 	starpu_sem_wait(sem);
 	return 0;
@@ -963,7 +1095,11 @@ int starpu_sem_trywait(starpu_sem_t *sem)
 int starpu_sem_getvalue(starpu_sem_t *sem, int *sval)
 {
 #if SIMGRID_VERSION > 31300
+#  ifdef STARPU_HAVE_SIMGRID_SEMAPHORE_H
+	*sval = sg_sem_get_capacity(*sem);
+#  else
 	*sval = MSG_sem_get_capacity(*sem);
+#  endif
 	return 0;
 #else
 	(void) sem;

+ 2 - 2
src/common/thread.h

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2012,2013                                Inria
  * Copyright (C) 2010-2017                                CNRS
- * Copyright (C) 2010-2014,2016,2017                      Université de Bordeaux
+ * Copyright (C) 2010-2014,2016,2017, 2019                      Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -53,7 +53,7 @@ static inline int _starpu_pthread_spin_lock(starpu_pthread_spinlock_t *lock)
 		}
 		/* Give hand to another thread, hopefully the one which has the
 		 * spinlock and probably just has also a short-lived mutex. */
-		MSG_process_sleep(0.000001);
+		starpu_sleep(0.000001);
 		STARPU_UYIELD();
 	}
 #elif defined(STARPU_LINUX_SYS) && defined(STARPU_HAVE_XCHG)

+ 11 - 0
src/common/timing.c

@@ -24,6 +24,9 @@
 
 #ifdef STARPU_SIMGRID
 #include <core/simgrid.h>
+#ifdef HAVE_SIMGRID_ENGINE_H
+#include <simgrid/engine.h>
+#endif
 #endif
 
 #if defined(_WIN32) && !defined(__MINGW32__) && !defined(__CYGWIN__)
@@ -37,7 +40,11 @@ void _starpu_timing_init(void)
 
 void _starpu_clock_gettime(struct timespec *ts)
 {
+#ifdef HAVE_SIMGRID_GET_CLOCK
+	double now = simgrid_get_clock();
+#else
 	double now = MSG_get_clock();
+#endif
 	ts->tv_sec = floor(now);
 	ts->tv_nsec = floor((now - ts->tv_sec) * 1000000000);
 }
@@ -246,7 +253,11 @@ double starpu_timing_timespec_to_us(struct timespec *ts)
 double starpu_timing_now(void)
 {
 #ifdef STARPU_SIMGRID
+#  ifdef HAVE_SIMGRID_GET_CLOCK
+	return simgrid_get_clock()*1000000;
+#  else
 	return MSG_get_clock()*1000000;
+#  endif
 #else
 	struct timespec now;
 	_starpu_clock_gettime(&now);

+ 8 - 0
src/common/utils.c

@@ -536,7 +536,11 @@ void _starpu_gethostname(char *hostname, size_t size)
 void starpu_sleep(float nb_sec)
 {
 #ifdef STARPU_SIMGRID
+#  ifdef HAVE_SG_ACTOR_SLEEP_FOR
+	sg_actor_sleep_for(nb_sec);
+#  else
 	MSG_process_sleep(nb_sec);
+#  endif
 #elif defined(STARPU_HAVE_WINDOWS)
 	Sleep(nb_sec * 1000);
 #else
@@ -552,7 +556,11 @@ void starpu_sleep(float nb_sec)
 void starpu_usleep(float nb_micro_sec)
 {
 #ifdef STARPU_SIMGRID
+#  ifdef HAVE_SG_ACTOR_SLEEP_FOR
+	sg_actor_sleep_for(nb_micro_sec / 1000000);
+#  else
 	MSG_process_sleep(nb_micro_sec / 1000000);
+#  endif
 #elif defined(STARPU_HAVE_WINDOWS)
 	Sleep(nb_micro_sec / 1000);
 #elif HAVE_UNISTD_H

+ 1 - 1
src/core/dependencies/data_concurrency.c

@@ -245,7 +245,7 @@ static unsigned _starpu_attempt_to_submit_data_request(unsigned request_from_cod
  * This is typicall used for nodeps tasks, for which a previous task has already
  * waited for the proper conditions, and we just need to take another reference
  * for overall reference coherency.
-/* No lock is held, this acquires and releases the handle header lock */
+ * No lock is held, this acquires and releases the handle header lock */
 static void _starpu_take_data(unsigned request_from_codelet,
 						       starpu_data_handle_t handle, enum starpu_data_access_mode mode,
 						       struct _starpu_job *j)

+ 12 - 7
src/core/dependencies/implicit_data_deps.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011,2012,2016                           Inria
- * Copyright (C) 2010-2018                                Université de Bordeaux
+ * Copyright (C) 2010-2019                                Université de Bordeaux
  * Copyright (C) 2010-2013,2015-2018                      CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -273,7 +273,8 @@ struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_
 			_STARPU_DEP_DEBUG("dependency\n");
 
 			if ((l != &handle->last_submitted_accessors && l->next != &handle->last_submitted_accessors)
-					|| (handle->last_submitted_ghost_accessors_id && handle->last_submitted_ghost_accessors_id->next))
+					|| (handle->last_submitted_ghost_accessors_id && handle->last_submitted_ghost_accessors_id->next)
+					|| (l != &handle->last_submitted_accessors && handle->last_submitted_ghost_accessors_id))
 			{
 				/* Several previous accessors */
 
@@ -314,10 +315,12 @@ struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_
 			}
 			else
 			{
-				/* One previous accessor, make it the sync
-				 * task, and start depending on it. */
+				struct _starpu_jobid_list *ghost_accessors_id = handle->last_submitted_ghost_accessors_id;
+				/* At most one previous accessor or one ghost */
 				if (l != &handle->last_submitted_accessors)
 				{
+					/* One accessor, make it the sync task,
+					 * and start depending on it. */
 					_STARPU_DEP_DEBUG("One previous accessor, depending on it\n");
 					handle->last_sync_task = l->task;
 					l->next = NULL;
@@ -325,13 +328,15 @@ struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_
 					handle->last_submitted_accessors.next = &handle->last_submitted_accessors;
 					handle->last_submitted_accessors.prev = &handle->last_submitted_accessors;
 				}
-				else if (handle->last_submitted_ghost_accessors_id)
+				else if (ghost_accessors_id)
 				{
+					/* One ghost, just remember its id */
 					_STARPU_DEP_DEBUG("No more currently running accessor, but a ghost id, taking it.\n");
-					handle->last_submitted_ghost_sync_id = handle->last_submitted_ghost_accessors_id->id;
+					handle->last_submitted_ghost_sync_id = ghost_accessors_id->id;
 					handle->last_submitted_ghost_sync_id_is_valid = 1;
-					free(handle->last_submitted_ghost_accessors_id);
+					STARPU_ASSERT(!ghost_accessors_id->next);
 					handle->last_submitted_ghost_accessors_id = NULL;
+					free(ghost_accessors_id);
 				}
 				else
 				{

+ 1 - 1
src/core/disk.c

@@ -137,7 +137,7 @@ int starpu_disk_register(struct starpu_disk_ops *func, void *parameter, starpu_s
 #ifdef STARPU_SIMGRID
 	char name[16];
 	snprintf(name, sizeof(name), "DISK%d", n);
-	msg_host_t host = _starpu_simgrid_get_host_by_name(name);
+	starpu_sg_host_t host = _starpu_simgrid_get_host_by_name(name);
 	STARPU_ASSERT_MSG(host, "Could not find disk %s in platform file", name);
 	_starpu_simgrid_memory_node_set_host(disk_memnode, host);
 #endif

+ 3 - 1
src/core/perfmodel/perfmodel_bus.c

@@ -91,6 +91,7 @@ static unsigned ncpus = 0;
 static unsigned nnumas = 0;
 static unsigned ncuda = 0;
 static unsigned nopencl = 0;
+#ifndef STARPU_SIMGRID
 static unsigned nmic = 0;
 static unsigned nmpi_ms = 0;
 
@@ -99,7 +100,6 @@ static unsigned nmpi_ms = 0;
 static double numa_latency[STARPU_MAXNUMANODES][STARPU_MAXNUMANODES];
 static double numa_timing[STARPU_MAXNUMANODES][STARPU_MAXNUMANODES];
 
-#ifndef STARPU_SIMGRID
 static uint64_t cuda_size[STARPU_MAXCUDADEVS];
 #endif
 #ifdef STARPU_USE_CUDA
@@ -675,6 +675,7 @@ static void measure_bandwidth_between_host_and_dev(int dev, struct dev_timing *d
 }
 #endif /* defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) */
 
+#if !defined(STARPU_SIMGRID)
 static void measure_bandwidth_latency_between_numa(int numa_src, int numa_dst)
 {
 #if defined(STARPU_HAVE_HWLOC)
@@ -733,6 +734,7 @@ static void measure_bandwidth_latency_between_numa(int numa_src, int numa_dst)
 		numa_latency[numa_src][numa_dst] = 0;
 	}
 }
+#endif
 
 static void benchmark_all_gpu_devices(void)
 {

+ 1 - 0
src/core/sched_ctx.h

@@ -26,6 +26,7 @@
 #include <starpu_scheduler.h>
 #include <common/config.h>
 #include <common/barrier_counter.h>
+#include <common/utils.h>
 #include <profiling/profiling.h>
 #include <semaphore.h>
 #include <core/task.h>

+ 128 - 37
src/core/simgrid.c

@@ -65,7 +65,7 @@ starpu_pthread_queue_t _starpu_simgrid_transfer_queue[STARPU_MAXNODES];
 static struct transfer_runner
 {
 	struct transfer *first_transfer, *last_transfer;
-	msg_sem_t sem;
+	starpu_sem_t sem;
 	msg_process_t runner;
 } transfer_runner[STARPU_MAXNODES][STARPU_MAXNODES];
 static int transfer_execute(int argc STARPU_ATTRIBUTE_UNUSED, char *argv[] STARPU_ATTRIBUTE_UNUSED);
@@ -74,7 +74,7 @@ starpu_pthread_queue_t _starpu_simgrid_task_queue[STARPU_NMAXWORKERS];
 static struct worker_runner
 {
 	struct task *first_task, *last_task;
-	msg_sem_t sem;
+	starpu_sem_t sem;
 	msg_process_t runner;
 } worker_runner[STARPU_NMAXWORKERS];
 static int task_execute(int argc STARPU_ATTRIBUTE_UNUSED, char *argv[] STARPU_ATTRIBUTE_UNUSED);
@@ -139,7 +139,11 @@ int _starpu_simgrid_get_nbhosts(const char *prefix)
 		snprintf(name, sizeof(name), STARPU_MPI_AS_PREFIX"%d", starpu_mpi_world_rank());
 #if defined(HAVE_MSG_ZONE_GET_HOSTS) || defined(MSG_zone_get_hosts)
 		hosts = xbt_dynar_new(sizeof(sg_host_t), NULL);
+#  if defined(HAVE_SG_ZONE_GET_BY_NAME) || defined(sg_zone_get_by_name)
+		sg_zone_get_hosts(_starpu_simgrid_get_as_by_name(name), hosts);
+#  else
 		MSG_zone_get_hosts(_starpu_simgrid_get_as_by_name(name), hosts);
+#  endif
 #else
 		hosts = MSG_environment_as_get_hosts(_starpu_simgrid_get_as_by_name(name));
 #endif
@@ -151,14 +155,22 @@ int _starpu_simgrid_get_nbhosts(const char *prefix)
 #endif /* HAVE_STARPU_SIMGRID_GET_AS_BY_NAME */
 	}
 	else
+#ifdef STARPU_HAVE_SIMGRID_HOST_H
+		hosts = sg_hosts_as_dynar();
+#else
 		hosts = MSG_hosts_as_dynar();
+#endif
 	nb = xbt_dynar_length(hosts);
 
 	ret = 0;
 	for (i = 0; i < nb; i++)
 	{
 		const char *name;
+#ifdef STARPU_HAVE_SIMGRID_HOST_H
+		name = sg_host_get_name(xbt_dynar_get_as(hosts, i, msg_host_t));
+#else
 		name = MSG_host_get_name(xbt_dynar_get_as(hosts, i, msg_host_t));
+#endif
 		if (!strncmp(name, prefix, len))
 			ret++;
 	}
@@ -178,10 +190,18 @@ unsigned long long _starpu_simgrid_get_memsize(const char *prefix, unsigned devi
 	if (!host)
 		return 0;
 
+#ifdef HAVE_SG_HOST_GET_PROPERTIES
+	if (!sg_host_get_properties(host))
+#else
 	if (!MSG_host_get_properties(host))
+#endif
 		return 0;
 
+#ifdef HAVE_SG_HOST_GET_PROPERTIES
+	memsize = sg_host_get_property_value(host, "memsize");
+#else
 	memsize = MSG_host_get_property_value(host, "memsize");
+#endif
 	if (!memsize)
 		return 0;
 
@@ -195,10 +215,18 @@ msg_host_t _starpu_simgrid_get_host_by_name(const char *name)
 		char mpiname[32];
 		STARPU_ASSERT(starpu_mpi_world_rank);
 		snprintf(mpiname, sizeof(mpiname), STARPU_MPI_AS_PREFIX"%d-%s", starpu_mpi_world_rank(), name);
+#ifdef STARPU_HAVE_SIMGRID_HOST_H
+		return sg_host_by_name(mpiname);
+#else
 		return MSG_get_host_by_name(mpiname);
+#endif
 	}
 	else
+#ifdef STARPU_HAVE_SIMGRID_HOST_H
+		return sg_host_by_name(name);
+#else
 		return MSG_get_host_by_name(name);
+#endif
 }
 
 msg_host_t _starpu_simgrid_get_host_by_worker(struct _starpu_worker *worker)
@@ -261,7 +289,9 @@ void _starpu_start_simgrid(int *argc, char **argv)
 		stack_size = rlim.rlim_cur / 1024;
 #endif
 
-#if SIMGRID_VERSION < 31300
+#ifdef HAVE_SG_CFG_SET_INT
+	sg_cfg_set_int("contexts/stack-size", stack_size);
+#elif SIMGRID_VERSION < 31300
 	extern xbt_cfg_t _sg_cfg_set;
 	xbt_cfg_set_int(_sg_cfg_set, "contexts/stack_size", stack_size);
 #else
@@ -284,7 +314,7 @@ static int main_ret;
 int do_starpu_main(int argc, char *argv[])
 {
 	/* FIXME: Ugly work-around for bug in simgrid: the MPI context is not properly set at MSG process startup */
-	MSG_process_sleep(0.000001);
+	starpu_sleep(0.000001);
 
 	main_ret = starpu_main(argc, argv);
 	return main_ret;
@@ -342,7 +372,7 @@ int main(int argc, char **argv)
 	_STARPU_CALLOC(tsd, MAX_TSD+1, sizeof(void*));
 
 	/* Run the application in a separate thread */
-	MSG_process_create_with_arguments("main", &do_starpu_main, tsd, MSG_get_host_by_name("MAIN"), argc, argv_cpy);
+	MSG_process_create_with_arguments("main", &do_starpu_main, tsd, _starpu_simgrid_get_host_by_name("MAIN"), argc, argv_cpy);
 
 	/* And run maestro in the main thread */
 	MSG_main();
@@ -369,8 +399,12 @@ void _starpu_simgrid_init_early(int *argc STARPU_ATTRIBUTE_UNUSED, char ***argv
 		 * Try using --cfg=contexts/factory:thread instead."
 		 * See https://github.com/simgrid/simgrid/issues/141 */
 		_STARPU_DISP("Warning: In simgrid mode, the file containing the main() function of this application should to be compiled with starpu.h or starpu_simgrid_wrap.h included, to properly rename it into starpu_main to avoid having to use --cfg=contexts/factory:thread which reduces performance\n");
-#if SIMGRID_VERSION >= 31400 /* Only recent versions of simgrid support setting xbt_cfg_set_string before starting simgrid */
+#if SIMGRID_VERSION >= 31400 /* Only recent versions of simgrid support setting sg_cfg_set_string before starting simgrid */
+#  ifdef HAVE_SG_CFG_SET_INT
+		sg_cfg_set_string("contexts/factory", "thread");
+#  else
 		xbt_cfg_set_string("contexts/factory", "thread");
+#  endif
 #endif
 		/* We didn't catch application's main. */
 		/* Start maestro as a separate thread */
@@ -380,7 +414,7 @@ void _starpu_simgrid_init_early(int *argc STARPU_ATTRIBUTE_UNUSED, char ***argv
 		/* And attach the main thread to the main simgrid process */
 		void **tsd;
 		_STARPU_CALLOC(tsd, MAX_TSD+1, sizeof(void*));
-		MSG_process_attach("main", tsd, MSG_get_host_by_name("MAIN"), NULL);
+		MSG_process_attach("main", tsd, _starpu_simgrid_get_host_by_name("MAIN"), NULL);
 		/* We initialized through MSG_process_attach */
 		simgrid_started = 3;
 	}
@@ -422,7 +456,7 @@ void _starpu_simgrid_init(void)
 		snprintf(s, sizeof(s), "worker %u runner", i);
 		void **tsd;
 		_STARPU_CALLOC(tsd, MAX_TSD+1, sizeof(void*));
-		worker_runner[i].sem = MSG_sem_init(0);
+		starpu_sem_init(&worker_runner[i].sem, 0, 0);
 		tsd[0] = (void*)(uintptr_t) i;
 		worker_runner[i].runner = MSG_process_create_with_arguments(s, task_execute, tsd, _starpu_simgrid_get_host_by_worker(_starpu_get_worker_struct(i)), 0, NULL);
 	}
@@ -451,15 +485,17 @@ void _starpu_simgrid_deinit(void)
 			struct transfer_runner *t = &transfer_runner[i][j];
 			if (t->runner)
 			{
-				MSG_sem_release(t->sem);
-#if SIMGRID_VERSION >= 31400
+				starpu_sem_post(&t->sem);
+#ifdef STARPU_HAVE_SIMGRID_ACTOR_H
+				sg_actor_join(t->runner, 1000000);
+#elif SIMGRID_VERSION >= 31400
 				MSG_process_join(t->runner, 1000000);
 #else
-				MSG_process_sleep(1);
+				starpu_sleep(1);
 #endif
 				STARPU_ASSERT(t->first_transfer == NULL);
 				STARPU_ASSERT(t->last_transfer == NULL);
-				MSG_sem_destroy(t->sem);
+				starpu_sem_destroy(&t->sem);
 			}
 		}
 		/* FIXME: queue not empty at this point, needs proper unregistration */
@@ -468,21 +504,29 @@ void _starpu_simgrid_deinit(void)
 	for (i = 0; i < starpu_worker_get_count(); i++)
 	{
 		struct worker_runner *w = &worker_runner[i];
-		MSG_sem_release(w->sem);
-#if SIMGRID_VERSION >= 31400
+		starpu_sem_post(&w->sem);
+#ifdef STARPU_HAVE_SIMGRID_ACTOR_H
+		sg_actor_join(w->runner, 1000000);
+#elif SIMGRID_VERSION >= 31400
 		MSG_process_join(w->runner, 1000000);
 #else
-		MSG_process_sleep(1);
+		starpu_sleep(1);
 #endif
 		STARPU_ASSERT(w->first_task == NULL);
 		STARPU_ASSERT(w->last_task == NULL);
-		MSG_sem_destroy(w->sem);
+		starpu_sem_destroy(&w->sem);
 		starpu_pthread_queue_destroy(&_starpu_simgrid_task_queue[i]);
 	}
 
 #if SIMGRID_VERSION >= 31300
 	/* clean-atexit introduced in simgrid 3.13 */
+#  ifdef HAVE_SG_CFG_SET_INT
+	if ( sg_cfg_get_boolean("debug/clean-atexit"))
+#  elif SIMGRID_VERSION >= 32300
+	if ( xbt_cfg_get_boolean("debug/clean-atexit"))
+#  else
 	if ( xbt_cfg_get_boolean("clean-atexit"))
+#  endif
 	{
 		_starpu_simgrid_deinit_late();
 	}
@@ -495,7 +539,11 @@ void _starpu_simgrid_deinit(void)
 
 struct task
 {
+#ifdef HAVE_SG_ACTOR_SELF_EXECUTE
+	double flops;
+#else
 	msg_task_t task;
+#endif
 
 	/* communication termination signalization */
 	unsigned *finished;
@@ -508,7 +556,7 @@ struct task
 static int task_execute(int argc STARPU_ATTRIBUTE_UNUSED, char *argv[] STARPU_ATTRIBUTE_UNUSED)
 {
 	/* FIXME: Ugly work-around for bug in simgrid: the MPI context is not properly set at MSG process startup */
-	MSG_process_sleep(0.000001);
+	starpu_sleep(0.000001);
 
 	unsigned workerid = (uintptr_t) STARPU_PTHREAD_GETSPECIFIC(0);
 	struct worker_runner *w = &worker_runner[workerid];
@@ -518,7 +566,7 @@ static int task_execute(int argc STARPU_ATTRIBUTE_UNUSED, char *argv[] STARPU_AT
 	{
 		struct task *task;
 
-		MSG_sem_acquire(w->sem);
+		starpu_sem_wait(&w->sem);
 		if (!runners_running)
 			break;
 
@@ -528,8 +576,12 @@ static int task_execute(int argc STARPU_ATTRIBUTE_UNUSED, char *argv[] STARPU_AT
 			w->last_task = NULL;
 
 		_STARPU_DEBUG("task %p started\n", task);
+#ifdef HAVE_SG_ACTOR_SELF_EXECUTE
+		sg_actor_self_execute(task->flops);
+#else
 		MSG_task_execute(task->task);
 		MSG_task_destroy(task->task);
+#endif
 		_STARPU_DEBUG("task %p finished\n", task);
 
 		*task->finished = 1;
@@ -569,7 +621,10 @@ void _starpu_simgrid_wait_tasks(int workerid)
 void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch, double length, unsigned *finished)
 {
 	struct starpu_task *starpu_task = j->task;
+	double flops;
+#ifndef HAVE_SG_ACTOR_SELF_EXECUTE
 	msg_task_t simgrid_task;
+#endif
 
 	if (j->internal)
 		/* This is not useful to include in simulation (and probably
@@ -586,23 +641,33 @@ void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct star
                  * to be able to easily check scheduling robustness */
 	}
 
-	simgrid_task = MSG_task_create(_starpu_job_get_task_name(j),
 #if defined(HAVE_SG_HOST_SPEED) || defined(sg_host_speed)
-			length/1000000.0*sg_host_speed(MSG_host_self()),
+#  if defined(HAVE_SG_HOST_SELF) || defined(sg_host_self)
+	flops = length/1000000.0*sg_host_speed(sg_host_self());
+#  else
+	flops = length/1000000.0*sg_host_speed(MSG_host_self());
+#  endif
 #elif defined HAVE_MSG_HOST_GET_SPEED || defined(MSG_host_get_speed)
-			length/1000000.0*MSG_host_get_speed(MSG_host_self()),
+	flops = length/1000000.0*MSG_host_get_speed(MSG_host_self());
 #else
-			length/1000000.0*MSG_get_host_speed(MSG_host_self()),
+	flops = length/1000000.0*MSG_get_host_speed(MSG_host_self());
+#endif
+
+#ifndef HAVE_SG_ACTOR_SELF_EXECUTE
+	simgrid_task = MSG_task_create(_starpu_job_get_task_name(j), flops, 0, NULL);
 #endif
-			0, NULL);
 
 	if (finished == NULL)
 	{
 		/* Synchronous execution */
 		/* First wait for previous tasks */
 		_starpu_simgrid_wait_tasks(workerid);
+#ifdef HAVE_SG_ACTOR_SELF_EXECUTE
+		sg_actor_self_execute(flops);
+#else
 		MSG_task_execute(simgrid_task);
 		MSG_task_destroy(simgrid_task);
+#endif
 	}
 	else
 	{
@@ -610,13 +675,17 @@ void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct star
 		struct task *task;
 		struct worker_runner *w = &worker_runner[workerid];
 		_STARPU_MALLOC(task, sizeof(*task));
+#ifdef HAVE_SG_ACTOR_SELF_EXECUTE
+		task->flops = flops;
+#else
 		task->task = simgrid_task;
+#endif
 		task->finished = finished;
 		*finished = 0;
 		task->next = NULL;
 		/* Sleep 10µs for the GPU task queueing */
 		if (_starpu_simgrid_queue_malloc_cost())
-			MSG_process_sleep(0.000010);
+			starpu_sleep(0.000010);
 		if (w->last_task)
 		{
 			/* Already running a task, queue */
@@ -629,7 +698,7 @@ void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct star
 			w->first_task = task;
 			w->last_task = task;
 		}
-		MSG_sem_release(w->sem);
+		starpu_sem_post(&w->sem);
 	}
 }
 
@@ -639,7 +708,11 @@ void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct star
 
 /* Note: simgrid is not parallel, so there is no need to hold locks for management of transfers.  */
 LIST_TYPE(transfer,
+#ifdef HAVE_SG_HOST_SEND_TO
+	size_t size;
+#else
 	msg_task_t task;
+#endif
 	int src_node;
 	int dst_node;
 	int run_node;
@@ -730,7 +803,7 @@ static void transfer_queue(struct transfer *transfer)
 			_STARPU_CALLOC(tsd, MAX_TSD+1, sizeof(void*));
 			tsd[0] = (void*)(uintptr_t)((src<<16) + dst);
 			t->runner = MSG_process_create_with_arguments(s, transfer_execute, tsd, _starpu_simgrid_get_memnode_host(src), 0, NULL);
-			t->sem = MSG_sem_init(0);
+			starpu_sem_init(&t->sem, 0, 0);
 		}
 		STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
 	}
@@ -747,14 +820,14 @@ static void transfer_queue(struct transfer *transfer)
 		t->first_transfer = transfer;
 		t->last_transfer = transfer;
 	}
-	MSG_sem_release(t->sem);
+	starpu_sem_post(&t->sem);
 }
 
 /* Actually execute the transfer, and then start transfers waiting for this one.  */
 static int transfer_execute(int argc STARPU_ATTRIBUTE_UNUSED, char *argv[] STARPU_ATTRIBUTE_UNUSED)
 {
 	/* FIXME: Ugly work-around for bug in simgrid: the MPI context is not properly set at MSG process startup */
-	MSG_process_sleep(0.000001);
+	starpu_sleep(0.000001);
 
 	unsigned src_dst = (uintptr_t) STARPU_PTHREAD_GETSPECIFIC(0);
 	unsigned src = src_dst >> 16;
@@ -766,7 +839,7 @@ static int transfer_execute(int argc STARPU_ATTRIBUTE_UNUSED, char *argv[] STARP
 	{
 		struct transfer *transfer;
 
-		MSG_sem_acquire(t->sem);
+		starpu_sem_wait(&t->sem);
 		if (!runners_running)
 			break;
 		transfer = t->first_transfer;
@@ -774,11 +847,21 @@ static int transfer_execute(int argc STARPU_ATTRIBUTE_UNUSED, char *argv[] STARP
 		if (t->last_transfer == transfer)
 			t->last_transfer = NULL;
 
+#ifdef HAVE_SG_HOST_SEND_TO
+		if (transfer->size)
+#else
 		if (transfer->task)
+#endif
 		{
 			_STARPU_DEBUG("transfer %p started\n", transfer);
+#ifdef HAVE_SG_HOST_SEND_TO
+			sg_host_send_to(_starpu_simgrid_memory_node_get_host(transfer->src_node),
+					_starpu_simgrid_memory_node_get_host(transfer->dst_node),
+					transfer->size);
+#else
 			MSG_task_execute(transfer->task);
 			MSG_task_destroy(transfer->task);
+#endif
 			_STARPU_DEBUG("transfer %p finished\n", transfer);
 		}
 
@@ -873,7 +956,11 @@ static void _starpu_simgrid_wait_transfers(void)
 	struct transfer *sync = transfer_new();
 	struct transfer *cur;
 
+#ifdef HAVE_SG_HOST_SEND_TO
+	sync->size = 0;
+#else
 	sync->task = NULL;
+#endif
 	sync->finished = &finished;
 
 	sync->src_node = STARPU_MAIN_RAM;
@@ -931,12 +1018,19 @@ int _starpu_simgrid_transfer(size_t size, unsigned src_node, unsigned dst_node,
 	if (!simgrid_transfer_cost)
 		return 0;
 
+	union _starpu_async_channel_event *event, myevent;
+	double start = 0.;
+	struct transfer *transfer = transfer_new();
+
+	_STARPU_DEBUG("creating transfer %p for %lu bytes\n", transfer, (unsigned long) size);
+
+#ifdef HAVE_SG_HOST_SEND_TO
+	transfer->size = size;
+#else
 	msg_task_t task;
 	msg_host_t *hosts;
 	double *computation;
 	double *communication;
-	union _starpu_async_channel_event *event, myevent;
-	double start = 0.;
 
 	_STARPU_CALLOC(hosts, 2, sizeof(*hosts));
 	_STARPU_CALLOC(computation, 2, sizeof(*computation));
@@ -949,11 +1043,8 @@ int _starpu_simgrid_transfer(size_t size, unsigned src_node, unsigned dst_node,
 
 	task = MSG_parallel_task_create("copy", 2, hosts, computation, communication, NULL);
 
-	struct transfer *transfer = transfer_new();
-
-	_STARPU_DEBUG("creating transfer %p for %lu bytes\n", transfer, (unsigned long) size);
-
 	transfer->task = task;
+#endif
 	transfer->src_node = src_node;
 	transfer->dst_node = dst_node;
 	transfer->run_node = starpu_worker_get_local_memory_node();
@@ -976,7 +1067,7 @@ int _starpu_simgrid_transfer(size_t size, unsigned src_node, unsigned dst_node,
 
 	/* Sleep 10µs for the GPU transfer queueing */
 	if (_starpu_simgrid_queue_malloc_cost())
-		MSG_process_sleep(0.000010);
+		starpu_sleep(0.000010);
 	transfer_submit(transfer);
 	/* Note: from here, transfer might be already freed */
 
@@ -1007,7 +1098,7 @@ _starpu_simgrid_thread_start(int argc STARPU_ATTRIBUTE_UNUSED, char *argv[])
 	void *arg = (void*) (uintptr_t) strtol(argv[1], NULL, 16);
 
 	/* FIXME: Ugly work-around for bug in simgrid: the MPI context is not properly set at MSG process startup */
-	MSG_process_sleep(0.000001);
+	starpu_sleep(0.000001);
 
 	/* _args is freed with process context */
 	f(arg);

+ 5 - 5
src/core/simgrid.h

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2016,2017                                Inria
  * Copyright (C) 2013,2017                                CNRS
- * Copyright (C) 2012-2018                                Université de Bordeaux
+ * Copyright (C) 2012-2019                                Université de Bordeaux
  * Copyright (C) 2013                                     Thibaut Lambert
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -62,10 +62,10 @@ void _starpu_simgrid_sync_gpus(void);
 /* Return the number of hosts prefixed by PREFIX */
 int _starpu_simgrid_get_nbhosts(const char *prefix);
 unsigned long long _starpu_simgrid_get_memsize(const char *prefix, unsigned devid);
-msg_host_t _starpu_simgrid_get_host_by_name(const char *name);
-msg_host_t _starpu_simgrid_get_memnode_host(unsigned node);
+starpu_sg_host_t _starpu_simgrid_get_host_by_name(const char *name);
+starpu_sg_host_t _starpu_simgrid_get_memnode_host(unsigned node);
 struct _starpu_worker;
-msg_host_t _starpu_simgrid_get_host_by_worker(struct _starpu_worker *worker);
+starpu_sg_host_t _starpu_simgrid_get_host_by_worker(struct _starpu_worker *worker);
 void _starpu_simgrid_get_platform_path(int version, char *path, size_t maxlen);
 msg_as_t _starpu_simgrid_get_as_by_name(const char *name);
 #pragma weak starpu_mpi_world_rank
@@ -99,7 +99,7 @@ void _starpu_simgrid_xbt_thread_create(const char *name, void_f_pvoid_t code,
 #define _SIMGRID_TIMER_END		\
 		if (__timer) {		\
 			xbt_os_threadtimer_stop(__timer);		\
-			MSG_process_sleep(xbt_os_timer_elapsed(__timer));\
+			starpu_sleep(xbt_os_timer_elapsed(__timer));\
 			xbt_os_timer_free(__timer);		\
 		}	\
 	}

+ 2 - 2
src/core/simgrid_cpp.cpp

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2016,2017                                CNRS
- * Copyright (C) 2012-2018                                Université de Bordeaux
+ * Copyright (C) 2012-2019                                Université de Bordeaux
  * Copyright (C) 2016,2017                                Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -52,7 +52,7 @@ static int _starpu_simgrid_xbt_thread_create_wrapper(int argc STARPU_ATTRIBUTE_U
 #endif
 {
 	/* FIXME: Ugly work-around for bug in simgrid: the MPI context is not properly set at MSG process startup */
-	MSG_process_sleep(0.000001);
+	starpu_sleep(0.000001);
 
 #ifdef HAVE_SMX_ACTOR_T
 	smx_actor_t

+ 1 - 1
src/core/task.c

@@ -943,7 +943,7 @@ int _starpu_task_submit(struct starpu_task *task, int nodeps)
 	ret = _starpu_submit_job(j, nodeps);
 #ifdef STARPU_SIMGRID
 	if (_starpu_simgrid_task_submit_cost())
-		MSG_process_sleep(0.000001);
+		starpu_sleep(0.000001);
 #endif
 
 	if (is_sync)

+ 8 - 4
src/core/topology.c

@@ -1876,7 +1876,7 @@ void _starpu_destroy_machine_config(struct _starpu_machine_config *config)
 #endif
 }
 
-int _starpu_bind_thread_on_cpu(int cpuid STARPU_ATTRIBUTE_UNUSED, int workerid STARPU_ATTRIBUTE_UNUSED, const char *name)
+int _starpu_bind_thread_on_cpu(int cpuid STARPU_ATTRIBUTE_UNUSED, int workerid STARPU_ATTRIBUTE_UNUSED, const char *name STARPU_ATTRIBUTE_UNUSED)
 {
 	int ret = 0;
 #ifdef STARPU_SIMGRID
@@ -2143,7 +2143,7 @@ static void _starpu_init_numa_node(struct _starpu_machine_config *config)
 
 #ifdef STARPU_SIMGRID
 	char name[16];
-	msg_host_t host;
+	starpu_sg_host_t host;
 #endif
 
 	numa_enabled = starpu_get_env_number_default("STARPU_USE_NUMA", 0);
@@ -2540,10 +2540,14 @@ static void _starpu_init_workers_binding_and_memory(struct _starpu_machine_confi
 					const char* cuda_memcpy_peer;
 					char name[16];
 					snprintf(name, sizeof(name), "CUDA%u", devid);
-					msg_host_t host = _starpu_simgrid_get_host_by_name(name);
+					starpu_sg_host_t host = _starpu_simgrid_get_host_by_name(name);
 					STARPU_ASSERT(host);
 					_starpu_simgrid_memory_node_set_host(memory_node, host);
+#  ifdef STARPU_HAVE_SIMGRID_ACTOR_H
+					cuda_memcpy_peer = sg_host_get_property_value(host, "memcpy_peer");
+#  else
 					cuda_memcpy_peer = MSG_host_get_property_value(host, "memcpy_peer");
+#  endif
 #endif /* SIMGRID */
 					if (
 #ifdef STARPU_SIMGRID
@@ -2636,7 +2640,7 @@ static void _starpu_init_workers_binding_and_memory(struct _starpu_machine_confi
 #ifdef STARPU_SIMGRID
 					char name[16];
 					snprintf(name, sizeof(name), "OpenCL%u", devid);
-					msg_host_t host = _starpu_simgrid_get_host_by_name(name);
+					starpu_sg_host_t host = _starpu_simgrid_get_host_by_name(name);
 					STARPU_ASSERT(host);
 					_starpu_simgrid_memory_node_set_host(memory_node, host);
 #endif /* SIMGRID */

+ 1 - 0
src/core/workers.h

@@ -31,6 +31,7 @@
 #include <common/timing.h>
 #include <common/fxt.h>
 #include <common/thread.h>
+#include <common/utils.h>
 #include <core/jobs.h>
 #include <core/perfmodel/perfmodel.h>
 #include <core/sched_policy.h>

+ 10 - 2
src/datawizard/coherency.c

@@ -814,6 +814,14 @@ size_t _starpu_data_get_alloc_size(starpu_data_handle_t handle)
 		return handle->ops->get_size(handle);
 }
 
+starpu_ssize_t _starpu_data_get_max_size(starpu_data_handle_t handle)
+{
+	if (handle->ops->get_max_size)
+		return handle->ops->get_max_size(handle);
+	else
+		return -1;
+}
+
 uint32_t _starpu_data_get_footprint(starpu_data_handle_t handle)
 {
 	return handle->footprint;
@@ -1130,7 +1138,7 @@ int _starpu_fetch_task_input(struct starpu_task *task, struct _starpu_job *j, in
 					_starpu_fetch_task_input_cb, worker, 0, "_starpu_fetch_task_input");
 #ifdef STARPU_SIMGRID
 			if (_starpu_simgrid_fetching_input_cost())
-				MSG_process_sleep(0.000001);
+				starpu_sleep(0.000001);
 #endif
 			if (STARPU_UNLIKELY(ret))
 			{
@@ -1145,7 +1153,7 @@ int _starpu_fetch_task_input(struct starpu_task *task, struct _starpu_job *j, in
 			ret = fetch_data(handle, node, local_replicate, mode, 0);
 #ifdef STARPU_SIMGRID
 			if (_starpu_simgrid_fetching_input_cost())
-				MSG_process_sleep(0.000001);
+				starpu_sleep(0.000001);
 #endif
 			if (STARPU_UNLIKELY(ret))
 				goto enomem;

+ 1 - 0
src/datawizard/coherency.h

@@ -307,6 +307,7 @@ uint32_t _starpu_get_data_refcnt(struct _starpu_data_state *state, unsigned node
 
 size_t _starpu_data_get_size(starpu_data_handle_t handle);
 size_t _starpu_data_get_alloc_size(starpu_data_handle_t handle);
+starpu_ssize_t _starpu_data_get_max_size(starpu_data_handle_t handle);
 
 uint32_t _starpu_data_get_footprint(starpu_data_handle_t handle);
 

+ 1 - 1
src/datawizard/copy_driver.c

@@ -162,12 +162,12 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 
 	return _starpu_simgrid_transfer(handle->ops->get_size(handle), src_node, dst_node, req);
 #else /* !SIMGRID */
-	enum starpu_node_kind src_kind = starpu_node_get_kind(src_node);
 	enum starpu_node_kind dst_kind = starpu_node_get_kind(dst_node);
 	void *src_interface = src_replicate->data_interface;
 	void *dst_interface = dst_replicate->data_interface;
 
 #if defined(STARPU_USE_CUDA) && defined(STARPU_HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
+	enum starpu_node_kind src_kind = starpu_node_get_kind(src_node);
 	if ((src_kind == STARPU_CUDA_RAM) || (dst_kind == STARPU_CUDA_RAM))
 	{
 		unsigned devid;

+ 2 - 2
src/datawizard/data_request.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2008-2017                                Université de Bordeaux
+ * Copyright (C) 2008-2017, 2019                                Université de Bordeaux
  * Copyright (C) 2011,2016,2017                           Inria
  * Copyright (C) 2018                                     Federal University of Rio Grande do Sul (UFRGS)
  * Copyright (C) 2010-2019                                CNRS
@@ -664,7 +664,7 @@ static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_l
 			 * rather have the caller block, and explicitly wait
 			 * for eviction to happen.
 			 */
-			MSG_process_sleep(0.000001);
+			starpu_sleep(0.000001);
 			_starpu_wake_all_blocked_workers_on_node(src_node);
 		}
 #elif !defined(STARPU_NON_BLOCKING_DRIVERS)

+ 2 - 2
src/datawizard/datawizard.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2016,2017                                Inria
  * Copyright (C) 2010,2011,2013,2015,2017                 CNRS
- * Copyright (C) 2009-2018                                Université de Bordeaux
+ * Copyright (C) 2009-2019                                Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -34,7 +34,7 @@ int ___starpu_datawizard_progress(unsigned memory_node, unsigned may_alloc, unsi
 
 #ifdef STARPU_SIMGRID
 	/* XXX */
-	MSG_process_sleep(0.000001);
+	starpu_sleep(0.000001);
 #endif
 	STARPU_UYIELD();
 

+ 2 - 2
src/datawizard/filters.c

@@ -854,12 +854,12 @@ void _starpu_data_unpartition_submit(starpu_data_handle_t initial_handle, unsign
 		ret = starpu_task_insert(initial_handle->switch_cl, STARPU_W, initial_handle, STARPU_DATA_MODE_ARRAY, descr, n,
 					 STARPU_NAME, "unpartition",
 					 STARPU_HANDLES_SEQUENTIAL_CONSISTENCY, handles_sequential_consistency,
-					 STARPU_CALLBACK_WITH_ARG, callback_func, callback_arg,
+					 STARPU_CALLBACK_WITH_ARG_NFREE, callback_func, callback_arg,
 					 0);
 	else
 		ret = starpu_task_insert(initial_handle->switch_cl, STARPU_W, initial_handle, STARPU_DATA_MODE_ARRAY, descr, n,
 					 STARPU_NAME, "unpartition",
-					 STARPU_CALLBACK_WITH_ARG, callback_func, callback_arg,
+					 STARPU_CALLBACK_WITH_ARG_NFREE, callback_func, callback_arg,
 					 0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 

+ 1 - 1
src/datawizard/malloc.c

@@ -200,7 +200,7 @@ int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int fl
 		 * instead of computing an average value.
 		 */
 			if (_starpu_simgrid_cuda_malloc_cost())
-				MSG_process_sleep((float) dim * 0.000650 / 1048576.);
+				starpu_sleep((float) dim * 0.000650 / 1048576.);
 #else /* STARPU_SIMGRID */
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_HAVE_CUDA_MEMCPY_PEER

+ 3 - 3
src/datawizard/memory_nodes.h

@@ -51,7 +51,7 @@ struct _starpu_memory_node_descr
 	unsigned nworkers[STARPU_MAXNODES];
 
 #ifdef STARPU_SIMGRID
-	msg_host_t host[STARPU_MAXNODES];
+	starpu_sg_host_t host[STARPU_MAXNODES];
 #endif
 
 	// TODO move this 2 lists outside struct _starpu_memory_node_descr
@@ -92,12 +92,12 @@ static inline unsigned _starpu_memory_node_get_nworkers(unsigned node)
 }
 
 #ifdef STARPU_SIMGRID
-static inline void _starpu_simgrid_memory_node_set_host(unsigned node, msg_host_t host)
+static inline void _starpu_simgrid_memory_node_set_host(unsigned node, starpu_sg_host_t host)
 {
 	_starpu_descr.host[node] = host;
 }
 
-static inline msg_host_t _starpu_simgrid_memory_node_get_host(unsigned node)
+static inline starpu_sg_host_t _starpu_simgrid_memory_node_get_host(unsigned node)
 {
 	return _starpu_descr.host[node];
 }

+ 7 - 2
src/debug/traces/starpu_fxt.c

@@ -274,6 +274,7 @@ struct data_info
 	unsigned long handle;
 	char *name;
 	size_t size;
+	starpu_ssize_t max_size;
 	char *description;
 	unsigned dimensions;
 	unsigned long *dims;
@@ -295,6 +296,7 @@ static struct data_info *get_data(unsigned long handle, int mpi_rank)
 		data->handle = handle;
 		data->name = NULL;
 		data->size = 0;
+		data->max_size = -1;
 		data->description = 0;
 		data->dimensions = 0;
 		data->dims = NULL;
@@ -323,6 +325,8 @@ static void data_dump(struct data_info *data)
 		free(data->name);
 	}
 	fprintf(data_file, "Size: %lu\n", (unsigned long) data->size);
+	if (data->max_size != -1)
+		fprintf(data_file, "MaxSize: %lu\n", (unsigned long) data->max_size);
 	if (data->description)
 	{
 		fprintf(data_file, "Description: %s\n", data->description);
@@ -2083,10 +2087,11 @@ static void handle_data_register(struct fxt_ev_64 *ev, struct starpu_fxt_options
 	unsigned long handle = ev->param[0];
 	char *prefix = options->file_prefix;
 	struct data_info *data = get_data(handle, options->file_rank);
-	char *description = get_fxt_string(ev, 3);
+	char *description = get_fxt_string(ev, 4);
 
 	data->size = ev->param[1];
-	data->home_node = ev->param[2];
+	data->max_size = ev->param[2];
+	data->home_node = ev->param[3];
 	if (description[0])
 		data->description = strdup(description);
 

+ 2 - 2
src/drivers/cpu/driver_cpu.c

@@ -140,7 +140,7 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 		{
 			/* Wait for other threads to exit barrier_wait so we
 			 * can safely drop the job structure */
-			MSG_process_sleep(0.0000001);
+			starpu_sleep(0.0000001);
 			j->after_work_busy_barrier = 0;
 		}
 #else
@@ -332,7 +332,7 @@ int _starpu_cpu_driver_run_once(struct _starpu_worker *cpu_worker)
 	}
   #else
 	/* Previous simgrid versions don't really permit to use wait_timedwait in C */
-	MSG_process_sleep(0.001);
+	starpu_sleep(0.001);
   #endif
  #endif
 #endif

+ 9 - 3
src/drivers/cuda/driver_cuda.c

@@ -1439,8 +1439,12 @@ int _starpu_cuda_is_direct_access_supported(unsigned node, unsigned handling_nod
 	(void) node;
 	if (starpu_node_get_kind(handling_node) == STARPU_CUDA_RAM)
 	{
-		msg_host_t host = _starpu_simgrid_get_memnode_host(handling_node);
+		starpu_sg_host_t host = _starpu_simgrid_get_memnode_host(handling_node);
+#  ifdef STARPU_HAVE_SIMGRID_ACTOR_H
+		const char* cuda_memcpy_peer = sg_host_get_property_value(host, "memcpy_peer");
+#  else
 		const char* cuda_memcpy_peer = MSG_host_get_property_value(host, "memcpy_peer");
+#  endif
 		return cuda_memcpy_peer && atoll(cuda_memcpy_peer);
 	}
 	else
@@ -1472,7 +1476,7 @@ uintptr_t _starpu_cuda_malloc_on_node(unsigned dst_node, size_t size, int flags)
 	/* Sleep for the allocation */
 	STARPU_PTHREAD_MUTEX_LOCK(&cuda_alloc_mutex);
 	if (_starpu_simgrid_cuda_malloc_cost())
-		MSG_process_sleep(0.000175);
+		starpu_sleep(0.000175);
 	if (!last[dst_node])
 		last[dst_node] = 1<<10;
 	addr = last[dst_node];
@@ -1513,6 +1517,8 @@ uintptr_t _starpu_cuda_malloc_on_node(unsigned dst_node, size_t size, int flags)
 
 void _starpu_cuda_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags)
 {
+	(void) dst_node;
+	(void) addr;
 	(void) size;
 	(void) flags;
 
@@ -1521,7 +1527,7 @@ void _starpu_cuda_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, i
 	STARPU_PTHREAD_MUTEX_LOCK(&cuda_alloc_mutex);
 	/* Sleep for the free */
 	if (_starpu_simgrid_cuda_malloc_cost())
-		MSG_process_sleep(0.000750);
+		starpu_sleep(0.000750);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_alloc_mutex);
 	/* CUDA also synchronizes roughly everything on cudaFree */
 	_starpu_simgrid_sync_gpus();

+ 10 - 5
src/drivers/opencl/driver_opencl.c

@@ -967,7 +967,11 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker
 			struct starpu_profiling_task_info *profiling_info = task->profiling_info;
 			STARPU_ASSERT_MSG(profiling_info->used_cycles, "Application kernel must call starpu_opencl_collect_stats to collect simulated time");
 #if defined(HAVE_SG_HOST_SPEED) || defined(sg_host_speed)
+#  if defined(HAVE_SG_HOST_SELF) || defined(sg_host_self)
+			length = ((double) profiling_info->used_cycles)/sg_host_speed(sg_host_self());
+#  else
 			length = ((double) profiling_info->used_cycles)/sg_host_speed(MSG_host_self());
+#  endif
 #elif defined HAVE_MSG_HOST_GET_SPEED || defined(MSG_host_get_speed)
 			length = ((double) profiling_info->used_cycles)/MSG_host_get_speed(MSG_host_self());
 #else
@@ -975,7 +979,7 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker
 #endif
 			/* And give the simulated time to simgrid */
 			simulate = 1;
-		#endif
+#endif
 		}
 		else if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT && !async)
 			{
@@ -1294,7 +1298,7 @@ uintptr_t _starpu_opencl_malloc_on_node(unsigned dst_node, size_t size, int flag
 	/* Sleep for the allocation */
 	STARPU_PTHREAD_MUTEX_LOCK(&opencl_alloc_mutex);
 	if (_starpu_simgrid_cuda_malloc_cost())
-		MSG_process_sleep(0.000175);
+		starpu_sleep(0.000175);
 	if (!last[dst_node])
 		last[dst_node] = 1<<10;
 	addr = last[dst_node];
@@ -1320,14 +1324,15 @@ uintptr_t _starpu_opencl_malloc_on_node(unsigned dst_node, size_t size, int flag
 
 void _starpu_opencl_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags)
 {
-	(void)flags;
-	(void)size;
 	(void)dst_node;
+	(void)addr;
+	(void)size;
+	(void)flags;
 #ifdef STARPU_SIMGRID
 	STARPU_PTHREAD_MUTEX_LOCK(&opencl_alloc_mutex);
 	/* Sleep for the free */
 	if (_starpu_simgrid_cuda_malloc_cost())
-		MSG_process_sleep(0.000750);
+		starpu_sleep(0.000750);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&opencl_alloc_mutex);
 #else
 	cl_int err;

+ 20 - 4
src/util/fstarpu.c

@@ -41,11 +41,15 @@ static const intptr_t fstarpu_task_deps_array	= STARPU_TASK_DEPS_ARRAY;
 static const intptr_t fstarpu_task_end_deps_array	= STARPU_TASK_END_DEPS_ARRAY;
 static const intptr_t fstarpu_callback	= STARPU_CALLBACK;
 static const intptr_t fstarpu_callback_with_arg	= STARPU_CALLBACK_WITH_ARG;
+static const intptr_t fstarpu_callback_with_arg_nfree	= STARPU_CALLBACK_WITH_ARG_NFREE;
 static const intptr_t fstarpu_callback_arg	= STARPU_CALLBACK_ARG;
+static const intptr_t fstarpu_callback_arg_nfree= STARPU_CALLBACK_ARG_NFREE;
 static const intptr_t fstarpu_prologue_callback	= STARPU_PROLOGUE_CALLBACK;
-static const intptr_t fstarpu_prologue_callback_arg	= STARPU_PROLOGUE_CALLBACK_ARG;
-static const intptr_t fstarpu_prologue_callback_pop	= STARPU_PROLOGUE_CALLBACK_POP;
-static const intptr_t fstarpu_prologue_callback_pop_arg	= STARPU_PROLOGUE_CALLBACK_POP_ARG;
+static const intptr_t fstarpu_prologue_callback_arg	  = STARPU_PROLOGUE_CALLBACK_ARG;
+static const intptr_t fstarpu_prologue_callback_arg_nfree = STARPU_PROLOGUE_CALLBACK_ARG_NFREE;
+static const intptr_t fstarpu_prologue_callback_pop	  	= STARPU_PROLOGUE_CALLBACK_POP;
+static const intptr_t fstarpu_prologue_callback_pop_arg	  	= STARPU_PROLOGUE_CALLBACK_POP_ARG;
+static const intptr_t fstarpu_prologue_callback_pop_arg_nfree	 = STARPU_PROLOGUE_CALLBACK_POP_ARG_NFREE;
 static const intptr_t fstarpu_priority	= STARPU_PRIORITY;
 static const intptr_t fstarpu_execute_on_node	= STARPU_EXECUTE_ON_NODE;
 static const intptr_t fstarpu_execute_on_data	= STARPU_EXECUTE_ON_DATA;
@@ -63,6 +67,10 @@ static const intptr_t fstarpu_handles_sequential_consistency	= STARPU_HANDLES_SE
 static const intptr_t fstarpu_task_end_dep	= STARPU_TASK_END_DEP;
 static const intptr_t fstarpu_task_synchronous	= STARPU_TASK_SYNCHRONOUS;
 static const intptr_t fstarpu_node_selection_policy	= STARPU_NODE_SELECTION_POLICY;
+static const intptr_t fstarpu_task_workerids = STARPU_TASK_WORKERIDS;
+static const intptr_t fstarpu_sequential_consistency = STARPU_SEQUENTIAL_CONSISTENCY;
+static const intptr_t fstarpu_task_profiling_info = STARPU_TASK_PROFILING_INFO;
+static const intptr_t fstarpu_task_no_submitorder = STARPU_TASK_NO_SUBMITORDER;
 
 static const intptr_t fstarpu_value = STARPU_VALUE;
 static const intptr_t fstarpu_sched_ctx = STARPU_SCHED_CTX;
@@ -117,11 +125,15 @@ intptr_t fstarpu_get_constant(char *s)
 	else if	(!strcmp(s, "FSTARPU_TASK_END_DEPS_ARRAY"))	{ return fstarpu_task_end_deps_array; }
 	else if	(!strcmp(s, "FSTARPU_CALLBACK"))	{ return fstarpu_callback; }
 	else if	(!strcmp(s, "FSTARPU_CALLBACK_WITH_ARG"))	{ return fstarpu_callback_with_arg; }
-	else if	(!strcmp(s, "FSTARPU_CALLBACK_ARG"))	{ return fstarpu_callback_arg; }
+	else if	(!strcmp(s, "FSTARPU_CALLBACK_WITH_ARG_NFREE"))	{ return fstarpu_callback_with_arg_nfree; }
+	else if	(!strcmp(s, "FSTARPU_CALLBACK_ARG"))		{ return fstarpu_callback_arg; }
+	else if	(!strcmp(s, "FSTARPU_CALLBACK_ARG_NFREE"))	{ return fstarpu_callback_arg_nfree; }
 	else if	(!strcmp(s, "FSTARPU_PROLOGUE_CALLBACK"))	{ return fstarpu_prologue_callback; }
 	else if	(!strcmp(s, "FSTARPU_PROLOGUE_CALLBACK_ARG"))	{ return fstarpu_prologue_callback_arg; }
+	else if	(!strcmp(s, "FSTARPU_PROLOGUE_CALLBACK_ARG_NFREE"))	{ return fstarpu_prologue_callback_arg_nfree; }
 	else if	(!strcmp(s, "FSTARPU_PROLOGUE_CALLBACK_POP"))	{ return fstarpu_prologue_callback_pop; }
 	else if	(!strcmp(s, "FSTARPU_PROLOGUE_CALLBACK_POP_ARG"))	{ return fstarpu_prologue_callback_pop_arg; }
+	else if	(!strcmp(s, "FSTARPU_PROLOGUE_CALLBACK_POP_ARG_NFREE"))	{ return fstarpu_prologue_callback_pop_arg_nfree; }
 	else if	(!strcmp(s, "FSTARPU_PRIORITY"))	{ return fstarpu_priority; }
 	else if	(!strcmp(s, "FSTARPU_EXECUTE_ON_NODE"))	{ return fstarpu_execute_on_node; }
 	else if	(!strcmp(s, "FSTARPU_EXECUTE_ON_DATA"))	{ return fstarpu_execute_on_data; }
@@ -140,7 +152,11 @@ intptr_t fstarpu_get_constant(char *s)
 	else if (!strcmp(s, "FSTARPU_TASK_COLOR"))	{ return fstarpu_task_color; }
 	else if (!strcmp(s, "FSTARPU_HANDLES_SEQUENTIAL_CONSISTENCY"))	{ return fstarpu_handles_sequential_consistency; }
 	else if (!strcmp(s, "FSTARPU_TASK_END_DEP"))	{ return fstarpu_task_end_dep; }
+	else if (!strcmp(s, "FSTARPU_TASK_WORKERIDS"))	{ return fstarpu_task_workerids; }
 	else if (!strcmp(s, "FSTARPU_TASK_SYNCHRONOUS"))	{ return fstarpu_task_synchronous; }
+	else if (!strcmp(s, "FSTARPU_SEQUENTIAL_CONSISTENCY"))	{ return fstarpu_sequential_consistency; }
+	else if (!strcmp(s, "FSTARPU_TASK_PROFILING_INFO"))	{ return fstarpu_task_profiling_info; }
+	else if (!strcmp(s, "FSTARPU_TASK_NO_SUBMITORDER"))	{ return fstarpu_task_no_submitorder; }
 
 	else if (!strcmp(s, "FSTARPU_CPU_WORKER"))	{ return fstarpu_cpu_worker; }
 	else if (!strcmp(s, "FSTARPU_CUDA_WORKER"))	{ return fstarpu_cuda_worker; }

+ 5 - 1
src/util/openmp_runtime_support.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2014-2018                                Inria
  * Copyright (C) 2014-2017,2019                           CNRS
- * Copyright (C) 2015,2017                                Université de Bordeaux
+ * Copyright (C) 2015,2017,2019                           Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -938,7 +938,11 @@ int starpu_omp_init(void)
 #ifdef STARPU_SIMGRID
 	/* XXX: ideally we'd pass the real argc/argv.  */
 	/* We have to tell simgrid to avoid cleaning up at exit, since that's before our destructor :/ */
+#  if SIMGRID_VERSION >= 32300
+	char *argv[] = { "program", "--cfg=debug/clean-atexit:0", NULL };
+#  else
 	char *argv[] = { "program", "--cfg=clean-atexit:0", NULL };
+#  endif
 	int argc = sizeof(argv) / sizeof(argv[0]) - 1;
 	char **_argv = argv;
 	/* Initialize simgrid before anything else.  */

+ 17 - 7
src/util/starpu_create_sync_task.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2011,2012                                Inria
  * Copyright (C) 2010,2011,2014                           Université de Bordeaux
- * Copyright (C) 2010-2013,2015,2017                      CNRS
+ * Copyright (C) 2010-2013,2015,2017,2019                 CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -20,12 +20,7 @@
 #include <common/config.h>
 #include <core/task.h>
 
-/* This creates (and submits) an empty task that unlocks a tag once all its
- * dependencies are fulfilled. */
-/* TODO it would be nice to have such a function without sync_tag in case we
- * just want to execute the callback. */
-void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t *deps,
-				void (*callback)(void *), void *callback_arg)
+void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t *deps, void (*callback)(void *), void *callback_arg)
 {
 	starpu_tag_declare_deps_array(sync_tag, ndeps, deps);
 
@@ -45,3 +40,18 @@ void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t
 	int sync_ret = _starpu_task_submit_internally(sync_task);
 	STARPU_ASSERT(!sync_ret);
 }
+
+void starpu_create_callback_task(void (*callback)(void *), void *callback_arg)
+{
+	/* We create an empty task */
+	struct starpu_task *empty_task = starpu_task_create();
+	empty_task->name = "empty_task";
+	empty_task->callback_func = callback;
+	empty_task->callback_arg = callback_arg;
+
+	/* This task does nothing */
+	empty_task->cl = NULL;
+
+	int ret = _starpu_task_submit_internally(empty_task);
+	STARPU_ASSERT(!ret);
+}

+ 128 - 0
src/util/starpu_task_insert_utils.c

@@ -125,10 +125,19 @@ int _starpu_codelet_pack_args(void **arg_buffer, size_t *arg_buffer_size, va_lis
 			va_arg(varg_list, _starpu_callback_func_t);
 			va_arg(varg_list, void *);
 		}
+		else if (arg_type==STARPU_CALLBACK_WITH_ARG_NFREE)
+		{
+			va_arg(varg_list, _starpu_callback_func_t);
+			va_arg(varg_list, void *);
+		}
 		else if (arg_type==STARPU_CALLBACK_ARG)
 		{
 			(void)va_arg(varg_list, void *);
 		}
+		else if (arg_type==STARPU_CALLBACK_ARG_NFREE)
+		{
+			(void)va_arg(varg_list, void *);
+		}
 		else if (arg_type==STARPU_PROLOGUE_CALLBACK)
 		{
 			va_arg(varg_list, _starpu_callback_func_t);
@@ -137,6 +146,10 @@ int _starpu_codelet_pack_args(void **arg_buffer, size_t *arg_buffer_size, va_lis
 		{
 			(void)va_arg(varg_list, void *);
 		}
+		else if (arg_type==STARPU_PROLOGUE_CALLBACK_ARG_NFREE)
+		{
+			(void)va_arg(varg_list, void *);
+		}
 		else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP)
 		{
 			va_arg(varg_list, _starpu_callback_func_t);
@@ -145,6 +158,10 @@ int _starpu_codelet_pack_args(void **arg_buffer, size_t *arg_buffer_size, va_lis
 		{
 			(void)va_arg(varg_list, void *);
 		}
+		else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP_ARG_NFREE)
+		{
+			(void)va_arg(varg_list, void *);
+		}
 		else if (arg_type==STARPU_PRIORITY)
 		{
 			(void)va_arg(varg_list, int);
@@ -213,6 +230,23 @@ int _starpu_codelet_pack_args(void **arg_buffer, size_t *arg_buffer_size, va_lis
 		{
 			(void)va_arg(varg_list, int);
 		}
+		else if (arg_type==STARPU_TASK_WORKERIDS)
+		{
+			(void)va_arg(varg_list, unsigned);
+			(void)va_arg(varg_list, uint32_t*);
+		}
+		else if (arg_type==STARPU_SEQUENTIAL_CONSISTENCY)
+		{
+			(void)va_arg(varg_list, unsigned);
+		}
+		else if (arg_type==STARPU_TASK_PROFILING_INFO)
+		{
+			(void)va_arg(varg_list, struct starpu_profiling_task_info *);
+		}
+		else if (arg_type==STARPU_TASK_NO_SUBMITORDER)
+		{
+			(void)va_arg(varg_list, unsigned);
+		}
 		else
 		{
 			STARPU_ABORT_MSG("Unrecognized argument %d, did you perhaps forget to end arguments with 0?\n", arg_type);
@@ -414,10 +448,23 @@ int _starpu_task_insert_create(struct starpu_codelet *cl, struct starpu_task *ta
 		{
 			task->callback_func = va_arg(varg_list, _starpu_callback_func_t);
 			task->callback_arg = va_arg(varg_list, void *);
+			task->callback_arg_free = 1;
+		}
+		else if (arg_type==STARPU_CALLBACK_WITH_ARG_NFREE)
+		{
+			task->callback_func = va_arg(varg_list, _starpu_callback_func_t);
+			task->callback_arg = va_arg(varg_list, void *);
+			task->callback_arg_free = 0;
 		}
 		else if (arg_type==STARPU_CALLBACK_ARG)
 		{
 			task->callback_arg = va_arg(varg_list, void *);
+			task->callback_arg_free = 1;
+		}
+		else if (arg_type==STARPU_CALLBACK_ARG_NFREE)
+		{
+			task->callback_arg = va_arg(varg_list, void *);
+			task->callback_arg_free = 0;
 		}
 		else if (arg_type==STARPU_PROLOGUE_CALLBACK)
 		{
@@ -426,6 +473,12 @@ int _starpu_task_insert_create(struct starpu_codelet *cl, struct starpu_task *ta
 		else if (arg_type==STARPU_PROLOGUE_CALLBACK_ARG)
 		{
 			task->prologue_callback_arg = va_arg(varg_list, void *);
+			task->prologue_callback_arg_free = 1;
+		}
+		else if (arg_type==STARPU_PROLOGUE_CALLBACK_ARG_NFREE)
+		{
+			task->prologue_callback_arg = va_arg(varg_list, void *);
+			task->prologue_callback_arg_free = 0;
 		}
 		else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP)
 		{
@@ -434,6 +487,12 @@ int _starpu_task_insert_create(struct starpu_codelet *cl, struct starpu_task *ta
 		else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP_ARG)
 		{
 			task->prologue_callback_pop_arg = va_arg(varg_list, void *);
+			task->prologue_callback_pop_arg_free = 1;
+		}
+		else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP_ARG_NFREE)
+		{
+			task->prologue_callback_pop_arg = va_arg(varg_list, void *);
+			task->prologue_callback_pop_arg_free = 0;
 		}
 		else if (arg_type==STARPU_PRIORITY)
 		{
@@ -528,6 +587,23 @@ int _starpu_task_insert_create(struct starpu_codelet *cl, struct starpu_task *ta
 			int end_dep = va_arg(varg_list, int);
 			starpu_task_end_dep_add(task, end_dep);
 		}
+		else if (arg_type==STARPU_TASK_WORKERIDS)
+		{
+			task->workerids_len = va_arg(varg_list, unsigned);
+			task->workerids = va_arg(varg_list, uint32_t*);
+		}
+		else if (arg_type==STARPU_SEQUENTIAL_CONSISTENCY)
+		{
+			task->sequential_consistency = va_arg(varg_list, unsigned);
+		}
+		else if (arg_type==STARPU_TASK_PROFILING_INFO)
+		{
+			task->profiling_info = va_arg(varg_list, struct starpu_profiling_task_info *);
+		}
+		else if (arg_type==STARPU_TASK_NO_SUBMITORDER)
+		{
+			task->no_submitorder = va_arg(varg_list, unsigned);
+		}
 		else
 		{
 			STARPU_ABORT_MSG("Unrecognized argument %d, did you perhaps forget to end arguments with 0?\n", arg_type);
@@ -668,11 +744,27 @@ int _fstarpu_task_insert_create(struct starpu_codelet *cl, struct starpu_task *t
 			task->callback_func = (_starpu_callback_func_t)arglist[arg_i];
 			arg_i++;
 			task->callback_arg = arglist[arg_i];
+			task->callback_arg_free = 1;
+		}
+		else if (arg_type == STARPU_CALLBACK_WITH_ARG_NFREE)
+		{
+			arg_i++;
+			task->callback_func = (_starpu_callback_func_t)arglist[arg_i];
+			arg_i++;
+			task->callback_arg = arglist[arg_i];
+			task->callback_arg_free = 0;
 		}
 		else if (arg_type == STARPU_CALLBACK_ARG)
 		{
 			arg_i++;
 			task->callback_arg = arglist[arg_i];
+			task->callback_arg_free = 1;
+		}
+		else if (arg_type == STARPU_CALLBACK_ARG_NFREE)
+		{
+			arg_i++;
+			task->callback_arg = arglist[arg_i];
+			task->callback_arg_free = 0;
 		}
 		else if (arg_type == STARPU_PROLOGUE_CALLBACK)
 		{
@@ -683,6 +775,13 @@ int _fstarpu_task_insert_create(struct starpu_codelet *cl, struct starpu_task *t
 		{
 			arg_i++;
 			task->prologue_callback_arg = arglist[arg_i];
+			task->prologue_callback_arg_free = 1;
+		}
+		else if (arg_type == STARPU_PROLOGUE_CALLBACK_ARG_NFREE)
+		{
+			arg_i++;
+			task->prologue_callback_arg = arglist[arg_i];
+			task->prologue_callback_arg_free = 0;
 		}
 		else if (arg_type == STARPU_PROLOGUE_CALLBACK_POP)
 		{
@@ -693,6 +792,13 @@ int _fstarpu_task_insert_create(struct starpu_codelet *cl, struct starpu_task *t
 		{
 			arg_i++;
 			task->prologue_callback_pop_arg = arglist[arg_i];
+			task->prologue_callback_pop_arg_free = 1;
+		}
+		else if (arg_type == STARPU_PROLOGUE_CALLBACK_POP_ARG_NFREE)
+		{
+			arg_i++;
+			task->prologue_callback_pop_arg = arglist[arg_i];
+			task->prologue_callback_pop_arg_free = 0;
 		}
 		else if (arg_type == STARPU_PRIORITY)
 		{
@@ -796,6 +902,28 @@ int _fstarpu_task_insert_create(struct starpu_codelet *cl, struct starpu_task *t
 			arg_i++;
 			starpu_task_end_dep_add(task, *(int*)arglist[arg_i]);
 		}
+		else if (arg_type==STARPU_TASK_WORKERIDS)
+		{
+			arg_i++;
+			task->workerids_len = *(unsigned *)arglist[arg_i];
+			arg_i++;
+			task->workerids = (uint32_t *)arglist[arg_i];
+		}
+		else if (arg_type==STARPU_SEQUENTIAL_CONSISTENCY)
+		{
+			arg_i++;
+			task->sequential_consistency = *(unsigned *)arglist[arg_i];
+		}
+		else if (arg_type==STARPU_TASK_PROFILING_INFO)
+		{
+			arg_i++;
+			task->profiling_info = (struct starpu_profiling_task_info *)arglist[arg_i];
+		}
+		else if (arg_type==STARPU_TASK_NO_SUBMITORDER)
+		{
+			arg_i++;
+			task->no_submitorder = *(unsigned *)arglist[arg_i];
+		}
 		else
 		{
 			STARPU_ABORT_MSG("unknown/unsupported argument %d, did you perhaps forget to end arguments with 0?", arg_type);

+ 2 - 1
starpufft/include/starpufft.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010,2012,2014,2015,2017,2019            CNRS
- * Copyright (C) 2009,2011,2014                           Université de Bordeaux
+ * Copyright (C) 2009,2011,2014,2019                      Université de Bordeaux
  * Copyright (C) 2012                                     Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -43,6 +43,7 @@ typedef struct starpufft(plan) *starpufft(plan); \
 \
 starpufft(plan) starpufft(plan_dft_1d)(int n, int sign, unsigned flags); \
 starpufft(plan) starpufft(plan_dft_2d)(int n, int m, int sign, unsigned flags); \
+starpufft(plan) starpufft(plan_dft_3d)(int n, int m, int p, int sign, unsigned flags); \
 starpufft(plan) starpufft(plan_dft_r2c_1d)(int n, unsigned flags); \
 starpufft(plan) starpufft(plan_dft_c2r_1d)(int n, unsigned flags); \
 \

+ 5 - 4
starpufft/src/Makefile.am

@@ -2,7 +2,7 @@
 #
 # Copyright (C) 2017                                     Inria
 # Copyright (C) 2010-2019                                CNRS
-# Copyright (C) 2009-2017                                Université de Bordeaux
+# Copyright (C) 2009-2017,2019                           Université de Bordeaux
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -28,6 +28,7 @@ EXTRA_DIST =			\
 	starpufftx.c		\
 	starpufftx1d.c		\
 	starpufftx2d.c		\
+	starpufftx3d.c		\
 	cuda_kernels.cu		\
 	cudaf_kernels.cu	\
 	cudax_kernels.cu
@@ -47,13 +48,13 @@ NVCCFLAGS += -Xcompiler -fPIC -Xlinker -fPIC
 cudaf_kernels.o: cudaf_kernels.cu
 	$(V_nvcc) $(NVCC) $(AM_CPPFLAGS) $< -c -o $@ --compiler-options -fno-strict-aliasing  $(NVCCFLAGS)
 
-libstarpufft_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += cudaf_kernels.cu
+libstarpufft_@STARPU_EFFECTIVE_VERSION@_la_LIBADD += cudaf_kernels.o
 
 if STARPU_HAVE_CUFFTDOUBLECOMPLEX
 cuda_kernels.o: cuda_kernels.cu
-	$(NVCC) $(AM_CPPFLAGS) $< -c -o $@ --compiler-options -fno-strict-aliasing  $(NVCCFLAGS)
+	$(V_nvcc) $(NVCC) $(AM_CPPFLAGS) $< -c -o $@ --compiler-options -fno-strict-aliasing  $(NVCCFLAGS)
 
-libstarpufft_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += cuda_kernels.cu
+libstarpufft_@STARPU_EFFECTIVE_VERSION@_la_LIBADD += cuda_kernels.o
 endif
 
 libstarpufft_@STARPU_EFFECTIVE_VERSION@_la_LIBADD +=  $(STARPU_CUDA_LDFLAGS)

+ 13 - 1
starpufft/src/starpufftx.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2010-2015,2017                           CNRS
  * Copyright (C) 2012,2013,2017                           Inria
- * Copyright (C) 2009-2012,2014,2017                      Université de Bordeaux
+ * Copyright (C) 2009-2012,2014,2017,2019                 Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -199,6 +199,7 @@ static int can_execute(unsigned workerid, struct starpu_task *task STARPU_ATTRIB
 
 #include "starpufftx1d.c"
 #include "starpufftx2d.c"
+#include "starpufftx3d.c"
 
 struct starpu_task *
 STARPUFFT(start)(STARPUFFT(plan) plan, void *_in, void *_out)
@@ -243,6 +244,17 @@ STARPUFFT(start)(STARPUFFT(plan) plan, void *_in, void *_out)
 			}
 			task = STARPUFFT(start2dC2C)(plan, plan->in_handle, plan->out_handle);
 			break;
+		case 3:
+			starpu_vector_data_register(&plan->in_handle, STARPU_MAIN_RAM, (uintptr_t) plan->in, plan->totsize, sizeof(STARPUFFT(complex)));
+			if (!PARALLEL)
+				starpu_vector_data_register(&plan->out_handle, STARPU_MAIN_RAM, (uintptr_t) plan->out, plan->totsize, sizeof(STARPUFFT(complex)));
+			if (PARALLEL)
+			{
+				for (z = 0; z < plan->totsize1; z++)
+					plan->twist1_tasks[z]->handles[0] = plan->in_handle;
+			}
+			task = STARPUFFT(start3dC2C)(plan, plan->in_handle, plan->out_handle);
+			break;
 		default:
 			STARPU_ABORT();
 			break;

+ 191 - 0
starpufft/src/starpufftx3d.c

@@ -0,0 +1,191 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2017                                CNRS
+ * Copyright (C) 2013,2014, 2019                                Université de Bordeaux
+ * Copyright (C) 2012,2013                                Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+/*
+ *
+ * Sequential version
+ *
+ */
+
+#ifdef __STARPU_USE_CUDA
+/* Perform one fft of size n,m */
+static void
+STARPUFFT(fft_3d_plan_gpu)(void *args)
+{
+	STARPUFFT(plan) plan = args;
+	cufftResult cures;
+	int n = plan->n[0];
+	int m = plan->n[1];
+	int p = plan->n[2];
+	int workerid = starpu_worker_get_id_check();
+
+	cures = cufftPlan3d(&plan->plans[workerid].plan_cuda, n, m, p, _CUFFT_C2C);
+	if (cures != CUFFT_SUCCESS)
+		STARPU_CUFFT_REPORT_ERROR(cures);
+	cufftSetStream(plan->plans[workerid].plan_cuda, starpu_cuda_get_local_stream());
+	if (cures != CUFFT_SUCCESS)
+		STARPU_CUFFT_REPORT_ERROR(cures);
+}
+
+static void
+STARPUFFT(fft_3d_kernel_gpu)(void *descr[], void *args)
+{
+	STARPUFFT(plan) plan = args;
+	cufftResult cures;
+
+	_cufftComplex * restrict in = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[0]);
+	_cufftComplex * restrict out = (_cufftComplex *)STARPU_VECTOR_GET_PTR(descr[1]);
+
+	int workerid = starpu_worker_get_id_check();
+
+	task_per_worker[workerid]++;
+
+	cures = _cufftExecC2C(plan->plans[workerid].plan_cuda, in, out, plan->sign == -1 ? CUFFT_FORWARD : CUFFT_INVERSE);
+	if (cures != CUFFT_SUCCESS)
+		STARPU_CUFFT_REPORT_ERROR(cures);
+
+	cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}
+#endif
+
+#ifdef STARPU_HAVE_FFTW
+/* Perform one fft of size n,m */
+static void
+STARPUFFT(fft_3d_kernel_cpu)(void *descr[], void *_args)
+{
+	STARPUFFT(plan) plan = _args;
+	int workerid = starpu_worker_get_id_check();
+
+	task_per_worker[workerid]++;
+
+	STARPUFFT(complex) * restrict in = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[0]);
+	STARPUFFT(complex) * restrict out = (STARPUFFT(complex) *)STARPU_VECTOR_GET_PTR(descr[1]);
+
+	_FFTW(execute_dft)(plan->plans[workerid].plan_cpu, in, out);
+}
+#endif
+
+static struct starpu_perfmodel STARPUFFT(fft_3d_model) = {
+	.type = STARPU_HISTORY_BASED,
+	.symbol = TYPE"fft_3d"
+};
+
+static struct starpu_codelet STARPUFFT(fft_3d_codelet) = {
+	.where =
+#ifdef __STARPU_USE_CUDA
+		STARPU_CUDA|
+#endif
+#ifdef STARPU_HAVE_FFTW
+		STARPU_CPU|
+#endif
+		0,
+#ifdef __STARPU_USE_CUDA
+	.cuda_funcs = {STARPUFFT(fft_3d_kernel_gpu)},
+#endif
+#ifdef STARPU_HAVE_FFTW
+	.cpu_funcs = {STARPUFFT(fft_3d_kernel_cpu)},
+#endif
+	CAN_EXECUTE
+	.model = &STARPUFFT(fft_3d_model),
+	.nbuffers = 2,
+	.modes = {STARPU_R, STARPU_W},
+	.name = "fft_3d_codelet"
+};
+
+STARPUFFT(plan)
+STARPUFFT(plan_dft_3d)(int n, int m, int p, int sign, unsigned flags)
+{
+	unsigned workerid;
+
+if (PARALLEL) {
+	/* TODO */
+	STARPU_ASSERT(0);
+}
+
+	/* TODO: flags? Automatically set FFTW_MEASURE on calibration? */
+	STARPU_ASSERT(flags == 0);
+
+	STARPUFFT(plan) plan = malloc(sizeof(*plan));
+	memset(plan, 0, sizeof(*plan));
+
+	plan->dim = 3;
+	plan->n = malloc(plan->dim * sizeof(*plan->n));
+	plan->n[0] = n;
+	plan->n[1] = m;
+	plan->n[2] = p;
+
+	plan->totsize = n * m;
+
+	plan->type = C2C;
+	plan->sign = sign;
+
+
+	/* Initialize per-worker working set */
+	for (workerid = 0; workerid < starpu_worker_get_count(); workerid++) {
+		switch (starpu_worker_get_type(workerid)) {
+		case STARPU_CPU_WORKER:
+#ifdef STARPU_HAVE_FFTW
+			/* fft plan: one fft of size n, m. */
+			plan->plans[workerid].plan_cpu = _FFTW(plan_dft_3d)(n, m, p, NULL, (void*) 1, sign, _FFTW_FLAGS);
+			STARPU_ASSERT(plan->plans[workerid].plan_cpu);
+#else
+/* #warning libstarpufft can not work correctly if libfftw3 is not installed */
+#endif
+			break;
+		case STARPU_CUDA_WORKER:
+			break;
+		default:
+			/* Do not care, we won't be executing anything there. */
+			break;
+		}
+	}
+#ifdef __STARPU_USE_CUDA
+	starpu_execute_on_each_worker(STARPUFFT(fft_3d_plan_gpu), plan, STARPU_CUDA);
+#endif
+
+	return plan;
+}
+
+/* Actually submit all the tasks. */
+static struct starpu_task *
+STARPUFFT(start3dC2C)(STARPUFFT(plan) plan, starpu_data_handle_t in, starpu_data_handle_t out)
+{
+	STARPU_ASSERT(plan->type == C2C);
+	int z;
+	int ret;
+
+if (PARALLEL) {
+	/* TODO */
+	STARPU_ASSERT(0);
+} else /* !PARALLEL */ {
+	struct starpu_task *task;
+
+	/* Create FFT task */
+	task = starpu_task_create();
+	task->detach = 0;
+	task->cl = &STARPUFFT(fft_3d_codelet);
+	task->handles[0] = in;
+	task->handles[1] = out;
+	task->cl_arg = plan;
+
+	ret = starpu_task_submit(task);
+	if (ret == -ENODEV) return NULL;
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+	return task;
+}
+}

+ 30 - 4
starpufft/tests/testx.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2010-2015,2017                           CNRS
  * Copyright (C) 2012,2013,2017                           Inria
- * Copyright (C) 2009-2012,2014                           Université de Bordeaux
+ * Copyright (C) 2009-2012,2014,2019                      Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -111,7 +111,7 @@ int main(int argc, char *argv[])
 {
 	int i, ret;
 	int size;
-	int n = 0, m = 0;
+	int n = 0, m = 0, p = 0;
 	STARPUFFT(plan) plan;
 	starpu_data_handle_t in_handle, out_handle;
 #ifdef STARPU_HAVE_FFTW
@@ -157,6 +157,15 @@ int main(int argc, char *argv[])
 		/* 2D */
 		size = n * m;
 	}
+	else if (argc == 4)
+	{
+		n = atoi(argv[1]);
+		m = atoi(argv[2]);
+		p = atoi(argv[3]);
+
+		/* 3D */
+		size = n * m * p;
+	}
 	else
 	{
 		assert(0);
@@ -166,10 +175,12 @@ int main(int argc, char *argv[])
 	bytes = size * sizeof(STARPUFFT(complex));
 #endif
 
-	STARPUFFT(complex) *in = STARPUFFT(malloc)(size * sizeof(*in));
+	STARPUFFT(complex) *in_orig = STARPUFFT(malloc)(size * sizeof(*in_orig));
 	starpu_srand48(0);
 	for (i = 0; i < size; i++)
-		in[i] = starpu_drand48() + I * starpu_drand48();
+		in_orig[i] = starpu_drand48() + I * starpu_drand48();
+
+	STARPUFFT(complex) *in = STARPUFFT(malloc)(size * sizeof(*in));
 
 	STARPUFFT(complex) *out = STARPUFFT(malloc)(size * sizeof(*out));
 
@@ -203,12 +214,23 @@ int main(int argc, char *argv[])
 		STARPU_ASSERT(cufftPlan2d(&cuda_plan, n, m, _CUFFT_C2C) == CUFFT_SUCCESS);
 #endif
 	}
+	else if (argc == 4)
+	{
+		plan = STARPUFFT(plan_dft_3d)(n, m, p, SIGN, 0);
+#ifdef STARPU_HAVE_FFTW
+		fftw_plan = _FFTW(plan_dft_3d)(n, m, p, NULL, (void*) 1, SIGN, FFTW_ESTIMATE);
+#endif
+#ifdef STARPU_USE_CUDA
+		STARPU_ASSERT(cufftPlan3d(&cuda_plan, n, m, p, _CUFFT_C2C) == CUFFT_SUCCESS);
+#endif
+	}
 	else
 	{
 		assert(0);
 	}
 
 #ifdef STARPU_HAVE_FFTW
+	memcpy(in, in_orig, size * sizeof(*in));
 	gettimeofday(&begin, NULL);
 	_FFTW(execute_dft)(fftw_plan, in, out_fftw);
 	gettimeofday(&end, NULL);
@@ -217,6 +239,7 @@ int main(int argc, char *argv[])
 	printf("FFTW took %2.2f ms (%2.2f MB/s)\n\n", timing/1000, bytes/timing);
 #endif
 #ifdef STARPU_USE_CUDA
+	memcpy(in, in_orig, size * sizeof(*in));
 	gettimeofday(&begin, NULL);
 	if (cufftExecC2C(cuda_plan, (cufftComplex*) in, (cufftComplex*) out_cuda, CUFFT_FORWARD) != CUFFT_SUCCESS)
 		printf("erf2\n");
@@ -228,6 +251,7 @@ int main(int argc, char *argv[])
 	printf("CUDA took %2.2f ms (%2.2f MB/s)\n\n", timing/1000, bytes/timing);
 #endif
 
+	memcpy(in, in_orig, size * sizeof(*in));
 	ret = STARPUFFT(execute)(plan, in, out);
 	if (ret == -1) return 77;
 	STARPUFFT(showstats)(stdout);
@@ -240,6 +264,7 @@ int main(int argc, char *argv[])
 #endif
 
 #if 1
+	memcpy(in, in_orig, size * sizeof(*in));
 	starpu_vector_data_register(&in_handle, STARPU_MAIN_RAM, (uintptr_t) in, size, sizeof(*in));
 	starpu_vector_data_register(&out_handle, STARPU_MAIN_RAM, (uintptr_t) out, size, sizeof(*out));
 
@@ -275,6 +300,7 @@ int main(int argc, char *argv[])
 #endif
 #endif
 
+	STARPUFFT(free)(in_orig);
 	STARPUFFT(free)(in);
 	STARPUFFT(free)(out);
 

+ 45 - 5
tests/datawizard/data_implicit_deps.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2010,2011,2013-2016,2019                 Université de Bordeaux
  * Copyright (C) 2011-2013                                Inria
- * Copyright (C) 2010-2013,2015,2017                      CNRS
+ * Copyright (C) 2010-2013,2015,2017,2019                 CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -82,6 +82,24 @@ void g_cuda(void *descr[], void *arg)
 }
 #endif
 
+#ifdef STARPU_USE_OPENCL
+void g_opencl(void *descr[], void *arg)
+{
+	(void)arg;
+	STARPU_SKIP_IF_VALGRIND;
+
+	cl_mem val = (cl_mem) STARPU_VARIABLE_GET_PTR(descr[0]);
+	unsigned value = 42;
+
+	usleep(100000);
+	cl_command_queue queue;
+	starpu_opencl_get_current_queue(&queue);
+
+	clEnqueueWriteBuffer(queue, val, CL_TRUE, 0, sizeof(unsigned), (void *)&value, 0, NULL, NULL);
+	clFinish(queue);
+}
+#endif
+
 static struct starpu_codelet cl_g =
 {
 	.modes = { STARPU_RW, STARPU_R, STARPU_RW },
@@ -89,8 +107,9 @@ static struct starpu_codelet cl_g =
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {g_cuda},
 #endif
-	// TODO
-	//.opencl_funcs = {g},
+#ifdef STARPU_USE_OPENCL
+	.opencl_funcs = {g_opencl},
+#endif
 	.cpu_funcs_name = {"g"},
 	.nbuffers = 3,
 };
@@ -122,6 +141,26 @@ void h_cuda(void *descr[], void *arg)
 }
 #endif
 
+#ifdef STARPU_USE_OPENCL
+void h_opencl(void *descr[], void *arg)
+{
+	(void)arg;
+	STARPU_SKIP_IF_VALGRIND;
+
+	cl_mem val = (cl_mem) STARPU_VARIABLE_GET_PTR(descr[0]);
+	unsigned value = 0;
+
+	cl_command_queue queue;
+	starpu_opencl_get_current_queue(&queue);
+
+	clEnqueueReadBuffer(queue, val, CL_TRUE, 0, sizeof(unsigned), (void *)&value, 0, NULL, NULL);
+	clFinish(queue);
+
+	FPRINTF(stderr, "VAR %u (should be 42)\n", value);
+	STARPU_ASSERT(value == 42);
+}
+#endif
+
 static struct starpu_codelet cl_h =
 {
 	.modes = { STARPU_RW, STARPU_R, STARPU_RW },
@@ -129,8 +168,9 @@ static struct starpu_codelet cl_h =
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {h_cuda},
 #endif
-	// TODO
-	//.opencl_funcs = {h},
+#ifdef STARPU_USE_OPENCL
+	.opencl_funcs = {h_opencl},
+#endif
 	.cpu_funcs_name = {"h"},
 	.nbuffers = 3
 };

+ 6 - 0
tests/datawizard/variable_size.c

@@ -116,6 +116,11 @@ static size_t variable_size_get_size(starpu_data_handle_t handle)
 	return interface->size;
 }
 
+static size_t variable_size_get_max_size(starpu_data_handle_t handle)
+{
+	return FULLSIZE;
+}
+
 static uint32_t variable_size_footprint(starpu_data_handle_t handle)
 {
 	return starpu_hash_crc32c_be(variable_size_get_size(handle), 0);
@@ -195,6 +200,7 @@ static struct starpu_data_interface_ops starpu_interface_variable_size_ops =
 	.free_data_on_node = free_variable_size_on_node,
 	.copy_methods = &variable_size_copy_data_methods,
 	.get_size = variable_size_get_size,
+	.get_max_size = variable_size_get_max_size,
 	.footprint = variable_size_footprint,
 	.compare = variable_size_compare,
 	.interfaceid = STARPU_UNKNOWN_INTERFACE_ID,

+ 6 - 2
tests/fault-tolerance/retry.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011-2013,2015,2017                      CNRS
+ * Copyright (C) 2011-2013,2015,2017,2019                 CNRS
  * Copyright (C) 2017                                     Inria
  * Copyright (C) 2019                                     Université de Bordeaux
  *
@@ -86,6 +86,10 @@ int main(void)
         starpu_data_handle_t h_x, h_y;
 	int ret, ret1;
 
+	if (starpu_get_env_number_default("STARPU_GLOBAL_ARBITER", 0) > 0)
+		/* TODO _submit_job_take_data_deps */
+		return STARPU_TEST_SKIPPED;
+
 	ret = starpu_init(NULL);
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
@@ -96,7 +100,7 @@ int main(void)
 	retry = 0;
 	ret1 = starpu_task_insert(&my_codelet,
 				  STARPU_PROLOGUE_CALLBACK, starpu_task_ft_prologue,
-				  STARPU_PROLOGUE_CALLBACK_ARG, check_ft,
+				  STARPU_PROLOGUE_CALLBACK_ARG_NFREE, check_ft,
 				  STARPU_R, h_x,
 				  STARPU_W, h_y,
 				  0);

+ 5 - 5
tests/main/codelet_null_callback.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2013-2015,2017                           CNRS
+ * Copyright (C) 2013-2015,2017,2019                      CNRS
  * Copyright (C) 2015,2016                                Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -71,25 +71,25 @@ int main(int argc, char **argv)
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
 	ret = starpu_task_insert(NULL,
-				 STARPU_CALLBACK_WITH_ARG, callback, &x,
+				 STARPU_CALLBACK_WITH_ARG_NFREE, callback, &x,
 				 0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 
 	ret = starpu_task_insert(NULL,
 				 STARPU_CALLBACK, callback2,
-				 STARPU_CALLBACK_ARG, &x2,
+				 STARPU_CALLBACK_ARG_NFREE, &x2,
 				 0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 
 	ret = starpu_task_insert(NULL,
 				 STARPU_PROLOGUE_CALLBACK, prologue_callback,
-				 STARPU_PROLOGUE_CALLBACK_ARG, &y,
+				 STARPU_PROLOGUE_CALLBACK_ARG_NFREE, &y,
 				 0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 
 	ret = starpu_task_insert(NULL,
 				 STARPU_PROLOGUE_CALLBACK_POP, prologue_callback_pop,
-				 STARPU_PROLOGUE_CALLBACK_POP_ARG, &z,
+				 STARPU_PROLOGUE_CALLBACK_POP_ARG_NFREE, &z,
 				 0);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
 

+ 3 - 3
tests/model-checking/Makefile

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2017  Université de Bordeaux
+# Copyright (C) 2017, 2019  Université de Bordeaux
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -15,8 +15,8 @@
 
 STARPU=../../
 CPPFLAGS=-I$(STARPU)/src -I$(STARPU)/include -I.
-CFLAGS=-Wall -Wextra -g -DNOCONFIG
-LDFLAGS=-lsimgrid
+CFLAGS+=-Wall -Wextra -g -DNOCONFIG
+LDFLAGS+=-lsimgrid -lm -Wl,-znorelro -Wl,-znoseparate-code
 
 MC_FLAGS=--cfg=model-check/reduction:none
 

+ 31 - 6
tests/model-checking/prio_list.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2017                                     CNRS
  * Copyright (C) 2017                                     Inria
- * Copyright (C) 2017                                     Université de Bordeaux
+ * Copyright (C) 2017,2019                                Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -25,7 +25,14 @@
 #define _GNU_SOURCE 1
 // Assuming recent simgrid
 #define STARPU_HAVE_SIMGRID_MSG_H
+#define STARPU_HAVE_SIMGRID_SEMAPHORE_H
+#define STARPU_HAVE_SIMGRID_MUTEX_H
+#define STARPU_HAVE_SIMGRID_COND_H
+#define STARPU_HAVE_SIMGRID_BARRIER_H
 #define STARPU_HAVE_XBT_SYNCHRO_H
+#define HAVE_SIMGRID_GET_CLOCK
+#define HAVE_SG_ACTOR_SLEEP_FOR
+#define HAVE_SG_CFG_SET_INT
 #endif
 #include <unistd.h>
 #include <stdlib.h>
@@ -62,7 +69,15 @@
 
 // MC_ignore
 
+#ifdef STARPU_HAVE_SIMGRID_MUTEX_H
+sg_mutex_t mutex[NLISTS];
+#define mutex_lock(l) sg_mutex_lock(l)
+#define mutex_unlock(l) sg_mutex_unlock(l)
+#else
 xbt_mutex_t mutex[NLISTS];
+#define mutex_lock(l) xbt_mutex_acquire(l)
+#define mutex_unlock(l) xbt_mutex_release(l)
+#endif
 
 
 LIST_TYPE(foo,
@@ -114,13 +129,13 @@ int worker(int argc, char *argv[])
 			elem->prio = res%10;
 			lrand48_r(&buffer, &res);
 			elem->back = res%2;
-			xbt_mutex_acquire(mutex[l]);
+			mutex_lock(mutex[l]);
 			if (elem->back)
 				foo_prio_list_push_back(&mylist[l], elem);
 			else
 				foo_prio_list_push_front(&mylist[l], elem);
 			check_list_prio(&mylist[l]);
-			xbt_mutex_release(mutex[l]);
+			mutex_unlock(mutex[l]);
 		}
 
 		for (i = 0; i < NELEMENTS; i++)
@@ -128,18 +143,22 @@ int worker(int argc, char *argv[])
 			lrand48_r(&buffer, &res);
 			n = res%(NELEMENTS-i);
 
-			xbt_mutex_acquire(mutex[l]);
+			mutex_lock(mutex[l]);
 			for (elem  = foo_prio_list_begin(&mylist[l]);
 			     n--;
 			     elem  = foo_prio_list_next(&mylist[l], elem))
 				;
 			foo_prio_list_erase(&mylist[l], elem);
 			check_list_prio(&mylist[l]);
-			xbt_mutex_release(mutex[l]);
+			mutex_unlock(mutex[l]);
 		}
 
 		/* horrible way to wait for list getting empty */
+#ifdef HAVE_SG_ACTOR_SLEEP_FOR
+		sg_actor_sleep_for(1000);
+#else
 		MSG_process_sleep(1000);
+#endif
 	}
 
 	return 0;
@@ -151,7 +170,11 @@ int master(int argc, char *argv[])
 
 	for (l = 0; l < NLISTS; l++)
 	{
+#ifdef STARPU_HAVE_SIMGRID_MUTEX_H
+		mutex[l] = sg_mutex_init();
+#else
 		mutex[l] = xbt_mutex_init();
+#endif
 		foo_prio_list_init(&mylist[l]);
 	}
 
@@ -177,7 +200,9 @@ int main(int argc, char *argv[])
 	}
 	srand48(0);
 	MSG_init(&argc, argv);
-#if SIMGRID_VERSION_MAJOR < 3 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR < 13)
+#ifdef HAVE_SG_CFG_SET_INT
+	sg_cfg_set_int("contexts/stack-size", 128);
+#elif SIMGRID_VERSION_MAJOR < 3 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR < 13)
 	extern xbt_cfg_t _sg_cfg_set;
 	xbt_cfg_set_int(_sg_cfg_set, "contexts/stack-size", 128);
 #else

+ 31 - 2
tests/model-checking/starpu_barrier.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2017                                     CNRS
- * Copyright (C) 2017                                     Université de Bordeaux
+ * Copyright (C) 2017,2019                                Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -19,6 +19,10 @@
 #define _STARPU_MALLOC(p, s) do {p = malloc(s);} while (0)
 #define _STARPU_CALLOC(p, n, s) do {p = calloc(n, s);} while (0)
 #define _STARPU_REALLOC(p, s) do {p = realloc(p, s);} while (0)
+#define STARPU_HG_DISABLE_CHECKING(v) ((void) 0)
+#define STARPU_HG_ENABLE_CHECKING(v) ((void) 0)
+#define ANNOTATE_HAPPENS_AFTER(v) ((void) 0)
+#define ANNOTATE_HAPPENS_BEFORE(v) ((void) 0)
 
 #define STARPU_DEBUG_PREFIX "[starpu]"
 #ifdef STARPU_VERBOSE
@@ -37,7 +41,14 @@
 #endif
 // Assuming recent simgrid
 #define STARPU_HAVE_SIMGRID_MSG_H
+#define STARPU_HAVE_SIMGRID_SEMAPHORE_H
+#define STARPU_HAVE_SIMGRID_MUTEX_H
+#define STARPU_HAVE_SIMGRID_COND_H
+#define STARPU_HAVE_SIMGRID_BARRIER_H
 #define STARPU_HAVE_XBT_SYNCHRO_H
+#define HAVE_SIMGRID_GET_CLOCK
+#define HAVE_SG_ACTOR_SLEEP_FOR
+#define HAVE_SG_CFG_SET_INT
 #endif
 #include <unistd.h>
 #include <stdlib.h>
@@ -65,12 +76,28 @@ _starpu_simgrid_thread_start(int argc, char *argv[])
 
 static void _starpu_clock_gettime(struct timespec *ts)
 {
+#ifdef HAVE_SIMGRID_GET_CLOCK
+	double now = simgrid_get_clock();
+#else
 	double now = MSG_get_clock();
+#endif
 	ts->tv_sec = floor(now);
 	ts->tv_nsec = floor((now - ts->tv_sec) * 1000000000);
 }
 
+void starpu_sleep(float nb_sec)
+{
+#ifdef HAVE_SG_ACTOR_SLEEP_FOR
+	sg_actor_sleep_for(nb_sec);
+#else
+	MSG_process_sleep(nb_sec);
+#endif
+}
+
 #include <common/barrier.c>
+#undef STARPU_DEBUG
+int starpu_worker_get_id(void) { return 0; }
+static inline unsigned _starpu_worker_mutex_is_sched_mutex(int workerid, starpu_pthread_mutex_t *mutex) { return 0; }
 #include <common/thread.c>
 
 #ifndef NTHREADS
@@ -125,7 +152,9 @@ int main(int argc, char *argv[])
 	}
 	srand48(0);
 	MSG_init(&argc, argv);
-#if SIMGRID_VERSION_MAJOR < 3 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR < 13)
+#ifdef HAVE_SG_CFG_SET_INT
+	sg_cfg_set_int("contexts/stack-size", 128);
+#elif SIMGRID_VERSION_MAJOR < 3 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR < 13)
 	extern xbt_cfg_t _sg_cfg_set;
 	xbt_cfg_set_int(_sg_cfg_set, "contexts/stack-size", 128);
 #else

+ 3 - 2
tests/sched_ctx/sched_ctx_hierarchy.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2017                                     CNRS
+ * Copyright (C) 2017, 2019                               CNRS
  * Copyright (C) 2017                                     Inria
  * Copyright (C) 2017                                     Université de Bordeaux
  *
@@ -21,7 +21,8 @@
 
 void free_codelet(void *arg)
 {
-	free(arg);
+	// The argument of the function is automatically freed by StarPU
+	//	free(arg);
 }
 
 void func_cpu_bis(void *descr[], void *_args)

+ 163 - 4
tools/starpu_replay.c

@@ -126,6 +126,163 @@ static struct perfmodel
 	char * model_name;
 } * model_hash;
 
+
+
+/*
+ * Replay data interface
+ * We don't care about many things anyway, essentially only sizes.
+ */
+
+struct replay_interface
+{
+	enum starpu_data_interface_id id;
+	starpu_data_handle_t orig_handle;
+	size_t size;
+	size_t alloc_size;
+	size_t max_size;
+};
+
+static struct starpu_data_interface_ops replay_interface_ops;
+static void register_replay(starpu_data_handle_t handle, unsigned home_node, void *data_interface)
+{
+	(void) home_node;
+	struct replay_interface *replay_interface = data_interface;
+	unsigned node;
+	for (node = 0; node < STARPU_MAXNODES; node++)
+	{
+		struct replay_interface *local_interface =
+			starpu_data_get_interface_on_node(handle, node);
+
+		local_interface->id = replay_interface->id;
+		local_interface->orig_handle = replay_interface->orig_handle;
+		local_interface->size = replay_interface->size;
+		local_interface->alloc_size = replay_interface->alloc_size;
+		local_interface->max_size = replay_interface->max_size;
+	}
+}
+
+static void replay_data_register(starpu_data_handle_t *handleptr, starpu_data_handle_t orig_handle, int home_node, size_t size, size_t alloc_size, size_t max_size)
+{
+	if (replay_interface_ops.interfaceid == STARPU_UNKNOWN_INTERFACE_ID)
+	{
+		replay_interface_ops.interfaceid = starpu_data_interface_get_next_id();
+	}
+	struct replay_interface interface = {
+		.id = replay_interface_ops.interfaceid,
+		.orig_handle = orig_handle,
+		.size = size,
+		.alloc_size = alloc_size,
+		.max_size = max_size,
+	};
+
+	starpu_data_register(handleptr, home_node, &interface, &replay_interface_ops);
+}
+
+static size_t replay_get_size(starpu_data_handle_t handle)
+{
+	struct replay_interface *interface =
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+	return interface->size;
+}
+
+static size_t replay_get_alloc_size(starpu_data_handle_t handle)
+{
+	struct replay_interface *interface =
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+	return interface->alloc_size;
+}
+
+static size_t replay_get_max_size(starpu_data_handle_t handle)
+{
+	struct replay_interface *interface =
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+	return interface->max_size;
+}
+
+static uint32_t replay_footprint(starpu_data_handle_t handle)
+{
+	return starpu_hash_crc32c_be(replay_get_size(handle), 0);
+}
+
+static int replay_compare(void *data_interface_a, void *data_interface_b)
+{
+	struct replay_interface *replay_a = data_interface_a;
+	struct replay_interface *replay_b = data_interface_b;
+
+	/* Two variables are considered compatible if they have the same size */
+	return replay_a->size == replay_b->size;
+}
+
+static void display_replay(starpu_data_handle_t handle, FILE *f)
+{
+	struct replay_interface *replay_interface =
+		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+
+	fprintf(f, "%lu/%lu/%lu\t",
+			(unsigned long) replay_interface->size,
+			(unsigned long) replay_interface->alloc_size,
+			(unsigned long) replay_interface->max_size);
+}
+
+static starpu_ssize_t describe_replay(void *data_interface, char *buf, size_t size)
+{
+	struct replay_interface *replay_interface = data_interface;
+	return snprintf(buf, size, "r%lu/%lu/%lu\t",
+			(unsigned long) replay_interface->size,
+			(unsigned long) replay_interface->alloc_size,
+			(unsigned long) replay_interface->max_size);
+}
+
+static starpu_ssize_t allocate_replay_on_node(void *data_interface, unsigned dst_node)
+{
+	struct replay_interface *replay_interface = data_interface;
+	starpu_memory_allocate(dst_node, replay_interface->alloc_size, STARPU_MEMORY_OVERFLOW);
+	return 0;
+}
+
+static void free_replay_on_node(void *data_interface, unsigned dst_node)
+{
+	struct replay_interface *replay_interface = data_interface;
+	starpu_memory_deallocate(dst_node, replay_interface->alloc_size);
+}
+
+static int replay_copy(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data)
+{
+	(void) dst_interface;
+	struct replay_interface *src = src_interface;
+
+	/* We don't care about pointers */
+	return starpu_interface_copy(1, 0, src_node, 1, 0, dst_node, src->size, async_data);
+}
+
+static const struct starpu_data_copy_methods replay_copy_data_methods =
+{
+	.any_to_any = replay_copy,
+};
+
+static struct starpu_data_interface_ops replay_interface_ops =
+{
+	.register_data_handle = register_replay,
+	.allocate_data_on_node = allocate_replay_on_node,
+	.free_data_on_node = free_replay_on_node,
+	.copy_methods = &replay_copy_data_methods,
+	.get_size = replay_get_size,
+	.get_alloc_size = replay_get_alloc_size,
+	.get_max_size = replay_get_max_size,
+	.footprint = replay_footprint,
+	.compare = replay_compare,
+	.interfaceid = STARPU_UNKNOWN_INTERFACE_ID,
+	.interface_size = sizeof(struct replay_interface),
+	.display = display_replay,
+	.pack_data = NULL,
+	.unpack_data = NULL,
+	.describe = describe_replay,
+
+	/* We want to observe actual allocations/deallocations */
+	.dontcache = 1,
+};
+
+
 /* [SUBMITORDER] The tree of the submit order */
 
 static struct starpu_rbtree tree = STARPU_RBTREE_INITIALIZER;
@@ -296,7 +453,9 @@ static void variable_data_register_check(size_t * array_of_size, int nb_handles)
 
 				handles_cell->handle = handles_ptr[h]; /* Get the hidden key (initial handle from the file) to store it as a key*/
 
-				starpu_variable_data_register(handles_ptr+h, STARPU_MAIN_RAM, (uintptr_t) 1, array_of_size[h]);
+				replay_data_register(handles_ptr+h, handles_ptr[h],
+						modes_ptr[h] & STARPU_R ? STARPU_MAIN_RAM : -1,
+						array_of_size[h], array_of_size[h], array_of_size[h]);
 
 				handles_cell->mem_ptr = handles_ptr[h]; /* Store the new value of the handle into the hash table */
 
@@ -708,7 +867,7 @@ int main(int argc, char **argv)
 						int one = 0;
 						for (i = 0; i < narch ; i++)
 						{
-							struct starpu_perfmodel_arch *arch = starpu_perfmodel_arch_comb_fetch(i);
+							arch = starpu_perfmodel_arch_comb_fetch(i);
 							perfTime[i] = starpu_perfmodel_history_based_expected_perf(&realmodel->perfmodel, arch, footprint);
 							if (!(perfTime[i] == 0 || isnan(perfTime[i])))
 								one = 1;
@@ -867,7 +1026,7 @@ int main(int argc, char **argv)
 		else if (TEST("Modes"))
 		{
 			char * buffer = s + 7;
-			int mode_i = 0;
+			unsigned mode_i = 0;
 			const char * delim = " ";
 			char * token = strtok(buffer, delim);
 
@@ -902,7 +1061,7 @@ int main(int argc, char **argv)
 			char *  buffer = s + 7;
 			const char * delim = " ";
 			char * token = strtok(buffer, delim);
-			int k = 0;
+			unsigned k = 0;
 
 			_STARPU_MALLOC(sizes_set, nb_parameters * sizeof(size_t));