Browse Source

Merge branch 'master' into fpga2

Nathalie Furmento 5 years ago
parent
commit
b2b6d1e715
100 changed files with 1427 additions and 638 deletions
  1. 3 0
      ChangeLog
  2. 8 2
      configure.ac
  3. 0 8
      doc/doxygen/Makefile.am
  4. 23 3
      doc/doxygen/chapters/101_building.doxy
  5. 7 2
      doc/doxygen/chapters/410_mpi_support.doxy
  6. 2 2
      doc/doxygen/chapters/450_native_fortran_support.doxy
  7. 32 2
      doc/doxygen/chapters/501_environment_variables.doxy
  8. 8 1
      doc/doxygen/chapters/510_configure_options.doxy
  9. 0 25
      doc/doxygen/chapters/api/codelet_and_tasks.doxy
  10. 0 28
      doc/doxygen/chapters/api/mic_extensions.doxy
  11. 0 34
      doc/doxygen/chapters/api/mpi.doxy
  12. 0 35
      doc/doxygen/chapters/api/opencl_extensions.doxy
  13. 0 25
      doc/doxygen/chapters/api/openmp_runtime_support.doxy
  14. 0 25
      doc/doxygen/chapters/api/scheduling_contexts.doxy
  15. 0 24
      doc/doxygen/chapters/api/scheduling_policy.doxy
  16. 0 12
      doc/doxygen/chapters/api/versioning.doxy
  17. 0 39
      doc/doxygen/chapters/api/workers.doxy
  18. 0 1
      doc/doxygen/doxygen-config.cfg.in
  19. 87 5
      doc/doxygen_dev/Makefile.am
  20. 85 2
      doc/doxygen_dev/doxygen-config.cfg.in
  21. 86 2
      doc/doxygen_dev/refman.tex
  22. 3 3
      examples/Makefile.am
  23. 0 3
      examples/Makefile.am.tmp.y
  24. 64 58
      examples/cpp/add_vectors_interface.cpp
  25. 2 2
      examples/interface/complex_interface.c
  26. 2 2
      examples/mult/sgemm.sh
  27. 5 4
      examples/scheduler/schedulers.sh
  28. 5 4
      examples/scheduler/schedulers_context.sh
  29. 17 2
      examples/stencil/implicit-stencil.c
  30. 17 2
      examples/stencil/stencil.c
  31. 52 0
      include/starpu_clusters.h
  32. 128 3
      include/starpu_config.h.in
  33. 3 3
      include/starpu_openmp.h
  34. 12 0
      include/starpu_sched_component.h
  35. 3 0
      include/starpu_task_util.h
  36. 5 1
      include/starpu_thread.h
  37. 10 4
      mpi/examples/Makefile.am
  38. 1 1
      mpi/examples/filters/filter.c
  39. 8 0
      mpi/examples/helper.h
  40. 3 3
      mpi/examples/matrix_decomposition/mpi_cholesky.c
  41. 2 2
      mpi/examples/matrix_decomposition/mpi_cholesky_distributed.c
  42. 10 0
      mpi/examples/matrix_decomposition/mpi_decomposition_params.c
  43. 10 0
      mpi/examples/mpi_lu/plu_example.c
  44. 10 0
      mpi/examples/mpi_lu/plu_implicit_example.c
  45. 10 0
      mpi/examples/mpi_lu/plu_outofcore_example.c
  46. 1 1
      mpi/examples/native_fortran/nf_basic_ring.f90
  47. 2 1
      mpi/examples/native_fortran/nf_mm.f90
  48. 2 1
      mpi/examples/native_fortran/nf_mm_task_build.f90
  49. 87 3
      mpi/examples/user_datatype/my_interface.c
  50. 4 1
      mpi/examples/user_datatype/my_interface.h
  51. 98 0
      mpi/examples/user_datatype/user_datatype2.c
  52. 25 25
      mpi/include/fstarpu_mpi_mod.f90
  53. 28 3
      mpi/include/starpu_mpi.h
  54. 8 3
      mpi/src/mpi/starpu_mpi_mpi.c
  55. 4 4
      mpi/src/mpi/starpu_mpi_mpi_backend.c
  56. 23 22
      mpi/src/nmad/starpu_mpi_nmad.c
  57. 13 12
      mpi/src/starpu_mpi_coop_sends.c
  58. 45 13
      mpi/src/starpu_mpi_datatype.c
  59. 2 0
      mpi/src/starpu_mpi_init.c
  60. 2 0
      mpi/src/starpu_mpi_private.h
  61. 11 7
      mpi/src/starpu_mpi_select_node.c
  62. 1 1
      mpi/src/starpu_mpi_task_insert_fortran.c
  63. 2 2
      mpi/tests/insert_task_owner2.c
  64. 9 9
      mpi/tests/policy_selection.c
  65. 7 7
      mpi/tests/policy_selection2.c
  66. 33 1
      mpi/tests/sendrecv_bench.c
  67. 3 1
      mpi/tests/user_defined_datatype_value.h
  68. 2 0
      src/common/barrier.h
  69. 2 0
      src/common/barrier_counter.h
  70. 100 38
      src/common/fxt.h
  71. 25 16
      src/common/graph.h
  72. 3 1
      src/common/knobs.h
  73. 0 5
      src/common/list.h
  74. 2 0
      src/common/prio_list.h
  75. 23 21
      src/common/rbtree.h
  76. 20 18
      src/common/rbtree_i.h
  77. 9 7
      src/common/rwlock.h
  78. 2 0
      src/common/starpu_spinlock.h
  79. 20 1
      src/common/thread.c
  80. 2 0
      src/common/thread.h
  81. 2 0
      src/common/timing.h
  82. 6 4
      src/common/utils.h
  83. 2 0
      src/core/combined_workers.h
  84. 2 0
      src/core/debug.h
  85. 4 0
      src/core/dependencies/cg.c
  86. 11 9
      src/core/dependencies/cg.h
  87. 2 0
      src/core/dependencies/data_concurrency.h
  88. 8 0
      src/core/dependencies/dependencies.c
  89. 4 2
      src/core/dependencies/implicit_data_deps.h
  90. 2 0
      src/core/dependencies/tags.c
  91. 2 0
      src/core/dependencies/tags.h
  92. 2 0
      src/core/dependencies/task_deps.c
  93. 3 1
      src/core/detect_combined_workers.h
  94. 10 10
      src/core/disk.h
  95. 2 0
      src/core/disk_ops/unistd/disk_unistd_global.h
  96. 2 0
      src/core/drivers.h
  97. 15 13
      src/core/errorcheck.h
  98. 2 0
      src/core/idle_hook.h
  99. 3 1
      src/core/jobs.c
  100. 0 0
      src/core/jobs.h

+ 3 - 0
ChangeLog

@@ -48,6 +48,9 @@ Small features:
     them from starpu_interface_copy2d and 3d.
   * New function starpu_task_watchdog_set_hook to specify a function
     to be called when the watchdog is raised
+  * Add STARPU_LIMIT_CPU_NUMA_MEM environment variable.
+  * Add STARPU_WORKERS_GETBIND environment variable.
+  * Add STARPU_SCHED_SIMPLE_DECIDE_ALWAYS modular scheduler flag.
 
 StarPU 1.3.3 (git revision 11afc5b007fe1ab1c729b55b47a5a98ef7f3cfad)
 ====================================================================

+ 8 - 2
configure.ac

@@ -288,8 +288,8 @@ if test x$enable_simgrid = xyes ; then
 	AC_CHECK_TYPES([smx_actor_t], [AC_DEFINE([STARPU_HAVE_SMX_ACTOR_T], [1], [Define to 1 if you have the smx_actor_t type.])], [], [[#include <simgrid/simix.h>]])
 
 	# Latest functions
-	AC_CHECK_FUNCS([MSG_process_attach sg_actor_attach sg_actor_init MSG_zone_get_hosts sg_zone_get_hosts MSG_process_self_name MSG_process_userdata_init sg_actor_data])
-	AC_CHECK_FUNCS([xbt_mutex_try_acquire smpi_process_set_user_data SMPI_thread_create sg_zone_get_by_name sg_link_name sg_host_route sg_host_self sg_host_speed simcall_process_create sg_config_continue_after_help])
+	AC_CHECK_FUNCS([MSG_process_attach sg_actor_attach sg_actor_init sg_actor_set_stacksize MSG_zone_get_hosts sg_zone_get_hosts MSG_process_self_name MSG_process_userdata_init sg_actor_data])
+	AC_CHECK_FUNCS([xbt_mutex_try_acquire smpi_process_set_user_data SMPI_thread_create sg_zone_get_by_name sg_link_name sg_host_route sg_host_self sg_host_list sg_host_speed simcall_process_create sg_config_continue_after_help])
 	AC_CHECK_FUNCS([simgrid_init], [AC_DEFINE([STARPU_SIMGRID_HAVE_SIMGRID_INIT], [1], [Define to 1 if you have the `simgrid_init' function.])])
 	AC_CHECK_FUNCS([xbt_barrier_init], [AC_DEFINE([STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT], [1], [Define to 1 if you have the `xbt_barrier_init' function.])])
 	AC_CHECK_FUNCS([sg_actor_sleep_for sg_actor_self sg_actor_ref sg_host_get_properties sg_host_send_to sg_host_sendto sg_cfg_set_int sg_actor_self_execute sg_actor_execute simgrid_get_clock])
@@ -3598,6 +3598,7 @@ AC_CONFIG_COMMANDS([executable-scripts], [
   chmod +x tests/model-checking/starpu-mc.sh
   chmod +x examples/loader-cross.sh
   chmod +x examples/stencil/loader-cross.sh
+  chmod +x tools/starpu_env
   chmod +x tools/starpu_codelet_profile
   chmod +x tools/starpu_codelet_histo_profile
   chmod +x tools/starpu_mpi_comm_matrix.py
@@ -3642,6 +3643,9 @@ AC_CONFIG_COMMANDS([executable-scripts], [
   test -e examples/cholesky/cholesky.sh || ln -sf $ac_abs_top_srcdir/examples/cholesky/cholesky.sh examples/cholesky/
   mkdir -p examples/mult
   test -e examples/mult/sgemm.sh || ln -sf $ac_abs_top_srcdir/examples/mult/sgemm.sh examples/mult/
+  mkdir -p examples/scheduler
+  test -e examples/scheduler/schedulers.sh || ln -sf $ac_abs_top_srcdir/examples/scheduler/schedulers.sh examples/scheduler
+  test -e examples/scheduler/schedulers_context.sh || ln -sf $ac_abs_top_srcdir/examples/scheduler/schedulers_context.sh examples/scheduler
   test -e tools/starpu_paje_draw_histogram.R || ln -sf $ac_abs_top_srcdir/tools/starpu_paje_draw_histogram.R tools/starpu_paje_draw_histogram.R
   test -e tools/starpu_paje_state_stats.R || ln -sf $ac_abs_top_srcdir/tools/starpu_paje_state_stats.R tools/starpu_paje_state_stats.R
   test -e tools/starpu_trace_state_stats.py || ln -sf $ac_abs_top_srcdir/tools/starpu_trace_state_stats.py tools/starpu_trace_state_stats.py
@@ -3672,6 +3676,8 @@ AC_OUTPUT([
 	Makefile
 	src/Makefile
 	tools/Makefile
+	tools/replay/Makefile
+	tools/starpu_env
 	tools/starpu_codelet_profile
 	tools/starpu_codelet_histo_profile
 	tools/starpu_mpi_comm_matrix.py

+ 0 - 8
doc/doxygen/Makefile.am

@@ -96,16 +96,8 @@ chapters =	\
 	chapters/code/disk_copy.c \
 	chapters/code/disk_compute.c \
 	chapters/code/nf_initexit.f90 \
-	chapters/api/codelet_and_tasks.doxy \
 	chapters/api/fft_support.doxy \
-	chapters/api/mpi.doxy \
-	chapters/api/opencl_extensions.doxy \
-	chapters/api/openmp_runtime_support.doxy \
-	chapters/api/mic_extensions.doxy \
-	chapters/api/scheduling_contexts.doxy \
-	chapters/api/scheduling_policy.doxy \
 	chapters/api/versioning.doxy \
-	chapters/api/workers.doxy \
 	chapters/api/threads.doxy
 
 images = 	\

+ 23 - 3
doc/doxygen/chapters/101_building.doxy

@@ -152,6 +152,14 @@ configuration:
 $ make install
 \endverbatim
 
+If you have let StarPU install in <c>/usr/local/</c>, you additionally need to run
+
+\verbatim
+$ sudo ldconfig
+\endverbatim
+
+so the libraries can be found by the system.
+
 Libtool interface versioning information are included in
 libraries names (<c>libstarpu-1.3.so</c>, <c>libstarpumpi-1.3.so</c> and
 <c>libstarpufft-1.3.so</c>).
@@ -219,6 +227,17 @@ $ starpu_machine_display
 If it does not, please check the output of \c lstopo from \c hwloc and report
 the issue to the \c hwloc project, since this is what StarPU uses to detect the hardware.
 
+<br>
+A tool is provided to help setting all the environment variables
+needed by StarPU. Once StarPU is installed in a specific directory,
+calling the script <c>bin/starpu_env</c> will set in your current
+environment the variables <c>STARPU_PATH</c>, <c>LD_LIBRARY_PATH</c>,
+<c>PKG_CONFIG_PATH</c>, <c>PATH</c> and <c>MANPATH</c>.
+
+\verbatim
+$ source $STARPU_PATH/bin/starpu_env
+\endverbatim
+
 \subsection IntegratingStarPUInABuildSystem Integrating StarPU in a Build System
 
 \subsubsection StarPUInMake Integrating StarPU in a Make Build System
@@ -504,9 +523,10 @@ It can also be convenient to try simulated benchmarks, if you want to give a try
 at CPU-GPU scheduling without actually having a GPU at hand. This can be done by
 using the SimGrid version of StarPU: first install the SimGrid simulator from
 http://simgrid.gforge.inria.fr/ (we tested with SimGrid from 3.11 to 3.16, and
-3.18 to 3.22, other versions may have compatibility issues, 3.17 notably does
-not build at all. MPI simulation does not work with version 3.22),
-then configure StarPU with \ref enable-simgrid
+3.18 to 3.25. SimGrid versions 3.25 and above need to be configured with -Denable_msg=ON.
+Other versions may have compatibility issues, 3.17 notably does
+not build at all. MPI simulation does not work with version 3.22).
+Then configure StarPU with \ref enable-simgrid
 "--enable-simgrid" and rebuild and install it, and then you can simulate the performance for a
 few virtualized systems shipped along StarPU: attila, mirage, idgraf, and sirocco.
 

File diff suppressed because it is too large
+ 7 - 2
doc/doxygen/chapters/410_mpi_support.doxy


+ 2 - 2
doc/doxygen/chapters/450_native_fortran_support.doxy

@@ -25,7 +25,7 @@ Every symbol of the Native Fortran support API is prefixed by
 
 Note: Mixing uses of <c>fstarpu_</c> and <c>starpu_</c>
 symbols in the same Fortran code has unspecified behaviour.
-See \ref APIMIX for a discussion about valid and unspecified
+See \ref NFAPIMIX for a discussion about valid and unspecified
 combinations.
 
 \section NFImplementation Implementation Details and Specificities
@@ -40,7 +40,7 @@ standard. It has currently been tested successfully with GNU GFortran 4.9,
 GFortran 5.x, GFortran 6.x and the Intel Fortran Compiler >= 2016. It is known
 not to work with GNU GFortran < 4.9, Intel Fortran Compiler < 2016.
 
-See Section \ref OldFortran on information on how to write StarPU
+See Section \ref NFOldFortran on information on how to write StarPU
 Fortran code with older compilers.
 
 \subsection NFConfiguration Configuration

+ 32 - 2
doc/doxygen/chapters/501_environment_variables.doxy

@@ -184,6 +184,16 @@ Setting it to non-zero will prevent StarPU from binding its threads to
 CPUs. This is for instance useful when running the testsuite in parallel.
 </dd>
 
+<dt>STARPU_WORKERS_GETBIND</dt>
+<dd>
+\anchor STARPU_WORKERS_GETBIND
+\addindex __env__STARPU_WORKERS_GETBIND
+Setting it to non-zero makes StarPU use the OS-provided CPU binding to determine
+how many and which CPU cores it should use. This is notably useful when running
+several StarPU-MPI processes on the same host, to let the MPI launcher set the
+CPUs to be used.
+</dd>
+
 <dt>STARPU_WORKERS_CPUID</dt>
 <dd>
 \anchor STARPU_WORKERS_CPUID
@@ -757,6 +767,13 @@ and allows studying scheduling overhead of the runtime system. However,
 it also makes simulation non-deterministic.
 </dd>
 
+<dt>STARPU_SINK</dt>
+<dd>
+\anchor STARPU_SINK
+\addindex __env__STARPU_SINK
+Variable defined by StarPU when running MPI Xeon PHI on the sink.
+</dd>
+
 </dl>
 
 \section MiscellaneousAndDebug Miscellaneous And Debug
@@ -952,8 +969,21 @@ STARPU_MPI_MEM_THROTTLE is set to 1.
 <dd>
 \anchor STARPU_LIMIT_CPU_NUMA_devid_MEM
 \addindex __env__STARPU_LIMIT_CPU_NUMA_devid_MEM
-Specify the maximum number of megabytes that should be
-available to the application on the NUMA node with the OS identifier <c>devid</c>.
+Specify the maximum number of megabytes that should be available to the
+application on the NUMA node with the OS identifier <c>devid</c>.  Setting it
+overrides the value of STARPU_LIMIT_CPU_MEM.
+</dd>
+
+<dt>STARPU_LIMIT_CPU_NUMA_MEM</dt>
+<dd>
+\anchor STARPU_LIMIT_CPU_NUMA_MEM
+\addindex __env__STARPU_LIMIT_CPU_NUMA_MEM
+Specify the maximum number of megabytes that should be available to the
+application on each NUMA node. This is the same as specifying that same amount
+with \ref STARPU_LIMIT_CPU_NUMA_devid_MEM for each NUMA node number. The total
+memory available to StarPU will thus be this amount multiplied by the number of
+NUMA nodes used by StarPU. Any \ref STARPU_LIMIT_CPU_NUMA_devid_MEM additionally
+specified will take over STARPU_LIMIT_CPU_NUMA_MEM.
 </dd>
 
 <dt>STARPU_MINIMUM_AVAILABLE_MEM</dt>

+ 8 - 1
doc/doxygen/chapters/510_configure_options.doxy

@@ -726,12 +726,19 @@ exploring various execution paths.
 <dd>
 \anchor enable-calibration-heuristic
 \addindex __configure__--enable-calibration-heuristic
-Allows to set the maximum authorized percentage of deviation
+Allow to set the maximum authorized percentage of deviation
 for the history-based calibrator of StarPU. A correct value
 of this parameter must be in [0..100]. The default value of
 this parameter is 10. Experimental.
 </dd>
 
+<dt>--disable-mlr</dt>
+<dd>
+\anchor disable-mlr
+\addindex __configure__--disable-mlr
+Allow to disable multiple linear regression models (see \ref PerformanceModelExample)
+</dd>
+
 </dl>
 
 */

+ 0 - 25
doc/doxygen/chapters/api/codelet_and_tasks.doxy

@@ -1,25 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-/*! \ingroup API_Codelet_And_Tasks
-
-\def STARPU_NMAXBUFS
-\ingroup API_Codelet_And_Tasks
-Define the maximum number of buffers that tasks will be able to take
-as parameters. The default value is 8, it can be changed by using the
-configure option \ref enable-maxbuffers "--enable-maxbuffers".
-
-*/

+ 0 - 28
doc/doxygen/chapters/api/mic_extensions.doxy

@@ -1,28 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-/*! \ingroup API_MIC_Extensions
-
-\def STARPU_USE_MIC
-\ingroup API_MIC_Extensions
-Defined when StarPU has been installed with MIC support.
-It should be used in your code to detect the availability of MIC.
-
-\def STARPU_MAXMICDEVS
-\ingroup API_MIC_Extensions
-Define the maximum number of MIC devices that are supported by StarPU.
-
-*/

+ 0 - 34
doc/doxygen/chapters/api/mpi.doxy

@@ -1,34 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-/*! \ingroup API_MPI_Support MPI Support
-
-\def STARPU_USE_MPI
-\ingroup API_MPI_Support
-Defined when StarPU has been installed with MPI support. It should be
-used in your code to detect the availability of MPI.
-
-@name MPI Master Slave
-\anchor MPIMasterSlaveSupport
-\ingroup API_MPI_Support
-
-\def STARPU_USE_MPI_MASTER_SLAVE
-\ingroup API_MPI_Support
-Defined when StarPU has been installed with MPI Master Slave
-support. It should be used in your code to detect the availability of
-MPI Master Slave.
-
-*/

+ 0 - 35
doc/doxygen/chapters/api/opencl_extensions.doxy

@@ -1,35 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-/*! \ingroup API_OpenCL_Extensions
-
-\def STARPU_USE_OPENCL
-\ingroup API_OpenCL_Extensions
-Defined when StarPU has been installed with
-OpenCL support. It should be used in your code to detect the
-availability of OpenCL as shown in \ref FullSourceCodeVectorScal.
-
-\def STARPU_MAXOPENCLDEVS
-\ingroup API_OpenCL_Extensions
-Define the maximum number of OpenCL devices that are
-supported by StarPU.
-
-\def STARPU_OPENCL_DATADIR
-\ingroup API_OpenCL_Extensions
-Define the directory in which the OpenCL codelets of the
-applications provided with StarPU have been installed.
-
-*/

+ 0 - 25
doc/doxygen/chapters/api/openmp_runtime_support.doxy

@@ -1,25 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2014-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-/*! \ingroup API_OpenMP_Runtime_Support
-
-\def STARPU_OPENMP
-\ingroup API_OpenMP_Runtime_Support
-This macro is defined when StarPU has been installed with OpenMP Runtime
-support. It should be used in your code to detect the availability of
-the runtime support for OpenMP.
-
-*/

+ 0 - 25
doc/doxygen/chapters/api/scheduling_contexts.doxy

@@ -1,25 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
- * Copyright (C) 2016       Uppsala University
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-/*! \ingroup API_Scheduling_Contexts
-
-\def STARPU_NMAX_SCHED_CTXS
-\ingroup API_Scheduling_Policy
-Define the maximum number of scheduling contexts managed by StarPU. The default value can be
-modified at configure by using the option \ref enable-max-sched-ctxs "--enable-max-sched-ctxs".
-
-*/

+ 0 - 24
doc/doxygen/chapters/api/scheduling_policy.doxy

@@ -1,24 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-/*! \ingroup API_Scheduling_Policy Scheduling Policy
-
-\def STARPU_MAXIMPLEMENTATIONS
-\ingroup API_Scheduling_Policy
-Define the maximum number of implementations per architecture. The default value can be modified at
-configure by using the option \ref enable-maximplementations "--enable-maximplementations".
-
-*/

+ 0 - 12
doc/doxygen/chapters/api/versioning.doxy

@@ -16,18 +16,6 @@
 
 /*! \defgroup API_Versioning Versioning
 
-\def STARPU_MAJOR_VERSION
-\ingroup API_Versioning
-Define the major version of StarPU. This is the version used when compiling the application.
-
-\def STARPU_MINOR_VERSION
-\ingroup API_Versioning
-Define the minor version of StarPU. This is the version used when compiling the application.
-
-\def STARPU_RELEASE_VERSION
-\ingroup API_Versioning
-Define the release version of StarPU. This is the version used when compiling the application.
-
 \fn void starpu_get_version(int *major, int *minor, int *release)
 \ingroup API_Versioning
 Return as 3 integers the version of StarPU used when running the application.

+ 0 - 39
doc/doxygen/chapters/api/workers.doxy

@@ -1,39 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2009-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-/*! \ingroup API_Workers_Properties
-
-\def STARPU_NMAXWORKERS
-\ingroup API_Workers_Properties
-Define the maximum number of workers managed by StarPU.
-
-\def STARPU_MAXCPUS
-\ingroup API_Workers_Properties
-Define the maximum number of CPU workers managed by StarPU. The default value can be modified at
-configure by using the option \ref enable-maxcpus "--enable-maxcpus".
-
-\def STARPU_MAXNUMANODES
-\ingroup API_Workers_Properties
-Define the maximum number of NUMA nodes managed by StarPU. The default value can be modified at
-configure by using the option \ref enable-maxnumanodes "--enable-maxnumanodes".
-
-\def STARPU_MAXNODES
-\ingroup API_Workers_Properties
-Define the maximum number of memory nodes managed by StarPU. The default value can be modified at
-configure by using the option \ref enable-maxnodes "--enable-maxnodes". Reducing it allows to
-considerably reduce memory used by StarPU data structures.
-
-*/

+ 0 - 1
doc/doxygen/doxygen-config.cfg.in

@@ -17,7 +17,6 @@
 #
 INPUT                  = @top_srcdir@/doc/doxygen/chapters \
 		       	 @top_srcdir@/doc/doxygen/chapters/api \
-		       	 @top_srcdir@/doc/doxygen/chapters/api/sc_hypervisor \
                          @top_builddir@/doc/doxygen/starpu_config.h \
 			 @top_srcdir@/include/starpu_bitmap.h \
 	 		 @top_srcdir@/include/starpu_bound.h \

+ 87 - 5
doc/doxygen_dev/Makefile.am

@@ -59,8 +59,9 @@ chapters =	\
 images =
 
 if BUILD_DOC
-starpu_config.h: $(top_srcdir)/include/starpu_config.h.in
+config.h: $(top_srcdir)/src/common/config.h.in
 	@$(SED) 's/#undef \(.*\)/#define \1 1/' $< > $@
+	@$(SED) -i '1s/^/\/\*\* \@file \*\/\n/' $@
 
 chapters/version.sty: $(chapters)
 	@for f in $(chapters) ; do \
@@ -102,9 +103,92 @@ chapters/version.html: $(chapters)
 
 dox_inputs = $(DOX_CONFIG) 				\
 	$(chapters) 					\
-	starpu_config.h					\
+	config.h					\
 	chapters/version.sty				\
 	chapters/version.html				\
+	$(top_srcdir)/src/datawizard/data_request.h	\
+	$(top_srcdir)/src/datawizard/coherency.h	\
+	$(top_srcdir)/src/datawizard/sort_data_handles.h	\
+	$(top_srcdir)/src/datawizard/memalloc.h	\
+	$(top_srcdir)/src/datawizard/copy_driver.h	\
+	$(top_srcdir)/src/datawizard/filters.h	\
+	$(top_srcdir)/src/datawizard/datastats.h	\
+	$(top_srcdir)/src/datawizard/write_back.h	\
+	$(top_srcdir)/src/datawizard/interfaces/data_interface.h	\
+	$(top_srcdir)/src/datawizard/memory_manager.h	\
+	$(top_srcdir)/src/datawizard/node_ops.h	\
+	$(top_srcdir)/src/datawizard/memstats.h	\
+	$(top_srcdir)/src/datawizard/datawizard.h	\
+	$(top_srcdir)/src/datawizard/memory_nodes.h	\
+	$(top_srcdir)/src/datawizard/footprint.h	\
+	$(top_srcdir)/src/datawizard/malloc.h	\
+	$(top_srcdir)/src/drivers/cpu/driver_cpu.h	\
+	$(top_srcdir)/src/drivers/cuda/driver_cuda.h	\
+	$(top_srcdir)/src/drivers/opencl/driver_opencl_utils.h	\
+	$(top_srcdir)/src/drivers/opencl/driver_opencl.h	\
+	$(top_srcdir)/src/drivers/disk/driver_disk.h	\
+	$(top_srcdir)/src/drivers/mpi/driver_mpi_common.h	\
+	$(top_srcdir)/src/drivers/mpi/driver_mpi_sink.h	\
+	$(top_srcdir)/src/drivers/mpi/driver_mpi_source.h	\
+	$(top_srcdir)/src/drivers/mp_common/sink_common.h	\
+	$(top_srcdir)/src/drivers/mp_common/mp_common.h	\
+	$(top_srcdir)/src/drivers/mp_common/source_common.h	\
+	$(top_srcdir)/src/drivers/driver_common/driver_common.h	\
+	$(top_srcdir)/src/drivers/mic/driver_mic_sink.h	\
+	$(top_srcdir)/src/drivers/mic/driver_mic_source.h	\
+	$(top_srcdir)/src/drivers/mic/driver_mic_common.h	\
+	$(top_srcdir)/src/profiling/profiling.h	\
+	$(top_srcdir)/src/profiling/bound.h	\
+	$(top_srcdir)/src/util/starpu_data_cpy.h	\
+	$(top_srcdir)/src/util/openmp_runtime_support.h	\
+	$(top_srcdir)/src/util/starpu_clusters_create.h	\
+	$(top_srcdir)/src/util/starpu_task_insert_utils.h	\
+	$(top_srcdir)/src/common/graph.h	\
+	$(top_srcdir)/src/common/fxt.h	\
+	$(top_srcdir)/src/common/starpu_spinlock.h	\
+	$(top_srcdir)/src/common/rbtree_i.h	\
+	$(top_srcdir)/src/common/rbtree.h	\
+	$(top_srcdir)/src/common/timing.h	\
+	$(top_srcdir)/src/common/rwlock.h	\
+	$(top_srcdir)/src/common/barrier.h	\
+	$(top_srcdir)/src/common/prio_list.h	\
+	$(top_srcdir)/src/common/barrier_counter.h	\
+	$(top_srcdir)/src/common/uthash.h	\
+	$(top_srcdir)/src/common/knobs.h	\
+	$(top_srcdir)/src/common/utils.h	\
+	$(top_srcdir)/src/common/thread.h	\
+	$(top_srcdir)/src/common/list.h	\
+	$(top_srcdir)/src/debug/starpu_debug_helpers.h	\
+	$(top_srcdir)/src/debug/traces/starpu_fxt.h	\
+	$(top_srcdir)/src/starpu_parameters.h	\
+	$(top_srcdir)/src/sched_policies/fifo_queues.h	\
+	$(top_srcdir)/src/sched_policies/helper_mct.h	\
+	$(top_srcdir)/src/sched_policies/sched_component.h	\
+	$(top_srcdir)/src/sched_policies/prio_deque.h	\
+	$(top_srcdir)/src/core/jobs.h	\
+	$(top_srcdir)/src/core/disk_ops/unistd/disk_unistd_global.h	\
+	$(top_srcdir)/src/core/dependencies/tags.h	\
+	$(top_srcdir)/src/core/dependencies/data_concurrency.h	\
+	$(top_srcdir)/src/core/dependencies/implicit_data_deps.h	\
+	$(top_srcdir)/src/core/dependencies/cg.h	\
+	$(top_srcdir)/src/core/idle_hook.h	\
+	$(top_srcdir)/src/core/sched_ctx_list.h	\
+	$(top_srcdir)/src/core/perfmodel/multiple_regression.h	\
+	$(top_srcdir)/src/core/perfmodel/perfmodel.h	\
+	$(top_srcdir)/src/core/perfmodel/regression.h	\
+	$(top_srcdir)/src/core/debug.h	\
+	$(top_srcdir)/src/core/sched_ctx.h	\
+	$(top_srcdir)/src/core/simgrid.h	\
+	$(top_srcdir)/src/core/task_bundle.h	\
+	$(top_srcdir)/src/core/topology.h	\
+	$(top_srcdir)/src/core/combined_workers.h	\
+	$(top_srcdir)/src/core/detect_combined_workers.h	\
+	$(top_srcdir)/src/core/task.h	\
+	$(top_srcdir)/src/core/disk.h	\
+	$(top_srcdir)/src/core/sched_policy.h	\
+	$(top_srcdir)/src/core/errorcheck.h	\
+	$(top_srcdir)/src/core/progress_hook.h	\
+	$(top_srcdir)/src/core/drivers.h	\
 	$(top_srcdir)/src/core/workers.h
 
 $(DOX_HTML_DIR): $(DOX_TAG) refman.tex
@@ -127,8 +211,6 @@ $(DOX_PDF): $(DOX_TAG) refman.tex
 	@echo $(PDFLATEX) $(DOX_LATEX_DIR)/refman.tex
 	@cd $(DOX_LATEX_DIR) ;\
 	rm -f *.aux *.toc *.idx *.ind *.ilg *.log *.out ;\
-	$(SED) -i -e 's/__env__/\\_Environment Variables!/' -e 's/\\-\\_\\-\\-\\_\\-env\\-\\_\\-\\-\\_\\-//' ExecutionConfigurationThroughEnvironmentVariables.tex ;\
-	$(SED) -i -e 's/__configure__/\\_Configure Options!/' -e 's/\\-\\_\\-\\-\\_\\-configure\\-\\_\\-\\-\\_\\-//' CompilationConfiguration.tex ;\
 	$(SED) -i s'/\\item Module\\-Documentation/\\item \\hyperlink{ModuleDocumentation}{Module Documentation}/' index.tex ;\
 	$(SED) -i s'/\\item File\\-Documentation/\\item \\hyperlink{FileDocumentation}{File Documentation}/' index.tex ;\
 	max_print_line=1000000 $(PDFLATEX) -interaction batchmode refman.tex ;\
@@ -146,7 +228,7 @@ $(DOX_PDF): $(DOX_TAG) refman.tex
 	done
 	mv $(DOX_LATEX_DIR)/refman.pdf $(DOX_PDF)
 
-CLEANFILES = $(DOX_TAG) starpu_config.h \
+CLEANFILES = $(DOX_TAG) config.h \
     -r \
     $(DOX_HTML_DIR) \
     $(DOX_LATEX_DIR) \

+ 85 - 2
doc/doxygen_dev/doxygen-config.cfg.in

@@ -15,7 +15,90 @@
 # See the GNU Lesser General Public License in COPYING.LGPL for more details.
 #
 INPUT                  = @top_srcdir@/doc/doxygen_dev/chapters         \
-                         @top_builddir@/doc/doxygen_dev/starpu_config.h \
+                         @top_builddir@/doc/doxygen_dev/config.h \
+			 @top_srcdir@/src/datawizard/data_request.h \
+			 @top_srcdir@/src/datawizard/coherency.h \
+			 @top_srcdir@/src/datawizard/sort_data_handles.h \
+			 @top_srcdir@/src/datawizard/memalloc.h \
+			 @top_srcdir@/src/datawizard/copy_driver.h \
+			 @top_srcdir@/src/datawizard/filters.h \
+			 @top_srcdir@/src/datawizard/datastats.h \
+			 @top_srcdir@/src/datawizard/write_back.h \
+			 @top_srcdir@/src/datawizard/interfaces/data_interface.h \
+			 @top_srcdir@/src/datawizard/memory_manager.h \
+			 @top_srcdir@/src/datawizard/node_ops.h \
+			 @top_srcdir@/src/datawizard/memstats.h \
+			 @top_srcdir@/src/datawizard/datawizard.h \
+			 @top_srcdir@/src/datawizard/memory_nodes.h \
+			 @top_srcdir@/src/datawizard/footprint.h \
+			 @top_srcdir@/src/datawizard/malloc.h \
+			 @top_srcdir@/src/drivers/cpu/driver_cpu.h \
+			 @top_srcdir@/src/drivers/cuda/driver_cuda.h \
+			 @top_srcdir@/src/drivers/opencl/driver_opencl_utils.h \
+			 @top_srcdir@/src/drivers/opencl/driver_opencl.h \
+			 @top_srcdir@/src/drivers/disk/driver_disk.h \
+			 @top_srcdir@/src/drivers/mpi/driver_mpi_common.h \
+			 @top_srcdir@/src/drivers/mpi/driver_mpi_sink.h \
+			 @top_srcdir@/src/drivers/mpi/driver_mpi_source.h \
+			 @top_srcdir@/src/drivers/mp_common/sink_common.h \
+			 @top_srcdir@/src/drivers/mp_common/mp_common.h \
+			 @top_srcdir@/src/drivers/mp_common/source_common.h \
+			 @top_srcdir@/src/drivers/driver_common/driver_common.h \
+			 @top_srcdir@/src/drivers/mic/driver_mic_sink.h \
+			 @top_srcdir@/src/drivers/mic/driver_mic_source.h \
+			 @top_srcdir@/src/drivers/mic/driver_mic_common.h \
+			 @top_srcdir@/src/profiling/profiling.h \
+			 @top_srcdir@/src/profiling/bound.h \
+			 @top_srcdir@/src/util/starpu_data_cpy.h \
+			 @top_srcdir@/src/util/openmp_runtime_support.h \
+			 @top_srcdir@/src/util/starpu_clusters_create.h \
+			 @top_srcdir@/src/util/starpu_task_insert_utils.h \
+			 @top_srcdir@/src/common/graph.h \
+			 @top_srcdir@/src/common/fxt.h \
+			 @top_srcdir@/src/common/starpu_spinlock.h \
+			 @top_srcdir@/src/common/rbtree_i.h \
+			 @top_srcdir@/src/common/rbtree.h \
+			 @top_srcdir@/src/common/timing.h \
+			 @top_srcdir@/src/common/rwlock.h \
+			 @top_srcdir@/src/common/barrier.h \
+			 @top_srcdir@/src/common/prio_list.h \
+			 @top_srcdir@/src/common/barrier_counter.h \
+			 @top_srcdir@/src/common/uthash.h \
+			 @top_srcdir@/src/common/knobs.h \
+			 @top_srcdir@/src/common/utils.h \
+			 @top_srcdir@/src/common/thread.h \
+			 @top_srcdir@/src/common/list.h \
+			 @top_srcdir@/src/debug/starpu_debug_helpers.h \
+			 @top_srcdir@/src/debug/traces/starpu_fxt.h \
+			 @top_srcdir@/src/starpu_parameters.h \
+			 @top_srcdir@/src/sched_policies/fifo_queues.h \
+			 @top_srcdir@/src/sched_policies/helper_mct.h \
+			 @top_srcdir@/src/sched_policies/sched_component.h \
+			 @top_srcdir@/src/sched_policies/prio_deque.h \
+			 @top_srcdir@/src/core/jobs.h \
+			 @top_srcdir@/src/core/disk_ops/unistd/disk_unistd_global.h \
+			 @top_srcdir@/src/core/dependencies/tags.h \
+			 @top_srcdir@/src/core/dependencies/data_concurrency.h \
+			 @top_srcdir@/src/core/dependencies/implicit_data_deps.h \
+			 @top_srcdir@/src/core/dependencies/cg.h \
+			 @top_srcdir@/src/core/idle_hook.h \
+			 @top_srcdir@/src/core/sched_ctx_list.h \
+			 @top_srcdir@/src/core/perfmodel/multiple_regression.h \
+			 @top_srcdir@/src/core/perfmodel/perfmodel.h \
+			 @top_srcdir@/src/core/perfmodel/regression.h \
+			 @top_srcdir@/src/core/debug.h \
+			 @top_srcdir@/src/core/sched_ctx.h \
+			 @top_srcdir@/src/core/simgrid.h \
+			 @top_srcdir@/src/core/task_bundle.h \
+			 @top_srcdir@/src/core/topology.h \
+			 @top_srcdir@/src/core/combined_workers.h \
+			 @top_srcdir@/src/core/detect_combined_workers.h \
+			 @top_srcdir@/src/core/task.h \
+			 @top_srcdir@/src/core/disk.h \
+			 @top_srcdir@/src/core/sched_policy.h \
+			 @top_srcdir@/src/core/errorcheck.h \
+			 @top_srcdir@/src/core/progress_hook.h \
+			 @top_srcdir@/src/core/drivers.h \
 			 @top_srcdir@/src/core/workers.h
 
 EXAMPLE_PATH           = @top_srcdir@/doc/doxygen_dev \
@@ -23,7 +106,7 @@ EXAMPLE_PATH           = @top_srcdir@/doc/doxygen_dev \
 
 INPUT_FILTER           = @top_builddir@/doc/doxygen_dev/doxygen_filter.sh
 
-#LATEX_HEADER           = @top_srcdir@/doc/doxygen/refman.tex
+#LATEX_HEADER           = @top_srcdir@/doc/doxygen_dev/refman.tex
 
 #IMAGE_PATH             = @top_srcdir@/doc/doxygen_dev/chapters/images
 

+ 86 - 2
doc/doxygen_dev/refman.tex

@@ -73,7 +73,91 @@ Documentation License”.
 
 \input{group__workers}
 
-\chapter{Index}
-\printindex
+\chapter{File Index}
+\input{files}
+
+\chapter{File Documentation}
+\input{barrier_8h}
+\input{barrier__counter_8h}
+\input{bound_8h}
+\input{cg_8h}
+\input{coherency_8h}
+\input{combined__workers_8h}
+\input{config_8h}
+\input{copy__driver_8h}
+\input{data__concurrency_8h}
+\input{data__interface_8h}
+\input{data__request_8h}
+\input{datastats_8h}
+\input{datawizard_8h}
+\input{debug_8h}
+\input{detect__combined__workers_8h}
+\input{disk_8h}
+\input{disk__unistd__global_8h}
+\input{driver__common_8h}
+\input{driver__cpu_8h}
+\input{driver__cuda_8h}
+\input{driver__disk_8h}
+\input{driver__mic__common_8h}
+\input{driver__mic__sink_8h}
+\input{driver__mic__source_8h}
+\input{driver__mpi__common_8h}
+\input{driver__mpi__sink_8h}
+\input{driver__mpi__source_8h}
+\input{driver__opencl_8h}
+\input{driver__opencl__utils_8h}
+\input{drivers_8h}
+\input{errorcheck_8h}
+\input{fifo__queues_8h}
+\input{filters_8h}
+\input{footprint_8h}
+\input{fxt_8h}
+\input{graph_8h}
+\input{helper__mct_8h}
+\input{idle__hook_8h}
+\input{implicit__data__deps_8h}
+\input{jobs_8h}
+\input{knobs_8h}
+\input{malloc_8h}
+\input{memalloc_8h}
+\input{memory__manager_8h}
+\input{memory__nodes_8h}
+\input{memstats_8h}
+\input{mp__common_8h}
+\input{multiple__regression_8h}
+\input{node__ops_8h}
+\input{openmp__runtime__support_8h}
+\input{perfmodel_8h}
+\input{prio__deque_8h}
+\input{prio__list_8h}
+\input{profiling_8h}
+\input{progress__hook_8h}
+\input{rbtree_8h}
+\input{rbtree__i_8h}
+\input{regression_8h}
+\input{rwlock_8h}
+\input{sched__component_8h}
+\input{sched__ctx_8h}
+\input{sched__ctx__list_8h}
+\input{sched__policy_8h}
+\input{simgrid_8h}
+\input{sink__common_8h}
+\input{sort__data__handles_8h}
+\input{source__common_8h}
+\input{starpu__clusters__create_8h}
+\input{starpu__data__cpy_8h}
+\input{starpu__debug__helpers_8h}
+\input{starpu__fxt_8h}
+\input{starpu__parameters_8h}
+\input{starpu__spinlock_8h}
+\input{starpu__task__insert__utils_8h}
+\input{tags_8h}
+\input{task_8h}
+\input{task__bundle_8h}
+\input{thread_8h}
+\input{timing_8h}
+\input{topology_8h}
+\input{utils_8h}
+\input{write__back_8h}
 
 \end{document}

+ 3 - 3
examples/Makefile.am

@@ -75,7 +75,7 @@ EXTRA_DIST = 					\
 	lu/lu.sh
 
 
-CLEANFILES = *.gcno *.gcda *.linkinfo *.mod starpu_idle_microsec.log *.mps *.dot *.pl *.png *.output tasks.rec perfs.rec perfs2.rec
+CLEANFILES = *.gcno *.gcda *.linkinfo *.mod starpu_idle_microsec.log *.mps *.dot *.pl *.png *.output tasks.rec perfs.rec perfs2.rec fortran90/starpu_mod.f90 native_fortran/fstarpu_mod.f90
 
 if STARPU_USE_CUDA
 
@@ -1124,10 +1124,10 @@ endif
 # - link over source file to build our own object
 fortran90/starpu_mod.f90:
 	@$(MKDIR_P) $(dir $@)
-	$(LN_S) $(srcdir)/../../include/$(notdir $@) $@
+	$(LN_S) $(abs_top_srcdir)/include/$(notdir $@) $@
 native_fortran/fstarpu_mod.f90:
 	@$(MKDIR_P) $(dir $@)
-	$(LN_S) $(srcdir)/../../include/$(notdir $@) $@
+	$(LN_S) $(abs_top_srcdir)/include/$(notdir $@) $@
 
 if STARPU_HAVE_FC
 # Fortran90 example

+ 0 - 3
examples/Makefile.am.tmp.y

@@ -1,3 +0,0 @@
-# Copyright (C) 2016       Uppsala University
-# Copyright (C) 2011       Télécom-SudParis
-# Copyright (C) 2017       Erwan Leria

+ 64 - 58
examples/cpp/add_vectors_interface.cpp

@@ -450,7 +450,7 @@ static int pack_vector_cpp_handle(starpu_data_handle_t handle, unsigned node, vo
 
 	if (ptr != NULL)
 	{
-		starpu_malloc_flags(ptr, *count, 0);
+		*ptr = (void*) starpu_malloc_on_node_flags(node, *count, 0);
 		memcpy(*ptr, (void*)vector_interface->ptr, vector_interface->elemsize*vector_interface->nx);
 	}
 
@@ -467,6 +467,8 @@ static int unpack_vector_cpp_handle(starpu_data_handle_t handle, unsigned node,
 	STARPU_ASSERT(count == vector_interface->elemsize * vector_interface->nx);
 	memcpy((void*)vector_interface->ptr, ptr, count);
 
+	starpu_free_on_node_flags(node, (uintptr_t)ptr, count, 0);
+
 	return 0;
 }
 
@@ -574,6 +576,8 @@ void cpu_kernel_add_vectors(void *buffers[], void *cl_arg)
 int main(int argc, char **argv)
 {
 	struct starpu_conf conf;
+	bool fail;
+
 	starpu_conf_init(&conf);
 	conf.nmic = 0;
 	conf.nmpi_ms = 0;
@@ -584,74 +588,76 @@ int main(int argc, char **argv)
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
-	/* Test data transfers between NUMA nodes if available */
-	unsigned last_numa_node = starpu_memory_nodes_get_numa_count() - 1;
-
-	const my_allocator<char> allocator_main_ram(STARPU_MAIN_RAM);
-	const my_allocator<char> allocator_last_numa(last_numa_node);
-	std::vector<MY_TYPE> vec_A(VEC_SIZE, 2, allocator_main_ram); // all the vector is initialized to 2
-	std::vector<MY_TYPE> vec_B(VEC_SIZE, 3, allocator_main_ram); // all the vector is initialized to 3
-	std::vector<MY_TYPE> vec_C(VEC_SIZE, 0, allocator_last_numa); // all the vector is initialized to 0
-
-	// StarPU data registering
-	starpu_data_handle_t spu_vec_A;
-	starpu_data_handle_t spu_vec_B;
-	starpu_data_handle_t spu_vec_C;
-
-	// give the data of the vector to StarPU (C array)
-	vector_cpp_data_register(&spu_vec_A, STARPU_MAIN_RAM, &vec_A, vec_A.size(), sizeof(char));
-	vector_cpp_data_register(&spu_vec_B, STARPU_MAIN_RAM, &vec_B, vec_B.size(), sizeof(char));
-	vector_cpp_data_register(&spu_vec_C, last_numa_node, &vec_C, vec_C.size(), sizeof(char));
-
-	// create the StarPU codelet
-	starpu_codelet cl;
-	starpu_codelet_init(&cl);
-	cl.cpu_funcs     [0] = cpu_kernel_add_vectors;
-	cl.cpu_funcs_name[0] = "cpu_kernel_add_vectors";
-	cl.nbuffers          = 3;
-	cl.modes         [0] = STARPU_R;
-	cl.modes         [1] = STARPU_R;
-	cl.modes         [2] = STARPU_W;
-	cl.name              = "add_vectors";
-
-	// submit a new StarPU task to execute
-	ret = starpu_task_insert(&cl,
-	                         STARPU_R, spu_vec_A,
-	                         STARPU_R, spu_vec_B,
-	                         STARPU_W, spu_vec_C,
-	                         0);
-	if (ret == -ENODEV)
 	{
+		/* Test data transfers between NUMA nodes if available */
+		unsigned last_numa_node = starpu_memory_nodes_get_numa_count() - 1;
+
+		const my_allocator<char> allocator_main_ram(STARPU_MAIN_RAM);
+		const my_allocator<char> allocator_last_numa(last_numa_node);
+		std::vector<MY_TYPE> vec_A(VEC_SIZE, 2, allocator_main_ram); // all the vector is initialized to 2
+		std::vector<MY_TYPE> vec_B(VEC_SIZE, 3, allocator_main_ram); // all the vector is initialized to 3
+		std::vector<MY_TYPE> vec_C(VEC_SIZE, 0, allocator_last_numa); // all the vector is initialized to 0
+
+		// StarPU data registering
+		starpu_data_handle_t spu_vec_A;
+		starpu_data_handle_t spu_vec_B;
+		starpu_data_handle_t spu_vec_C;
+
+		// give the data of the vector to StarPU (C array)
+		vector_cpp_data_register(&spu_vec_A, STARPU_MAIN_RAM, &vec_A, vec_A.size(), sizeof(char));
+		vector_cpp_data_register(&spu_vec_B, STARPU_MAIN_RAM, &vec_B, vec_B.size(), sizeof(char));
+		vector_cpp_data_register(&spu_vec_C, last_numa_node, &vec_C, vec_C.size(), sizeof(char));
+
+		// create the StarPU codelet
+		starpu_codelet cl;
+		starpu_codelet_init(&cl);
+		cl.cpu_funcs     [0] = cpu_kernel_add_vectors;
+		cl.cpu_funcs_name[0] = "cpu_kernel_add_vectors";
+		cl.nbuffers          = 3;
+		cl.modes         [0] = STARPU_R;
+		cl.modes         [1] = STARPU_R;
+		cl.modes         [2] = STARPU_W;
+		cl.name              = "add_vectors";
+
+		// submit a new StarPU task to execute
+		ret = starpu_task_insert(&cl,
+					 STARPU_R, spu_vec_A,
+					 STARPU_R, spu_vec_B,
+					 STARPU_W, spu_vec_C,
+					 0);
+		if (ret == -ENODEV)
+		{
+			// StarPU data unregistering
+			starpu_data_unregister(spu_vec_C);
+			starpu_data_unregister(spu_vec_B);
+			starpu_data_unregister(spu_vec_A);
+
+			// terminate StarPU, no task can be submitted after
+			starpu_shutdown();
+
+			return 77;
+		}
+
+		STARPU_CHECK_RETURN_VALUE(ret, "task_submit::add_vectors");
+
+		// wait the task
+		starpu_task_wait_for_all();
+
 		// StarPU data unregistering
 		starpu_data_unregister(spu_vec_C);
 		starpu_data_unregister(spu_vec_B);
 		starpu_data_unregister(spu_vec_A);
 
-		// terminate StarPU, no task can be submitted after
-		starpu_shutdown();
-
-		return 77;
+		// check results
+		fail = false;
+		int i = 0;
+		while (!fail && i < VEC_SIZE)
+			fail = vec_C[i++] != 5;
 	}
 
-	STARPU_CHECK_RETURN_VALUE(ret, "task_submit::add_vectors");
-
-	// wait the task
-	starpu_task_wait_for_all();
-
-	// StarPU data unregistering
-	starpu_data_unregister(spu_vec_C);
-	starpu_data_unregister(spu_vec_B);
-	starpu_data_unregister(spu_vec_A);
-
 	// terminate StarPU, no task can be submitted after
 	starpu_shutdown();
 
-	// check results
-	bool fail = false;
-	int i = 0;
-	while (!fail && i < VEC_SIZE)
-		fail = vec_C[i++] != 5;
-
 	if (fail)
 	{
 #ifdef PRINT_OUTPUT

+ 2 - 2
examples/interface/complex_interface.c

@@ -138,7 +138,7 @@ static int complex_pack_data(starpu_data_handle_t handle, unsigned node, void **
 	if (ptr != NULL)
 	{
 		char *data;
-		starpu_malloc_flags((void**) &data, *count, 0);
+		data = (void*) starpu_malloc_on_node_flags(node, *count, 0);
 		*ptr = data;
 		memcpy(data, complex_interface->real, complex_interface->nx*sizeof(double));
 		memcpy(data+complex_interface->nx*sizeof(double), complex_interface->imaginary, complex_interface->nx*sizeof(double));
@@ -159,7 +159,7 @@ static int complex_unpack_data(starpu_data_handle_t handle, unsigned node, void
 	memcpy(complex_interface->real, data, complex_interface->nx*sizeof(double));
 	memcpy(complex_interface->imaginary, data+complex_interface->nx*sizeof(double), complex_interface->nx*sizeof(double));
 
-	starpu_free_flags(ptr, count, 0);
+	starpu_free_on_node_flags(node, (uintptr_t) ptr, count, 0);
 
 	return 0;
 }

+ 2 - 2
examples/mult/sgemm.sh

@@ -67,8 +67,8 @@ then
 	$PREFIX/../../tools/starpu_codelet_histo_profile distrib.data || true
 	[ -f distrib.data.starpu_sgemm_gemm.0.492beed5.33177600.pdf ] || true
 
-	if [ -x $PREFIX/../../tools/starpu_replay ]; then
-		$STARPU_LAUNCH $PREFIX/../../tools/starpu_replay tasks.rec
+	if [ -x $PREFIX/../../tools/replay/starpu_replay ]; then
+		$STARPU_LAUNCH $PREFIX/../../tools/replay/starpu_replay tasks.rec
 	fi
 
 	[ ! -x $PREFIX/../../tools/starpu_perfmodel_recdump ] || $STARPU_LAUNCH $PREFIX/../../tools/starpu_perfmodel_recdump tasks.rec -o perfs2.rec

+ 5 - 4
examples/scheduler/schedulers.sh

@@ -22,9 +22,10 @@ check_success()
     fi
 }
 
-if test ! -x ./cholesky/cholesky_tag
+basedir=$(dirname $0)
+if test ! -x $basedir/../cholesky/cholesky_tag
 then
-    echo "Application ./cholesky/cholesky_tag unavailable"
+    echo "Application $basedir/../cholesky/cholesky_tag unavailable"
     exit 77
 fi
 
@@ -32,12 +33,12 @@ if [ -n "$STARPU_SCHED" ]
 then
 	SCHEDULERS=$STARPU_SCHED
 else
-	SCHEDULERS=`../tools/starpu_sched_display | grep -v heteroprio`
+	SCHEDULERS=`$basedir/../../tools/starpu_sched_display | grep -v heteroprio`
 fi
 
 for sched in $SCHEDULERS
 do
     echo "cholesky.$sched"
-    STARPU_SCHED=$sched $STARPU_LAUNCH ./cholesky/cholesky_tag -size $((960*3)) -nblocks 3
+    STARPU_SCHED=$sched $STARPU_LAUNCH $basedir/../cholesky/cholesky_tag -size $((960*3)) -nblocks 3
     check_success $?
 done

+ 5 - 4
examples/scheduler/schedulers_context.sh

@@ -22,9 +22,10 @@ check_success()
     fi
 }
 
-if test ! -x ./sched_ctx/sched_ctx
+basedir=$(dirname $0)
+if test ! -x $basedir/../sched_ctx/sched_ctx
 then
-    echo "Application ./sched_ctx/sched_ctx unavailable"
+    echo "Application $basedir/../sched_ctx/sched_ctx unavailable"
     exit 77
 fi
 
@@ -32,12 +33,12 @@ if [ -n "$STARPU_SCHED" ]
 then
 	SCHEDULERS="$STARPU_SCHED"
 else
-	SCHEDULERS=`../tools/starpu_sched_display | grep -v pheft | grep -v peager | grep -v heteroprio | grep -v modular-gemm`
+	SCHEDULERS=`$basedir/../../tools/starpu_sched_display | grep -v pheft | grep -v peager | grep -v heteroprio | grep -v modular-gemm`
 fi
 
 for sched in $SCHEDULERS
 do
     echo "sched_ctx.$sched"
-    STARPU_SCHED=$sched $STARPU_LAUNCH ./sched_ctx/sched_ctx
+    STARPU_SCHED=$sched $STARPU_LAUNCH $basedir/../sched_ctx/sched_ctx
     check_success $?
 done

+ 17 - 2
examples/stencil/implicit-stencil.c

@@ -16,6 +16,10 @@
 
 #include "implicit-stencil.h"
 
+#ifdef STARPU_HAVE_VALGRIND_H
+#include <valgrind/valgrind.h>
+#endif
+
 /* Main application */
 
 /* default parameter values */
@@ -26,18 +30,20 @@ static unsigned ticks = 1000;
 #ifdef STARPU_QUICK_CHECK
 static unsigned niter = 4;
 #define SIZE 16
+#define NBZ 8
 #else
 static unsigned niter = 32;
 #define SIZE 128
+#define NBZ 64
 #endif
 
 /* Problem size */
 static unsigned sizex = SIZE;
 static unsigned sizey = SIZE;
-static unsigned sizez = 64*SIZE;
+static unsigned sizez = NBZ*SIZE;
 
 /* Number of blocks (scattered over the different MPI processes) */
-unsigned nbz = 64;
+unsigned nbz = NBZ;
 
 double start;
 double begin, end;
@@ -120,6 +126,15 @@ static void parse_args(int argc, char **argv)
 			 exit(0);
 		}
 	}
+
+#ifdef STARPU_HAVE_VALGRIND_H
+	if (RUNNING_ON_VALGRIND)
+	{
+		sizex = sizey = 3;
+		nbz = 10;
+		sizez = nbz*3;
+	}
+#endif
 }
 
 static void init_problem(int argc, char **argv, int rank, int world_size)

+ 17 - 2
examples/stencil/stencil.c

@@ -16,6 +16,10 @@
 
 #include "stencil.h"
 
+#ifdef STARPU_HAVE_VALGRIND_H
+#include <valgrind/valgrind.h>
+#endif
+
 /* Main application */
 
 /* default parameter values */
@@ -26,18 +30,20 @@ static unsigned ticks = 1000;
 #ifdef STARPU_QUICK_CHECK
 static unsigned niter = 4;
 #define SIZE 16
+#define NBZ 8
 #else
 static unsigned niter = 32;
 #define SIZE 128
+#define NBZ 64
 #endif
 
 /* Problem size */
 static unsigned sizex = SIZE;
 static unsigned sizey = SIZE;
-static unsigned sizez = 64*SIZE;
+static unsigned sizez = NBZ*SIZE;
 
 /* Number of blocks (scattered over the different MPI processes) */
-unsigned nbz = 64;
+unsigned nbz = NBZ;
 
 /*
  *	Initialization
@@ -116,6 +122,15 @@ static void parse_args(int argc, char **argv)
 			 exit(0);
 		}
 	}
+
+#ifdef STARPU_HAVE_VALGRIND_H
+	if (RUNNING_ON_VALGRIND)
+	{
+		sizex = sizey = 3;
+		nbz = 10;
+		sizez = nbz*3;
+	}
+#endif
 }
 
 static void init_problem(int argc, char **argv, int rank, int world_size)

+ 52 - 0
include/starpu_clusters.h

@@ -34,21 +34,73 @@ extern "C"
    @{
  */
 
+/**
+   Used when calling starpu_cluster_machine
+ */
 #define STARPU_CLUSTER_MIN_NB			(1<<STARPU_MODE_SHIFT)
+
+/**
+   Used when calling starpu_cluster_machine
+ */
 #define STARPU_CLUSTER_MAX_NB			(2<<STARPU_MODE_SHIFT)
+
+/**
+   Used when calling starpu_cluster_machine
+ */
 #define STARPU_CLUSTER_NB			(3<<STARPU_MODE_SHIFT)
+
+/**
+   Used when calling starpu_cluster_machine
+ */
 #define STARPU_CLUSTER_PREFERE_MIN		(4<<STARPU_MODE_SHIFT)
+
+/**
+   Used when calling starpu_cluster_machine
+ */
 #define STARPU_CLUSTER_KEEP_HOMOGENEOUS		(5<<STARPU_MODE_SHIFT)
 
+/**
+   Used when calling starpu_cluster_machine
+ */
 #define STARPU_CLUSTER_POLICY_NAME		(6<<STARPU_MODE_SHIFT)
+
+/**
+   Used when calling starpu_cluster_machine
+ */
 #define STARPU_CLUSTER_POLICY_STRUCT		(7<<STARPU_MODE_SHIFT)
+
+/**
+   Used when calling starpu_cluster_machine
+ */
 #define STARPU_CLUSTER_CREATE_FUNC		(8<<STARPU_MODE_SHIFT)
+
+/**
+   Used when calling starpu_cluster_machine
+ */
 #define STARPU_CLUSTER_CREATE_FUNC_ARG		(9<<STARPU_MODE_SHIFT)
+/**
+   Used when calling starpu_cluster_machine
+ */
 #define STARPU_CLUSTER_TYPE			(10<<STARPU_MODE_SHIFT)
+
+/**
+   Used when calling starpu_cluster_machine
+ */
 #define STARPU_CLUSTER_AWAKE_WORKERS		(11<<STARPU_MODE_SHIFT)
 
+/**
+   Used when calling starpu_cluster_machine
+ */
 #define STARPU_CLUSTER_PARTITION_ONE		(12<<STARPU_MODE_SHIFT)
+
+/**
+   Used when calling starpu_cluster_machine
+ */
 #define STARPU_CLUSTER_NEW			(13<<STARPU_MODE_SHIFT)
+
+/**
+   Used when calling starpu_cluster_machine
+ */
 #define STARPU_CLUSTER_NCORES			(14<<STARPU_MODE_SHIFT)
 
 /**

+ 128 - 3
include/starpu_config.h.in

@@ -25,17 +25,68 @@
 #ifndef __STARPU_CONFIG_PUBLIC_H__
 #define __STARPU_CONFIG_PUBLIC_H__
 
+/**
+   Define the major version of StarPU. This is the version used when
+   compiling the application.
+   @ingroup API_Versioning
+*/
 #undef STARPU_MAJOR_VERSION
+
+/**
+   @ingroup API_Versioning
+   Define the minor version of StarPU. This is the version used when
+   compiling the application.
+*/
 #undef STARPU_MINOR_VERSION
+
+/**
+   Define the release version of StarPU. This is the version used when
+   compiling the application.
+   @ingroup API_Versioning
+*/
 #undef STARPU_RELEASE_VERSION
 
 #undef STARPU_USE_CPU
+
+/**
+   Defined when StarPU has been installed with
+   CUDA support. It should be used in your code to detect the
+   availability of CUDA.
+   @ingroup API_CUDA_Extensions
+*/
 #undef STARPU_USE_CUDA
+
+/**
+   Defined when StarPU has been installed with OpenCL support. It
+   should be used in your code to detect the availability of OpenCL as
+   shown in \ref FullSourceCodeVectorScal.
+   @ingroup API_OpenCL_Extensions
+*/
 #undef STARPU_USE_OPENCL
+
+/**
+   Defined when StarPU has been installed with MIC support. It should
+   be used in your code to detect the availability of MIC.
+   @ingroup API_MIC_Extensions
+*/
 #undef STARPU_USE_MIC
+
+/**
+   Defined when StarPU has been installed with MPI Master Slave
+   support. It should be used in your code to detect the availability
+   of MPI Master Slave.
+   @ingroup API_MPI_Support
+*/
 #undef STARPU_USE_MPI_MASTER_SLAVE
 
+/**
+   Defined when StarPU has been installed with OpenMP Runtime support.
+   It should be used in your code to detect the availability of the
+   runtime support for OpenMP.
+   @ingroup API_OpenMP_Runtime_Support
+*/
 #undef STARPU_OPENMP
+
 #undef STARPU_CLUSTER
 
 #undef STARPU_SIMGRID
@@ -57,9 +108,13 @@
 /* workers must call callbacks on sleep/wake-up */
 #undef STARPU_WORKER_CALLBACKS
 
-
 #undef STARPU_HAVE_ICC
 
+/**
+   Defined when StarPU has been installed with MPI support. It should
+   be used in your code to detect the availability of MPI.
+   @ingroup API_MPI_Support
+*/
 #undef STARPU_USE_MPI
 #undef STARPU_USE_MPI_MPI
 #undef STARPU_USE_MPI_NMAD
@@ -72,6 +127,11 @@
 #undef STARPU_SYSTEM_BLAS
 #undef STARPU_HAVE_CBLAS_H
 
+/**
+   Define the directory in which the OpenCL codelets of the
+   applications provided with StarPU have been installed.
+   @ingroup API_OpenCL_Extensions
+*/
 #undef STARPU_OPENCL_DATADIR
 #undef STARPU_HAVE_MAGMA
 
@@ -107,19 +167,84 @@
 
 #undef STARPU_HAVE_CURAND
 
+/**
+   Define the maximum number of memory nodes managed by StarPU. The
+   default value can be modified at configure by using the option \ref
+   enable-maxnodes "--enable-maxnodes". Reducing it allows to
+   considerably reduce memory used by StarPU data structures.
+   @ingroup API_Workers_Properties
+*/
 #undef STARPU_MAXNODES
+
+/**
+   Define the maximum number of buffers that tasks will be able to
+   take as parameters. The default value is 8, it can be changed by
+   using the configure option \ref enable-maxbuffers
+   "--enable-maxbuffers".
+   @ingroup API_Codelet_And_Tasks
+*/
 #undef STARPU_NMAXBUFS
+
+/**
+   Define the maximum number of CPU workers managed by StarPU. The
+   default value can be modified at configure by using the option \ref
+   enable-maxcpus "--enable-maxcpus".
+   @ingroup API_Workers_Properties
+*/
 #undef STARPU_MAXCPUS
+
+/**
+   Define the maximum number of NUMA nodes managed by StarPU. The
+   default value can be modified at configure by using the option \ref
+   enable-maxnumanodes "--enable-maxnumanodes".
+   @ingroup API_Workers_Properties
+*/
 #undef STARPU_MAXNUMANODES
+
+/**
+ * Define the maximum number of CUDA devices that are supported by StarPU.
+ * @ingroup API_CUDA_Extensions
+ */
 #undef STARPU_MAXCUDADEVS
-// BEGIN FPGA
+
 #undef STARPU_MAXFPGADEVS
-// END FPGA
+
+/**
+   Define the maximum number of OpenCL devices that are supported by
+   StarPU.
+   @ingroup API_OpenCL_Extensions
+*/
 #undef STARPU_MAXOPENCLDEVS
+
+/**
+   Define the maximum number of MIC devices that are supported by
+   StarPU.
+   @ingroup API_MIC_Extensions
+*/
 #undef STARPU_MAXMICDEVS
+
+/**
+   Define the maximum number of workers managed by StarPU.
+   @ingroup API_Workers_Properties
+*/
 #undef STARPU_NMAXWORKERS
+
+/**
+   Define the maximum number of scheduling contexts managed by StarPU.
+   The default value can be modified at configure by using the option
+   \ref enable-max-sched-ctxs "--enable-max-sched-ctxs".
+   @ingroup API_Scheduling_Policy
+*/
 #undef STARPU_NMAX_SCHED_CTXS
+
+/**
+   Define the maximum number of implementations per architecture. The
+   default value can be modified at configure by using the option \ref
+   enable-maximplementations "--enable-maximplementations".
+   @ingroup API_Scheduling_Policy
+*/
 #undef STARPU_MAXIMPLEMENTATIONS
+
 #undef STARPU_MAXMPKERNELS
 #undef STARPU_USE_SC_HYPERVISOR
 #undef STARPU_SC_HYPERVISOR_DEBUG

+ 3 - 3
include/starpu_openmp.h

@@ -27,7 +27,7 @@
 
 #if defined STARPU_OPENMP
 /**
-   Opaque Simple Lock object (\ref SimpleLock) for inter-task
+   Opaque Simple Lock object (\anchor SimpleLock) for inter-task
    synchronization operations.
    \sa starpu_omp_init_lock()
    \sa starpu_omp_destroy_lock()
@@ -38,7 +38,7 @@
 typedef struct { void *internal; /**< opaque pointer for internal use */ } starpu_omp_lock_t;
 
 /**
-   Opaque Nestable Lock object (\ref NestableLock) for inter-task
+   Opaque Nestable Lock object (\anchor NestableLock) for inter-task
    synchronization operations.
    \sa starpu_omp_init_nest_lock()
    \sa starpu_omp_destroy_nest_lock()
@@ -50,7 +50,7 @@ typedef struct { void *internal; /**< opaque pointer for internal use */  } star
 
 /**
    Set of constants for selecting the for loop iteration scheduling
-   algorithm (\ref OMPFor) as defined by the OpenMP specification.
+   algorithm (\anchor OMPFor) as defined by the OpenMP specification.
    \sa starpu_omp_for()
    \sa starpu_omp_for_inline_first()
    \sa starpu_omp_for_inline_next()

+ 12 - 0
include/starpu_sched_component.h

@@ -420,6 +420,7 @@ struct starpu_sched_component_fifo_data
 	unsigned ntasks_threshold;
 	double exp_len_threshold;
 	int ready;
+	int exp;
 };
 
 /**
@@ -446,6 +447,7 @@ struct starpu_sched_component_prio_data
 	unsigned ntasks_threshold;
 	double exp_len_threshold;
 	int ready;
+	int exp;
 };
 struct starpu_sched_component *starpu_sched_component_prio_create(struct starpu_sched_tree *tree, struct starpu_sched_component_prio_data *prio_data) STARPU_ATTRIBUTE_MALLOC;
 int starpu_sched_component_is_prio(struct starpu_sched_component *component);
@@ -727,6 +729,11 @@ struct starpu_sched_tree *starpu_sched_component_make_scheduler(unsigned sched_c
 #define STARPU_SCHED_SIMPLE_DECIDE_ARCHS	(3<<0)
 
 /**
+   Request to create the scheduling decision-making component even if there is only one available choice. This is useful for instance when the decision-making component will store tasks itself (and not use STARPU_SCHED_SIMPLE_FIFO_ABOVE) to decide in which order tasks should be passed below.
+*/
+#define STARPU_SCHED_SIMPLE_DECIDE_ALWAYS	(1<<3)
+
+/**
    Request to add a perfmodel selector above the scheduling decision-making component. That way, only tasks with a calibrated performance model will be given to the component, other tasks will go to an eager branch that will distributed tasks so that their performance models will get calibrated.
    In other words, this is needed when using a component which needs performance models for tasks.
 */
@@ -777,6 +784,11 @@ struct starpu_sched_tree *starpu_sched_component_make_scheduler(unsigned sched_c
 #define STARPU_SCHED_SIMPLE_COMBINED_WORKERS	(1<<12)
 
 /**
+   Request that the fifos below keep track of expected duration, start and end time of theirs elements
+*/
+#define STARPU_SCHED_SIMPLE_FIFOS_BELOW_EXP	(1<<13)
+
+/**
    Create a simple modular scheduler tree around a scheduling decision-making
    component \p component. The details of what should be built around \p component
    is described by \p flags. The different STARPU_SCHED_SIMPL_DECIDE_* flags are

+ 3 - 0
include/starpu_task_util.h

@@ -106,6 +106,9 @@ extern "C"
 */
 #define STARPU_TAG               (10<<STARPU_MODE_SHIFT)
 
+/**
+   Used when calling starpu_task_insert(), must be followed by a tag.
+*/
 #define STARPU_HYPERVISOR_TAG	 (11<<STARPU_MODE_SHIFT)
 
 /**

+ 5 - 1
include/starpu_thread.h

@@ -76,7 +76,9 @@ typedef sg_actor_t starpu_pthread_t;
 #else
 typedef msg_process_t starpu_pthread_t;
 #endif
-typedef int starpu_pthread_attr_t;
+typedef struct {
+	size_t stacksize;
+} starpu_pthread_attr_t;
 
 #ifdef STARPU_HAVE_SIMGRID_ACTOR_H
 typedef sg_host_t starpu_sg_host_t;
@@ -93,6 +95,7 @@ int starpu_pthread_exit(void *retval) STARPU_ATTRIBUTE_NORETURN;
 int starpu_pthread_attr_init(starpu_pthread_attr_t *attr);
 int starpu_pthread_attr_destroy(starpu_pthread_attr_t *attr);
 int starpu_pthread_attr_setdetachstate(starpu_pthread_attr_t *attr, int detachstate);
+int starpu_pthread_attr_setstacksize(starpu_pthread_attr_t *attr, size_t stacksize);
 
 #elif !defined(_MSC_VER) || defined(BUILDING_STARPU) /* STARPU_SIMGRID */
 
@@ -108,6 +111,7 @@ typedef pthread_attr_t starpu_pthread_attr_t;
 #define starpu_pthread_attr_init pthread_attr_init
 #define starpu_pthread_attr_destroy pthread_attr_destroy
 #define starpu_pthread_attr_setdetachstate pthread_attr_setdetachstate
+#define starpu_pthread_attr_setstacksize pthread_attr_setstacksize
 
 #endif /* STARPU_SIMGRID, _MSC_VER */
 

+ 10 - 4
mpi/examples/Makefile.am

@@ -59,7 +59,7 @@ starpu_mpi_EXAMPLES =
 
 BUILT_SOURCES =
 
-CLEANFILES = *.gcno *.gcda *.linkinfo starpu_idle_microsec.log *.mod
+CLEANFILES = *.gcno *.gcda *.linkinfo starpu_idle_microsec.log *.mod native_fortran/fstarpu_mod.f90 native_fortran/fstarpu_mpi_mod.f90
 
 EXTRA_DIST = 				\
 	mpi_lu/mpi_lu-float.h		\
@@ -342,14 +342,20 @@ endif
 
 if BUILD_EXAMPLES
 examplebin_PROGRAMS +=				\
-	user_datatype/user_datatype
+	user_datatype/user_datatype		\
+	user_datatype/user_datatype2
 
 user_datatype_user_datatype_SOURCES =		\
 	user_datatype/user_datatype.c		\
 	user_datatype/my_interface.c
 
+user_datatype_user_datatype2_SOURCES =		\
+	user_datatype/user_datatype2.c		\
+	user_datatype/my_interface.c
+
 if !STARPU_SIMGRID
 starpu_mpi_EXAMPLES	+=			\
+	user_datatype/user_datatype2		\
 	user_datatype/user_datatype
 endif
 endif
@@ -389,10 +395,10 @@ endif
 # - link over source file to build our own object
 native_fortran/fstarpu_mod.f90:
 	@$(MKDIR_P) $(dir $@)
-	$(LN_S) $(srcdir)/../../../include/$(notdir $@) $@
+	$(LN_S) $(abs_top_srcdir)/include/$(notdir $@) $@
 native_fortran/fstarpu_mpi_mod.f90:
 	@$(MKDIR_P) $(dir $@)
-	$(LN_S) $(srcdir)/../../../mpi/include/$(notdir $@) $@
+	$(LN_S) $(abs_top_srcdir)/mpi/include/$(notdir $@) $@
 
 if STARPU_HAVE_MPIFORT
 if BUILD_EXAMPLES

+ 1 - 1
mpi/examples/filters/filter.c

@@ -31,7 +31,7 @@ void cpu_func(void *buffers[], void *cl_arg)
 	int rank;
 
 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
-	fprintf(stderr, "comuting on rank %d\n", rank);
+	fprintf(stderr, "computing on rank %d\n", rank);
         unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
         int *val = (int *)STARPU_VECTOR_GET_PTR(buffers[0]);
 	starpu_codelet_unpack_args(cl_arg, &factor);

+ 8 - 0
mpi/examples/helper.h

@@ -17,6 +17,14 @@
 #include <errno.h>
 #include <starpu_mpi.h>
 
+#ifdef STARPU_HAVE_VALGRIND_H
+#include <valgrind/valgrind.h>
+#endif
+
+#ifdef STARPU_HAVE_HELGRIND_H
+#include <valgrind/helgrind.h>
+#endif
+
 #define STARPU_TEST_SKIPPED 77
 
 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)

+ 3 - 3
mpi/examples/matrix_decomposition/mpi_cholesky.c

@@ -55,9 +55,6 @@ int main(int argc, char **argv)
 
 	dw_cholesky(bmat, size/nblocks, rank, nodes, &timing, &flops);
 
-	starpu_cublas_shutdown();
-	starpu_mpi_shutdown();
-
 #ifndef STARPU_SIMGRID
 	matrix_display(bmat, rank);
 
@@ -67,6 +64,9 @@ int main(int argc, char **argv)
 
 	matrix_free(&bmat, rank, nodes, 1);
 
+	starpu_cublas_shutdown();
+	starpu_mpi_shutdown();
+
 #ifndef STARPU_SIMGRID
 	if (check)
 		assert(correctness);

+ 2 - 2
mpi/examples/matrix_decomposition/mpi_cholesky_distributed.c

@@ -53,11 +53,11 @@ int main(int argc, char **argv)
 
 	dw_cholesky(bmat, size/nblocks, rank, nodes, &timing, &flops);
 
+	matrix_free(&bmat, rank, nodes, 0);
+
 	starpu_cublas_shutdown();
 	starpu_mpi_shutdown();
 
-	matrix_free(&bmat, rank, nodes, 0);
-
 	if (rank == 0)
 	{
 		FPRINTF(stdout, "Computation time (in ms): %2.2f\n", timing/1000);

+ 10 - 0
mpi/examples/matrix_decomposition/mpi_decomposition_params.c

@@ -15,11 +15,16 @@
  */
 
 #include "mpi_cholesky.h"
+#include "helper.h"
 #include <string.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
 
+#ifdef STARPU_HAVE_VALGRIND_H
+#include <valgrind/valgrind.h>
+#endif
+
 #ifdef STARPU_QUICK_CHECK
 unsigned size = 2*320;
 unsigned nblocks = 2;
@@ -95,6 +100,11 @@ void parse_args(int argc, char **argv, int nodes)
                 }
         }
 
+#ifdef STARPU_HAVE_VALGRIND_H
+	if (RUNNING_ON_VALGRIND)
+		size = 16;
+#endif
+
         if (nblocks > size)
 		nblocks = size;
 

+ 10 - 0
mpi/examples/mpi_lu/plu_example.c

@@ -15,6 +15,7 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
+#include "helper.h"
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
@@ -29,6 +30,10 @@
 #include <numaif.h>
 #endif
 
+#ifdef STARPU_HAVE_VALGRIND_H
+#include <valgrind/valgrind.h>
+#endif
+
 static unsigned long size = 4096;
 static unsigned nblocks = 16;
 static unsigned check = 0;
@@ -125,6 +130,11 @@ static void parse_args(int rank, int argc, char **argv)
 			exit(0);
 		}
 	}
+
+#ifdef STARPU_HAVE_VALGRIND_H
+	if (RUNNING_ON_VALGRIND)
+		size = 16;
+#endif
 }
 
 unsigned STARPU_PLU(display_flag)(void)

+ 10 - 0
mpi/examples/mpi_lu/plu_implicit_example.c

@@ -15,6 +15,7 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
+#include "helper.h"
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
@@ -29,6 +30,10 @@
 #include <numaif.h>
 #endif
 
+#ifdef STARPU_HAVE_VALGRIND_H
+#include <valgrind/valgrind.h>
+#endif
+
 static unsigned long size = 4096;
 static unsigned nblocks = 16;
 static unsigned check = 0;
@@ -104,6 +109,11 @@ static void parse_args(int argc, char **argv)
 			exit(0);
 		}
 	}
+
+#ifdef STARPU_HAVE_VALGRIND_H
+	if (RUNNING_ON_VALGRIND)
+		size = 16;
+#endif
 }
 
 unsigned STARPU_PLU(display_flag)(void)

+ 10 - 0
mpi/examples/mpi_lu/plu_outofcore_example.c

@@ -15,6 +15,7 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
+#include "helper.h"
 #include <stdlib.h>
 #include <stdio.h>
 #include <unistd.h>
@@ -32,6 +33,10 @@
 #include <numaif.h>
 #endif
 
+#ifdef STARPU_HAVE_VALGRIND_H
+#include <valgrind/valgrind.h>
+#endif
+
 static unsigned long size = 4096;
 static unsigned nblocks = 16;
 static unsigned check = 0;
@@ -111,6 +116,11 @@ static void parse_args(int argc, char **argv)
 			exit(0);
 		}
 	}
+
+#ifdef STARPU_HAVE_VALGRIND_H
+	if (RUNNING_ON_VALGRIND)
+		size = 16;
+#endif
 }
 
 unsigned STARPU_PLU(display_flag)(void)

+ 1 - 1
mpi/examples/native_fortran/nf_basic_ring.f90

@@ -25,7 +25,7 @@ program nf_basic_ring
         integer(c_int),target :: token = 42
         integer(c_int) :: nloops = 32
         integer(c_int) :: loop
-        integer(c_int) :: tag
+        integer(c_int64_t) :: tag
         integer(c_int) :: world
         integer(c_int) :: src,dst
         type(c_ptr) :: token_dh, st

+ 2 - 1
mpi/examples/native_fortran/nf_mm.f90

@@ -31,7 +31,8 @@ program nf_mm
         integer(c_int) :: ret
         integer(c_int) :: row, col
         integer(c_int) :: b_row, b_col
-        integer(c_int) :: mr, tag, rank
+        integer(c_int) :: mr, rank
+        integer(c_int64_t) :: tag
 
         ret = fstarpu_init(C_NULL_PTR)
         if (ret == -19) then

+ 2 - 1
mpi/examples/native_fortran/nf_mm_task_build.f90

@@ -32,7 +32,8 @@ program nf_mm
         integer(c_int) :: ret
         integer(c_int) :: row, col
         integer(c_int) :: b_row, b_col
-        integer(c_int) :: mr, tag, rank
+        integer(c_int) :: mr, rank
+        integer(c_int64_t) :: tag
 
         ret = fstarpu_init(C_NULL_PTR)
         if (ret == -19) then

+ 87 - 3
mpi/examples/user_datatype/my_interface.c

@@ -67,10 +67,11 @@ void _starpu_my_data_datatype_allocate(MPI_Datatype *mpi_datatype)
 	free(myinterface);
 }
 
-void starpu_my_data_datatype_allocate(starpu_data_handle_t handle, MPI_Datatype *mpi_datatype)
+int starpu_my_data_datatype_allocate(starpu_data_handle_t handle, MPI_Datatype *mpi_datatype)
 {
 	(void)handle;
 	_starpu_my_data_datatype_allocate(mpi_datatype);
+	return 0;
 }
 
 void starpu_my_data_datatype_free(MPI_Datatype *mpi_datatype)
@@ -78,6 +79,18 @@ void starpu_my_data_datatype_free(MPI_Datatype *mpi_datatype)
 	MPI_Type_free(mpi_datatype);
 }
 
+int starpu_my_data2_datatype_allocate(starpu_data_handle_t handle, MPI_Datatype *mpi_datatype)
+{
+	(void)handle;
+	(void)mpi_datatype;
+	return -1;
+}
+
+void starpu_my_data2_datatype_free(MPI_Datatype *mpi_datatype)
+{
+	STARPU_ASSERT_MSG(0, "should not be called\n");
+}
+
 char starpu_my_data_interface_get_char(void *interface)
 {
 	struct starpu_my_data_interface *my_data = (struct starpu_my_data_interface *) interface;
@@ -197,6 +210,43 @@ static int data_unpack_data(starpu_data_handle_t handle, unsigned node, void *pt
 	return 0;
 }
 
+static int data_pack_data2(starpu_data_handle_t handle, unsigned node, void **ptr, starpu_ssize_t *count)
+{
+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
+
+	*count = sizeof(int) + sizeof(char);
+	if (ptr != NULL)
+	{
+		int d = starpu_my_data_get_int(handle);
+		char c = starpu_my_data_get_char(handle);
+
+		*ptr = (void*) starpu_malloc_on_node_flags(node, *count, 0);
+		memcpy(*ptr, &d, sizeof(int));
+		char *x = *ptr;
+		x += sizeof(int);
+		memcpy(x, &c, sizeof(char));
+	}
+
+	return 0;
+}
+
+static int data_unpack_data2(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count)
+{
+	(void)count;
+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
+	STARPU_ASSERT(count == sizeof(int)+sizeof(char));
+
+	struct starpu_my_data_interface *my_data = (struct starpu_my_data_interface *) starpu_data_get_interface_on_node(handle, node);
+	struct starpu_my_data *data = (struct starpu_my_data *)my_data->ptr;
+	memcpy(&data->d, ptr, sizeof(int));
+	char *x = ptr;
+	x += sizeof(int);
+	memcpy(&data->c, x, sizeof(char));
+
+	starpu_free_on_node_flags(node, (uintptr_t)ptr, count, 0);
+	return 0;
+}
+
 static starpu_ssize_t data_describe(void *data_interface, char *buf, size_t size)
 {
 	struct starpu_my_data_interface *my_data = (struct starpu_my_data_interface *) data_interface;
@@ -219,7 +269,6 @@ static int copy_any_to_any(void *src_interface, unsigned src_node,
 			   void *dst_interface, unsigned dst_node,
 			   void *async_data)
 {
-	assert(0);
 	struct starpu_my_data *src_data = src_interface;
 	struct starpu_my_data *dst_data = dst_interface;
 	int ret = 0;
@@ -248,7 +297,7 @@ static struct starpu_data_interface_ops interface_data_ops =
 	.register_data_handle = data_register_data_handle,
 	.allocate_data_on_node = data_allocate_data_on_node,
 	.free_data_on_node = data_free_data_on_node,
-	//	.copy_methods = &data_copy_methods,
+	.copy_methods = &data_copy_methods,
 	.get_size = data_get_size,
 	.get_alloc_size = data_get_alloc_size,
 	.footprint = data_footprint,
@@ -277,3 +326,38 @@ void starpu_my_data_register(starpu_data_handle_t *handleptr, unsigned home_node
 
 	starpu_data_register(handleptr, home_node, &data, &interface_data_ops);
 }
+
+static struct starpu_data_interface_ops interface_data2_ops =
+{
+	.register_data_handle = data_register_data_handle,
+	.allocate_data_on_node = data_allocate_data_on_node,
+	.free_data_on_node = data_free_data_on_node,
+	.copy_methods = &data_copy_methods,
+	.get_size = data_get_size,
+	.get_alloc_size = data_get_alloc_size,
+	.footprint = data_footprint,
+	.interfaceid = STARPU_UNKNOWN_INTERFACE_ID,
+	.interface_size = sizeof(struct starpu_my_data_interface),
+	.to_pointer = data_to_pointer,
+	.pack_data = data_pack_data2,
+	.unpack_data = data_unpack_data2,
+	.describe = data_describe
+};
+
+void starpu_my_data2_register(starpu_data_handle_t *handleptr, unsigned home_node, struct starpu_my_data *xc)
+{
+	if (interface_data2_ops.interfaceid == STARPU_UNKNOWN_INTERFACE_ID)
+	{
+		interface_data2_ops.interfaceid = starpu_data_interface_get_next_id();
+	}
+
+	struct starpu_my_data_interface data =
+	{
+	 	.id = interface_data_ops.interfaceid,
+		.ptr = (uintptr_t) xc,
+		.dev_handle = (uintptr_t) xc,
+		.offset = 0,
+	};
+
+	starpu_data_register(handleptr, home_node, &data, &interface_data2_ops);
+}

+ 4 - 1
mpi/examples/user_datatype/my_interface.h

@@ -36,6 +36,7 @@ struct starpu_my_data
 };
 
 void starpu_my_data_register(starpu_data_handle_t *handle, unsigned home_node, struct starpu_my_data *xc);
+void starpu_my_data2_register(starpu_data_handle_t *handle, unsigned home_node, struct starpu_my_data *xc);
 
 char starpu_my_data_get_char(starpu_data_handle_t handle);
 int starpu_my_data_get_int(starpu_data_handle_t handle);
@@ -47,8 +48,10 @@ int starpu_my_data_interface_get_int(void *interface);
 #define STARPU_MY_DATA_GET_INT(interface)	starpu_my_data_interface_get_int(interface)
 
 void _starpu_my_data_datatype_allocate(MPI_Datatype *mpi_datatype);
-void starpu_my_data_datatype_allocate(starpu_data_handle_t handle, MPI_Datatype *mpi_datatype);
+int starpu_my_data_datatype_allocate(starpu_data_handle_t handle, MPI_Datatype *mpi_datatype);
 void starpu_my_data_datatype_free(MPI_Datatype *mpi_datatype);
+int starpu_my_data2_datatype_allocate(starpu_data_handle_t handle, MPI_Datatype *mpi_datatype);
+void starpu_my_data2_datatype_free(MPI_Datatype *mpi_datatype);
 
 void starpu_my_data_display_codelet_cpu(void *descr[], void *_args);
 void starpu_my_data_compare_codelet_cpu(void *descr[], void *_args);

+ 98 - 0
mpi/examples/user_datatype/user_datatype2.c

@@ -0,0 +1,98 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2015-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu_mpi.h>
+#include "my_interface.h"
+
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
+int main(int argc, char **argv)
+{
+	int rank, nodes;
+	int ret=0;
+	int compare=0;
+
+	ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
+	starpu_mpi_comm_size(MPI_COMM_WORLD, &nodes);
+
+	if (nodes < 2 || (starpu_cpu_worker_get_count() == 0))
+	{
+		if (rank == 0)
+		{
+			if (nodes < 2)
+				fprintf(stderr, "We need at least 2 processes.\n");
+			else
+				fprintf(stderr, "We need at least 1 CPU.\n");
+		}
+		starpu_mpi_shutdown();
+		return 77;
+	}
+
+	struct starpu_my_data my0 = {.d = 42 , .c = 'n'};
+	struct starpu_my_data my1 = {.d = 98 , .c = 'z'};
+
+	starpu_data_handle_t handle0;
+	starpu_data_handle_t handle1;
+
+	if (rank == 1)
+	{
+		my0.d = 0;
+		my0.c = 'z';
+	}
+
+	starpu_my_data2_register(&handle0, STARPU_MAIN_RAM, &my0);
+	starpu_my_data2_register(&handle1, -1, &my1);
+	starpu_mpi_datatype_register(handle1, starpu_my_data2_datatype_allocate, starpu_my_data2_datatype_free);
+
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+
+	if (rank == 0)
+	{
+		int *compare_ptr = &compare;
+
+		starpu_task_insert(&starpu_my_data_display_codelet, STARPU_VALUE, "node0 initial value", strlen("node0 initial value")+1, STARPU_R, handle0, 0);
+		starpu_mpi_isend_detached(handle0, 1, 10, MPI_COMM_WORLD, NULL, NULL);
+		starpu_mpi_irecv_detached(handle1, 1, 20, MPI_COMM_WORLD, NULL, NULL);
+
+		starpu_task_insert(&starpu_my_data_display_codelet, STARPU_VALUE, "node0 received value", strlen("node0 received value")+1, STARPU_R, handle1, 0);
+		starpu_task_insert(&starpu_my_data_compare_codelet, STARPU_R, handle0, STARPU_R, handle1, STARPU_VALUE, &compare_ptr, sizeof(compare_ptr), 0);
+	}
+	else if (rank == 1)
+	{
+		starpu_task_insert(&starpu_my_data_display_codelet, STARPU_VALUE, "node1 initial value", strlen("node1 initial value")+1, STARPU_R, handle0, 0);
+		starpu_mpi_irecv_detached(handle0, 0, 10, MPI_COMM_WORLD, NULL, NULL);
+		starpu_task_insert(&starpu_my_data_display_codelet, STARPU_VALUE, "node1 received value", strlen("node1 received value")+1, STARPU_R, handle0, 0);
+		starpu_mpi_isend_detached(handle0, 0, 20, MPI_COMM_WORLD, NULL, NULL);
+	}
+
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+	starpu_mpi_wait_for_all(MPI_COMM_WORLD);
+
+	starpu_mpi_datatype_unregister(handle0);
+	starpu_data_unregister(handle0);
+	starpu_data_unregister(handle1);
+
+	starpu_mpi_shutdown();
+
+	if (rank == 0)
+	{
+		FPRINTF(stderr, "[node 0] %s\n", compare==1?"SUCCESS":"FAILURE");
+	}
+
+	return (rank == 0) ? !compare : 0;
+}

+ 25 - 25
mpi/include/fstarpu_mpi_mod.f90

@@ -28,7 +28,7 @@ module fstarpu_mpi_mod
                         type(c_ptr), value, intent(in) :: dh
                         type(c_ptr), value, intent(in) :: mpi_req
                         integer(c_int), value, intent(in) :: dst
-                        integer(c_int), value, intent(in) :: data_tag
+                        integer(c_int64_t), value, intent(in) :: data_tag
                         integer(c_int), value, intent(in) :: mpi_comm
                 end function fstarpu_mpi_isend
 
@@ -41,7 +41,7 @@ module fstarpu_mpi_mod
                         type(c_ptr), value, intent(in) :: dh
                         type(c_ptr), value, intent(in) :: mpi_req
                         integer(c_int), value, intent(in) :: dst
-                        integer(c_int), value, intent(in) :: data_tag
+                        integer(c_int64_t), value, intent(in) :: data_tag
                         integer(c_int), value, intent(in) :: prio
                         integer(c_int), value, intent(in) :: mpi_comm
                 end function fstarpu_mpi_isend_prio
@@ -54,7 +54,7 @@ module fstarpu_mpi_mod
                         type(c_ptr), value, intent(in) :: dh
                         type(c_ptr), value, intent(in) :: mpi_req
                         integer(c_int), value, intent(in) :: src
-                        integer(c_int), value, intent(in) :: data_tag
+                        integer(c_int64_t), value, intent(in) :: data_tag
                         integer(c_int), value, intent(in) :: mpi_comm
                 end function fstarpu_mpi_irecv
 
@@ -65,7 +65,7 @@ module fstarpu_mpi_mod
                         integer(c_int) :: fstarpu_mpi_send
                         type(c_ptr), value, intent(in) :: dh
                         integer(c_int), value, intent(in) :: dst
-                        integer(c_int), value, intent(in) :: data_tag
+                        integer(c_int64_t), value, intent(in) :: data_tag
                         integer(c_int), value, intent(in) :: mpi_comm
                 end function fstarpu_mpi_send
 
@@ -76,7 +76,7 @@ module fstarpu_mpi_mod
                         integer(c_int) :: fstarpu_mpi_send_prio
                         type(c_ptr), value, intent(in) :: dh
                         integer(c_int), value, intent(in) :: dst
-                        integer(c_int), value, intent(in) :: data_tag
+                        integer(c_int64_t), value, intent(in) :: data_tag
                         integer(c_int), value, intent(in) :: prio
                         integer(c_int), value, intent(in) :: mpi_comm
                 end function fstarpu_mpi_send_prio
@@ -88,7 +88,7 @@ module fstarpu_mpi_mod
                         integer(c_int) :: fstarpu_mpi_recv
                         type(c_ptr), value, intent(in) :: dh
                         integer(c_int), value, intent(in) :: src
-                        integer(c_int), value, intent(in) :: data_tag
+                        integer(c_int64_t), value, intent(in) :: data_tag
                         integer(c_int), value, intent(in) :: mpi_comm
                         type(c_ptr), value, intent(in) :: mpi_status
                 end function fstarpu_mpi_recv
@@ -100,7 +100,7 @@ module fstarpu_mpi_mod
                         integer(c_int) :: fstarpu_mpi_isend_detached
                         type(c_ptr), value, intent(in) :: dh
                         integer(c_int), value, intent(in) :: dst
-                        integer(c_int), value, intent(in) :: data_tag
+                        integer(c_int64_t), value, intent(in) :: data_tag
                         integer(c_int), value, intent(in) :: mpi_comm
                         type(c_funptr), value, intent(in) :: callback
                         type(c_ptr), value, intent(in) :: arg
@@ -113,7 +113,7 @@ module fstarpu_mpi_mod
                         integer(c_int) :: fstarpu_mpi_isend_detached_prio
                         type(c_ptr), value, intent(in) :: dh
                         integer(c_int), value, intent(in) :: dst
-                        integer(c_int), value, intent(in) :: data_tag
+                        integer(c_int64_t), value, intent(in) :: data_tag
                         integer(c_int), value, intent(in) :: prio
                         integer(c_int), value, intent(in) :: mpi_comm
                         type(c_funptr), value, intent(in) :: callback
@@ -127,7 +127,7 @@ module fstarpu_mpi_mod
                         integer(c_int) :: fstarpu_mpi_recv_detached
                         type(c_ptr), value, intent(in) :: dh
                         integer(c_int), value, intent(in) :: src
-                        integer(c_int), value, intent(in) :: data_tag
+                        integer(c_int64_t), value, intent(in) :: data_tag
                         integer(c_int), value, intent(in) :: mpi_comm
                         type(c_funptr), value, intent(in) :: callback
                         type(c_ptr), value, intent(in) :: arg
@@ -141,7 +141,7 @@ module fstarpu_mpi_mod
                         type(c_ptr), value, intent(in) :: dh
                         type(c_ptr), value, intent(in) :: mpi_req
                         integer(c_int), value, intent(in) :: dst
-                        integer(c_int), value, intent(in) :: data_tag
+                        integer(c_int64_t), value, intent(in) :: data_tag
                         integer(c_int), value, intent(in) :: mpi_comm
                 end function fstarpu_mpi_issend
 
@@ -153,7 +153,7 @@ module fstarpu_mpi_mod
                         type(c_ptr), value, intent(in) :: dh
                         type(c_ptr), value, intent(in) :: mpi_req
                         integer(c_int), value, intent(in) :: dst
-                        integer(c_int), value, intent(in) :: data_tag
+                        integer(c_int64_t), value, intent(in) :: data_tag
                         integer(c_int), value, intent(in) :: prio
                         integer(c_int), value, intent(in) :: mpi_comm
                 end function fstarpu_mpi_issend_prio
@@ -165,7 +165,7 @@ module fstarpu_mpi_mod
                         integer(c_int) :: fstarpu_mpi_issend_detached
                         type(c_ptr), value, intent(in) :: dh
                         integer(c_int), value, intent(in) :: dst
-                        integer(c_int), value, intent(in) :: data_tag
+                        integer(c_int64_t), value, intent(in) :: data_tag
                         integer(c_int), value, intent(in) :: mpi_comm
                         type(c_funptr), value, intent(in) :: callback
                         type(c_ptr), value, intent(in) :: arg
@@ -178,7 +178,7 @@ module fstarpu_mpi_mod
                         integer(c_int) :: fstarpu_mpi_issend_detached_prio
                         type(c_ptr), value, intent(in) :: dh
                         integer(c_int), value, intent(in) :: dst
-                        integer(c_int), value, intent(in) :: data_tag
+                        integer(c_int64_t), value, intent(in) :: data_tag
                         integer(c_int), value, intent(in) :: prio
                         integer(c_int), value, intent(in) :: mpi_comm
                         type(c_funptr), value, intent(in) :: callback
@@ -220,7 +220,7 @@ module fstarpu_mpi_mod
                         integer(c_int) :: fstarpu_mpi_recv_detached_sequential_consistency
                         type(c_ptr), value, intent(in) :: dh
                         integer(c_int), value, intent(in) :: src
-                        integer(c_int), value, intent(in) :: data_tag
+                        integer(c_int64_t), value, intent(in) :: data_tag
                         integer(c_int), value, intent(in) :: mpi_comm
                         type(c_funptr), value, intent(in) :: callback
                         type(c_ptr), value, intent(in) :: arg
@@ -342,7 +342,7 @@ module fstarpu_mpi_mod
                         integer(c_int) :: fstarpu_mpi_isend_detached_unlock_tag
                         type(c_ptr), value, intent(in) :: dh
                         integer(c_int), value, intent(in) :: dst
-                        integer(c_int), value, intent(in) :: data_tag
+                        integer(c_int64_t), value, intent(in) :: data_tag
                         integer(c_int), value, intent(in) :: mpi_comm
                         type(c_ptr), value, intent(in) :: starpu_tag
                 end function fstarpu_mpi_isend_detached_unlock_tag
@@ -354,7 +354,7 @@ module fstarpu_mpi_mod
                         integer(c_int) :: fstarpu_mpi_isend_detached_unlock_tag_prio
                         type(c_ptr), value, intent(in) :: dh
                         integer(c_int), value, intent(in) :: dst
-                        integer(c_int), value, intent(in) :: data_tag
+                        integer(c_int64_t), value, intent(in) :: data_tag
                         integer(c_int), value, intent(in) :: prio
                         integer(c_int), value, intent(in) :: mpi_comm
                         type(c_ptr), value, intent(in) :: starpu_tag
@@ -367,7 +367,7 @@ module fstarpu_mpi_mod
                         integer(c_int) :: fstarpu_mpi_recv_detached_unlock_tag
                         type(c_ptr), value, intent(in) :: dh
                         integer(c_int), value, intent(in) :: src
-                        integer(c_int), value, intent(in) :: data_tag
+                        integer(c_int64_t), value, intent(in) :: data_tag
                         integer(c_int), value, intent(in) :: mpi_comm
                         type(c_ptr), value, intent(in) :: starpu_tag
                 end function fstarpu_mpi_recv_detached_unlock_tag
@@ -381,7 +381,7 @@ module fstarpu_mpi_mod
                         integer(c_int), value, intent(in) :: array_size
                         type(c_ptr), intent(in) :: dhs(*)
                         integer(c_int), intent(in) :: dsts(*)
-                        integer(c_int), intent(in) :: data_tags(*)
+                        integer(c_int64_t), intent(in) :: data_tags(*)
                         integer(c_int), intent(in) :: mpi_comms(*)
                         type(c_ptr), value, intent(in) :: starpu_tag
                 end function fstarpu_mpi_isend_array_detached_unlock_tag
@@ -395,7 +395,7 @@ module fstarpu_mpi_mod
                         integer(c_int), value, intent(in) :: array_size
                         type(c_ptr), intent(in) :: dhs(*)
                         integer(c_int), intent(in) :: dsts(*)
-                        integer(c_int), intent(in) :: data_tags(*)
+                        integer(c_int64_t), intent(in) :: data_tags(*)
                         integer(c_int), intent(in) :: prio(*)
                         integer(c_int), intent(in) :: mpi_comms(*)
                         type(c_ptr), value, intent(in) :: starpu_tag
@@ -410,7 +410,7 @@ module fstarpu_mpi_mod
                         integer(c_int), value, intent(in) :: array_size
                         type(c_ptr), intent(in) :: dhs(*)
                         integer(c_int), intent(in) :: srcs(*)
-                        integer(c_int), intent(in) :: data_tags(*)
+                        integer(c_int64_t), intent(in) :: data_tags(*)
                         integer(c_int), intent(in) :: mpi_comms(*)
                         type(c_ptr), value, intent(in) :: starpu_tag
                 end function fstarpu_mpi_recv_array_detached_unlock_tag
@@ -489,7 +489,7 @@ module fstarpu_mpi_mod
                 subroutine fstarpu_mpi_set_communication_tag(tag) bind(C,name="starpu_mpi_set_communication_tag")
                         use iso_c_binding
                         implicit none
-                        integer(c_int), value, intent(in) :: tag
+                        integer(c_int64_t), value, intent(in) :: tag
                 end subroutine fstarpu_mpi_set_communication_tag
 
                 ! void starpu_mpi_data_register_comm(starpu_data_handle_t data_handle, int tag, int rank, MPI_Comm comm);
@@ -497,7 +497,7 @@ module fstarpu_mpi_mod
                         use iso_c_binding
                         implicit none
                         type(c_ptr), value, intent(in) :: dh
-                        integer(c_int), value, intent(in) :: tag
+                        integer(c_int64_t), value, intent(in) :: tag
                         integer(c_int), value, intent(in) :: rank
                         integer(c_int), value, intent(in) :: mpi_comm
                 end subroutine fstarpu_mpi_data_register_comm
@@ -507,7 +507,7 @@ module fstarpu_mpi_mod
                         use iso_c_binding
                         implicit none
                         type(c_ptr), value, intent(in) :: dh
-                        integer(c_int), value, intent(in) :: tag
+                        integer(c_int64_t), value, intent(in) :: tag
                         integer(c_int), value, intent(in) :: rank
                 end subroutine fstarpu_mpi_data_register
 
@@ -533,7 +533,7 @@ module fstarpu_mpi_mod
                         use iso_c_binding
                         implicit none
                         type(c_ptr), value, intent(in) :: dh
-                        integer(c_int), value, intent(in) :: tag
+                        integer(c_int64_t), value, intent(in) :: tag
                 end subroutine fstarpu_mpi_data_set_tag
 
                 ! int starpu_mpi_data_get_rank(starpu_data_handle_t handle);
@@ -548,7 +548,7 @@ module fstarpu_mpi_mod
                 function fstarpu_mpi_data_get_tag(dh) bind(C,name="starpu_mpi_data_get_tag")
                         use iso_c_binding
                         implicit none
-                        integer(c_int) :: fstarpu_mpi_data_get_tag
+                        integer(c_int64_t) :: fstarpu_mpi_data_get_tag
                         type(c_ptr), value, intent(in) :: dh
                 end function fstarpu_mpi_data_get_tag
 

+ 28 - 3
mpi/include/starpu_mpi.h

@@ -149,7 +149,7 @@ void starpu_mpi_set_communication_tag(int tag);
 typedef void *starpu_mpi_req;
 
 /**
-   Define the type which can be used to set communication tag when exchanging data.
+   Type of the message tag.
 */
 typedef int64_t starpu_mpi_tag_t;
 
@@ -337,7 +337,7 @@ int starpu_mpi_isend_array_detached_unlock_tag_prio(unsigned array_size, starpu_
 */
 int starpu_mpi_irecv_array_detached_unlock_tag(unsigned array_size, starpu_data_handle_t *data_handle, int *source, starpu_mpi_tag_t *data_tag, MPI_Comm *comm, starpu_tag_t tag);
 
-typedef void (*starpu_mpi_datatype_allocate_func_t)(starpu_data_handle_t, MPI_Datatype *);
+typedef int (*starpu_mpi_datatype_allocate_func_t)(starpu_data_handle_t, MPI_Datatype *);
 typedef void (*starpu_mpi_datatype_free_func_t)(MPI_Datatype *);
 
 /**
@@ -350,11 +350,27 @@ typedef void (*starpu_mpi_datatype_free_func_t)(MPI_Datatype *);
 int starpu_mpi_datatype_register(starpu_data_handle_t handle, starpu_mpi_datatype_allocate_func_t allocate_datatype_func, starpu_mpi_datatype_free_func_t free_datatype_func);
 
 /**
+   Register functions to create and free a MPI datatype for the given
+   interface id.
+   Similar to starpu_mpi_datatype_register().
+   It is important that the function is called before any
+   communication can take place for a data with the given handle. See
+   \ref ExchangingUserDefinedDataInterface for an example.
+*/
+int starpu_mpi_interface_datatype_register(enum starpu_data_interface_id id, starpu_mpi_datatype_allocate_func_t allocate_datatype_func, starpu_mpi_datatype_free_func_t free_datatype_func);
+
+/**
    Unregister the MPI datatype functions stored for the interface of
    the given handle.
 */
 int starpu_mpi_datatype_unregister(starpu_data_handle_t handle);
 
+/**
+   Unregister the MPI datatype functions stored for the interface of
+   the given interface id. Similar to starpu_mpi_datatype_unregister().
+*/
+int starpu_mpi_interface_datatype_unregister(enum starpu_data_interface_id id);
+
 /** @} */
 
 /**
@@ -409,6 +425,7 @@ int starpu_mpi_cached_send(starpu_data_handle_t data_handle, int dest);
 
 /**
    @name MPI Insert Task
+   \anchor MPIInsertTask
    @{
 */
 
@@ -517,7 +534,7 @@ starpu_mpi_tag_t starpu_mpi_data_get_tag(starpu_data_handle_t handle);
 	If there is several nodes owning data in ::STARPU_W mode, a
 	node will be selected according to a given node selection
 	policy (see ::STARPU_NODE_SELECTION_POLICY or
-	starpu_mpi_node_selection_set_current_policy()) 
+	starpu_mpi_node_selection_set_current_policy())
 	<li>
 	The argument ::STARPU_EXECUTE_ON_NODE followed by an integer
 	can be used to specify the node;
@@ -615,7 +632,15 @@ void starpu_mpi_data_migrate(MPI_Comm comm, starpu_data_handle_t handle, int new
    @{
 */
 
+
+/**
+   Define the current policy
+ */
 #define STARPU_MPI_NODE_SELECTION_CURRENT_POLICY -1
+/**
+   Define the policy in which the selected node is the one having the
+   most data in ::STARPU_R mode
+*/
 #define STARPU_MPI_NODE_SELECTION_MOST_R_DATA    0
 
 typedef int (*starpu_mpi_select_node_policy_func_t)(int me, int nb_nodes, struct starpu_data_descr *descr, int nb_data);

+ 8 - 3
mpi/src/mpi/starpu_mpi_mpi.c

@@ -137,11 +137,13 @@ void _starpu_mpi_submit_ready_request_inc(struct _starpu_mpi_req *req)
 	_starpu_mpi_submit_ready_request(req);
 }
 
+#if 0
 void _starpu_mpi_coop_sends_build_tree(struct _starpu_mpi_coop_sends *coop_sends)
 {
 	(void)coop_sends;
 	/* TODO: turn them into redirects & forwards */
 }
+#endif
 
 void _starpu_mpi_submit_coop_sends(struct _starpu_mpi_coop_sends *coop_sends, int submit_control, int submit_data)
 {
@@ -353,7 +355,10 @@ void _starpu_mpi_simgrid_wait_req(MPI_Request *request, MPI_Status *status, star
 	sim_req->done = done;
 	*done = 0;
 
-	_starpu_simgrid_xbt_thread_create("wait for mpi transfer", _starpu_mpi_simgrid_wait_req_func, sim_req);
+	starpu_pthread_attr_t attr;
+	starpu_pthread_attr_init(&attr);
+	starpu_pthread_attr_setstacksize(&attr, 32786);
+	_starpu_simgrid_xbt_thread_create("wait for mpi transfer", &attr, _starpu_mpi_simgrid_wait_req_func, sim_req);
 }
 #endif
 
@@ -853,7 +858,7 @@ static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req)
 					int ret;
 					ret = MPI_Wait(&req->backend->size_req, MPI_STATUS_IGNORE);
 					STARPU_MPI_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Wait returning %s", _starpu_mpi_get_mpi_error_code(ret));
-					free(req->ptr);
+					starpu_free_on_node_flags(STARPU_MAIN_RAM, (uintptr_t)req->ptr, req->count, 0);
 					req->ptr = NULL;
 				}
 				else if (req->request_type == RECV_REQ)
@@ -1117,7 +1122,7 @@ static void _starpu_mpi_receive_early_data(struct _starpu_mpi_envelope *envelope
 		 * to the application when it post a receive for this tag
 		 */
 		_STARPU_MPI_DEBUG(3, "Posting a receive for a data of size %d which has not yet been registered\n", (int)early_data_handle->env->size);
-		_STARPU_MPI_MALLOC(early_data_handle->buffer, early_data_handle->env->size);
+		early_data_handle->buffer = (void *)starpu_malloc_on_node_flags(STARPU_MAIN_RAM, early_data_handle->env->size, 0);
 		starpu_variable_data_register(&early_data_handle->handle, STARPU_MAIN_RAM, (uintptr_t) early_data_handle->buffer, early_data_handle->env->size);
 		//_starpu_mpi_early_data_add(early_data_handle);
 	}

+ 4 - 4
mpi/src/mpi/starpu_mpi_mpi_backend.c

@@ -51,10 +51,10 @@ void _starpu_mpi_mpi_backend_request_init(struct _starpu_mpi_req *req)
 
 	//req->backend->data_request = 0;
 
-	STARPU_PTHREAD_MUTEX_INIT(&req->backend->req_mutex, NULL);
-	STARPU_PTHREAD_COND_INIT(&req->backend->req_cond, NULL);
-	STARPU_PTHREAD_MUTEX_INIT(&req->backend->posted_mutex, NULL);
-	STARPU_PTHREAD_COND_INIT(&req->backend->posted_cond, NULL);
+	STARPU_PTHREAD_MUTEX_INIT0(&req->backend->req_mutex, NULL);
+	STARPU_PTHREAD_COND_INIT0(&req->backend->req_cond, NULL);
+	STARPU_PTHREAD_MUTEX_INIT0(&req->backend->posted_mutex, NULL);
+	STARPU_PTHREAD_COND_INIT0(&req->backend->posted_cond, NULL);
 
 	//req->backend->other_request = NULL;
 

+ 23 - 22
mpi/src/nmad/starpu_mpi_nmad.c

@@ -44,11 +44,14 @@
 
 void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req,nm_sr_event_t event);
 #ifdef STARPU_VERBOSE
-static char *_starpu_mpi_request_type(enum _starpu_mpi_request_type request_type);
+char *_starpu_mpi_request_type(enum _starpu_mpi_request_type request_type);
 #endif
 
 void _starpu_mpi_handle_pending_request(struct _starpu_mpi_req *req);
+
+#ifdef STARPU_USE_FXT
 static void _starpu_mpi_add_sync_point_in_fxt(void);
+#endif
 
 /* Condition to wake up waiting for all current MPI requests to finish */
 static starpu_pthread_t progress_thread;
@@ -283,7 +286,7 @@ int _starpu_mpi_barrier(MPI_Comm comm)
 /********************************************************/
 
 #ifdef STARPU_VERBOSE
-static char *_starpu_mpi_request_type(enum _starpu_mpi_request_type request_type)
+char *_starpu_mpi_request_type(enum _starpu_mpi_request_type request_type)
 {
 	switch (request_type)
 	{
@@ -368,10 +371,12 @@ void _starpu_mpi_handle_pending_request(struct _starpu_mpi_req *req)
 	nm_sr_request_monitor(req->backend->session, &(req->backend->data_request), NM_SR_EVENT_FINALIZED,_starpu_mpi_handle_request_termination_callback);
 }
 
+#if 0
 void _starpu_mpi_coop_sends_build_tree(struct _starpu_mpi_coop_sends *coop_sends)
 {
 	/* TODO: turn them into redirects & forwards */
 }
+#endif
 
 void _starpu_mpi_submit_coop_sends(struct _starpu_mpi_coop_sends *coop_sends, int submit_control, int submit_data)
 {
@@ -415,25 +420,14 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 	struct _starpu_mpi_argc_argv *argc_argv = (struct _starpu_mpi_argc_argv *) arg;
 
 #ifndef STARPU_SIMGRID
-	if (_starpu_mpi_thread_cpuid < 0)
-	{
-		_starpu_mpi_thread_cpuid = starpu_get_next_bindid(0, NULL, 0);
-	}
-
 	if (starpu_bind_thread_on(_starpu_mpi_thread_cpuid, 0, "MPI") < 0)
 	{
 		char hostname[65];
 		gethostname(hostname, sizeof(hostname));
 		_STARPU_DISP("[%s] No core was available for the MPI thread. You should use STARPU_RESERVE_NCPU to leave one core available for MPI, or specify one core less in STARPU_NCPU\n", hostname);
 	}
-	_starpu_mpi_do_initialize(argc_argv);
-	if (_starpu_mpi_thread_cpuid >= 0)
-		/* In case MPI changed the binding */
-		starpu_bind_thread_on(_starpu_mpi_thread_cpuid, STARPU_THREAD_ACTIVE, "MPI");
 #endif
 
-	_starpu_mpi_env_init();
-
 #ifdef STARPU_SIMGRID
 	/* Now that MPI is set up, let the rest of simgrid get initialized */
 	char **argv_cpy;
@@ -547,9 +541,9 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 // static int hookid = - 1;
 // #endif /* STARPU_MPI_ACTIVITY */
 
+#ifdef STARPU_USE_FXT
 static void _starpu_mpi_add_sync_point_in_fxt(void)
 {
-#ifdef STARPU_USE_FXT
 	int rank;
 	int worldsize;
 	int ret;
@@ -578,8 +572,8 @@ static void _starpu_mpi_add_sync_point_in_fxt(void)
 	_STARPU_MPI_TRACE_BARRIER(rank, worldsize, random_number);
 
 	_STARPU_MPI_DEBUG(3, "unique key %x\n", random_number);
-#endif
 }
+#endif
 
 int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv)
 {
@@ -589,14 +583,21 @@ int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv)
 	starpu_sem_init(&callback_sem, 0, 0);
 	running = 0;
 
-	/* Tell pioman to use a bound thread for communication progression */
-	unsigned piom_bindid = starpu_get_next_bindid(STARPU_THREAD_ACTIVE, NULL, 0);
-	int indexes[1] = {piom_bindid};
-	piom_ltask_set_bound_thread_indexes(HWLOC_OBJ_PU,indexes,1);
+	_starpu_mpi_env_init();
+
+	/* This function calls MPI_Init_thread if needed, and it initializes internal NMAD/Pioman variables,
+	 * required for piom_ltask_set_bound_thread_indexes() */
+	_starpu_mpi_do_initialize(argc_argv);
+
+	if (_starpu_mpi_thread_cpuid < 0)
+	{
+		_starpu_mpi_thread_cpuid = starpu_get_next_bindid(STARPU_THREAD_ACTIVE, NULL, 0);
+	}
 
-	/* We force the "MPI" thread to share the same core as the pioman thread
-	   to avoid binding it on the same core as a worker */
-	_starpu_mpi_thread_cpuid = piom_bindid;
+	/* Tell pioman to use a bound thread for communication progression:
+	 * share the same core as StarPU's MPI thread, the MPI thread has very low activity with NMAD backend */
+	int indexes[1] = { _starpu_mpi_thread_cpuid };
+	piom_ltask_set_bound_thread_indexes(HWLOC_OBJ_PU, indexes, 1);
 
 	/* Register some hooks for communication progress if needed */
 	int polling_point_prog, polling_point_idle;

+ 13 - 12
mpi/src/starpu_mpi_coop_sends.c

@@ -62,15 +62,18 @@ static int _starpu_mpi_reqs_prio_compare(const void *a, const void *b)
 {
 	const struct _starpu_mpi_req * const *ra = a;
 	const struct _starpu_mpi_req * const *rb = b;
-	return (*rb)->prio - (*ra)->prio;
+	if ((*rb)->prio < (*ra)->prio)
+		return -1;
+	else if ((*rb)->prio == (*ra)->prio)
+		return 0;
+	else
+		return 1;
 }
 
 /* Sort the requests by priority and build a diffusion tree. Actually does something only once per coop_sends bag. */
 static void _starpu_mpi_coop_sends_optimize(struct _starpu_mpi_coop_sends *coop_sends)
 {
-	if (coop_sends->n == 1)
-		/* Trivial case, don't optimize */
-		return;
+	STARPU_ASSERT(coop_sends->n > 1);
 
 	_starpu_spin_lock(&coop_sends->lock);
 	if (!coop_sends->reqs_array)
@@ -92,8 +95,10 @@ static void _starpu_mpi_coop_sends_optimize(struct _starpu_mpi_coop_sends *coop_
 		/* Sort them */
 		qsort(reqs, n, sizeof(*reqs), _starpu_mpi_reqs_prio_compare);
 
+#if 0
 		/* And build the diffusion tree */
 		_starpu_mpi_coop_sends_build_tree(coop_sends);
+#endif
 	}
 	_starpu_spin_unlock(&coop_sends->lock);
 }
@@ -114,9 +119,6 @@ static void _starpu_mpi_coop_sends_data_ready(void *arg)
 		_starpu_spin_unlock(&mpi_data->coop_lock);
 	}
 
-	/* Build diffusion tree */
-	_starpu_mpi_coop_sends_optimize(coop_sends);
-
 	if (coop_sends->n == 1)
 	{
 		/* Trivial case, just submit it */
@@ -124,6 +126,9 @@ static void _starpu_mpi_coop_sends_data_ready(void *arg)
 	}
 	else
 	{
+		/* Build diffusion tree */
+		_starpu_mpi_coop_sends_optimize(coop_sends);
+
 		/* And submit them */
 		if (STARPU_TEST_AND_SET(&coop_sends->redirects_sent, 1) == 0)
 			_starpu_mpi_submit_coop_sends(coop_sends, 1, 1);
@@ -138,16 +143,12 @@ static void _starpu_mpi_coop_sends_data_ready(void *arg)
  * or because the value has changed.  */
 static void _starpu_mpi_coop_send_flush(struct _starpu_mpi_coop_sends *coop_sends)
 {
-	if (!coop_sends)
+	if (!coop_sends || coop_sends->n == 1)
 		return;
 
 	/* Build diffusion tree */
 	_starpu_mpi_coop_sends_optimize(coop_sends);
 
-	if (coop_sends->n == 1)
-		/* Trivial case, we will just send the data */
-		return;
-
 	/* And submit them */
 	if (STARPU_TEST_AND_SET(&coop_sends->redirects_sent, 1) == 0)
 		_starpu_mpi_submit_coop_sends(coop_sends, 1, 0);

+ 45 - 13
mpi/src/starpu_mpi_datatype.c

@@ -43,7 +43,7 @@ void _starpu_mpi_datatype_shutdown(void)
  * 	Matrix
  */
 
-static void handle_to_datatype_matrix(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
+static int handle_to_datatype_matrix(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
 {
 	int ret;
 
@@ -57,13 +57,15 @@ static void handle_to_datatype_matrix(starpu_data_handle_t data_handle, MPI_Data
 
 	ret = MPI_Type_commit(datatype);
 	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_commit failed");
+
+	return 0;
 }
 
 /*
  * 	Block
  */
 
-static void handle_to_datatype_block(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
+static int handle_to_datatype_block(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
 {
 	int ret;
 
@@ -86,13 +88,15 @@ static void handle_to_datatype_block(starpu_data_handle_t data_handle, MPI_Datat
 
 	ret = MPI_Type_commit(datatype);
 	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_commit failed");
+
+	return 0;
 }
 
 /*
  * 	Tensor
  */
 
-static void handle_to_datatype_tensor(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
+static int handle_to_datatype_tensor(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
 {
 	int ret;
 
@@ -124,13 +128,15 @@ static void handle_to_datatype_tensor(starpu_data_handle_t data_handle, MPI_Data
 
 	ret = MPI_Type_commit(datatype);
 	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_commit failed");
+
+	return 0;
 }
 
 /*
  * 	Vector
  */
 
-static void handle_to_datatype_vector(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
+static int handle_to_datatype_vector(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
 {
 	int ret;
 
@@ -142,13 +148,15 @@ static void handle_to_datatype_vector(starpu_data_handle_t data_handle, MPI_Data
 
 	ret = MPI_Type_commit(datatype);
 	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_commit failed");
+
+	return 0;
 }
 
 /*
  * 	Variable
  */
 
-static void handle_to_datatype_variable(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
+static int handle_to_datatype_variable(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
 {
 	int ret;
 
@@ -159,13 +167,15 @@ static void handle_to_datatype_variable(starpu_data_handle_t data_handle, MPI_Da
 
 	ret = MPI_Type_commit(datatype);
 	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_commit failed");
+
+	return 0;
 }
 
 /*
  * 	Void
  */
 
-static void handle_to_datatype_void(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
+static int handle_to_datatype_void(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
 {
 	int ret;
 	(void)data_handle;
@@ -175,6 +185,8 @@ static void handle_to_datatype_void(starpu_data_handle_t data_handle, MPI_Dataty
 
 	ret = MPI_Type_commit(datatype);
 	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_commit failed");
+
+	return 0;
 }
 
 /*
@@ -225,8 +237,15 @@ void _starpu_mpi_datatype_allocate(starpu_data_handle_t data_handle, struct _sta
 		if (table)
 		{
 			STARPU_ASSERT_MSG(table->allocate_datatype_func, "Handle To Datatype Function not defined for StarPU data interface %d", id);
-			table->allocate_datatype_func(data_handle, &req->datatype);
-			req->registered_datatype = 1;
+			int ret = table->allocate_datatype_func(data_handle, &req->datatype);
+			if (ret == 0)
+				req->registered_datatype = 1;
+			else
+			{
+				/* Couldn't register, probably complex data which needs packing. */
+				req->datatype = MPI_BYTE;
+				req->registered_datatype = 0;
+			}
 		}
 		else
 		{
@@ -315,16 +334,16 @@ void _starpu_mpi_datatype_free(starpu_data_handle_t data_handle, MPI_Datatype *d
 		if (table)
 		{
 			STARPU_ASSERT_MSG(table->free_datatype_func, "Free Datatype Function not defined for StarPU data interface %d", id);
-			table->free_datatype_func(datatype);
+			if (*datatype != MPI_BYTE)
+				table->free_datatype_func(datatype);
 		}
 
 	}
 	/* else the datatype is not predefined by StarPU */
 }
 
-int starpu_mpi_datatype_register(starpu_data_handle_t handle, starpu_mpi_datatype_allocate_func_t allocate_datatype_func, starpu_mpi_datatype_free_func_t free_datatype_func)
+int starpu_mpi_interface_datatype_register(enum starpu_data_interface_id id, starpu_mpi_datatype_allocate_func_t allocate_datatype_func, starpu_mpi_datatype_free_func_t free_datatype_func)
 {
-	enum starpu_data_interface_id id = starpu_data_get_interface_id(handle);
 	struct _starpu_mpi_datatype_funcs *table;
 
 	STARPU_ASSERT_MSG(id >= STARPU_MAX_INTERFACE_ID, "Cannot redefine the MPI datatype for a predefined StarPU datatype");
@@ -344,14 +363,21 @@ int starpu_mpi_datatype_register(starpu_data_handle_t handle, starpu_mpi_datatyp
 		table->free_datatype_func = free_datatype_func;
 		HASH_ADD_INT(_starpu_mpi_datatype_funcs_table, id, table);
 	}
-	STARPU_ASSERT_MSG(handle->ops->handle_to_pointer || handle->ops->to_pointer, "The data interface must define the operation 'to_pointer'\n");
 	STARPU_PTHREAD_MUTEX_UNLOCK(&_starpu_mpi_datatype_funcs_table_mutex);
 	return 0;
 }
 
-int starpu_mpi_datatype_unregister(starpu_data_handle_t handle)
+int starpu_mpi_datatype_register(starpu_data_handle_t handle, starpu_mpi_datatype_allocate_func_t allocate_datatype_func, starpu_mpi_datatype_free_func_t free_datatype_func)
 {
 	enum starpu_data_interface_id id = starpu_data_get_interface_id(handle);
+	int ret;
+	ret = starpu_mpi_interface_datatype_register(id, allocate_datatype_func, free_datatype_func);
+	STARPU_ASSERT_MSG(handle->ops->handle_to_pointer || handle->ops->to_pointer, "The data interface must define the operation 'to_pointer'\n");
+	return ret;
+}
+
+int starpu_mpi_interface_datatype_unregister(enum starpu_data_interface_id id)
+{
 	struct _starpu_mpi_datatype_funcs *table;
 
 	STARPU_ASSERT_MSG(id >= STARPU_MAX_INTERFACE_ID, "Cannot redefine the MPI datatype for a predefined StarPU datatype");
@@ -366,3 +392,9 @@ int starpu_mpi_datatype_unregister(starpu_data_handle_t handle)
 	STARPU_PTHREAD_MUTEX_UNLOCK(&_starpu_mpi_datatype_funcs_table_mutex);
 	return 0;
 }
+
+int starpu_mpi_datatype_unregister(starpu_data_handle_t handle)
+{
+	enum starpu_data_interface_id id = starpu_data_get_interface_id(handle);
+	return starpu_mpi_interface_datatype_unregister(id);
+}

+ 2 - 0
mpi/src/starpu_mpi_init.c

@@ -220,6 +220,8 @@ int starpu_mpi_shutdown(void)
 	_starpu_mpi_comm_amounts_shutdown();
 	_starpu_mpi_cache_shutdown(world_size);
 
+	_mpi_backend._starpu_mpi_backend_shutdown();
+
 	if (_mpi_initialized_starpu)
 		starpu_shutdown();
 

+ 2 - 0
mpi/src/starpu_mpi_private.h

@@ -278,8 +278,10 @@ void _starpu_mpi_submit_ready_request(void *arg);
 /* To be called when request is completed */
 void _starpu_mpi_release_req_data(struct _starpu_mpi_req *req);
 
+#if 0
 /* Build a communication tree. Called before _starpu_mpi_coop_send is ever called. coop_sends->lock is held. */
 void _starpu_mpi_coop_sends_build_tree(struct _starpu_mpi_coop_sends *coop_sends);
+#endif
 /* Try to merge with send request with other send requests */
 void _starpu_mpi_coop_send(starpu_data_handle_t data_handle, struct _starpu_mpi_req *req, enum starpu_data_access_mode mode, int sequential_consistency);
 

+ 11 - 7
mpi/src/starpu_mpi_select_node.c

@@ -28,13 +28,13 @@ static int _current_policy = STARPU_MPI_NODE_SELECTION_MOST_R_DATA;
 static int _last_predefined_policy = STARPU_MPI_NODE_SELECTION_MOST_R_DATA;
 static starpu_mpi_select_node_policy_func_t _policies[_STARPU_MPI_NODE_SELECTION_MAX_POLICY];
 
-int _starpu_mpi_select_node_with_most_R_data(int me, int nb_nodes, struct starpu_data_descr *descr, int nb_data);
+int _starpu_mpi_select_node_with_most_data(int me, int nb_nodes, struct starpu_data_descr *descr, int nb_data);
 
 void _starpu_mpi_select_node_init()
 {
 	int i;
 
-	_policies[STARPU_MPI_NODE_SELECTION_MOST_R_DATA] = _starpu_mpi_select_node_with_most_R_data;
+	_policies[STARPU_MPI_NODE_SELECTION_MOST_R_DATA] = _starpu_mpi_select_node_with_most_data;
 	for(i=_last_predefined_policy+1 ; i<_STARPU_MPI_NODE_SELECTION_MAX_POLICY ; i++)
 		_policies[i] = NULL;
 }
@@ -73,7 +73,7 @@ int starpu_mpi_node_selection_unregister_policy(int policy)
 	return 0;
 }
 
-int _starpu_mpi_select_node_with_most_R_data(int me, int nb_nodes, struct starpu_data_descr *descr, int nb_data)
+int _starpu_mpi_select_node_with_most_data(int me, int nb_nodes, struct starpu_data_descr *descr, int nb_data)
 {
 	size_t *size_on_nodes;
 	size_t max_size;
@@ -87,11 +87,15 @@ int _starpu_mpi_select_node_with_most_R_data(int me, int nb_nodes, struct starpu
 	{
 		starpu_data_handle_t data = descr[i].handle;
 		enum starpu_data_access_mode mode = descr[i].mode;
+		int rank = starpu_data_get_rank(data);
+		size_t size = data->ops->get_size(data);
+
 		if (mode & STARPU_R)
-		{
-			int rank = starpu_data_get_rank(data);
-			size_on_nodes[rank] += data->ops->get_size(data);
-		}
+			size_on_nodes[rank] += size;
+
+		if (mode & STARPU_W)
+			/* Would have to transfer it back */
+			size_on_nodes[rank] += size;
 	}
 
 	max_size = 0;

+ 1 - 1
mpi/src/starpu_mpi_task_insert_fortran.c

@@ -227,8 +227,8 @@ int _fstarpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_no
 		}
 		else if (arg_type==STARPU_PRIORITY)
 		{
-			prio = *(int *)arglist[arg_i];
 			arg_i++;
+			prio = *(int *)arglist[arg_i];
 			/* int* */
 		}
 		/* STARPU_EXECUTE_ON_NODE handled above */

+ 2 - 2
mpi/tests/insert_task_owner2.c

@@ -37,7 +37,7 @@ struct starpu_codelet mycodelet =
 {
 	.cpu_funcs = {func_cpu},
 	.nbuffers = 4,
-	.modes = {STARPU_R, STARPU_RW, STARPU_W, STARPU_W},
+	.modes = {STARPU_R, STARPU_RW, STARPU_W, STARPU_RW},
 	.model = &starpu_perfmodel_nop,
 };
 
@@ -94,7 +94,7 @@ int main(int argc, char **argv)
 	err = starpu_mpi_task_insert(MPI_COMM_WORLD, &mycodelet,
 				     STARPU_R, data_handles[0], STARPU_RW, data_handles[1],
 				     STARPU_W, data_handles[2],
-				     STARPU_W, data_handles[3],
+				     STARPU_RW, data_handles[3],
 				     STARPU_EXECUTE_ON_NODE, 1, 0);
 	STARPU_CHECK_RETURN_VALUE(err, "starpu_mpi_task_insert");
 	starpu_task_wait_for_all();

+ 9 - 9
mpi/tests/policy_selection.c

@@ -106,13 +106,13 @@ int main(int argc, char **argv)
 	FPRINTF_MPI(stderr, "Task %p\n", task);
 	if (rank == 1)
 	{
-		STARPU_ASSERT_MSG(task, "Task should be executed by rank 1\n");
+		STARPU_ASSERT_MSG(task, "Task should be executed by rank 1");
 		task->destroy = 0;
 		starpu_task_destroy(task);
 	}
 	else
 	{
-		STARPU_ASSERT_MSG(task == NULL, "Task should be executed by rank 1\n");
+		STARPU_ASSERT_MSG(task == NULL, "Task should be executed by rank 1");
 	}
 
 	// Force the execution on node 1
@@ -123,13 +123,13 @@ int main(int argc, char **argv)
 	FPRINTF_MPI(stderr, "Task %p\n", task);
 	if (rank == 1)
 	{
-		STARPU_ASSERT_MSG(task, "Task should be executed by rank 1\n");
+		STARPU_ASSERT_MSG(task, "Task should be executed by rank 1");
 		task->destroy = 0;
 		starpu_task_destroy(task);
 	}
 	else
 	{
-		STARPU_ASSERT_MSG(task == NULL, "Task should be executed by rank 1\n");
+		STARPU_ASSERT_MSG(task == NULL, "Task should be executed by rank 1");
 	}
 
 	// Let StarPU choose the node
@@ -138,15 +138,15 @@ int main(int argc, char **argv)
 				     STARPU_W, handles[0], STARPU_W, handles[1],
 				     0);
 	FPRINTF_MPI(stderr, "Task %p\n", task);
-	if (rank == 2)
+	if (rank == 0)
 	{
-		STARPU_ASSERT_MSG(task, "Task should be executed by rank 2\n");
+		STARPU_ASSERT_MSG(task, "Task should be executed by rank 0");
 		task->destroy = 0;
 		starpu_task_destroy(task);
 	}
 	else
 	{
-		STARPU_ASSERT_MSG(task == NULL, "Task should be executed by rank 2\n");
+		STARPU_ASSERT_MSG(task == NULL, "Task should be executed by rank 2");
 	}
 
 	// Let StarPU choose the node
@@ -156,13 +156,13 @@ int main(int argc, char **argv)
 	FPRINTF_MPI(stderr, "Task %p\n", task);
 	if (rank == 0)
 	{
-		STARPU_ASSERT_MSG(task, "Task should be executed by rank 0\n");
+		STARPU_ASSERT_MSG(task, "Task should be executed by rank 0");
 		task->destroy = 0;
 		starpu_task_destroy(task);
 	}
 	else
 	{
-		STARPU_ASSERT_MSG(task == NULL, "Task should be executed by rank 0\n");
+		STARPU_ASSERT_MSG(task == NULL, "Task should be executed by rank 0");
 	}
 
 	starpu_data_unregister(handles[0]);

+ 7 - 7
mpi/tests/policy_selection2.c

@@ -24,8 +24,8 @@ void func_cpu(void *descr[], void *_args)
 	int *data0 = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	int *data1 = (int *)STARPU_VARIABLE_GET_PTR(descr[1]);
 	int *data2 = (int *)STARPU_VARIABLE_GET_PTR(descr[2]);
-	*data1 += *data0;
-	*data2 += *data0;
+	*data1 = *data0;
+	*data2 = *data0;
 }
 
 struct starpu_codelet mycodelet =
@@ -69,11 +69,11 @@ int main(int argc, char **argv)
 		return STARPU_TEST_SKIPPED;
 	}
 
-	data[0] = 12;
+	data[0] = 42;
 	starpu_variable_data_register(&handles[0], STARPU_MAIN_RAM, (uintptr_t)&data[0], sizeof(int));
 	starpu_mpi_data_register(handles[0], 10, 0);
 
-	data[1] = 12;
+	data[1] = 42;
 	starpu_variable_data_register(&handles[1], STARPU_MAIN_RAM, (uintptr_t)&data[1], sizeof(int));
 	starpu_mpi_data_register(handles[1], 20, 1);
 
@@ -88,9 +88,9 @@ int main(int argc, char **argv)
 	FPRINTF_MPI(stderr, "data[%d,%d,%d] = %d,%d,%d\n", 0, 1, 2, data[0], data[1], data[2]);
 	for(i=0 ; i<2 ; i++) starpu_data_release(handles[i]);
 #ifndef STARPU_SIMGRID
-	if (rank == 2)
+	if (rank == 0)
 	{
-		STARPU_ASSERT_MSG(data[0] == 2*data[2] && data[1] == 2*data[2], "Computation incorrect. data[%d] (%d) != 2*data[%d] (%d) && data[%d] (%d) != 2*data[%d] (%d)\n",
+		STARPU_ASSERT_MSG(data[0] == data[2] && data[1] == data[2], "Computation incorrect. data[%d] (%d) != data[%d] (%d) && data[%d] (%d) != data[%d] (%d)\n",
 				  0, data[0], 2, data[2], 1, data[1], 2, data[2]);
 	}
 #endif
@@ -110,7 +110,7 @@ int main(int argc, char **argv)
 #ifndef STARPU_SIMGRID
 	if (rank == 1)
 	{
-		STARPU_ASSERT_MSG(data[0] == 2*data[2] && data[1] == 2*data[2], "Computation incorrect. data[%d] (%d) != 2*data[%d] (%d) && data[%d] (%d) != 2*data[%d] (%d)\n",
+		STARPU_ASSERT_MSG(data[0] == data[2] && data[1] == data[2], "Computation incorrect. data[%d] (%d) != data[%d] (%d) && data[%d] (%d) != data[%d] (%d)\n",
 				  0, data[0], 2, data[2], 1, data[1], 2, data[2]);
 	}
 #endif

+ 33 - 1
mpi/tests/sendrecv_bench.c

@@ -24,10 +24,18 @@
 
 #define NX_MAX (512 * 1024 * 1024) // kB
 #define NX_MIN 0
+#ifdef STARPU_QUICK_CHECK
+#define MULT_DEFAULT 4
+#else
 #define MULT_DEFAULT 2
+#endif
 #define INCR_DEFAULT 0
 #define NX_STEP 1.4 // multiplication
+#ifdef STARPU_QUICK_CHECK
+#define LOOPS_DEFAULT 100
+#else
 #define LOOPS_DEFAULT 10000
+#endif
 
 int times_nb_nodes;
 int times_size;
@@ -96,7 +104,7 @@ int main(int argc, char **argv)
 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
 	starpu_mpi_comm_size(MPI_COMM_WORLD, &worldsize);
 
-	if (worldsize != 2)
+	if (worldsize < 2)
 	{
 		if (rank == 0)
 			FPRINTF(stderr, "We need 2 processes.\n");
@@ -107,6 +115,27 @@ int main(int argc, char **argv)
 		return STARPU_TEST_SKIPPED;
 	}
 
+	if (rank >= 2)
+	{
+		starpu_pause();
+		for (uint64_t s = NX_MIN; s <= NX_MAX; s = _next(s, multiplier, increment))
+		{
+			iterations = _iterations(iterations, s);
+
+			starpu_mpi_barrier(MPI_COMM_WORLD);
+
+			for (uint64_t j = 0; j < iterations; j++)
+			{
+				starpu_mpi_barrier(MPI_COMM_WORLD);
+			}
+		}
+		starpu_resume();
+
+		starpu_mpi_shutdown();
+		if (!mpi_init)
+			MPI_Finalize();
+		return 0;
+	}
 
 	if (rank == 0)
 	{
@@ -185,6 +214,9 @@ int main(int argc, char **argv)
 	}
 
 	starpu_mpi_shutdown();
+	if (!mpi_init)
+		MPI_Finalize();
 
+	free(lats);
 	return 0;
 }

+ 3 - 1
mpi/tests/user_defined_datatype_value.h

@@ -98,7 +98,7 @@ static int value_pack_data(starpu_data_handle_t handle, unsigned node, void **pt
 	*count = sizeof(int);
 	if (ptr != NULL)
 	{
-		*ptr = malloc(*count);
+		*ptr = (void*) starpu_malloc_on_node_flags(node, *count, 0);
 		memcpy(*ptr, value_interface->value, sizeof(int));
 	}
 
@@ -117,6 +117,8 @@ static int value_unpack_data(starpu_data_handle_t handle, unsigned node, void *p
 
 	assert(value_interface->value[0] == 36);
 
+	starpu_free_on_node_flags(node, (uintptr_t)ptr, count, 0);
+
 	return 0;
 }
 

+ 2 - 0
src/common/barrier.h

@@ -19,6 +19,8 @@
 
 #include <starpu_thread.h>
 
+/** @file */
+
 struct _starpu_barrier
 {
 	unsigned count;

+ 2 - 0
src/common/barrier_counter.h

@@ -18,6 +18,8 @@
 #ifndef __BARRIER_COUNTER_H__
 #define __BARRIER_COUNTER_H__
 
+/** @file */
+
 #include <common/utils.h>
 #include <common/barrier.h>
 

+ 100 - 38
src/common/fxt.h

@@ -20,6 +20,8 @@
 #define __FXT_H__
 
 
+/** @file */
+
 #ifndef _GNU_SOURCE
 #define _GNU_SOURCE  1 /* ou _BSD_SOURCE ou _SVID_SOURCE */
 #endif
@@ -311,13 +313,13 @@ static inline unsigned long _starpu_fxt_get_submit_order(void)
 
 long _starpu_gettid(void);
 
-/* Initialize the FxT library. */
+/** Initialize the FxT library. */
 void _starpu_fxt_init_profiling(uint64_t trace_buffer_size);
 
-/* Stop the FxT library, and generate the trace file. */
+/** Stop the FxT library, and generate the trace file. */
 void _starpu_stop_fxt_profiling(void);
 
-/* Generate the trace file. Used when catching signals SIGINT and SIGSEGV */
+/** Generate the trace file. Used when catching signals SIGINT and SIGSEGV */
 void _starpu_fxt_dump_file(void);
 
 #ifdef FUT_NEEDS_COMMIT
@@ -326,15 +328,11 @@ void _starpu_fxt_dump_file(void);
 #define _STARPU_FUT_COMMIT(size) do { } while (0)
 #endif
 
-#ifdef FUT_FULL_PROBE1STR
-#define _STARPU_FUT_FULL_PROBE1STR(KEYMASK, CODE, P1, str) FUT_FULL_PROBE1STR(CODE, P1, str)
+#ifdef FUT_ALWAYS_PROBE1STR
+#define _STARPU_FUT_ALWAYS_PROBE1STR(CODE, P1, str) FUT_RAW_ALWAYS_PROBE1STR(CODE, P1, str)
 #else
-/* Sometimes we need something a little more specific than the wrappers from
- * FxT: these macro permit to put add an event with 3 (or 4) numbers followed
- * by a string. */
-#define _STARPU_FUT_FULL_PROBE1STR(KEYMASK, CODE, P1, str)			\
+#define _STARPU_FUT_ALWAYS_PROBE1STR(CODE, P1, str)	\
 do {									\
-    if(KEYMASK & fut_active) {							\
 	/* No more than FXT_MAX_PARAMS args are allowed */		\
 	/* we add a \0 just in case ... */				\
 	size_t len = STARPU_MIN(strlen(str)+1, (FXT_MAX_PARAMS - 1)*sizeof(unsigned long));\
@@ -347,19 +345,28 @@ do {									\
 	snprintf((char *)futargs, len, "%s", str);			\
 	((char *)futargs)[len - 1] = '\0';				\
 	_STARPU_FUT_COMMIT(total_len);					\
-    }									\
 } while (0);
 #endif
 
-#ifdef FUT_FULL_PROBE2STR
-#define _STARPU_FUT_FULL_PROBE2STR(KEYMASK, CODE, P1, P2, str) FUT_FULL_PROBE2STR(CODE, P1, P2, str)
+#ifdef FUT_FULL_PROBE1STR
+#define _STARPU_FUT_FULL_PROBE1STR(KEYMASK, CODE, P1, str) FUT_FULL_PROBE1STR(CODE, P1, str)
 #else
-/* Sometimes we need something a little more specific than the wrappers from
+/** Sometimes we need something a little more specific than the wrappers from
  * FxT: these macro permit to put add an event with 3 (or 4) numbers followed
  * by a string. */
-#define _STARPU_FUT_FULL_PROBE2STR(KEYMASK, CODE, P1, P2, str)			\
+#define _STARPU_FUT_FULL_PROBE1STR(KEYMASK, CODE, P1, str)		\
+do {									\
+    if(KEYMASK & fut_active) {						\
+	_STARPU_FUT_ALWAYS_PROBE1STR(CODE, P1, str);		\
+    }									\
+} while (0);
+#endif
+
+#ifdef FUT_ALWAYS_PROBE2STR
+#define _STARPU_FUT_ALWAYS_PROBE2STR(CODE, P1, P2, str) FUT_RAW_ALWAYS_PROBE2STR(CODE, P1, P2, str)
+#else
+#define _STARPU_FUT_ALWAYS_PROBE2STR(CODE, P1, P2, str)			\
 do {									\
-    if(KEYMASK & fut_active) {							\
 	/* No more than FXT_MAX_PARAMS args are allowed */		\
 	/* we add a \0 just in case ... */				\
 	size_t len = STARPU_MIN(strlen(str)+1, (FXT_MAX_PARAMS - 2)*sizeof(unsigned long));\
@@ -373,16 +380,25 @@ do {									\
 	snprintf((char *)futargs, len, "%s", str);			\
 	((char *)futargs)[len - 1] = '\0';				\
 	_STARPU_FUT_COMMIT(total_len);					\
+} while (0);
+#endif
+
+#ifdef FUT_FULL_PROBE2STR
+#define _STARPU_FUT_FULL_PROBE2STR(KEYMASK, CODE, P1, P2, str) FUT_FULL_PROBE2STR(CODE, P1, P2, str)
+#else
+#define _STARPU_FUT_FULL_PROBE2STR(KEYMASK, CODE, P1, P2, str)		\
+do {									\
+    if(KEYMASK & fut_active) {						\
+	_STARPU_FUT_ALWAYS_PROBE2STR(CODE, P1, P2, str);		\
     }									\
 } while (0);
 #endif
 
-#ifdef FUT_FULL_PROBE3STR
-#define _STARPU_FUT_FULL_PROBE3STR(KEYMASK, CODE, P1, P2, P3, str) FUT_FULL_PROBE3STR(CODE, P1, P2, P3, str)
+#ifdef FUT_ALWAYS_PROBE3STR
+#define _STARPU_FUT_ALWAYS_PROBE3STR(CODE, P1, P2, P3, str) FUT_RAW_ALWAYS_PROBE3STR(CODE, P1, P2, P3, str)
 #else
-#define _STARPU_FUT_FULL_PROBE3STR(KEYMASK, CODE, P1, P2, P3, str)			\
+#define _STARPU_FUT_ALWAYS_PROBE3STR(CODE, P1, P2, P3, str)			\
 do {									\
-    if(KEYMASK & fut_active) {							\
 	/* No more than FXT_MAX_PARAMS args are allowed */		\
 	/* we add a \0 just in case ... */				\
 	size_t len = STARPU_MIN(strlen(str)+1, (FXT_MAX_PARAMS - 3)*sizeof(unsigned long));\
@@ -397,16 +413,25 @@ do {									\
 	snprintf((char *)futargs, len, "%s", str);			\
 	((char *)futargs)[len - 1] = '\0';				\
 	_STARPU_FUT_COMMIT(total_len);					\
+} while (0);
+#endif
+
+#ifdef FUT_FULL_PROBE3STR
+#define _STARPU_FUT_FULL_PROBE3STR(KEYMASK, CODE, P1, P2, P3, str) FUT_FULL_PROBE3STR(CODE, P1, P2, P3, str)
+#else
+#define _STARPU_FUT_FULL_PROBE3STR(KEYMASK, CODE, P1, P2, P3, str)		\
+do {									\
+    if(KEYMASK & fut_active) {						\
+	_STARPU_FUT_ALWAYS_PROBE3STR(CODE, P1, P2, P3, str);	\
     }									\
 } while (0);
 #endif
 
-#ifdef FUT_FULL_PROBE4STR
-#define _STARPU_FUT_FULL_PROBE4STR(KEYMASK, CODE, P1, P2, P3, P4, str) FUT_FULL_PROBE4STR(CODE, P1, P2, P3, P4, str)
+#ifdef FUT_ALWAYS_PROBE4STR
+#define _STARPU_FUT_ALWAYS_PROBE4STR(CODE, P1, P2, P3, P4, str) FUT_RAW_ALWAYS_PROBE4STR(CODE, P1, P2, P3, P4, str)
 #else
-#define _STARPU_FUT_FULL_PROBE4STR(KEYMASK, CODE, P1, P2, P3, P4, str)		\
+#define _STARPU_FUT_ALWAYS_PROBE4STR(CODE, P1, P2, P3, P4, str)		\
 do {									\
-    if(KEYMASK & fut_active) {							\
 	/* No more than FXT_MAX_PARAMS args are allowed */		\
 	/* we add a \0 just in case ... */				\
 	size_t len = STARPU_MIN(strlen(str)+1, (FXT_MAX_PARAMS - 4)*sizeof(unsigned long));\
@@ -422,16 +447,25 @@ do {									\
 	snprintf((char *)futargs, len, "%s", str);			\
 	((char *)futargs)[len - 1] = '\0';				\
 	_STARPU_FUT_COMMIT(total_len);					\
+} while (0);
+#endif
+
+#ifdef FUT_FULL_PROBE4STR
+#define _STARPU_FUT_FULL_PROBE4STR(KEYMASK, CODE, P1, P2, P3, P4, str) FUT_FULL_PROBE4STR(CODE, P1, P2, P3, P4, str)
+#else
+#define _STARPU_FUT_FULL_PROBE4STR(KEYMASK, CODE, P1, P2, P3, P4, str)		\
+do {									\
+    if(KEYMASK & fut_active) {						\
+	_STARPU_FUT_ALWAYS_PROBE4STR(CODE, P1, P2, P3, P4, str);	\
     }									\
 } while (0);
 #endif
 
-#ifdef FUT_FULL_PROBE5STR
-#define _STARPU_FUT_FULL_PROBE5STR(KEYMASK, CODE, P1, P2, P3, P4, P5, str) FUT_FULL_PROBE5STR(CODE, P1, P2, P3, P4, P5, str)
+#ifdef FUT_ALWAYS_PROBE5STR
+#define _STARPU_FUT_ALWAYS_PROBE5STR(CODE, P1, P2, P3, P4, P5, str) FUT_RAW_ALWAYS_PROBE5STR(CODE, P1, P2, P3, P4, P5, str)
 #else
-#define _STARPU_FUT_FULL_PROBE5STR(KEYMASK, CODE, P1, P2, P3, P4, P5, str)		\
+#define _STARPU_FUT_ALWAYS_PROBE5STR(CODE, P1, P2, P3, P4, P5, str)		\
 do {									\
-    if(KEYMASK & fut_active) {							\
 	/* No more than FXT_MAX_PARAMS args are allowed */		\
 	/* we add a \0 just in case ... */				\
 	size_t len = STARPU_MIN(strlen(str)+1, (FXT_MAX_PARAMS - 5)*sizeof(unsigned long));\
@@ -448,16 +482,25 @@ do {									\
 	snprintf((char *)futargs, len, "%s", str);			\
 	((char *)futargs)[len - 1] = '\0';				\
 	_STARPU_FUT_COMMIT(total_len);					\
+} while (0);
+#endif
+
+#ifdef FUT_FULL_PROBE5STR
+#define _STARPU_FUT_FULL_PROBE5STR(KEYMASK, CODE, P1, P2, P3, P4, P5, str) FUT_FULL_PROBE5STR(CODE, P1, P2, P3, P4, P5, str)
+#else
+#define _STARPU_FUT_FULL_PROBE5STR(KEYMASK, CODE, P1, P2, P3, P4, P5, str)		\
+do {									\
+    if(KEYMASK & fut_active) {						\
+	_STARPU_FUT_ALWAYS_PROBE5STR(CODE, P1, P2, P3, P4, P5, str);	\
     }									\
 } while (0);
 #endif
 
-#ifdef FUT_FULL_PROBE6STR
-#define _STARPU_FUT_FULL_PROBE6STR(KEYMASK, CODE, P1, P2, P3, P4, P5, P6, str) FUT_FULL_PROBE6STR(CODE, P1, P2, P3, P4, P5, P6, str)
+#ifdef FUT_ALWAYS_PROBE6STR
+#define _STARPU_FUT_ALWAYS_PROBE6STR(CODE, P1, P2, P3, P4, P5, P6, str) FUT_RAW_ALWAYS_PROBE6STR(CODE, P1, P2, P3, P4, P5, P6, str)
 #else
-#define _STARPU_FUT_FULL_PROBE6STR(KEYMASK, CODE, P1, P2, P3, P4, P5, P6, str)	\
+#define _STARPU_FUT_ALWAYS_PROBE6STR(CODE, P1, P2, P3, P4, P5, P6, str)	\
 do {									\
-    if(KEYMASK & fut_active) {							\
 	/* No more than FXT_MAX_PARAMS args are allowed */		\
 	/* we add a \0 just in case ... */				\
 	size_t len = STARPU_MIN(strlen(str)+1, (FXT_MAX_PARAMS - 6)*sizeof(unsigned long));\
@@ -475,16 +518,25 @@ do {									\
 	snprintf((char *)futargs, len, "%s", str);			\
 	((char *)futargs)[len - 1] = '\0';				\
 	_STARPU_FUT_COMMIT(total_len);					\
+} while (0);
+#endif
+
+#ifdef FUT_FULL_PROBE6STR
+#define _STARPU_FUT_FULL_PROBE6STR(KEYMASK, CODE, P1, P2, P3, P4, P5, P6, str) FUT_FULL_PROBE6STR(CODE, P1, P2, P3, P4, P5, P6, str)
+#else
+#define _STARPU_FUT_FULL_PROBE6STR(KEYMASK, CODE, P1, P2, P3, P4, P5, P6, str)		\
+do {									\
+    if(KEYMASK & fut_active) {						\
+	_STARPU_FUT_ALWAYS_PROBE6STR(CODE, P1, P2, P3, P4, P5, P6, str);	\
     }									\
 } while (0);
 #endif
 
-#ifdef FUT_FULL_PROBE7STR
-#define _STARPU_FUT_FULL_PROBE7STR(KEYMASK, CODE, P1, P2, P3, P4, P5, P6, P7, str) FUT_FULL_PROBE7STR(CODE, P1, P2, P3, P4, P5, P6, P7, str)
+#ifdef FUT_ALWAYS_PROBE7STR
+#define _STARPU_FUT_ALWAYS_PROBE7STR(CODE, P1, P2, P3, P4, P5, P6, P7, str) FUT_RAW_ALWAYS_PROBE7STR(CODE, P1, P2, P3, P4, P5, P6, P7, str)
 #else
-#define _STARPU_FUT_FULL_PROBE7STR(KEYMASK, CODE, P1, P2, P3, P4, P5, P6, P7, str)	\
+#define _STARPU_FUT_ALWAYS_PROBE7STR(CODE, P1, P2, P3, P4, P5, P6, P7, str)	\
 do {									\
-    if(KEYMASK & fut_active) {							\
 	/* No more than FXT_MAX_PARAMS args are allowed */		\
 	/* we add a \0 just in case ... */				\
 	size_t len = STARPU_MIN(strlen(str)+1, (FXT_MAX_PARAMS - 7)*sizeof(unsigned long));\
@@ -503,6 +555,16 @@ do {									\
 	snprintf((char *)futargs, len, "%s", str);			\
 	((char *)futargs)[len - 1] = '\0';				\
 	_STARPU_FUT_COMMIT(total_len);					\
+} while (0);
+#endif
+
+#ifdef FUT_FULL_PROBE7STR
+#define _STARPU_FUT_FULL_PROBE7STR(KEYMASK, CODE, P1, P2, P3, P4, P5, P6, P7, str) FUT_FULL_PROBE7STR(CODE, P1, P2, P3, P4, P5, P6, P7, str)
+#else
+#define _STARPU_FUT_FULL_PROBE7STR(KEYMASK, CODE, P1, P2, P3, P4, P5, P6, P7, str)		\
+do {									\
+    if(KEYMASK & fut_active) {						\
+	_STARPU_FUT_ALWAYS_PROBE7STR(CODE, P1, P2, P3, P4, P5, P6, P7, str);	\
     }									\
 } while (0);
 #endif
@@ -1203,10 +1265,10 @@ do {										\
 	FUT_FULL_PROBE4(_STARPU_FUT_KEYMASK_SCHED, _STARPU_FUT_SCHED_COMPONENT_POP_PRIO, _starpu_gettid(), workerid, ntasks, exp_len);
 
 #define _STARPU_TRACE_SCHED_COMPONENT_NEW(component)		\
-	_STARPU_FUT_FULL_PROBE1STR(_STARPU_FUT_KEYMASK_SCHED, _STARPU_FUT_SCHED_COMPONENT_NEW, component, (component)->name);
+	_STARPU_FUT_ALWAYS_PROBE1STR(_STARPU_FUT_SCHED_COMPONENT_NEW, component, (component)->name);
 
 #define _STARPU_TRACE_SCHED_COMPONENT_CONNECT(parent, child)		\
-	FUT_FULL_PROBE2(_STARPU_FUT_KEYMASK_SCHED, _STARPU_FUT_SCHED_COMPONENT_CONNECT, parent, child);
+	FUT_RAW_ALWAYS_PROBE2(FUT_CODE(_STARPU_FUT_SCHED_COMPONENT_CONNECT,2), parent, child);
 
 #define _STARPU_TRACE_SCHED_COMPONENT_PUSH(from, to, task)		\
 	FUT_FULL_PROBE5(_STARPU_FUT_KEYMASK_SCHED, _STARPU_FUT_SCHED_COMPONENT_PUSH, _starpu_gettid(), from, to, task, (task)->priority);

+ 25 - 16
src/common/graph.h

@@ -18,6 +18,9 @@
 #define __GRAPH_H__
 
 #include <common/list.h>
+
+/** @file */
+
 MULTILIST_CREATE_TYPE(_starpu_graph_node, all)
 MULTILIST_CREATE_TYPE(_starpu_graph_node, top)
 MULTILIST_CREATE_TYPE(_starpu_graph_node, bottom)
@@ -28,24 +31,24 @@ struct _starpu_graph_node
 	starpu_pthread_mutex_t mutex;	/* protects access to the job */
 	struct _starpu_job *job;	/* pointer to the job, if it is still alive, NULL otherwise */
 
-	/*
+	/**
 	 * Fields for graph analysis for scheduling heuristics
 	 */
-	/* Member of list of all jobs without incoming dependency */
+	/** Member of list of all jobs without incoming dependency */
 	struct _starpu_graph_node_multilist_top top;
-	/* Member of list of all jobs without outgoing dependency */
+	/** Member of list of all jobs without outgoing dependency */
 	struct _starpu_graph_node_multilist_bottom bottom;
-	/* Member of list of all jobs */
+	/** Member of list of all jobs */
 	struct _starpu_graph_node_multilist_all all;
-	/* Member of list of dropped jobs */
+	/** Member of list of dropped jobs */
 	struct _starpu_graph_node_multilist_dropped dropped;
 
-	/* set of incoming dependencies */
+	/** set of incoming dependencies */
 	struct _starpu_graph_node **incoming;	/* May contain NULLs for terminated jobs */
 	unsigned *incoming_slot;	/* Index within corresponding outgoing array */
 	unsigned n_incoming;		/* Number of slots used */
 	unsigned alloc_incoming;	/* Size of incoming */
-	/* set of outgoing dependencies */
+	/** set of outgoing dependencies */
 	struct _starpu_graph_node **outgoing;
 	unsigned *outgoing_slot;	/* Index within corresponding incoming array */
 	unsigned n_outgoing;		/* Number of slots used */
@@ -71,27 +74,33 @@ void _starpu_graph_rdlock(void);
 void _starpu_graph_wrunlock(void);
 void _starpu_graph_rdunlock(void);
 
-/* Add a job to the graph, called before any _starpu_graph_add_job_dep call */
+/** Add a job to the graph, called before any _starpu_graph_add_job_dep call */
 void _starpu_graph_add_job(struct _starpu_job *job);
 
-/* Add a dependency between jobs */
+/** Add a dependency between jobs */
 void _starpu_graph_add_job_dep(struct _starpu_job *job, struct _starpu_job *prev_job);
 
-/* Remove a job from the graph */
+/** Remove a job from the graph */
 void _starpu_graph_drop_job(struct _starpu_job *job);
 
-/* Really drop the nodes from the graph now */
+/** Really drop the nodes from the graph now */
 void _starpu_graph_drop_dropped_nodes(void);
 
-/* This make StarPU compute for each task the depth, i.e. the length of the longest path to a task without outgoing dependencies. */
-/* This does not take job duration into account, just the number */
+/**
+ * This make StarPU compute for each task the depth, i.e. the length
+ * of the longest path to a task without outgoing dependencies.
+ * This does not take job duration into account, just the number
+*/
 void _starpu_graph_compute_depths(void);
 
-/* Compute the descendants of jobs in the graph */
+/** Compute the descendants of jobs in the graph */
 void _starpu_graph_compute_descendants(void);
 
-/* This calls \e func for each node of the task graph, passing also \e data as it */
-/* Apply func on each job of the graph */
+/**
+ * This calls \e func for each node of the task graph, passing also \e
+ * data as it
+ * Apply func on each job of the graph
+*/
 void _starpu_graph_foreach(void (*func)(void *data, struct _starpu_graph_node *node), void *data);
 
 #endif /* __GRAPH_H__ */

+ 3 - 1
src/common/knobs.h

@@ -19,11 +19,13 @@
 #ifndef __KNOBS_H__
 #define __KNOBS_H__
 
+/** @file */
+
 #include <stdint.h>
 #include <starpu.h>
 #include <common/config.h>
 
-/* Performance Monitoring */
+/** Performance Monitoring */
 #define STARPU_ASSERT_PERF_COUNTER_SCOPE_DEFINED(t) STARPU_ASSERT( \
 		(t == starpu_perf_counter_scope_global ) \
 		|| (t == starpu_perf_counter_scope_per_worker ) \

+ 0 - 5
src/common/list.h

@@ -20,11 +20,6 @@
 
 #include <starpu_util.h>
 
-/** @file
- * @brief Doubly-linked lists
- */
-
-
 /** @remarks list how-to
  * *********************************************************
  * LIST_TYPE(FOO, content);

+ 2 - 0
src/common/prio_list.h

@@ -14,6 +14,8 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
+/** @file */
+
 /*
  * This implements list with priorities (as an int), by using two stages:
  * - an RB tree stage sorted by priority, whose leaves are...

+ 23 - 21
src/common/rbtree.h

@@ -29,6 +29,8 @@
 #ifndef _KERN_RBTREE_H
 #define _KERN_RBTREE_H
 
+/** @file */
+
 #include <stddef.h>
 #include <assert.h>
 #include <stdint.h>
@@ -44,24 +46,24 @@
 #define STARPU_RBTREE_LEFT     0
 #define STARPU_RBTREE_RIGHT    1
 
-/*
+/**
  * Red-black node.
  */
 struct starpu_rbtree_node;
 
-/*
+/**
  * Red-black tree.
  */
 struct starpu_rbtree;
 
-/*
+/**
  * Static tree initializer.
  */
 #define STARPU_RBTREE_INITIALIZER { NULL }
 
 #include "rbtree_i.h"
 
-/*
+/**
  * Initialize a tree.
  */
 static inline void starpu_rbtree_init(struct starpu_rbtree *tree)
@@ -69,14 +71,14 @@ static inline void starpu_rbtree_init(struct starpu_rbtree *tree)
     tree->root = NULL;
 }
 
-/*
+/**
  * This version assumes that the content of tree was already zeroed
  */
 static inline void starpu_rbtree_init0(struct starpu_rbtree *tree STARPU_ATTRIBUTE_UNUSED)
 {
 }
 
-/*
+/**
  * Initialize a node.
  *
  * A node is in no tree when its parent points to itself.
@@ -90,7 +92,7 @@ static inline void starpu_rbtree_node_init(struct starpu_rbtree_node *node)
     node->children[STARPU_RBTREE_RIGHT] = NULL;
 }
 
-/*
+/**
  * This version assumes that the content of node was already zeroed
  */
 static inline void starpu_rbtree_node_init0(struct starpu_rbtree_node *node)
@@ -102,7 +104,7 @@ static inline void starpu_rbtree_node_init0(struct starpu_rbtree_node *node)
     //node->children[STARPU_RBTREE_RIGHT] = NULL;
 }
 
-/*
+/**
  * Return true if node is in no tree.
  */
 static inline int starpu_rbtree_node_unlinked(const struct starpu_rbtree_node *node)
@@ -110,13 +112,13 @@ static inline int starpu_rbtree_node_unlinked(const struct starpu_rbtree_node *n
     return starpu_rbtree_parent(node) == node;
 }
 
-/*
+/**
  * Macro that evaluates to the address of the structure containing the
  * given node based on the given type and member.
  */
 #define starpu_rbtree_entry(node, type, member) structof(node, type, member)
 
-/*
+/**
  * Return true if tree is empty.
  */
 static inline int starpu_rbtree_empty(const struct starpu_rbtree *tree)
@@ -124,7 +126,7 @@ static inline int starpu_rbtree_empty(const struct starpu_rbtree *tree)
     return tree->root == NULL;
 }
 
-/*
+/**
  * Look up a node in a tree.
  *
  * Note that implementing the lookup algorithm as a macro gives two benefits:
@@ -155,7 +157,7 @@ MACRO_BEGIN                                             \
     ___cur;                                             \
 MACRO_END
 
-/*
+/**
  * Look up a node or one of its nearest nodes in a tree.
  *
  * This macro essentially acts as starpu_rbtree_lookup() but if no entry matched
@@ -191,7 +193,7 @@ MACRO_BEGIN                                                 \
     ___cur;                                                 \
 MACRO_END
 
-/*
+/**
  * Insert a node in a tree.
  *
  * This macro performs a standard lookup to obtain the insertion point of
@@ -227,7 +229,7 @@ MACRO_BEGIN                                                 \
     starpu_rbtree_insert_rebalance(tree, ___prev, ___index, node); \
 MACRO_END
 
-/*
+/**
  * Look up a node/slot pair in a tree.
  *
  * This macro essentially acts as starpu_rbtree_lookup() but in addition to a node,
@@ -263,7 +265,7 @@ MACRO_BEGIN                                         \
     ___cur;                                         \
 MACRO_END
 
-/*
+/**
  * Insert a node at an insertion point in a tree.
  *
  * This macro essentially acts as starpu_rbtree_insert() except that it doesn't
@@ -283,20 +285,20 @@ static inline void starpu_rbtree_insert_slot(struct starpu_rbtree *tree, uintptr
     starpu_rbtree_insert_rebalance(tree, parent, index, node);
 }
 
-/*
+/**
  * Remove a node from a tree.
  *
  * After completion, the node is stale.
  */
 void starpu_rbtree_remove(struct starpu_rbtree *tree, struct starpu_rbtree_node *node);
 
-/*
+/**
  * Return the first node of a tree.
  */
 /* TODO: optimize by maintaining the first node of the tree */
 #define starpu_rbtree_first(tree) starpu_rbtree_firstlast(tree, STARPU_RBTREE_LEFT)
 
-/*
+/**
  * Return the last node of a tree.
  */
 /* TODO: optimize by maintaining the first node of the tree */
@@ -304,17 +306,17 @@ void starpu_rbtree_remove(struct starpu_rbtree *tree, struct starpu_rbtree_node
  * bigger that the biggest node */
 #define starpu_rbtree_last(tree) starpu_rbtree_firstlast(tree, STARPU_RBTREE_RIGHT)
 
-/*
+/**
  * Return the node previous to the given node.
  */
 #define starpu_rbtree_prev(node) starpu_rbtree_walk(node, STARPU_RBTREE_LEFT)
 
-/*
+/**
  * Return the node next to the given node.
  */
 #define starpu_rbtree_next(node) starpu_rbtree_walk(node, STARPU_RBTREE_RIGHT)
 
-/*
+/**
  * Forge a loop to process all nodes of a tree, removing them when visited.
  *
  * This macro can only be used to destroy a tree, so that the resources used

+ 20 - 18
src/common/rbtree_i.h

@@ -28,7 +28,9 @@
 
 #include <assert.h>
 
-/*
+/** @file */
+
+/**
  * Red-black node structure.
  *
  * To reduce the number of branches and the instruction cache footprint,
@@ -47,34 +49,34 @@ struct starpu_rbtree_node {
     struct starpu_rbtree_node *children[2];
 };
 
-/*
+/**
  * Red-black tree structure.
  */
 struct starpu_rbtree {
     struct starpu_rbtree_node *root;
 };
 
-/*
+/**
  * Masks applied on the parent member of a node to obtain either the
  * color or the parent address.
  */
 #define STARPU_RBTREE_COLOR_MASK   ((uintptr_t) 0x1)
 #define STARPU_RBTREE_PARENT_MASK  (~((uintptr_t) 0x3))
 
-/*
+/**
  * Node colors.
  */
 #define STARPU_RBTREE_COLOR_RED    0
 #define STARPU_RBTREE_COLOR_BLACK  1
 
-/*
+/**
  * Masks applied on slots to obtain either the child index or the parent
  * address.
  */
 #define STARPU_RBTREE_SLOT_INDEX_MASK  ((uintptr_t) 0x1)
 #define STARPU_RBTREE_SLOT_PARENT_MASK (~STARPU_RBTREE_SLOT_INDEX_MASK)
 
-/*
+/**
  * Return true if the given pointer is suitably aligned.
  */
 static inline int starpu_rbtree_check_alignment(const struct starpu_rbtree_node *node)
@@ -82,7 +84,7 @@ static inline int starpu_rbtree_check_alignment(const struct starpu_rbtree_node
     return ((uintptr_t)node & (~STARPU_RBTREE_PARENT_MASK)) == 0;
 }
 
-/*
+/**
  * Return true if the given index is a valid child index.
  */
 static inline int starpu_rbtree_check_index(int index)
@@ -90,7 +92,7 @@ static inline int starpu_rbtree_check_index(int index)
     return index == (index & 1);
 }
 
-/*
+/**
  * Convert the result of a comparison into an index in the children array
  * (0 or 1).
  *
@@ -101,7 +103,7 @@ static inline int starpu_rbtree_d2i(int diff)
     return !(diff <= 0);
 }
 
-/*
+/**
  * Return the parent of a node.
  */
 static inline struct starpu_rbtree_node * starpu_rbtree_parent(const struct starpu_rbtree_node *node)
@@ -109,7 +111,7 @@ static inline struct starpu_rbtree_node * starpu_rbtree_parent(const struct star
     return (struct starpu_rbtree_node *)(node->parent & STARPU_RBTREE_PARENT_MASK);
 }
 
-/*
+/**
  * Translate an insertion point into a slot.
  */
 static inline uintptr_t starpu_rbtree_slot(struct starpu_rbtree_node *parent, int index)
@@ -119,7 +121,7 @@ static inline uintptr_t starpu_rbtree_slot(struct starpu_rbtree_node *parent, in
     return (uintptr_t)parent | index;
 }
 
-/*
+/**
  * Extract the parent address from a slot.
  */
 static inline struct starpu_rbtree_node * starpu_rbtree_slot_parent(uintptr_t slot)
@@ -127,7 +129,7 @@ static inline struct starpu_rbtree_node * starpu_rbtree_slot_parent(uintptr_t sl
     return (struct starpu_rbtree_node *)(slot & STARPU_RBTREE_SLOT_PARENT_MASK);
 }
 
-/*
+/**
  * Extract the index from a slot.
  */
 static inline int starpu_rbtree_slot_index(uintptr_t slot)
@@ -135,7 +137,7 @@ static inline int starpu_rbtree_slot_index(uintptr_t slot)
     return slot & STARPU_RBTREE_SLOT_INDEX_MASK;
 }
 
-/*
+/**
  * Insert a node in a tree, rebalancing it if necessary.
  *
  * The index parameter is the index in the children array of the parent where
@@ -146,7 +148,7 @@ static inline int starpu_rbtree_slot_index(uintptr_t slot)
 void starpu_rbtree_insert_rebalance(struct starpu_rbtree *tree, struct starpu_rbtree_node *parent,
                              int index, struct starpu_rbtree_node *node);
 
-/*
+/**
  * Return the previous or next node relative to a location in a tree.
  *
  * The parent and index parameters define the location, which can be empty.
@@ -156,7 +158,7 @@ void starpu_rbtree_insert_rebalance(struct starpu_rbtree *tree, struct starpu_rb
 struct starpu_rbtree_node * starpu_rbtree_nearest(struct starpu_rbtree_node *parent, int index,
                                     int direction);
 
-/*
+/**
  * Return the first or last node of a tree.
  *
  * The direction parameter is either STARPU_RBTREE_LEFT (to obtain the first node)
@@ -164,7 +166,7 @@ struct starpu_rbtree_node * starpu_rbtree_nearest(struct starpu_rbtree_node *par
  */
 struct starpu_rbtree_node * starpu_rbtree_firstlast(const struct starpu_rbtree *tree, int direction);
 
-/*
+/**
  * Return the node next to, or previous to the given node.
  *
  * The direction parameter is either STARPU_RBTREE_LEFT (to obtain the previous node)
@@ -172,13 +174,13 @@ struct starpu_rbtree_node * starpu_rbtree_firstlast(const struct starpu_rbtree *
  */
 struct starpu_rbtree_node * starpu_rbtree_walk(struct starpu_rbtree_node *node, int direction);
 
-/*
+/**
  * Return the left-most deepest node of a tree, which is the starting point of
  * the postorder traversal performed by starpu_rbtree_for_each_remove().
  */
 struct starpu_rbtree_node * starpu_rbtree_postwalk_deepest(const struct starpu_rbtree *tree);
 
-/*
+/**
  * Unlink a node from its tree and return the next (right) node in postorder.
  */
 struct starpu_rbtree_node * starpu_rbtree_postwalk_unlink(struct starpu_rbtree_node *node);

+ 9 - 7
src/common/rwlock.h

@@ -20,7 +20,9 @@
 #include <stdint.h>
 #include <starpu.h>
 
-/* Dummy implementation of a RW-lock using a spinlock. */
+/** @file */
+
+/** Dummy implementation of a RW-lock using a spinlock. */
 struct _starpu_rw_lock
 {
 	uint32_t busy;
@@ -28,24 +30,24 @@ struct _starpu_rw_lock
 	uint16_t readercnt;
 };
 
-/* Initialize the RW-lock */
+/** Initialize the RW-lock */
 void _starpu_init_rw_lock(struct _starpu_rw_lock *lock);
 
-/* Grab the RW-lock in a write mode */
+/** Grab the RW-lock in a write mode */
 void _starpu_take_rw_lock_write(struct _starpu_rw_lock *lock);
 
-/* Grab the RW-lock in a read mode */
+/** Grab the RW-lock in a read mode */
 void _starpu_take_rw_lock_read(struct _starpu_rw_lock *lock);
 
-/* Try to grab the RW-lock in a write mode. Returns 0 in case of success, -1
+/** Try to grab the RW-lock in a write mode. Returns 0 in case of success, -1
  * otherwise. */
 int _starpu_take_rw_lock_write_try(struct _starpu_rw_lock *lock);
 
-/* Try to grab the RW-lock in a read mode. Returns 0 in case of success, -1
+/** Try to grab the RW-lock in a read mode. Returns 0 in case of success, -1
  * otherwise. */
 int _starpu_take_rw_lock_read_try(struct _starpu_rw_lock *lock);
 
-/* Unlock the RW-lock. */
+/** Unlock the RW-lock. */
 void _starpu_release_rw_lock(struct _starpu_rw_lock *lock);
 
 #endif

+ 2 - 0
src/common/starpu_spinlock.h

@@ -16,6 +16,8 @@
 #ifndef __STARPU_SPINLOCK_H__
 #define __STARPU_SPINLOCK_H__
 
+/** @file */
+
 #include <errno.h>
 #include <stdint.h>
 #include <common/config.h>

+ 20 - 1
src/common/thread.c

@@ -90,8 +90,16 @@ int starpu_pthread_create_on(const char *name, starpu_pthread_t *thread, const s
 	void *tsd;
 	_STARPU_CALLOC(tsd, MAX_TSD+1, sizeof(void*));
 
+#ifndef HAVE_SG_ACTOR_SET_STACKSIZE
+	if (attr && attr->stacksize)
+		_starpu_simgrid_set_stack_size(attr->stacksize);
+#endif
 #ifdef HAVE_SG_ACTOR_INIT
 	*thread= sg_actor_init(name, host);
+#ifdef HAVE_SG_ACTOR_SET_STACKSIZE
+	if (attr && attr->stacksize)
+		sg_actor_set_stacksize(*thread, attr->stacksize);
+#endif
 	sg_actor_data_set(*thread, tsd);
 	sg_actor_start(*thread, _starpu_simgrid_thread_start, 2, _args);
 #else
@@ -100,6 +108,10 @@ int starpu_pthread_create_on(const char *name, starpu_pthread_t *thread, const s
 	sg_actor_data_set(*thread, tsd);
 #endif
 #endif
+#ifndef HAVE_SG_ACTOR_SET_STACKSIZE
+	if (attr && attr->stacksize)
+		_starpu_simgrid_set_stack_size(_starpu_default_stack_size);
+#endif
 
 #if SIMGRID_VERSION >= 31500 && SIMGRID_VERSION != 31559
 #  ifdef HAVE_SG_ACTOR_REF
@@ -148,8 +160,9 @@ int starpu_pthread_exit(void *retval STARPU_ATTRIBUTE_UNUSED)
 }
 
 
-int starpu_pthread_attr_init(starpu_pthread_attr_t *attr STARPU_ATTRIBUTE_UNUSED)
+int starpu_pthread_attr_init(starpu_pthread_attr_t *attr)
 {
+	attr->stacksize = 0;
 	return 0;
 }
 
@@ -158,6 +171,12 @@ int starpu_pthread_attr_destroy(starpu_pthread_attr_t *attr STARPU_ATTRIBUTE_UNU
 	return 0;
 }
 
+int starpu_pthread_attr_setstacksize(starpu_pthread_attr_t *attr, size_t stacksize)
+{
+	attr->stacksize = stacksize;
+	return 0;
+}
+
 int starpu_pthread_attr_setdetachstate(starpu_pthread_attr_t *attr STARPU_ATTRIBUTE_UNUSED, int detachstate STARPU_ATTRIBUTE_UNUSED)
 {
 	return 0;

+ 2 - 0
src/common/thread.h

@@ -17,6 +17,8 @@
 #ifndef __COMMON_THREAD_H__
 #define __COMMON_THREAD_H__
 
+/** @file */
+
 #include <common/utils.h>
 
 #if defined(STARPU_LINUX_SYS) && defined(STARPU_HAVE_XCHG)

+ 2 - 0
src/common/timing.h

@@ -17,6 +17,8 @@
 #ifndef TIMING_H
 #define TIMING_H
 
+/** @file */
+
 /*
  * _starpu_timing_init must be called prior to using any of these timing
  * functions.

+ 6 - 4
src/common/utils.h

@@ -17,6 +17,8 @@
 #ifndef __COMMON_UTILS_H__
 #define __COMMON_UTILS_H__
 
+/** @file */
+
 #include <common/config.h>
 #include <starpu.h>
 #include <sys/stat.h>
@@ -162,17 +164,17 @@ int _starpu_fwrunlock(FILE *file);
 char *_starpu_get_home_path(void);
 void _starpu_gethostname(char *hostname, size_t size);
 
-/* If FILE is currently on a comment line, eat it.  */
+/** If FILE is currently on a comment line, eat it.  */
 void _starpu_drop_comments(FILE *f);
 
 struct _starpu_job;
-/* Returns the symbol associated to that job if any. */
+/** Returns the symbol associated to that job if any. */
 const char *_starpu_job_get_model_name(struct _starpu_job *j);
-/* Returns the name associated to that job if any. */
+/** Returns the name associated to that job if any. */
 const char *_starpu_job_get_task_name(struct _starpu_job *j);
 
 struct starpu_codelet;
-/* Returns the symbol associated to that job if any. */
+/** Returns the symbol associated to that job if any. */
 const char *_starpu_codelet_get_model_name(struct starpu_codelet *cl);
 
 int _starpu_check_mutex_deadlock(starpu_pthread_mutex_t *mutex);

+ 2 - 0
src/core/combined_workers.h

@@ -17,6 +17,8 @@
 #ifndef __COMBINED_WORKERS_H__
 #define __COMBINED_WORKERS_H__
 
+/** @file */
+
 #include <starpu.h>
 #include <common/config.h>
 

+ 2 - 0
src/core/debug.h

@@ -17,6 +17,8 @@
 #ifndef __DEBUG_H__
 #define __DEBUG_H__
 
+/** @file */
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdarg.h>

+ 4 - 0
src/core/dependencies/cg.c

@@ -286,6 +286,8 @@ void _starpu_notify_cg(void *pred STARPU_ATTRIBUTE_UNUSED, struct _starpu_cg *cg
 	}
 }
 
+/* Called when a job has just started, so we can notify tasks which were waiting
+ * only for this one when they can expect to start */
 /* Note: in case of a tag, it must be already locked */
 void _starpu_notify_job_ready_soon_cg(void *pred STARPU_ATTRIBUTE_UNUSED, struct _starpu_cg *cg, _starpu_notify_job_start_data *data)
 {
@@ -389,6 +391,8 @@ void _starpu_notify_cg_list(void *pred, struct _starpu_cg_list *successors)
 	_starpu_spin_unlock(&successors->lock);
 }
 
+/* Called when a job has just started, so we can notify tasks which were waiting
+ * only for this one when they can expect to start */
 /* Caller just has to promise that the list will not disappear.
  * _starpu_notify_cg_list protects the list itself.
  * No job lock should be held, since we might want to immediately call the callback of an empty task.

+ 11 - 9
src/core/dependencies/cg.h

@@ -17,10 +17,12 @@
 #ifndef __CG_H__
 #define __CG_H__
 
+/** @file */
+
 #include <starpu.h>
 #include <common/config.h>
 
-/* we do not necessarily want to allocate room for 256 dependencies, but we
+/** we do not necessarily want to allocate room for 256 dependencies, but we
    want to handle the few situation where there are a lot of dependencies as
    well */
 #define STARPU_DYNAMIC_DEPS_SIZE	1
@@ -32,30 +34,30 @@
 
 struct _starpu_job;
 
-/* Completion Group list, records both the number of expected notifications
+/** Completion Group list, records both the number of expected notifications
  * before the completion can start, and the list of successors when the
  * completion is finished. */
 struct _starpu_cg_list
 {
-	/* Protects atomicity of the list and the terminated flag */
+	/** Protects atomicity of the list and the terminated flag */
 	struct _starpu_spinlock lock;
 
-	/* Number of notifications to be waited for */
+	/** Number of notifications to be waited for */
 	unsigned ndeps; /* how many deps ? */
 	unsigned ndeps_completed; /* how many deps are done ? */
 #ifdef STARPU_DEBUG
-	/* Array of the notifications, size ndeps */
+	/** Array of the notifications, size ndeps */
 	struct _starpu_cg **deps;
-	/* Which ones have notified, size ndeps */
+	/** Which ones have notified, size ndeps */
 	char *done;
 #endif
 
-	/* Whether the completion is finished.
+	/** Whether the completion is finished.
 	 * For restartable/restarted tasks, only the first iteration is taken into account here.
 	 */
 	unsigned terminated;
 
-	/* List of successors */
+	/** List of successors */
 	unsigned nsuccs; /* how many successors ? */
 #ifdef STARPU_DYNAMIC_DEPS_SIZE
 	unsigned succ_list_size; /* How many allocated items in succ */
@@ -72,7 +74,7 @@ enum _starpu_cg_type
 	STARPU_CG_TASK=(1<<2)
 };
 
-/* Completion Group */
+/** Completion Group */
 struct _starpu_cg
 {
 	unsigned ntags; /* number of tags depended on */

+ 2 - 0
src/core/dependencies/data_concurrency.h

@@ -17,6 +17,8 @@
 #ifndef __DATA_CONCURRENCY_H__
 #define __DATA_CONCURRENCY_H__
 
+/** @file */
+
 #include <core/jobs.h>
 
 void _starpu_job_set_ordered_buffers(struct _starpu_job *j);

+ 8 - 0
src/core/dependencies/dependencies.c

@@ -81,10 +81,18 @@ static void __starpu_job_notify_start(struct _starpu_job *j, double delay)
 	/* TODO: check data notification */
 }
 
+/* Called when the last dependency of this job has just started, so we know that
+ * this job will be released after the given delay. */
 void _starpu_job_notify_ready_soon(struct _starpu_job *j, _starpu_notify_job_start_data *data)
 {
 	struct starpu_task *task = j->task;
+
+	/* Notify that this task will start after the given delay */
 	notify_ready_soon_func(notify_ready_soon_func_data, task, data->delay);
+
+
+	/* Notify some known transitions as well */
+
 	if (!task->cl || task->cl->where == STARPU_NOWHERE || task->where == STARPU_NOWHERE)
 		/* This task will immediately terminate, so transition this */
 		__starpu_job_notify_start(_starpu_get_job_associated_to_task(task), data->delay);

+ 4 - 2
src/core/dependencies/implicit_data_deps.h

@@ -17,6 +17,8 @@
 #ifndef __IMPLICIT_DATA_DEPS_H__
 #define __IMPLICIT_DATA_DEPS_H__
 
+/** @file */
+
 #include <starpu.h>
 #include <common/config.h>
 
@@ -30,10 +32,10 @@ void _starpu_release_task_enforce_sequential_consistency(struct _starpu_job *j);
 void _starpu_add_post_sync_tasks(struct starpu_task *post_sync_task, starpu_data_handle_t handle);
 void _starpu_unlock_post_sync_tasks(starpu_data_handle_t handle);
 
-/* Register a hook to be called when a write is submitted */
+/** Register a hook to be called when a write is submitted */
 void _starpu_implicit_data_deps_write_hook(void (*func)(starpu_data_handle_t));
 
-/* This function blocks until the handle is available in the requested mode */
+/** This function blocks until the handle is available in the requested mode */
 int _starpu_data_wait_until_available(starpu_data_handle_t handle, enum starpu_data_access_mode mode, const char *sync_name);
 
 void _starpu_data_clear_implicit(starpu_data_handle_t handle);

+ 2 - 0
src/core/dependencies/tags.c

@@ -266,6 +266,8 @@ void _starpu_notify_tag_dependencies(struct _starpu_tag *tag)
 	_starpu_spin_unlock(&tag->lock);
 }
 
+/* Called when a job has just started, so we can notify tasks which were waiting
+ * only for this one when they can expect to start */
 void _starpu_notify_job_start_tag_dependencies(struct _starpu_tag *tag, _starpu_notify_job_start_data *data)
 {
 	_starpu_notify_job_start_cg_list(tag, &tag->tag_successors, data);

+ 2 - 0
src/core/dependencies/tags.h

@@ -17,6 +17,8 @@
 #ifndef __TAGS_H__
 #define __TAGS_H__
 
+/** @file */
+
 #include <starpu.h>
 #include <common/config.h>
 #include <common/starpu_spinlock.h>

+ 2 - 0
src/core/dependencies/task_deps.c

@@ -66,6 +66,8 @@ void _starpu_notify_task_dependencies(struct _starpu_job *j)
 	_starpu_notify_cg_list(j, &j->job_successors);
 }
 
+/* Called when a job has just started, so we can notify tasks which were waiting
+ * only for this one when they can expect to start */
 void _starpu_notify_job_start_tasks(struct _starpu_job *j, _starpu_notify_job_start_data *data)
 {
 	_starpu_notify_job_start_cg_list(j, &j->job_successors, data);

+ 3 - 1
src/core/detect_combined_workers.h

@@ -16,7 +16,9 @@
 
 #include <starpu.h>
 
-/* Initialize combined workers */
+/** @file */
+
+/** Initialize combined workers */
 void _starpu_sched_find_worker_combinations(int *workerids, int nworkers);
 
 extern int _starpu_initialized_combined_workers;

+ 10 - 10
src/core/disk.h

@@ -18,6 +18,8 @@
 #ifndef __DISK_H__
 #define __DISK_H__
 
+/** @file */
+
 #define STARPU_DISK_ALL 1
 #define STARPU_DISK_NO_RECLAIM 2
 
@@ -29,13 +31,13 @@ extern "C"
 #include <datawizard/copy_driver.h>
 #include <datawizard/malloc.h>
 
-/* interface to manipulate memory disk */
+/** interface to manipulate memory disk */
 void * _starpu_disk_alloc (unsigned node, size_t size) STARPU_ATTRIBUTE_MALLOC;
 
 void _starpu_disk_free (unsigned node, void *obj, size_t size);
-/* src_node is a disk node, dst_node is for the moment the STARPU_MAIN_RAM */
+/** src_node is a disk node, dst_node is for the moment the STARPU_MAIN_RAM */
 int _starpu_disk_read(unsigned src_node, unsigned dst_node, void *obj, void *buf, off_t offset, size_t size, struct _starpu_async_channel * async_channel);
-/* src_node is for the moment the STARU_MAIN_RAM, dst_node is a disk node */ 
+/** src_node is for the moment the STARU_MAIN_RAM, dst_node is a disk node */
 int _starpu_disk_write(unsigned src_node, unsigned dst_node, void *obj, void *buf, off_t offset, size_t size, struct _starpu_async_channel * async_channel);
 
 int _starpu_disk_full_read(unsigned src_node, unsigned dst_node, void * obj, void ** ptr, size_t * size, struct _starpu_async_channel * async_channel);
@@ -43,22 +45,20 @@ int _starpu_disk_full_write(unsigned src_node, unsigned dst_node, void * obj, vo
 
 int _starpu_disk_copy(unsigned node_src, void* obj_src, off_t offset_src, unsigned node_dst, void* obj_dst, off_t offset_dst, size_t size, struct _starpu_async_channel * async_channel);
 
-/* force the request to compute */
+/** force the request to compute */
 void starpu_disk_wait_request(struct _starpu_async_channel *async_channel);
-/* return 1 if the request is finished, 0 if not finished */
+/** return 1 if the request is finished, 0 if not finished */
 int starpu_disk_test_request(struct _starpu_async_channel *async_channel);
 void starpu_disk_free_request(struct _starpu_async_channel *async_channel);
 
-/* interface to compare memory disk */
+/** interface to compare memory disk */
 int _starpu_disk_can_copy(unsigned node1, unsigned node2);
 
-/* change disk flag */
-
+/** change disk flag */
 void _starpu_set_disk_flag(unsigned node, int flag);
 int _starpu_get_disk_flag(unsigned node);
 
-/* unregister disk */
-
+/** unregister disk */
 void _starpu_disk_unregister(void);
 
 void _starpu_swap_init(void);

+ 2 - 0
src/core/disk_ops/unistd/disk_unistd_global.h

@@ -18,6 +18,8 @@
 #ifndef __DISK_UNISTD_GLOBAL_H__
 #define __DISK_UNISTD_GLOBAL_H__
 
+/** @file */
+
 #include <fcntl.h>
 #ifdef __linux__
 #include <sys/syscall.h>

+ 2 - 0
src/core/drivers.h

@@ -18,6 +18,8 @@
 #ifndef __DRIVERS_H__
 #define __DRIVERS_H__
 
+/** @file */
+
 struct _starpu_driver_ops
 {
 	int (*init)(struct _starpu_worker *worker);

+ 15 - 13
src/core/errorcheck.h

@@ -17,43 +17,45 @@
 #ifndef __ERRORCHECK_H__
 #define __ERRORCHECK_H__
 
+/** @file */
+
 #include <starpu.h>
 
-/* This type describes in which state a worker may be. */
+/** This type describes in which state a worker may be. */
 enum _starpu_worker_status
 {
-	/* invalid status (for instance if we request the status of some thread
+	/** invalid status (for instance if we request the status of some thread
 	 * that is not controlled by StarPU */
 	STATUS_INVALID,
-	/* everything that does not fit the other status */
+	/** everything that does not fit the other status */
 	STATUS_UNKNOWN,
-	/* during the initialization */
+	/** during the initialization */
 	STATUS_INITIALIZING,
-	/* during the execution of a codelet */
+	/** during the execution of a codelet */
 	STATUS_EXECUTING,
-	/* during the execution of the callback */
+	/** during the execution of the callback */
 	STATUS_CALLBACK,
-	/* while executing the scheduler code */
+	/** while executing the scheduler code */
 	STATUS_SCHEDULING,
-	/* while waiting for a data transfer */
+	/** while waiting for a data transfer */
 	STATUS_WAITING,
-	/* while sleeping because there is nothing to do, but looking for tasks to do */
+	/** while sleeping because there is nothing to do, but looking for tasks to do */
 	STATUS_SLEEPING_SCHEDULING,
-	/* while sleeping because there is nothing to do, and not even scheduling */
+	/** while sleeping because there is nothing to do, and not even scheduling */
 	STATUS_SLEEPING
 };
 
 struct _starpu_worker;
-/* Specify what the local worker is currently doing (eg. executing a callback).
+/** Specify what the local worker is currently doing (eg. executing a callback).
  * This permits to detect if this is legal to do a blocking call for instance.
  * */
 void _starpu_set_worker_status(struct _starpu_worker *worker, enum _starpu_worker_status st);
 void _starpu_set_local_worker_status(enum _starpu_worker_status st);
 
-/* Indicate what type of operation the worker is currently doing. */
+/** Indicate what type of operation the worker is currently doing. */
 enum _starpu_worker_status _starpu_get_local_worker_status(void);
 
-/* It is forbidden to do blocking calls during some operations such as callback
+/** It is forbidden to do blocking calls during some operations such as callback
  * or during the execution of a task. This function indicates whether it is
  * legal to call a blocking operation in the current context. */
 unsigned _starpu_worker_may_perform_blocking_calls(void);

+ 2 - 0
src/core/idle_hook.h

@@ -17,6 +17,8 @@
 #ifndef __IDLE_HOOK_H__
 #define __IDLE_HOOK_H__
 
+/** @file */
+
 void _starpu_init_idle_hooks(void);
 
 unsigned _starpu_execute_registered_idle_hooks(void);

+ 3 - 1
src/core/jobs.c

@@ -344,6 +344,9 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 				_starpu_spin_unlock(&handle->header_lock);
 		}
 	}
+	/* Check nowhere before releasing the sequential consistency (which may
+	 * unregister the handle and free its switch_cl, and thus task->cl here.  */
+	unsigned nowhere = !task->cl || task->cl->where == STARPU_NOWHERE || task->where == STARPU_NOWHERE;
 	/* If this is a continuation, we do not release task dependencies now.
 	 * Task dependencies will be released only when the continued task
 	 * fully completes */
@@ -358,7 +361,6 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 	 * scheduler to process it : the task structure doesn't contain any valuable
 	 * data as it's not linked to an actual worker */
 	/* control task should not execute post_exec_hook */
-	unsigned nowhere = !task->cl || task->cl->where == STARPU_NOWHERE || task->where == STARPU_NOWHERE;
 	if(j->task_size == 1 && !nowhere && !j->internal
 #ifdef STARPU_OPENMP
 	/* If this is a continuation, we do not execute the post_exec_hook. The

+ 0 - 0
src/core/jobs.h


Some files were not shown because too many files changed in this diff