Browse Source

Merge branch 'master' of gitlab.inria.fr:starpu/starpu

Nathalie Furmento 4 years ago
parent
commit
a89348bcbc
100 changed files with 5218 additions and 2689 deletions
  1. 10 1
      .gitlab-ci.yml
  2. 1 0
      AUTHORS
  3. 20 0
      ChangeLog
  4. 3 2
      Makefile.am
  5. 73 308
      configure.ac
  6. 1 1
      contrib/ci.inria.fr/job-0-tarball.sh
  7. 19 5
      contrib/ci.inria.fr/job-1-check.sh
  8. 10 0
      contrib/gitlab/build.sh
  9. 22 0
      contrib/gitlab/simgrid.sh
  10. 0 3
      doc/doxygen/Makefile.am
  11. 0 1
      doc/doxygen/chapters/000_introduction.doxy
  12. 2 2
      doc/doxygen/chapters/101_building.doxy
  13. 1 1
      doc/doxygen/chapters/110_basic_examples.doxy
  14. 21 14
      doc/doxygen/chapters/210_check_list_performance.doxy
  15. 4 4
      doc/doxygen/chapters/301_tasks.doxy
  16. 230 46
      doc/doxygen/chapters/310_data_management.doxy
  17. 51 59
      doc/doxygen/chapters/320_scheduling.doxy
  18. 6 6
      doc/doxygen/chapters/380_offline_performance_tools.doxy
  19. 1 1
      doc/doxygen/chapters/400_python.doxy
  20. 62 3
      doc/doxygen/chapters/410_mpi_support.doxy
  21. 0 97
      doc/doxygen/chapters/430_mic_support.doxy
  22. 1 1
      doc/doxygen/chapters/470_simgrid.doxy
  23. 0 1
      doc/doxygen/chapters/495_interoperability.doxy
  24. 325 304
      doc/doxygen/chapters/501_environment_variables.doxy
  25. 10 33
      doc/doxygen/chapters/510_configure_options.doxy
  26. 0 1
      doc/doxygen/chapters/520_files.doxy
  27. 0 24
      doc/doxygen/chapters/api/versioning.doxy
  28. 1 1
      doc/doxygen/chapters/code/disk_copy.c
  29. 0 0
      doc/doxygen/chapters/images/starpu_gflops_non_linear_memset_regression_based_energy.eps
  30. 0 0
      doc/doxygen/chapters/images/starpu_gflops_non_linear_memset_regression_based_energy.pdf
  31. 0 0
      doc/doxygen/chapters/images/starpu_gflops_non_linear_memset_regression_based_energy.png
  32. 3446 1032
      doc/doxygen/chapters/images/tasks_size_overhead.eps
  33. BIN
      doc/doxygen/chapters/images/tasks_size_overhead.pdf
  34. BIN
      doc/doxygen/chapters/images/tasks_size_overhead.png
  35. 0 1
      doc/doxygen/doxygen-config.cfg.in
  36. 0 1
      doc/doxygen/doxygen.cfg
  37. 1 1
      doc/doxygen/doxygen_filter.sh.in
  38. 0 7
      doc/doxygen/refman.tex
  39. 1 3
      doc/doxygen_dev/Makefile.am
  40. 0 3
      doc/doxygen_dev/doxygen-config.cfg.in
  41. 0 1
      doc/doxygen_dev/doxygen.cfg
  42. 4 0
      doc/doxygen_dev/modules.tex
  43. 0 3
      doc/doxygen_dev/refman.tex
  44. 1 1
      examples/Makefile.am
  45. 6 2
      examples/basic_examples/multiformat_conversion_codelets.c
  46. 55 159
      examples/cg/cg.c
  47. 0 27
      examples/cg/cg.h
  48. 237 51
      examples/cg/cg_kernels.c
  49. 0 1
      examples/cpp/add_vectors.cpp
  50. 0 1
      examples/cpp/add_vectors_cpp11.cpp
  51. 11 8
      examples/cpp/add_vectors_interface.cpp
  52. 0 1
      examples/cpp/incrementer_cpp.cpp
  53. 0 1
      examples/dependency/sequential_consistency.c
  54. 0 1
      examples/dependency/task_end_dep.c
  55. 0 1
      examples/dependency/task_end_dep_add.c
  56. 1 1
      examples/filters/fmultiple_manual.c
  57. 1 1
      examples/filters/fmultiple_submit.c
  58. 1 1
      examples/filters/fmultiple_submit_implicit.c
  59. 1 1
      examples/filters/fplan_notautomatic.c
  60. 2 2
      examples/filters/frecursive.c
  61. 0 6
      examples/heat/heat.sh
  62. 1 1
      examples/interface/complex_codelet.h
  63. 14 6
      examples/interface/complex_interface.c
  64. 0 1
      examples/loader-cross.sh.in
  65. 0 12
      examples/lu/lu.sh
  66. 1 1
      examples/lu/xlu_implicit_pivot.c
  67. 1 1
      examples/lu/xlu_kernels.c
  68. 1 1
      examples/lu/xlu_pivot.c
  69. 2 4
      examples/matvecmult/matvecmult.c
  70. 0 6
      examples/mult/sgemm.sh
  71. 5 1
      examples/native_fortran/nf_vector.f90
  72. 1 1
      examples/perf_steering/perf_knobs_03.c
  73. 1 2
      examples/pi/pi_redux.c
  74. 3 4
      examples/reductions/dot_product.c
  75. 1 1
      examples/reductions/minmax_reduction.c
  76. 2 2
      examples/sched_ctx/parallel_tasks_reuse_handle.c
  77. 0 1
      examples/sched_ctx/parallel_tasks_with_cluster_api.c
  78. 6 1
      examples/sched_ctx/two_cpu_contexts.c
  79. 1 1
      examples/scheduler/heteroprio_test.c
  80. 2 0
      examples/spmv/dw_block_spmv.c
  81. 0 1
      examples/stencil/loader-cross.sh.in
  82. 13 29
      include/fstarpu_mod.f90
  83. 78 80
      include/starpu.h
  84. 14 14
      include/starpu_clusters.h
  85. 9 14
      include/starpu_config.h.in
  86. 12 0
      include/starpu_cuda.h
  87. 53 6
      include/starpu_data.h
  88. 24 44
      include/starpu_data_interfaces.h
  89. 8 0
      include/starpu_hash.h
  90. 0 61
      include/starpu_mic.h
  91. 0 7
      include/starpu_mod.f90
  92. 9 9
      include/starpu_perf_monitoring.h
  93. 109 84
      include/starpu_perf_steering.h
  94. 3 5
      include/starpu_perfmodel.h
  95. 11 0
      include/starpu_sched_component.h
  96. 6 1
      include/starpu_sched_ctx.h
  97. 113 43
      include/starpu_task.h
  98. 3 0
      include/starpu_task_dep.h
  99. 69 7
      include/starpu_task_util.h
  100. 0 0
      include/starpu_util.h

+ 10 - 1
.gitlab-ci.yml

@@ -30,7 +30,7 @@ build:
       when: never  # Prevent pipeline run for push event
     - when: always # Run pipeline for all other cases
 
-deploy:
+check:
   stage: deploy
   script:
     - ./contrib/gitlab/deploy.sh
@@ -38,3 +38,12 @@ deploy:
     - if: '$CI_PIPELINE_SOURCE == "push"'
       when: never  # Prevent pipeline run for push event
     - when: always # Run pipeline for all other cases
+
+simgrid:
+  stage: deploy
+  script:
+    - ./contrib/gitlab/simgrid.sh
+  rules:
+    - if: '$CI_PIPELINE_SOURCE == "push"'
+      when: never  # Prevent pipeline run for push event
+    - when: always # Run pipeline for all other cases

+ 1 - 0
AUTHORS

@@ -17,6 +17,7 @@ Guilbaud Adrien, Inria, <adrien.guilbaud@inria.fr>
 He Kun, Inria, <kun.he@inria.fr>
 Henry Sylvain, Université de Bordeaux, <sylvain.henry@inria.fr>
 Hugo Andra, Université de Bordeaux/Inria, <andra.hugo@inria.fr>
+Jego Antoine, Enseeiht, <antoine.jego@etu.enseeiht.fr>
 Juhoor Mehdi, Université de Bordeaux, <mjuhoor@gmail.com>
 Juven Alexis, Inria, <alexis.juven@inria.fr>
 Keryell-Even Maël, Inria, <mael.keryell@inria.fr>

+ 20 - 0
ChangeLog

@@ -50,9 +50,23 @@ New features:
   * Add starpu_mpi_datatype_node_register and
     starpu_mpi_interface_datatype_node_register which will be needed for
     MPI/NUMA/GPUDirect.
+  * Add peek_data interface method.
+  * Add STARPU_MPI_REDUX
+  * Add starpu_data_query_status2 function.
+
+Small features:
+  * New configure option --with-check-cflags to define flags for C,
+    CXX and Fortran compilers
 
 Small changes:
   * Add a synthetic energy efficiency testcase.
+  * Make reduction methods want the commute flag.
+  * Delete old MIC driver code
+  * Rename
+    - starpu_conf::sched_policy_init to starpu_conf::sched_policy_callback
+    and
+   - starpu_sched_ctx_get_sched_policy_init() to starpu_sched_ctx_get_sched_policy_callback()
+   as the callback function may not only be used for init purposes
 
 StarPU 1.3.8
 ====================================================================
@@ -66,6 +80,12 @@ Small features:
     STARPU_MPI_THREAD_COREID environment variables to bind threads to cores
     instead of hyperthreads.
   * New STARPU_TASK_PROGRESS environment variable to show task progression.
+  * Add STARPU_SIMGRID environment variable guard against native builds.
+  * Add starpu_cuda_get_nvmldev function.
+  * Add starpu_sched_tree_deinitialize function.
+  * Add STARPU_SCHED_SORTED_ABOVE and STARPU_SCHED_SORTED_BELOW environment
+    variables.
+  * Add STARPU_SCHED_SIMPLE_PRE_DECISION.
 
 StarPU 1.3.7
 ====================================================================

+ 3 - 2
Makefile.am

@@ -53,9 +53,11 @@ if STARPU_BUILD_STARPURM
 SUBDIRS += starpurm
 endif
 
+if STARPU_USE_CPU
 if STARPU_BUILD_STARPUPY
 SUBDIRS += starpupy
 endif
+endif
 
 if STARPU_BUILD_SC_HYPERVISOR
 SUBDIRS += sc_hypervisor
@@ -89,7 +91,6 @@ versinclude_HEADERS = 				\
 	include/starpu_opencl.h			\
 	include/starpu_openmp.h			\
 	include/starpu_sink.h			\
-	include/starpu_mic.h			\
 	include/starpu_mpi_ms.h			\
 	include/starpu_expert.h			\
 	include/starpu_profiling.h		\
@@ -160,7 +161,7 @@ else
 txtdir = ${docdir}
 endif
 txt_DATA = AUTHORS COPYING.LGPL README README.dev STARPU-REVISION
-EXTRA_DIST = autogen.sh AUTHORS COPYING.LGPL README README.dev STARPU-VERSION STARPU-REVISION build-aux/svn2cl.xsl mic-configure
+EXTRA_DIST = autogen.sh AUTHORS COPYING.LGPL README README.dev STARPU-VERSION STARPU-REVISION build-aux/svn2cl.xsl
 
 DISTCLEANFILES = STARPU-REVISION
 

+ 73 - 308
configure.ac

@@ -167,9 +167,8 @@ if test x$enable_simgrid = xyes ; then
 	   	CXXFLAGS="$SIMGRID_CFLAGS $CXXFLAGS"
 	   	NVCCFLAGS="$SIMGRID_CFLAGS $NVCCFLAGS"
 	fi
-	if test -n "$SIMGRID_LIBS" ; then
-		LDFLAGS="$SIMGRID_LIBS $LDFLAGS"
-	fi
+	SAVED_LIBS="${LIBS}"
+	LIBS="$SIMGRID_LIBS $LIBS"
 	AC_HAVE_LIBRARY([simgrid], [],
 		[
 			AC_MSG_ERROR(Simgrid support needs simgrid installed)
@@ -207,6 +206,7 @@ if test x$enable_simgrid = xyes ; then
 
 	# Oldies for compatibility with older simgrid
 	AC_CHECK_FUNCS([MSG_get_as_by_name MSG_zone_get_by_name MSG_environment_get_routing_root MSG_host_get_speed])
+	LIBS="${SAVED_LIBS}"
 
 	AC_DEFINE(STARPU_SIMGRID, [1], [Define this to enable simgrid execution])
 	# We won't bind or detect anything
@@ -225,6 +225,7 @@ if test x$enable_simgrid = xyes ; then
 		SIMGRID_LIBS="$SIMGRID_LIBS -lstdc++"
 		LIBS="$LIBS -lstdc++"
 	fi
+	SIMGRID_LDFLAGS="$SIMGRID_LIBS -lsimgrid"
 
 	# Simgrid 3.12 & 3.13 need -std=c++11 to be able to build anything in C++...
 	case \ $CXXFLAGS\  in
@@ -267,13 +268,13 @@ if test x$enable_simgrid = xyes ; then
 		AC_PATH_PROG([SIMGRID_MC], [simgrid-mc], [no], [$simgrid_dir/bin:$PATH])
 		LDFLAGS="$LDFLAGS -Wl,-znorelro -Wl,-znoseparate-code"
 		# libsimgrid needs to be linked from binaries themselves for MC to work
-		STARPU_EXPORTED_LIBS="$STARPU_EXPORTED_LIBS -lsimgrid"
+		STARPU_EXPORTED_LIBS="$STARPU_EXPORTED_LIBS $SIMGRID_LDFLAGS"
 	fi
 fi
 AM_CONDITIONAL(STARPU_SIMGRID_MC, test x$enable_simgrid_mc = xyes)
 AM_CONDITIONAL(STARPU_SIMGRID, test x$enable_simgrid = xyes)
 AC_SUBST(SIMGRID_CFLAGS)
-AC_SUBST(SIMGRID_LIBS)
+AC_SUBST(SIMGRID_LDFLAGS)
 AC_MSG_CHECKING(whether SimGrid is enabled)
 AC_MSG_RESULT($enable_simgrid)
 
@@ -486,6 +487,12 @@ AC_ARG_ENABLE(mpi-master-slave, [AS_HELP_STRING([--enable-mpi-master-slave],
                               [Enable StarPU to run with the master-slave mode])],
               use_mpi_master_slave=$enableval,
               use_mpi_master_slave=no)
+
+# in case it is explicitely required, but mpicc is not available, this is an error
+if test x$use_mpi_master_slave = xyes -a ! -x "$mpicc_path"; then
+   AC_MSG_ERROR([Compiler MPI '$mpicc_path' not valid])
+fi
+
 #We can only build MPI Master Slave if User wants it and MPI compiler are available
 if test x$use_mpi_master_slave = xyes -a x$mpicc_path != xno -a x${mpicxx_path} != xno ; then
     build_mpi_master_slave=yes
@@ -590,8 +597,16 @@ if test x$enable_mpi = xyes ; then
 	    FCLAGS="$FFLAGS -fPIC"
         fi
     fi
+
+    enable_mpi_sync_clocks=no
+    PKG_CHECK_MODULES([MPI_SYNC_CLOCKS],[mpi_sync_clocks],[enable_mpi_sync_clocks=yes],[enable_mpi_sync_clocks=no])
+    if test x$enable_mpi_sync_clocks = xyes ; then
+	MPI_SYNC_CLOCKS_LDFLAGS="$(pkg-config --libs mpi_sync_clocks)"
+	MPI_SYNC_CLOCKS_CFLAGS="$(pkg-config --cflags mpi_sync_clocks)"
+    fi
 fi
 
+AM_CONDITIONAL(STARPU_MPI_SYNC_CLOCKS, test x$enable_mpi_sync_clocks = xyes)
 AM_CONDITIONAL(STARPU_USE_MPI_MPI, test x$build_mpi_lib = xyes)
 AM_CONDITIONAL(STARPU_USE_MPI_NMAD, test x$build_nmad_lib = xyes)
 AM_CONDITIONAL(STARPU_USE_MPI, test x$build_nmad_lib = xyes -o x$build_mpi_lib = xyes)
@@ -640,29 +655,6 @@ AC_SUBST(CC_OR_MPICC, $cc_or_mpicc)
 
 ###############################################################################
 #                                                                             #
-#                           MIC device compilation                            #
-#   (Must be done in beginning to change prefix in the whole configuration)   #
-#                                                                             #
-###############################################################################
-
-AC_ARG_ENABLE(mic, [AS_HELP_STRING([--enable-mic],
-	      [use MIC device(s)])], [enable_mic=$enableval], [enable_mic=no])
-AC_ARG_ENABLE(mic-rma, [AS_HELP_STRING([--disable-mic-rma],
-	      [disable MIC RMA transfer])], [enable_mic_rma=$enableval], [enable_mic_rma=yes])
-
-if test x$enable_mic = xyes ; then
-	AC_DEFINE(STARPU_USE_MIC, [1], [MIC workers support is enabled])
-fi
-if test x$enable_mic_rma = xyes ; then
-	AC_DEFINE([STARPU_MIC_USE_RMA], [1], [MIC RMA transfer is enable])
-fi
-
-AM_CONDITIONAL([STARPU_USE_MIC], [test "x$enable_mic" = "xyes"])
-
-###############################################################################
-
-###############################################################################
-#                                                                             #
 #                           NUMA memory nodes                                 #
 #                                                                             #
 ###############################################################################
@@ -1375,7 +1367,7 @@ if test x$enable_cuda = xyes; then
 		      STARPU_CUDA_LDFLAGS="$STARPU_CUDA_LDFLAGS -lcusparse"])
 	AC_CHECK_DECLS([cusparseSetStream], [], [], [[#include <cusparse.h>]])
 
-	LDFLAGS="${LDFLAGS} -lnvidia-ml"
+	LIBS="${LIBS} -lnvidia-ml"
 	AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
 		[[#include <nvml.h>]],
 		[[nvmlInit();]]
@@ -1400,7 +1392,7 @@ if test x$enable_cuda = xyes; then
 		AC_CHECK_DECLS([nvmlDeviceGetTotalEnergyConsumption], [
 			AC_CHECK_FUNCS([nvmlDeviceGetTotalEnergyConsumption])
 			], [], [[#include <nvml.h>]])
-		AC_DEFINE([HAVE_LIBNVIDIA_ML], [1], [Define to 1 if you have the nvidia-ml library])
+		AC_DEFINE([STARPU_HAVE_LIBNVIDIA_ML], [1], [Define to 1 if you have the nvidia-ml library])
 		STARPU_CUDA_LDFLAGS="$STARPU_CUDA_LDFLAGS -lnvidia-ml"
 	fi
 	AC_MSG_CHECKING(whether nvidia-ml should be used)
@@ -1703,19 +1695,6 @@ if test x$disable_asynchronous_opencl_copy = xyes ; then
    AC_DEFINE([STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY], [1], [Define to 1 to disable asynchronous copy between CPU and OpenCL devices])
 fi
 
-AC_MSG_CHECKING(whether asynchronous MIC copy should be disabled)
-AC_ARG_ENABLE(asynchronous-mic-copy, [AS_HELP_STRING([--disable-asynchronous-mic-copy],
-			[disable asynchronous copy between CPU and MIC devices])],
-			enable_asynchronous_mic_copy=$enableval, enable_asynchronous_mic_copy=yes)
-disable_asynchronous_mic_copy=no
-if test x$enable_asynchronous_mic_copy = xno ; then
-   disable_asynchronous_mic_copy=yes
-fi
-AC_MSG_RESULT($disable_asynchronous_mic_copy)
-if test x$disable_asynchronous_mic_copy = xyes ; then
-   AC_DEFINE([STARPU_DISABLE_ASYNCHRONOUS_MIC_COPY], [1], [Define to 1 to disable asynchronous copy between CPU and MIC devices])
-fi
-
 AC_MSG_CHECKING(whether asynchronous MPI Master Slave copy should be disabled)
 AC_ARG_ENABLE(asynchronous-mpi-master-slave-copy, [AS_HELP_STRING([--disable-asynchronous-mpi-master-slave-copy],
 			[disable asynchronous copy between MPI Master and MPI Slave devices])],
@@ -1731,224 +1710,6 @@ fi
 
 ###############################################################################
 #                                                                             #
-#                                 MIC settings                                #
-#                                                                             #
-###############################################################################
-
-# ignore these otions, only meant for mic-configure, but also passed here.
-AC_ARG_ENABLE(native-mic)
-AC_ARG_WITH(compiler)
-AC_ARG_WITH(mic-param)
-AC_ARG_WITH(host-param)
-
-AC_MSG_CHECKING(maximum number of MIC devices)
-AC_ARG_ENABLE(maxmicdev, [AS_HELP_STRING([--enable-maxmicdev=<number>],
-			[maximum number of MIC devices])],
-			nmaxmicdev=$enableval,
-            [
-             if test x$enable_mic = xyes; then
-                 nmaxmicdev=4
-             else
-                 nmaxmicdev=0
-             fi
-            ])
-AC_MSG_RESULT($nmaxmicdev)
-
-AC_DEFINE_UNQUOTED(STARPU_MAXMICDEVS, [$nmaxmicdev],
-	[maximum number of MIC devices])
-
-AC_MSG_CHECKING(maximum number of MIC threads)
-AC_ARG_ENABLE(maxmicthreads, [AS_HELP_STRING([--enable-maxmicthreads=<number>],
-			[maximum number of MIC threads])],
-			nmaxmicthreads=$enableval, nmaxmicthreads=120)
-AC_MSG_RESULT($nmaxmicthread)
-
-AC_ARG_WITH(coi-dir,
-	[AS_HELP_STRING([--with-coi-dir=<path>],
-	[specify the MIC's COI installation directory])],
-	[coi_dir="$withval"],
-	[coi_dir=no])
-
-AC_ARG_WITH(coi-include-dir,
-	[AS_HELP_STRING([--with-coi-include-dir=<path>],
-	[specify where the MIC's COI headers are installed])],
-	[coi_include_dir="$withval"],
-	[coi_include_dir=no])
-
-AC_ARG_WITH(coi-lib-dir,
-	[AS_HELP_STRING([--with-coi-lib-dir=<path>],
-	[specify where the MIC's COI libraries are installed])],
-	[coi_lib_dir="$withval"],
-	[coi_lib_dir=no])
-
-AC_DEFUN([STARPU_CHECK_COI_RUNTIME],
-[
-    __coi_dir=$1
-    __coi_include_dir=$2
-    __coi_lib_dir=$3
-
-    if test "$__coi_dir" != "no" -a "$__coi_dir" != "" ; then
-	AC_MSG_CHECKING(whether MIC's COI runtime is available in $__coi_dir)
-    else
-	AC_MSG_CHECKING(whether MIC's COI runtime is available)
-    fi
-    AC_MSG_RESULT()
-
-    if test "$__coi_include_dir" = "no" -a "$__coi_dir" != "no" ; then
-        __coi_include_dir="${__coi_dir}/include"
-    fi
-    if test "$__coi_lib_dir" = "no" -a "$__coi_dir" != "no" ; then
-        __coi_lib_dir="${__coi_dir}/lib"
-    fi
-
-    SAVED_CPPFLAGS="$CPPFLAGS"
-    SAVED_LDFLAGS="$LDFLAGS"
-
-    if test "$__coi_include_dir" != "no" ; then
-        CPPFLAGS="${CPPFLAGS} -I$__coi_include_dir"
-    fi
-    if test "$__coi_lib_dir" != "no" ; then
-	LDFLAGS="${LDFLAGS} -L$__coi_lib_dir ${STARPU_SCIF_LDFLAGS}"
-    fi
-
-    AC_CHECK_HEADER([source/COIEngine_source.h],[have_valid_coi=yes],[have_valid_coi=no])
-
-    if test "$have_valid_coi" = "yes" ; then
-	AC_HAVE_LIBRARY([$4],[have_valid_coi=yes],[have_valid_coi=no])
-
-        if test "$have_valid_coi" = "no" ; then
-            if test "$3" = "no" -a "$__coi_dir" != "no" ; then
-		# ${__coi_dir}/lib didn't work, let's try with lib64
-                __coi_lib_dir="$__coi_dir/lib64"
-		LDFLAGS="${SAVED_LDFLAGS} -L$__coi_lib_dir"
-	        AC_HAVE_LIBRARY([$4],[have_valid_coi=yes],[have_valid_coi=no])
-            fi
-        fi
-    fi
-
-    if test "$have_valid_coi" = "yes" -a "$__coi_include_dir" != "no"; then
-        STARPU_COI_CPPFLAGS="-I$__coi_include_dir"
-    fi
-
-    if test "$have_valid_coi" = "yes" ; then
-        if test "$__coi_lib_dir" != "no"; then
-	    STARPU_COI_LDFLAGS="-L$__coi_lib_dir"
-        fi
-	STARPU_COI_LDFLAGS="${STARPU_COI_LDFLAGS} -l$4"
-    fi
-
-    CPPFLAGS="${SAVED_CPPFLAGS}"
-    LDFLAGS="${SAVED_LDFLAGS}"
-])
-
-AC_ARG_WITH(scif-dir,
-	[AS_HELP_STRING([--with-scif-dir=<path>],
-	[specify the MIC's SCIF installation directory])],
-	[scif_dir="$withval"],
-	[scif_dir=no])
-
-AC_ARG_WITH(scif-include-dir,
-	[AS_HELP_STRING([--with-scif-include-dir=<path>],
-	[specify where the MIC's SCIF headers are installed])],
-	[scif_include_dir="$withval"],
-	[scif_include_dir=no])
-
-AC_ARG_WITH(scif-lib-dir,
-	[AS_HELP_STRING([--with-scif-lib-dir=<path>],
-	[specify where the MIC's SCIF libraries are installed])],
-	[scif_lib_dir="$withval"],
-	[scif_lib_dir=no])
-
-AC_DEFUN([STARPU_CHECK_SCIF_RUNTIME],
-[
-    __scif_dir=$1
-    __scif_include_dir=$2
-    __scif_lib_dir=$3
-
-    if test "$__scif_dir" != "no" -a "$__scif_dir" != "" ; then
-	AC_MSG_CHECKING(whether MIC's SCIF runtime is available in $__scif_dir)
-    else
-	AC_MSG_CHECKING(whether MIC's SCIF runtime is available)
-    fi
-    AC_MSG_RESULT()
-
-    if test "$__scif_include_dir" = "no" -a "$__scif_dir" != "no" ; then
-        __scif_include_dir="${__scif_dir}/include"
-    fi
-    if test "$__scif_lib_dir" = "no" -a "$__scif_dir" != "no" ; then
-        __scif_lib_dir="${__scif_dir}/lib"
-    fi
-
-    SAVED_CPPFLAGS="$CPPFLAGS"
-    SAVED_LDFLAGS="$LDFLAGS"
-
-    if test "$__scif_include_dir" != "no" ; then
-        CPPFLAGS="${CPPFLAGS} -I$__scif_include_dir"
-    fi
-    if test "$__scif_lib_dir" != "no" ; then
-	LDFLAGS="${LDFLAGS} -L$__scif_lib_dir"
-    fi
-
-#    AC_CHECK_HEADER([source/SCIFEngine_source.h],[have_valid_scif=yes],[have_valid_scif=no])
-
-#    if test "$have_valid_scif" = "yes" ; then
-	AC_HAVE_LIBRARY([scif],[have_valid_scif=yes],[have_valid_scif=no])
-
-        if test "$have_valid_scif" = "no" ; then
-            if test "$3" = "no" -a "$__scif_dir" != "no" ; then
-		# ${__scif_dir}/lib didn't work, let's try with lib64
-                __scif_lib_dir="$__scif_dir/lib64"
-		LDFLAGS="${SAVED_LDFLAGS} -L$__scif_lib_dir"
-	        AC_HAVE_LIBRARY([scif],[have_valid_scif=yes],[have_valid_scif=no])
-            fi
-        fi
-#    fi
-
-    if test "$have_valid_scif" = "yes" -a "$__scif_include_dir" != "no"; then
-        STARPU_SCIF_CPPFLAGS="-I$__scif_include_dir"
-    fi
-
-    if test "$have_valid_scif" = "yes" ; then
-        if test "$__scif_lib_dir" != "no"; then
-	    STARPU_SCIF_LDFLAGS="-L$__scif_lib_dir"
-        fi
-	STARPU_SCIF_LDFLAGS="${STARPU_SCIF_LDFLAGS} -lscif"
-    fi
-
-    CPPFLAGS="${SAVED_CPPFLAGS}"
-    LDFLAGS="${SAVED_LDFLAGS}"
-])
-
-if test x$enable_mic = xyes ; then
-
-    STARPU_CHECK_SCIF_RUNTIME($scif_dir, $scif_include_dir, $scif_lib_dir)
-    if test "$have_valid_scif" = "no" ; then
-	AC_MSG_ERROR([cannot find MIC's SCIF runtime])
-    fi
-
-    case $host_vendor in
-	*1om)
-	    # We are cross-compiling.
-	    # Let's have a look for the device runtime which lib has a different name
-	    STARPU_CHECK_COI_RUNTIME($coi_dir, $coi_include_dir, $coi_lib_dir, coi_device)
-	    ;;
-	*)
-	    STARPU_CHECK_COI_RUNTIME($coi_dir, $coi_include_dir, $coi_lib_dir, coi_host)
-	    ;;
-    esac
-
-    if test "$have_valid_coi" = "no" ; then
-	AC_MSG_ERROR([cannot find MIC's COI runtime])
-    fi
-
-    AC_SUBST(STARPU_COI_CPPFLAGS)
-    AC_SUBST(STARPU_COI_LDFLAGS)
-    AC_SUBST(STARPU_SCIF_CPPFLAGS)
-    AC_SUBST(STARPU_SCIF_LDFLAGS)
-fi
-
-###############################################################################
-#                                                                             #
 #                   Debug and Performance analysis tools                      #
 #                                                                             #
 ###############################################################################
@@ -2030,11 +1791,11 @@ if test x$enable_fast = xyes; then
 	AC_DEFINE(STARPU_NO_ASSERT, [1], [disable assertions])
 else
         # fortify gets really enabled only with optimizations, avoid enabling it
-        # when they optimizations are not enabled, because with some glibc it
+        # when optimizations are not enabled, because with some glibc it
         # spews a lot of warnings.
 	if test x$enable_debug != xyes; then
 		if test x$GCC = xyes; then
-			CPPFLAGS="$CPPFLAGS -D_FORTIFY_SOURCE=1"
+			CPPFLAGS="-D_FORTIFY_SOURCE=1 $CPPFLAGS"
 		fi
 	fi
 fi
@@ -2261,7 +2022,6 @@ fi
 AM_CONDITIONAL([STARPU_USE_AYUDAME1], [test "x$enable_ayudame1" = "xyes"])
 AM_CONDITIONAL([STARPU_USE_AYUDAME2], [test "x$enable_ayudame2" = "xyes"])
 
-
 STARPU_FXT_EVENT_DEFINES="`grep -E '#define\s+_STARPU_(MPI_)?FUT_' ${srcdir}/src/common/fxt.h ${srcdir}/mpi/src/starpu_mpi_fxt.h | grep 0x | grep -v 0x1 | cut -d : -f 2`"
 AC_SUBST([STARPU_FXT_EVENT_DEFINES])
 
@@ -2295,13 +2055,9 @@ AC_ARG_ENABLE(maxnodes, [AS_HELP_STRING([--enable-maxnodes=<nnodes>],
 if test x$maxnodes = x0 ; then
 	if test x$enable_simgrid = xyes ; then
 		# We need the room for the virtual CUDA/OpenCL devices
-		nodes=`expr 4 + $nmaxcudadev + $nmaxopencldev + $nmaxmicdev + 1 + $nmaxmpidev`
-		if test $nodes -gt 32 ; then
-			nodes=32
-		fi
+		nodes=`expr 4 + $nmaxcudadev + $nmaxopencldev + 1 + $nmaxmpidev`
 	else
 		# We have one memory node shared by all CPU workers, one node per GPU
-		# and per MIC device
 		# we add nodes to use 2 memory disks
 		nodes=`expr $nmaxnumanodes + 2`
 		if test x$enable_cuda = xyes ; then
@@ -2314,13 +2070,6 @@ if test x$maxnodes = x0 ; then
 			# odd number.
 			nodes=`expr $nodes + $nmaxopencldev`
 		fi
-		if test x$enable_mic = xyes ; then
-			nodes=`expr $nodes + $nmaxmicdev`
-		fi
-		if test x$enable_rcce = xyes ; then
-			# Only 1 memory node for the shared memory.
-			nodes=`expr $nodes + 1`
-		fi
 
 		#nmaxmpidev = 0 if mpi master-slave is disabled
 		nodes=`expr $nodes + $nmaxmpidev`
@@ -2334,8 +2083,7 @@ if test x$maxnodes = x0 ; then
 	done
 fi
 if test $maxnodes -gt 32 ; then
-	# FIXME: at least use uint64 so we can have 64 memory nodes
-	AC_MSG_ERROR([selected number of nodes ($maxnodes) can not be greater than 32])
+	AC_MSG_WARN([Note: the wt_mask feature only supports 32 memory nodes])
 fi
 
 AC_MSG_CHECKING(maximum number of memory nodes)
@@ -2391,16 +2139,21 @@ if test x$enable_simgrid != xyes; then
 	if test x$enable_opencl != xyes; then
 		nmaxopencldev=0
 	fi
-	if test x$enable_mic != xyes; then
-		nmaxmicthreads=0
-	fi
     #By default, if we cannot build mpi master-slave nmaxmpidev is set to zero.
     #But with the multiplication with maxcpus, we need to put it to one.
     if test x$build_mpi_master_slave != xyes; then
         nmaxmpidev=1
     fi
 fi
-nmaxworkers=`expr 16 \* \( \( \( $nmaxmpidev \* $maxcpus \) + $nmaxcudadev + $nmaxopencldev + $nmaxmicthreads + 15 \) / 16 \) `
+if test $maxcpus == 0
+then
+	nmaxworkers=`expr 16 \* \( \( \( $nmaxmpidev \* 64 \) + $nmaxcudadev + $nmaxopencldev + 15 \) / 16 \) `
+elif test $nmaxmpidev == 0
+then
+	nmaxworkers=`expr 16 \* \( \( $maxcpus + $nmaxcudadev + $nmaxopencldev + 15 \) / 16 \) `
+else
+	nmaxworkers=`expr 16 \* \( \( \( $nmaxmpidev \* $maxcpus \) + $nmaxcudadev + $nmaxopencldev + 15 \) / 16 \) `
+fi
 AC_MSG_CHECKING(Maximum number of workers)
 AC_MSG_RESULT($nmaxworkers)
 AC_DEFINE_UNQUOTED(STARPU_NMAXWORKERS, [$nmaxworkers], [Maximum number of workers])
@@ -2411,16 +2164,13 @@ fi
 if test $nmaxdevs -lt $nmaxopencldev; then
 	nmaxdevs=$nmaxopencldev
 fi
-if test $nmaxdevs -lt $nmaxmicdev; then
-	nmaxdevs=$nmaxmicdev
-fi
 if test $nmaxdevs -lt $nmaxmpidev; then
 	nmaxdevs=$nmaxmpidev
 fi
 AC_DEFINE_UNQUOTED(STARPU_NMAXDEVS, [$nmaxdevs], [Maximum number of device per device arch])
 
 # Computes the maximun number of combined worker
-nmaxcombinedworkers=`expr $maxcpus + $nmaxmicthreads`
+nmaxcombinedworkers=$maxcpus
 AC_MSG_CHECKING(Maximum number of workers combinations)
 AC_MSG_RESULT($nmaxcombinedworkers)
 AC_DEFINE_UNQUOTED(STARPU_NMAX_COMBINEDWORKERS,
@@ -2464,12 +2214,12 @@ AC_DEFINE_UNQUOTED(STARPU_HISTORYMAXERROR, [$calibration_heuristic], [calibratio
 #                                                                             #
 ###############################################################################
 
-AM_CONDITIONAL([STARPU_USE_MP], [test "x$enable_mic" = "xyes" -o "x$build_mpi_master_slave" = "xyes" -o "x$enable_rcce" = "xyes"])
+AM_CONDITIONAL([STARPU_USE_MP], [test "x$build_mpi_master_slave" = "xyes"])
 
 AC_ARG_ENABLE([export-dynamic], [AS_HELP_STRING([--disable-export-dynamic],
 			  [Prevent the linker from adding all symbols to the dynamic symbol table])], [], [])
 
-if test x$enable_mic = xyes -o x$build_mpi_master_slave = xyes -o x$enable_rcce = xyes ; then
+if test x$build_mpi_master_slave = xyes ; then
 	AC_DEFINE(STARPU_USE_MP, [1], [Message-passing SINKs support
 		  is enabled])
 
@@ -2532,6 +2282,16 @@ AC_SUBST(GLOBAL_AM_FCFLAGS)
 pkglibdir="\${libdir}/$PACKAGE"
 AC_SUBST([pkglibdir])
 
+AC_ARG_WITH(check-flags, [AS_HELP_STRING([--with-check-flags],
+			 [Specify flags for C and Fortran compilers])],
+			 check_flags=$withval, check_flags="")
+if test "x$check_flags" != "x" ; then
+	for xflag in $check_flags
+	do
+		IS_SUPPORTED_FLAG($xflag)
+	done
+fi
+
 ###############################################################################
 #                                                                             #
 #                               Fortran                                       #
@@ -2691,7 +2451,7 @@ if test x$starpu_windows = xyes ; then
    enable_openmp=no
    openmp_msg="disabled on windows"
 fi
-if test "x$use_mpi_master_slave" = "xyes" ; then
+if test "x$build_mpi_master_slave" = "xyes" ; then
    enable_openmp=no
    openmp_msg="incompatibility with MPI master slave support"
 fi
@@ -2737,7 +2497,7 @@ if test "x$enable_socl" = "xyes" -a "$have_valid_opencl" = "no" ; then
 fi
 
 # MPI Master Slave and SOCL are not compatible
-if test "x$use_mpi_master_slave" = "xyes" ; then
+if test "x$build_mpi_master_slave" = "xyes" ; then
    if test "x$enable_socl" = "xyes" ; then
       AC_MSG_ERROR([MPI Master-Slave and SOCL can not be used at the same time !])
    fi
@@ -3440,11 +3200,25 @@ then
 		AC_MSG_ERROR([python3 missing, cannot build StarPU python interface])
 	fi
 	AC_SUBST(PYTHON)
+	PYTHON_INCLUDE_DIRS="`$PYTHON -c "from sysconfig import get_paths as gp; print(gp()@<:@'include'@:>@)"`"
+	SAVED_CPPFLAGS="${CPPFLAGS}"
+	CPPFLAGS="$CPPFLAGS -I$PYTHON_INCLUDE_DIRS"
+	AC_CHECK_HEADERS([Python.h],[have_python_h=yes],[have_python_h=no])
+	if test "$have_python_h" = "no" ; then
+		AC_MSG_ERROR([Python.h missing, cannot build StarPU python interface (consider installing python-dev)])
+	fi
+	CPPFLAGS=${SAVED_CPPFLAGS}
 	AC_MSG_CHECKING(for python3 module joblib)
 	AC_PYTHON_MODULE(joblib,[joblib_avail=yes],[joblib_avail=no])
 	AC_MSG_RESULT($joblib_avail)
 	if test "$joblib_avail" = "no" ; then
-		AC_MSG_ERROR([python3 module joblib missing, cannot build StarPU python interface])
+		AC_MSG_ERROR([python3 module joblib missing, cannot build StarPU python interface (consider running 'pip3 install joblib')])
+	fi
+	AC_MSG_CHECKING(for python3 module cloudpickle)
+	AC_PYTHON_MODULE(cloudpickle,[cloudpickle_avail=yes],[cloudpickle_avail=no])
+	AC_MSG_RESULT($cloudpickle_avail)
+	if test "$cloudpickle_avail" = "no" ; then
+		AC_MSG_ERROR([python3 module cloudpickle missing, cannot build StarPU python interface (consider running 'pip3 install cloudpickle')])
 	fi
 	AC_MSG_CHECKING(for python3 module numpy)
 	AC_PYTHON_MODULE(numpy,[numpy_avail=yes],[numpy_avail=no])
@@ -3551,7 +3325,7 @@ STARPU_H_CPPFLAGS="$HWLOC_CFLAGS $STARPU_CUDA_CPPFLAGS $STARPU_OPENCL_CPPFLAGS $
 AC_SUBST([STARPU_H_CPPFLAGS])
 
 # these are the flags needed for linking libstarpu (and thus also for static linking)
-LIBSTARPU_LDFLAGS="$STARPU_OPENCL_LDFLAGS $STARPU_CUDA_LDFLAGS $HWLOC_LIBS $FXT_LDFLAGS $FXT_LIBS $PAPI_LIBS $STARPU_COI_LDFLAGS $STARPU_SCIF_LDFLAGS $STARPU_RCCE_LDFLAGS $STARPU_LEVELDB_LDFLAGS $STARPU_GLPK_LDFLAGS $STARPU_LEVELDB_LDFLAGS $SIMGRID_LIBS $STARPU_BLAS_LDFLAGS $STARPU_OMP_LDFLAGS $DGELS_LIBS"
+LIBSTARPU_LDFLAGS="$STARPU_OPENCL_LDFLAGS $STARPU_CUDA_LDFLAGS $HWLOC_LIBS $FXT_LDFLAGS $FXT_LIBS $PAPI_LIBS $STARPU_LEVELDB_LDFLAGS $STARPU_GLPK_LDFLAGS $STARPU_LEVELDB_LDFLAGS $SIMGRID_LDFLAGS $STARPU_BLAS_LDFLAGS $STARPU_OMP_LDFLAGS $DGELS_LIBS"
 AC_SUBST([LIBSTARPU_LDFLAGS])
 
 # these are the flags needed for linking against libstarpu (because starpu.h makes its includer use pthread_*, simgrid, etc.)
@@ -3570,10 +3344,7 @@ AC_SUBST([LIBSTARPU_LINK])
 # File configuration
 AC_CONFIG_COMMANDS([executable-scripts], [
   chmod +x tests/regression/regression.sh
-  chmod +x tests/loader-cross.sh
   chmod +x tests/model-checking/starpu-mc.sh
-  chmod +x examples/loader-cross.sh
-  chmod +x examples/stencil/loader-cross.sh
   chmod +x tools/starpu_env
   chmod +x tools/starpu_codelet_profile
   chmod +x tools/starpu_codelet_histo_profile
@@ -3716,11 +3487,8 @@ AC_OUTPUT([
 	examples/Makefile
 	examples/stencil/Makefile
 	tests/Makefile
-	tests/loader-cross.sh
 	tests/model-checking/Makefile
 	tests/model-checking/starpu-mc.sh
-	examples/loader-cross.sh
-	examples/stencil/loader-cross.sh
 	mpi/Makefile
 	mpi/src/Makefile
 	mpi/tests/Makefile
@@ -3751,19 +3519,16 @@ AC_MSG_NOTICE([
 	CPUs   enabled: $enable_cpu
 	CUDA   enabled: $enable_cuda
 	OpenCL enabled: $enable_opencl
-	MIC    enabled: $enable_mic
 
 	Compile-time limits
 	(change these with --enable-maxcpus, --enable-maxcudadev,
-	--enable-maxopencldev, --enable-maxmicdev, --enable-maxnodes,
-        --enable-maxbuffers)
+	--enable-maxopencldev, --enable-maxnodes, --enable-maxbuffers)
         (Note these numbers do not represent the number of detected
 	devices, but the maximum number of devices StarPU can manage)
 
 	Maximum number of CPUs:                     $maxcpus
 	Maximum number of CUDA devices:             $nmaxcudadev
 	Maximum number of OpenCL devices:           $nmaxopencldev
-	Maximum number of MIC threads:              $nmaxmicthreads
 	Maximum number of MPI master-slave devices: $nmaxmpidev
 	Maximum number of memory nodes:             $maxnodes
 	Maximum number of task buffers:             $nmaxbuffers
@@ -3784,18 +3549,18 @@ AC_MSG_NOTICE([
 	       StarPU MPI enabled:                            $build_mpi_lib
 	       StarPU MPI(nmad) enabled:                      $build_nmad_lib
 	       MPI test suite:                                $running_mpi_check
-	       Master-Slave MPI enabled:                      $use_mpi_master_slave
+	       Master-Slave MPI enabled:                      $build_mpi_master_slave
 	       FFT Support:                                   $fft_support
 	       Resource Management enabled:                   $starpurm_support
 	       Python Interface enabled:                      $starpupy_support
 	       OpenMP runtime support enabled:                $enable_openmp
 	       Cluster support enabled:                       $enable_cluster
 	       SOCL enabled:                                  $build_socl
-               SOCL test suite:                               $run_socl_check
-               Scheduler Hypervisor:                          $build_sc_hypervisor
-               simgrid enabled:                               $enable_simgrid
-               ayudame enabled:                               $ayu_msg
-               HDF5 enabled:                                  $enable_hdf5
+	       SOCL test suite:                               $run_socl_check
+	       Scheduler Hypervisor:                          $build_sc_hypervisor
+	       simgrid enabled:                               $enable_simgrid
+	       ayudame enabled:                               $ayu_msg
+	       HDF5 enabled:                                  $enable_hdf5
 	       Native fortran support:                        $enable_build_fortran
 	       Native MPI fortran support:                    $use_mpi_fort
 	       Support for multiple linear regression models: $support_mlr

+ 1 - 1
contrib/ci.inria.fr/job-0-tarball.sh

@@ -22,7 +22,7 @@ export LD_LIBRARY_PATH=/home/ci/usr/local/lib:$LD_LIBRARY_PATH
 if test -d build ; then chmod -R 777 build && rm -rf build ; fi
 mkdir build && cd build
 ../configure --enable-build-doc-pdf
-make V=1
+make -j4
 make dist
 cp *gz ..
 cp doc/doxygen/starpu.pdf ..

+ 19 - 5
contrib/ci.inria.fr/job-1-check.sh

@@ -37,7 +37,11 @@ basename=$(basename $tarball .tar.gz)
 export STARPU_HOME=$PWD/$basename/home
 mkdir -p $basename
 cd $basename
-env > $PWD/env
+(
+    echo "oldPWD=\${PWD}"
+    env|grep -v LS_COLORS | grep '^[A-Z]'|grep -v BASH_FUNC | grep '=' | sed 's/=/=\"/'| sed 's/$/\"/' | sed 's/^/export /'
+    echo "cd \$oldPWD"
+) > ${PWD}/env
 
 test -d $basename && chmod -R u+rwX $basename && rm -rf $basename
 tar xfz ../$tarball
@@ -63,7 +67,17 @@ fi
 
 export CC=gcc
 
-CONFIGURE_OPTIONS="--enable-debug --enable-verbose --enable-mpi-check --disable-build-doc"
+set +e
+mpiexec -oversubscribe pwd 2>/dev/null
+ret=$?
+set -e
+ARGS=""
+if test "$ret" = "0"
+then
+    ARGS="--with-mpiexec-args=-oversubscribe"
+fi
+
+CONFIGURE_OPTIONS="--enable-debug --enable-verbose --enable-mpi-check --disable-build-doc $ARGS"
 CONFIGURE_CHECK=""
 day=$(date +%u)
 if test $day -le 5
@@ -72,12 +86,12 @@ then
 #else
     # we do a normal check, a long check takes too long on VM nodes
 fi
-../configure $CONFIGURE_OPTIONS $CONFIGURE_CHECK  $STARPU_CONFIGURE_OPTIONS
+../configure $CONFIGURE_OPTIONS $CONFIGURE_CHECK  $STARPU_CONFIGURE_OPTIONS $STARPU_USER_CONFIGURE_OPTIONS
 
 export STARPU_TIMEOUT_ENV=1800
 export MPIEXEC_TIMEOUT=1800
-make
-#make check
+
+make -j4
 (make -k check || true) 2>&1 | tee  ../check_$$
 make showsuite
 

+ 10 - 0
contrib/gitlab/build.sh

@@ -17,5 +17,15 @@
 
 ./contrib/ci.inria.fr/job-0-tarball.sh
 
+tarball=$(ls -tr starpu-*.tar.gz | tail -1)
+
+if test -z "$tarball"
+then
+    echo Error. No tar.gz file
+    ls
+    pwd
+    exit 1
+fi
+
 
 

+ 22 - 0
contrib/gitlab/simgrid.sh

@@ -0,0 +1,22 @@
+#!/bin/sh
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2021       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+STARPU_USER_CONFIGURE_OPTIONS="--enable-simgrid --disable-mpi --disable-mpi-check" ./contrib/ci.inria.fr/job-1-check.sh
+
+
+
+

+ 0 - 3
doc/doxygen/Makefile.am

@@ -81,7 +81,6 @@ chapters =	\
 	chapters/410_mpi_support.doxy		\
 	chapters/415_fault_tolerance.doxy	\
 	chapters/420_fft_support.doxy		\
-	chapters/430_mic_support.doxy		\
 	chapters/450_native_fortran_support.doxy		\
 	chapters/460_socl_opencl_extensions.doxy		\
 	chapters/470_simgrid.doxy		\
@@ -106,7 +105,6 @@ chapters =	\
 	chapters/code/disk_compute.c \
 	chapters/code/nf_initexit.f90 \
 	chapters/api/fft_support.doxy \
-	chapters/api/versioning.doxy \
 	chapters/api/threads.doxy
 
 images = 	\
@@ -214,7 +212,6 @@ dox_inputs = $(DOX_CONFIG) 				\
 	$(top_srcdir)/include/starpu_fxt.h		\
 	$(top_srcdir)/include/starpu_hash.h		\
 	$(top_srcdir)/include/starpu_helper.h		\
-	$(top_srcdir)/include/starpu_mic.h		\
 	$(top_srcdir)/include/starpu_mpi_ms.h		\
 	$(top_srcdir)/include/starpu_mod.f90		\
 	$(top_srcdir)/include/starpu_opencl.h		\

+ 0 - 1
doc/doxygen/chapters/000_introduction.doxy

@@ -295,7 +295,6 @@ The documentation chapters include
 <li> \ref MPISupport
 <li> \ref FaultTolerance
 <li> \ref FFTSupport
-<li> \ref MICSupport
 <li> \ref NativeFortranSupport
 <li> \ref SOCLOpenclExtensions
 <li> \ref SimGridSupport

+ 2 - 2
doc/doxygen/chapters/101_building.doxy

@@ -46,7 +46,7 @@ recommended.  It allows for topology aware scheduling, which improves
 performance. <c>hwloc</c> is available in major free operating system
 distributions, and for most operating systems. Make sure to not only install
 a <c>hwloc</c> or <c>libhwloc</c> package, but also <c>hwloc-devel</c> or
-<c>libhwloc-dev</c> so as to have hwloc headers etc.
+<c>libhwloc-dev</c> so as to have \c hwloc headers etc.
 
 If <c>libhwloc</c> is installed in a standard
 location, no option is required, it will be detected automatically,
@@ -520,7 +520,7 @@ It can also be convenient to try simulated benchmarks, if you want to give a try
 at CPU-GPU scheduling without actually having a GPU at hand. This can be done by
 using the SimGrid version of StarPU: first install the SimGrid simulator from
 http://simgrid.gforge.inria.fr/ (we tested with SimGrid from 3.11 to 3.16, and
-3.18 to 3.25. SimGrid versions 3.25 and above need to be configured with -Denable_msg=ON.
+3.18 to 3.25. SimGrid versions 3.25 and above need to be configured with \c -Denable_msg=ON.
 Other versions may have compatibility issues, 3.17 notably does
 not build at all. MPI simulation does not work with version 3.22).
 Then configure StarPU with \ref enable-simgrid

+ 1 - 1
doc/doxygen/chapters/110_basic_examples.doxy

@@ -214,7 +214,7 @@ Once a task has been executed, an optional callback function
 starpu_task::callback_func is called when defined.
 While the computational kernel could be offloaded on various architectures, the
 callback function is always executed on a CPU. The pointer
-starpu_task::callback_arg is passed as an argument of the callback
+starpu_task::callback_arg is passed as an argument to the callback
 function. The prototype of a callback function must be:
 
 \code{.c}

+ 21 - 14
doc/doxygen/chapters/210_check_list_performance.doxy

@@ -224,7 +224,7 @@ starpu_malloc_on_node() which are used by the data interfaces
 (matrix, vector, etc.).  This does not include allocations performed by
 the application through e.g. malloc(). It does not include allocations
 performed through starpu_malloc() either, only allocations
-performed explicitly with the \ref STARPU_MALLOC_COUNT flag, i.e. by calling
+performed explicitly with the flag \ref STARPU_MALLOC_COUNT, i.e. by calling
 
 \code{.c}
 starpu_malloc_flags(STARPU_MALLOC_COUNT)
@@ -258,18 +258,18 @@ structures of StarPU by describing the shape of your machine and/or your
 application when calling \c configure.
 
 To reduce the memory footprint of the data internal structures of StarPU, one
-can set the
+can set the \c configure parameters 
 \ref enable-maxcpus "--enable-maxcpus",
 \ref enable-maxnumanodes "--enable-maxnumanodes",
 \ref enable-maxcudadev "--enable-maxcudadev",
 \ref enable-maxopencldev "--enable-maxopencldev" and
 \ref enable-maxnodes "--enable-maxnodes"
-\c configure parameters to give StarPU
+to give StarPU
 the architecture of the machine it will run on, thus tuning the size of the
 structures to the machine.
 
 To reduce the memory footprint of the task internal structures of StarPU, one
-can set the \ref enable-maxbuffers "--enable-maxbuffers" \c configure parameter to
+can set the \c configure parameter \ref enable-maxbuffers "--enable-maxbuffers" to
 give StarPU the maximum number of buffers that a task can use during an
 execution. For example, in the Cholesky factorization (dense linear algebra
 application), the GEMM task uses up to 3 buffers, so it is possible to set the
@@ -278,15 +278,15 @@ maximum number of task buffers to 3 to run a Cholesky factorization on StarPU.
 The size of the various structures of StarPU can be printed by
 <c>tests/microbenchs/display_structures_size</c>.
 
-It is also often useless to submit *all* the tasks at the same time.
+It is also often useless to submit \b all the tasks at the same time.
 Task submission can be blocked when a reasonable given number of
 tasks have been submitted, by setting the environment variables \ref
 STARPU_LIMIT_MIN_SUBMITTED_TASKS and \ref STARPU_LIMIT_MAX_SUBMITTED_TASKS.
 
-<c>
+\code{.sh}
 export STARPU_LIMIT_MAX_SUBMITTED_TASKS=10000
 export STARPU_LIMIT_MIN_SUBMITTED_TASKS=9000
-</c>
+\endcode
 
 will make StarPU block submission when 10000 tasks are submitted, and unblock
 submission when only 9000 tasks are still submitted, i.e. 1000 tasks have
@@ -341,11 +341,17 @@ overrides the hostname of the system.
 
 By default, StarPU stores separate performance models for each GPU. To avoid
 having to calibrate performance models for each GPU of a homogeneous set of GPU
-devices for instance, the model can be shared by setting
-<c>export STARPU_PERF_MODEL_HOMOGENEOUS_CUDA=1</c> (\ref STARPU_PERF_MODEL_HOMOGENEOUS_CUDA),
-<c>export STARPU_PERF_MODEL_HOMOGENEOUS_OPENCL=1</c> (\ref STARPU_PERF_MODEL_HOMOGENEOUS_OPENCL),
-<c>export STARPU_PERF_MODEL_HOMOGENEOUS_MIC=1</c> (\ref STARPU_PERF_MODEL_HOMOGENEOUS_MIC),
-<c>export STARPU_PERF_MODEL_HOMOGENEOUS_MPI_MS=1</c> (\ref STARPU_PERF_MODEL_HOMOGENEOUS_MPI_MS) depending on your GPU device type.
+devices for instance, the model can be shared by using the environment
+variables \ref STARPU_PERF_MODEL_HOMOGENEOUS_CUDA, \ref
+STARPU_PERF_MODEL_HOMOGENEOUS_OPENCL and \ref
+STARPU_PERF_MODEL_HOMOGENEOUS_MPI_MS depending on your GPU device
+type.
+
+\code{.shell}
+export STARPU_PERF_MODEL_HOMOGENEOUS_CUDA=1
+export STARPU_PERF_MODEL_HOMOGENEOUS_OPENCL=1
+export STARPU_PERF_MODEL_HOMOGENEOUS_MPI_MS=1
+\endcode
 
 To force continuing calibration,
 use <c>export STARPU_CALIBRATE=1</c> (\ref STARPU_CALIBRATE). This may be necessary if your application
@@ -453,12 +459,13 @@ and in Joules for the energy consumption models.
 A quick view of how many tasks each worker has executed can be obtained by setting
 <c>export STARPU_WORKER_STATS=1</c> (\ref STARPU_WORKER_STATS). This is a convenient way to check that
 execution did happen on accelerators, without penalizing performance with
-the profiling overhead. \ref STARPU_WORKER_STATS_FILE can be defined
+the profiling overhead. The environment variable \ref STARPU_WORKER_STATS_FILE can be defined
 to specify a filename in which to display statistics, by default
 statistics are printed on the standard error stream.
 
 A quick view of how much data transfers have been issued can be obtained by setting
-<c>export STARPU_BUS_STATS=1</c> (\ref STARPU_BUS_STATS). \ref
+<c>export STARPU_BUS_STATS=1</c> (\ref STARPU_BUS_STATS). The
+environment variable \ref
 STARPU_BUS_STATS_FILE can be defined to specify a filename in which to
 display statistics, by default statistics are printed on the standard error stream.
 

+ 4 - 4
doc/doxygen/chapters/301_tasks.doxy

@@ -612,13 +612,13 @@ tasks or tags, or to be submitted in callbacks, etc.
 The obvious way is of course to make kernel functions empty, but such task will
 thus have to wait for a worker to become ready, transfer data, etc.
 
-A much lighter way to define a synchronization task is to set its starpu_task::cl
-field to <c>NULL</c>. The task will thus be a mere synchronization point,
+A much lighter way to define a synchronization task is to set its field starpu_task::cl
+to <c>NULL</c>. The task will thus be a mere synchronization point,
 without any data access or execution content: as soon as its dependencies become
 available, it will terminate, call the callbacks, and release dependencies.
 
-An intermediate solution is to define a codelet with its
-starpu_codelet::where field set to \ref STARPU_NOWHERE, for instance:
+An intermediate solution is to define a codelet with its field
+starpu_codelet::where set to \ref STARPU_NOWHERE, for instance:
 
 \code{.c}
 struct starpu_codelet cl =

+ 230 - 46
doc/doxygen/chapters/310_data_management.doxy

@@ -93,8 +93,8 @@ starpu_block_data_register(&block_handle, STARPU_MAIN_RAM, (uintptr_t)block, nx,
 
 3D matrices can be partitioned along the x dimension by
 using starpu_block_filter_block(), or along the y dimension
-by using starpu_block_filter_vertical_block, or along the
-z dimension by using starpu_block_filter_depth_block. They
+by using starpu_block_filter_vertical_block(), or along the
+z dimension by using starpu_block_filter_depth_block(). They
 can also be partitioned with some overlapping by using
 starpu_block_filter_block_shadow(), starpu_block_filter_vertical_block_shadow(),
 or starpu_block_filter_depth_block_shadow().
@@ -206,23 +206,23 @@ need to be set within the original allocation. To reserve room for increasing
 the NX/NY values, one can use starpu_matrix_data_register_allocsize() instead of
 starpu_matrix_data_register(), to specify the allocation size to be used instead
 of the default NX*NY*ELEMSIZE. To support this, the data interface
-has to implement the starpu_data_interface_ops::alloc_footprint and
-starpu_data_interface_ops::alloc_compare methods, for proper StarPU allocation
+has to implement the functions starpu_data_interface_ops::alloc_footprint and
+starpu_data_interface_ops::alloc_compare, for proper StarPU allocation
 management.
 
 A more involved case is changing the amount of allocated data.
 The task implementation can just reallocate the buffer during its execution, and
 set the proper new values in the interface structure, e.g. nx, ny, ld, etc. so
-that the StarPU core knows the new data layout. The starpu_data_interface_ops
-structure however then needs to have the starpu_data_interface_ops::dontcache
-field set to 1, to prevent StarPU from trying to perform any cached allocation,
+that the StarPU core knows the new data layout. The structure starpu_data_interface_ops
+however then needs to have the field starpu_data_interface_ops::dontcache
+set to 1, to prevent StarPU from trying to perform any cached allocation,
 since the allocated size will vary. An example is available in
 <c>tests/datawizard/variable_size.c</c>. The example uses its own data
 interface so as to contain some simulation information for data growth, but the
 principle can be applied for any data interface.
 
-The principle is to use <c>starpu_malloc_on_node_flags</c> to make the new
-allocation, and use <c>starpu_free_on_node_flags</c> to release any previous
+The principle is to use starpu_malloc_on_node_flags() to make the new
+allocation, and use starpu_free_on_node_flags() to release any previous
 allocation. The flags have to be precisely like in the example:
 
 \code{.c}
@@ -244,7 +244,7 @@ stored in the data and the actually allocated buffer. For instance, the vector
 interface uses the <c>nx</c> field for the former, and the <c>allocsize</c> for
 the latter. This allows for lazy reallocation to avoid reallocating the buffer
 everytime to exactly match the actual number of elements. Computations and data
-transfers will use <c>nx</c> field, while allocation functions will use the
+transfers will use the field <c>nx</c>, while allocation functions will use the field
 <c>allocsize</c>. One just has to make sure that <c>allocsize</c> is always
 bigger or equal to <c>nx</c>.
 
@@ -254,12 +254,12 @@ Important note: one can not change the size of a partitioned data.
 \section DataManagement Data Management
 
 When the application allocates data, whenever possible it should use
-the starpu_malloc() function, which will ask CUDA or OpenCL to make
-the allocation itself and pin the corresponding allocated memory, or to use the
-starpu_memory_pin() function to pin memory allocated by other ways, such as local arrays. This
+the function starpu_malloc(), which will ask CUDA or OpenCL to make
+the allocation itself and pin the corresponding allocated memory, or to use the function
+starpu_memory_pin() to pin memory allocated by other ways, such as local arrays. This
 is needed to permit asynchronous data transfer, i.e. permit data
 transfer to overlap with computations. Otherwise, the trace will show
-that the <c>DriverCopyAsync</c> state takes a lot of time, this is
+that the state <c>DriverCopyAsync</c> takes a lot of time, this is
 because CUDA or OpenCL then reverts to synchronous transfers.
 
 The application can provide its own allocation function by calling
@@ -351,8 +351,8 @@ before submitting tasks, which will thus guide StarPU toward an initial task
 distribution (since StarPU will try to avoid further transfers).
 
 This can be achieved by giving the function starpu_data_prefetch_on_node() the
-handle and the desired target memory node. The
-starpu_data_idle_prefetch_on_node() variant can be used to issue the transfer
+handle and the desired target memory node. The variant
+starpu_data_idle_prefetch_on_node() can be used to issue the transfer
 only when the bus is idle.
 
 Conversely, one can advise StarPU that some data will not be useful in the
@@ -434,8 +434,8 @@ __kernel void opencl_kernel(__global int *vector, unsigned offset)
 }
 \endcode
 
-When the sub-data is not of the same type as the original data, the
-starpu_data_filter::get_child_ops field needs to be set appropriately for StarPU
+When the sub-data is not of the same type as the original data, the field
+starpu_data_filter::get_child_ops needs to be set appropriately for StarPU
 to know which type should be used.
 
 StarPU provides various interfaces and filters for matrices, vectors, etc.,
@@ -476,9 +476,8 @@ starpu_data_partition_plan(handle, &f_vert, vert_handle);
 
 starpu_data_partition_plan() returns the handles for the partition in <c>vert_handle</c>.
 
-One can then submit tasks working on the main handle, and tasks working on
-<c>vert_handle</c> handles. Between using the main handle and <c>vert_handle</c>
-handles, StarPU will automatically call starpu_data_partition_submit() and
+One can then submit tasks working on the main handle, and tasks working on the handles
+<c>vert_handle</c>. Between using the main handle and the handles <c>vert_handle</c>, StarPU will automatically call starpu_data_partition_submit() and
 starpu_data_unpartition_submit().
 
 All this code is asynchronous, just submitting which tasks, partitioning and
@@ -544,7 +543,7 @@ ret = starpu_task_insert(&cl_switch, STARPU_RW, handle,
 			0);
 \endcode
 
-The execution of the <c>switch</c> task will get back the matrix data into the
+The execution of the task <c>switch</c> will get back the matrix data into the
 main memory, and thus the vertical slices will get the updated value there.
 
 Again, we prefer to make sure that we don't accidentally access the matrix through the whole-matrix handle:
@@ -557,7 +556,7 @@ And now we can start using vertical slices, etc.
 
 \section DataPointers Handles data buffer pointers
 
-A simple understanding of starpu handles is that it's a collection of buffers on
+A simple understanding of StarPU handles is that it's a collection of buffers on
 each memory node of the machine, which contain the same data.  The picture is
 however made more complex with the OpenCL support and with partitioning.
 
@@ -565,21 +564,28 @@ When partitioning a handle, the data buffers of the subhandles will indeed
 be inside the data buffers of the main handle (to save transferring data
 back and forth between the main handle and the subhandles). But in OpenCL,
 a <c>cl_mem</c> is not a pointer, but an opaque value on which pointer
-arithmetic can not be used. That is why data interfaces contain three members:
-<c>dev_handle</c>, <c>offset</c>, and <c>ptr</c>. The <c>dev_handle</c> member
-is what the allocation function returned, and one can not do arithmetic on
-it. The <c>offset</c> member is the offset inside the allocated area, most often
-it will be 0 because data start at the beginning of the allocated area, but
-when the handle is partitioned, the subhandles will have varying <c>offset</c>
-values, for each subpiece. The <c>ptr</c> member, in the non-OpenCL case, i.e.
-when pointer arithmetic can be used on <c>dev_handle</c>, is just the sum of
+arithmetic can not be used. That is why data interfaces contain three fields:
+<c>dev_handle</c>, <c>offset</c>, and <c>ptr</c>.
+<ul>
+<li> The field <c>dev_handle</c> is what the allocation function
+returned, and one can not do arithmetic on it.
+</li>
+<li> The field <c>offset</c> is the offset inside the allocated area,
+most often it will be 0 because data start at the beginning of the
+allocated area, but when the handle is partitioned, the subhandles
+will have varying <c>offset</c> values, for each subpiece.
+</li>
+<li> The field <c>ptr</c>, in the non-OpenCL case, i.e. when pointer
+arithmetic can be used on <c>dev_handle</c>, is just the sum of
 <c>dev_handle</c> and <c>offset</c>, provided for convenience.
+</li>
+</ul>
 
 This means that:
 <ul>
 <li>computation kernels can use <c>ptr</c> in non-OpenCL implementations.</li>
 <li>computation kernels have to use <c>dev_handle</c> and <c>offset</c> in the OpenCL implementation.</li>
-<li>allocation methods of data interfaces have to store the value returned by starpu_malloc_on_node in <c>dev_handle</c> and <c>ptr</c>, and set <c>offset</c> to 0.</li>
+<li>allocation methods of data interfaces have to store the value returned by starpu_malloc_on_node() in <c>dev_handle</c> and <c>ptr</c>, and set <c>offset</c> to 0.</li>
 <li>partitioning filters have to copy over <c>dev_handle</c> without modifying it, set in the child different values of <c>offset</c>, and set <c>ptr</c> accordingly as the sum of <c>dev_handle</c> and <c>offset</c>.</li>
 </ul>
 
@@ -589,8 +595,8 @@ StarPU provides a series of predefined filters in \ref API_Data_Partition, but
 additional filters can be defined by the application. The principle is that the
 filter function just fills the memory location of the <c>i-th</c> subpart of a data.
 Examples are provided in <c>src/datawizard/interfaces/*_filters.c</c>,
-and see \ref starpu_data_filter::filter_func for the details.
-The starpu_filter_nparts_compute_chunk_size_and_offset() helper can be used to
+check \ref starpu_data_filter::filter_func for further details.
+The helper function starpu_filter_nparts_compute_chunk_size_and_offset() can be used to
 compute the division of pieces of data.
 
 \section DataReduction Data Reduction
@@ -643,7 +649,8 @@ struct starpu_codelet accumulate_variable_cl =
         .cpu_funcs = { accumulate_variable_cpu },
         .cpu_funcs_name = { "accumulate_variable_cpu" },
         .cuda_funcs = { accumulate_variable_cuda },
-        .nbuffers = 1,
+        .nbuffers = 2,
+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
 }
 \endcode
 
@@ -708,7 +715,7 @@ of submission. In some applicative cases, the write contributions can actually
 be performed in any order without affecting the eventual result. In this case
 it is useful to drop the strictly sequential semantic, to improve parallelism
 by allowing StarPU to reorder the write accesses. This can be done by using
-the ::STARPU_COMMUTE data access flag. Accesses without this flag will however
+the data access flag ::STARPU_COMMUTE. Accesses without this flag will however
 properly be serialized against accesses with this flag. For instance:
 
 \code{.c}
@@ -738,7 +745,7 @@ already serialized, and thus by default StarPU uses the Dijkstra solution which
 scales very well in terms of overhead: tasks will just acquire data one by one
 by data handle pointer value order.
 
-When sequential ordering is disabled or the ::STARPU_COMMUTE flag is used, there
+When sequential ordering is disabled or the flag ::STARPU_COMMUTE is used, there
 may be a lot of concurrent accesses to the same data, and the Dijkstra solution
 gets only poor parallelism, typically in some pathological cases which do happen
 in various applications. In this case, one can use a data access arbiter, which
@@ -751,7 +758,7 @@ will acquire them arbiter by arbiter, in arbiter pointer value order.
 
 See the <c>tests/datawizard/test_arbiter.cpp</c> example.
 
-Arbiters however do not support the ::STARPU_REDUX flag yet.
+Arbiters however do not support the flag ::STARPU_REDUX yet.
 
 \section TemporaryBuffers Temporary Buffers
 
@@ -884,19 +891,21 @@ struct starpu_complex_interface
 That structure stores enough to describe <b>one</b> buffer of such kind of
 data. It is used for the buffer stored in the main memory, another instance
 is used for the buffer stored in a GPU, etc. A <i>data handle</i> is thus a
-collection of such structures, to remember each buffer on each memory node.
+collection of such structures, to describe each buffer on each memory node.
 
 Note: one should not take pointers into such structures, because StarPU needs
 to be able to copy over the content of it to various places, for instance to
 efficiently migrate a data buffer from one data handle to another data handle.
 
+\subsection DefiningANewDataInterface_registration Data registration
+
 Registering such a data to StarPU is easily done using the function
 starpu_data_register(). The last
 parameter of the function, <c>interface_complex_ops</c>, will be
 described below.
 
 \code{.c}
-void starpu_complex_data_register(starpu_data_handle_t *handle,
+void starpu_complex_data_register(starpu_data_handle_t *handleptr,
      unsigned home_node, double *real, double *imaginary, int nx)
 {
         struct starpu_complex_interface complex =
@@ -918,8 +927,8 @@ void starpu_complex_data_register(starpu_data_handle_t *handle,
 The <c>struct starpu_complex_interface complex</c> is here used just to store the
 parameters that the user provided to <c>starpu_complex_data_register</c>.
 starpu_data_register() will first allocate the handle, and
-then pass the <c>starpu_complex_interface</c> structure to the
-starpu_data_interface_ops::register_data_handle method, which records them
+then pass the structure <c>starpu_complex_interface</c> to the method
+starpu_data_interface_ops::register_data_handle, which records them
 within the data handle (it is called once per node by starpu_data_register()):
 
 \code{.c}
@@ -971,8 +980,8 @@ static struct starpu_data_interface_ops interface_complex_ops =
 };
 \endcode
 
-Functions need to be defined to access the different fields of the
-complex interface from a StarPU data handle.
+Convenience functions can defined to access the different fields of the
+complex interface from a StarPU data handle after a call to starpu_data_acquire():
 
 \code{.c}
 double *starpu_complex_get_real(starpu_data_handle_t handle)
@@ -1022,14 +1031,189 @@ The whole code for this complex data interface is available in the
 directory <c>examples/interface/</c>.
 
 
+\subsection DefiningANewDataInterface_allocation Data allocation
+
+To be able to run tasks on GPUs etc. StarPU needs to know how to allocate a
+buffer for the interface. In our example, two allocations are needed in the
+allocation method \c complex_allocate_data_on_node(): one for the real part and one
+for the imaginary part.
+
+\code{.c}
+static starpu_ssize_t complex_allocate_data_on_node(void *data_interface, unsigned node)
+{
+	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *) data_interface;
+
+	double *addr_real = NULL;
+	double *addr_imaginary = NULL;
+	starpu_ssize_t requested_memory = complex_interface->nx * sizeof(complex_interface->real[0]);
+
+	addr_real = (double*) starpu_malloc_on_node(node, requested_memory);
+	if (!addr_real)
+		goto fail_real;
+	addr_imaginary = (double*) starpu_malloc_on_node(node, requested_memory);
+	if (!addr_imaginary)
+		goto fail_imaginary;
+
+	/* update the data properly in consequence */
+	complex_interface->real = addr_real;
+	complex_interface->imaginary = addr_imaginary;
+
+	return 2*requested_memory;
+
+fail_imaginary:
+	starpu_free_on_node(node, (uintptr_t) addr_real, requested_memory);
+fail_real:
+	return -ENOMEM;
+}
+\endcode
+
+Here we try to allocate the two parts. If either of them fails, we return
+\c -ENOMEM. If they succeed, we can record the obtained pointers and returned the
+amount of allocated memory (for memory usage accounting).
+
+Conversely, \c complex_free_data_on_node() frees the two parts:
+
+\code{.c}
+static void complex_free_data_on_node(void *data_interface, unsigned node)
+{
+	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *) data_interface;
+	starpu_ssize_t requested_memory = complex_interface->nx * sizeof(complex_interface->real[0]);
+
+	starpu_free_on_node(node, (uintptr_t) complex_interface->real, requested_memory);
+	starpu_free_on_node(node, (uintptr_t) complex_interface->imaginary, requested_memory);
+}
+\endcode
+
+We we have not made anything particular for GPUs or whatsoever: it is
+starpu_free_on_node() which knows how to actually make the allocation, and
+returns the resulting pointer, be it in main memory, in GPU memory, etc.
+
+\subsection DefiningANewDataInterface_copy Data copy
+
+Now that StarPU knows how to allocate/free a buffer, it needs to be able to
+copy over data into/from it. Defining a method \c copy_any_to_any() allows StarPU to
+perform direct transfers between main memory and GPU memory.
+
+\code{.c}
+static int copy_any_to_any(void *src_interface, unsigned src_node,
+			   void *dst_interface, unsigned dst_node,
+			   void *async_data)
+{
+	struct starpu_complex_interface *src_complex = src_interface;
+	struct starpu_complex_interface *dst_complex = dst_interface;
+	int ret = 0;
+
+
+	if (starpu_interface_copy((uintptr_t) src_complex->real, 0, src_node,
+				    (uintptr_t) dst_complex->real, 0, dst_node,
+				     src_complex->nx*sizeof(src_complex->real[0]),
+				     async_data))
+		ret = -EAGAIN;
+	if (starpu_interface_copy((uintptr_t) src_complex->imaginary, 0, src_node,
+				    (uintptr_t) dst_complex->imaginary, 0, dst_node,
+				     src_complex->nx*sizeof(src_complex->imaginary[0]),
+				     async_data))
+		ret = -EAGAIN;
+	return ret;
+}
+\endcode
+
+We here again have no idea what is main memory or GPU memory, or even if the
+copy is synchronous or asynchronous: we just call starpu_interface_copy()
+according to the interface, passing it the pointers, and checking whether it
+returned \c -EAGAIN, which means the copy is asynchronous, and StarPU will
+appropriately wait for it thanks to the pointer \c async_data.
+
+This copy method is referenced in a structure \ref starpu_data_copy_methods
+
+\code{.c}
+static const struct starpu_data_copy_methods complex_copy_methods =
+{
+	.any_to_any = copy_any_to_any
+};
+\endcode
+
+which was referenced in the structure \ref starpu_data_interface_ops above.
+
+Other fields of \ref starpu_data_copy_methods allow to provide optimized
+variants, notably for the case of 2D or 3D matrix tiles with non-trivial ld.
+
+\subsection DefiningANewDataInterface_pack Data pack/peek/unpack
+
+The copy methods allow for RAM/GPU transfers, but is not enough for e.g.
+transferring over MPI. That requires defining the pack/peek/unpack methods. The
+principle is that the method starpu_data_interface_ops::pack_data concatenates
+the buffer data into a newly-allocated contiguous bytes array, conversely
+starpu_data_interface_ops::peek_data extracts from a bytes array into the
+buffer data, and starpu_data_interface_ops::unpack_data does the same as
+starpu_data_interface_ops::peek_data but also frees the bytes array.
+
+\code{.c}
+static int complex_pack_data(starpu_data_handle_t handle, unsigned node, void **ptr, starpu_ssize_t *count)
+{
+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
+
+	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *)
+		starpu_data_get_interface_on_node(handle, node);
+
+	*count = complex_get_size(handle);
+	if (ptr != NULL)
+	{
+		char *data;
+		data = (void*) starpu_malloc_on_node_flags(node, *count, 0);
+		*ptr = data;
+		memcpy(data, complex_interface->real, complex_interface->nx*sizeof(double));
+		memcpy(data+complex_interface->nx*sizeof(double), complex_interface->imaginary, complex_interface->nx*sizeof(double));
+	}
+
+	return 0;
+}
+\endcode
+
+\c complex_pack_data() first computes the size to be allocated, then allocates it,
+and copies over into it the content of the two real and imaginary arrays.
+
+\code{.c}
+static int complex_peek_data(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count)
+{
+	char *data = ptr;
+	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
+
+	struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *)
+		starpu_data_get_interface_on_node(handle, node);
+
+	STARPU_ASSERT(count == 2 * complex_interface->nx * sizeof(double));
+	memcpy(complex_interface->real, data, complex_interface->nx*sizeof(double));
+	memcpy(complex_interface->imaginary, data+complex_interface->nx*sizeof(double), complex_interface->nx*sizeof(double));
+
+	return 0;
+}
+\endcode
+
+\c complex_peek_data() simply uses \c memcpy() to copy over from the bytes array into the data buffer.
+
+\code{.c}
+static int complex_unpack_data(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count)
+{
+	complex_peek_data(handle, node, ptr, count);
+
+	starpu_free_on_node_flags(node, (uintptr_t) ptr, count, 0);
+
+	return 0;
+}
+\endcode
+
+And \c complex_unpack_data() just calls \c complex_peek_data() and releases the bytes array.
+
+
 \section SpecifyingATargetNode Specifying A Target Node For Task Data
 
 When executing a task on a GPU for instance, StarPU would normally copy all the
 needed data for the tasks on the embedded memory of the GPU.  It may however
 happen that the task kernel would rather have some of the datas kept in the
 main memory instead of copied in the GPU, a pivoting vector for instance.
-This can be achieved by setting the starpu_codelet::specific_nodes flag to
-<c>1</c>, and then fill the starpu_codelet::nodes array (or starpu_codelet::dyn_nodes when
+This can be achieved by setting the flag starpu_codelet::specific_nodes to
+<c>1</c>, and then fill the array starpu_codelet::nodes (or starpu_codelet::dyn_nodes when
 starpu_codelet::nbuffers is greater than \ref STARPU_NMAXBUFS) with the node numbers
 where data should be copied to, or ::STARPU_SPECIFIC_NODE_LOCAL to let
 StarPU copy it to the memory node where the task will be executed.

File diff suppressed because it is too large
+ 51 - 59
doc/doxygen/chapters/320_scheduling.doxy


+ 6 - 6
doc/doxygen/chapters/380_offline_performance_tools.doxy

@@ -155,8 +155,8 @@ starpu_task::name or by using \ref STARPU_NAME when calling
 starpu_task_insert().
 
 Tasks are assigned default colors based on the worker which executed
-them (green for CPUs, yellow/orange/red for CUDAs, blue for OpenCLs,
-red for MICs, ...). To use a different color for every type of task,
+them (green for CPUs, yellow/orange/red for CUDAs, blue for OpenCLs, ...).
+To use a different color for every type of task,
 one can specify the option <c>-c</c> to <c>starpu_fxt_tool</c> or in
 \ref STARPU_GENERATE_TRACE_OPTIONS. Tasks can also be given a specific
 color by setting the field starpu_codelet::color or the
@@ -515,12 +515,12 @@ The <c>-f</c> option can also be used to display the performance in terms of GFl
 
 \verbatim
 $ tools/starpu_perfmodel_plot -f -e -s non_linear_memset_regression_based_energy
-$ gnuplot starpu_non_linear_memset_regression_based_energy.gp
-$ gv starpu_non_linear_memset_regression_based_energy.eps
+$ gnuplot starpu_gflops_non_linear_memset_regression_based_energy.gp
+$ gv starpu_gflops_non_linear_memset_regression_based_energy.eps
 \endverbatim
 
-\image html starpu_non_linear_memset_regression_based_energy_flops.png
-\image latex starpu_non_linear_memset_regression_based_energy_flops.eps "" width=\textwidth
+\image html starpu_gflops_non_linear_memset_regression_based_energy.png
+\image latex starpu_gflops_non_linear_memset_regression_based_energy.eps "" width=\textwidth
 
 We clearly see here that it is much more energy-efficient to stay in the L3 cache.
 

File diff suppressed because it is too large
+ 1 - 1
doc/doxygen/chapters/400_python.doxy


+ 62 - 3
doc/doxygen/chapters/410_mpi_support.doxy

@@ -355,17 +355,44 @@ static int complex_unpack_data(starpu_data_handle_t handle, unsigned node, void
     memcpy(complex_interface->real, ptr, complex_interface->nx*sizeof(double));
     memcpy(complex_interface->imaginary, ptr+complex_interface->nx*sizeof(double), complex_interface->nx*sizeof(double));
 
+    starpu_free_on_node_flags(node, (uintptr_t) ptr, count, 0);
+
     return 0;
 }
+\endcode
+
+And the starpu_data_interface_ops::peek_data operation does
+the same, but without freeing the buffer. Of course one can
+implement starpu_data_interface_ops::unpack_data as merely calling
+starpu_data_interface_ops::peek_data and do the free:
+
+\code{.c}
+static int complex_peek_data(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count)
+{
+    STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
+    STARPU_ASSERT(count == complex_get_size(handle));
+
+    struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, node);
 
+    memcpy(complex_interface->real, ptr, complex_interface->nx*sizeof(double));
+    memcpy(complex_interface->imaginary, ptr+complex_interface->nx*sizeof(double), complex_interface->nx*sizeof(double));
+
+    return 0;
+}
+\endcode
+
+\code{.c}
 static struct starpu_data_interface_ops interface_complex_ops =
 {
     ...
     .pack_data = complex_pack_data,
+    .peek_data = complex_peek_data
     .unpack_data = complex_unpack_data
 };
 \endcode
 
+
+
 Instead of defining pack and unpack operations, users may want to
 attach a MPI type to their user-defined data interface. The function
 starpu_mpi_interface_datatype_register() allows to do so. This function takes 3
@@ -443,7 +470,7 @@ starpu_mpi_barrier(MPI_COMM_WORLD);
 \section MPIInsertTaskUtility MPI Insert Task Utility
 
 To save the programmer from having to explicit all communications, StarPU
-provides an "MPI Insert Task Utility". The principe is that the application
+provides an "MPI Insert Task Utility". The principle is that the application
 decides a distribution of the data over the MPI nodes by allocating it and
 notifying StarPU of this decision, i.e. tell StarPU which MPI node "owns"
 which data. It also decides, for each handle, an MPI tag which will be used to
@@ -544,7 +571,7 @@ to provide a dynamic policy.
 
 A function starpu_mpi_task_build() is also provided with the aim to
 only construct the task structure. All MPI nodes need to call the
-function, which posts the required send/recv on the various nodes which have to.
+function, which posts the required send/recv on the various nodes as needed.
 Only the node which is to execute the task will then return a
 valid task structure, others will return <c>NULL</c>. This node must submit the task.
 All nodes then need to call the function starpu_mpi_task_post_build() -- with the same
@@ -610,7 +637,7 @@ saves, a quick and easy way is to measure the submission time of just one of the
 MPI nodes. This can be achieved by running the application on just one MPI node
 with the following environment variables:
 
-\code
+\code{.sh}
 export STARPU_DISABLE_KERNELS=1
 export STARPU_MPI_FAKE_RANK=2
 export STARPU_MPI_FAKE_SIZE=1024
@@ -717,6 +744,37 @@ starpu_mpi_data_set_rank(data, STARPU_MPI_PER_NODE);
 
 The data can then be used just like pernode above.
 
+\section MPIMpiRedux Inter-node reduction
+
+One might want to leverage a reduction pattern across several nodes.
+Using \c STARPU_REDUX, one can obtain reduction patterns across several nodes,
+however each core across the contributing nodes will spawn their own
+contribution to work with. In the case that these allocations or the
+required reductions are too expensive to execute for each contribution,
+the access mode \c STARPU_MPI_REDUX tells StarPU to spawn only one contribution 
+on node executing tasks partaking in the reduction.
+
+Tasks producing a result in the inter-node reduction should be registered as
+accessing the contribution through \c STARPU_RW|STARPU_COMMUTE mode.
+
+\code{.c}
+static struct starpu_codelet contrib_cl =
+{
+	.cpu_funcs = {cpu_contrib}, /* cpu implementation(s) of the routine */
+	.nbuffers = 1, /* number of data handles referenced by this routine */
+	.modes = {STARPU_RW | STARPU_COMMUTE} /* access modes for the contribution */
+	.name = "contribution"
+};
+\endcode
+
+When inserting these tasks, the access mode handed out to the StarPU-MPI layer
+should be \c STARPU_MPI_REDUX. Assuming \c data is owned by node 0 and we want node
+1 to compute the contribution, we could do the following.
+
+\code{.c}
+starpu_mpi_task_insert(MPI_COMM_WORLD, &contrib_cl, STARPU_MPI_REDUX, data, EXECUTE_ON_NODE, 1); /* Node 1 computes it */
+\endcode
+
 \section MPIPriorities Priorities
 
 All send functions have a <c>_prio</c> variant which takes an additional
@@ -1037,6 +1095,7 @@ disabled in NewMadeleine by compiling it with the profile
 
 To build NewMadeleine, download the latest version from the website (or,
 better, use the Git version to use the most recent version), then:
+
 \code{.sh}
 cd pm2/scripts
 ./pm2-build-packages ./<the profile you chose> --prefix=<installation prefix>

+ 0 - 97
doc/doxygen/chapters/430_mic_support.doxy

@@ -1,97 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2009-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-/*! \page MICSupport MIC Xeon Phi Support
-
-\section MICCompilation Compilation
-
-MIC Xeon Phi support actually needs two compilations of StarPU, one for the host and one for
-the device. The <c>PATH</c> environment variable has to include the path to the
-cross-compilation toolchain, for instance <c>/usr/linux-k1om-4.7/bin</c> .
-The <c>SINK_PKG_CONFIG_PATH</c> environment variable should include the path to the
-cross-compiled <c>hwloc.pc</c>.
-The script <c>mic-configure</c> can then be used to achieve the two compilations: it basically
-calls <c>configure</c> as appropriate from two new directories: <c>build_mic</c> and
-<c>build_host</c>. <c>make</c> and <c>make install</c> can then be used as usual and will
-recurse into both directories. If different \c configure options are needed
-for the host and for the mic, one can use <c>--with-host-param=--with-fxt</c>
-for instance to specify the <c>--with-fxt</c> option for the host only, or
-<c>--with-mic-param=--with-fxt</c> for the mic only.
-
-One can also run StarPU just natively on the Xeon Phi, i.e. it will only run
-directly on the Phi without any exchange with the host CPU. The binaries in
-<c>build_mic</c> can be run that way.
-
-For MPI support, you will probably have to specify different MPI compiler path
-or option for the host and the device builds, for instance:
-
-\verbatim
-./mic-configure --with-mic-param=--with-mpicc="/.../mpiicc -mmic" \
-    --with-host-param=--with-mpicc=/.../mpiicc
-\endverbatim
-
-In case you have troubles with the \c coi or \c scif libraries (the Intel paths are
-really not standard, it seems...), you can still make a build in native mode
-only, by using <c>mic-configure --enable-native-mic</c> (and notably without
-<c>--enable-mic</c> since in that case we don't need \c mic offloading support).
-
-\section PortingApplicationsToMIC Porting Applications To MIC Xeon Phi
-
-The simplest way to port an application to MIC Xeon Phi is to set the field
-starpu_codelet::cpu_funcs_name, to provide StarPU with the function
-name of the CPU implementation, so for instance:
-
-\verbatim
-struct starpu_codelet cl =
-{
-    .cpu_funcs = {myfunc},
-    .cpu_funcs_name = {"myfunc"},
-    .nbuffers = 1,
-}
-\endverbatim
-
-StarPU will thus simply use the
-existing CPU implementation (cross-rebuilt in the MIC Xeon Phi case). The
-functions have to be globally-visible (i.e. not <c>static</c>) for
-StarPU to be able to look them up, and \c -rdynamic must be passed to \c gcc (or
-\c -export-dynamic to \c ld) so that symbols of the main program are visible.
-
-If you have used the starpu_codelet::where field, you additionally need to add in it
-::STARPU_MIC for the Xeon Phi.
-
-For non-native MIC Xeon Phi execution, the 'main' function of the application, on the sink, should call starpu_init() immediately upon start-up; the starpu_init() function never returns. On the host, the 'main' function may freely perform application related initialization calls as usual, before calling starpu_init().
-
-For MIC Xeon Phi, the application may programmatically detect whether executing on the sink or on the host, by checking whether the \ref STARPU_SINK environment variable is defined (on the sink) or not (on the host).
-
-\section LaunchingPrograms Launching Programs
-
-MIC programs are started from the host. StarPU automatically
-starts the same program on MIC devices. It however needs to get
-the MIC-cross-built binary. It will look for the file given by the
-environment variable \ref STARPU_MIC_SINK_PROGRAM_NAME or in the
-directory given by the environment variable \ref STARPU_MIC_SINK_PROGRAM_PATH,
-or in the field
-starpu_conf::mic_sink_program_path. It will also look in the current
-directory for the same binary name plus the suffix <c>-mic</c> or
-<c>_mic</c>.
-
-The testsuite can be started by simply running <c>make check</c> from the
-top directory. It will recurse into both <c>build_host</c> to run tests with only
-the host, and into <c>build_mic</c> to run tests with both the host and the MIC
-devices. Single tests with the host and the MIC can be run by starting
-<c>./loader-cross.sh ./the_test</c> from <c>build_mic/tests</c>.
-
-*/

+ 1 - 1
doc/doxygen/chapters/470_simgrid.doxy

@@ -23,7 +23,7 @@
 
 StarPU can use Simgrid in order to simulate execution on an arbitrary
 platform. This was tested with SimGrid from 3.11 to 3.16, and 3.18 to
-3.26. SimGrid version 3.25 needs to be configured with -Denable_msg=ON .
+3.27. SimGrid version 3.25 needs to be configured with -Denable_msg=ON .
 Other versions may have compatibility issues. 3.17 notably does not build at
 all. MPI simulation does not work with version 3.22.
 

+ 0 - 1
doc/doxygen/chapters/495_interoperability.doxy

@@ -65,7 +65,6 @@ of the following strings as argument and returns the corresponding ID:
 <li><c>"cpu"</c></li>
 <li><c>"opencl"</c></li>
 <li><c>"cuda"</c></li>
-<li><c>"mic"</c></li>
 </ul>
 The \c cpu pseudo device type is defined for convenience and designates CPU
 cores. The number of units of each type available for computation can be

+ 325 - 304
doc/doxygen/chapters/501_environment_variables.doxy

@@ -23,161 +23,8 @@ the following environment variables.
 
 \section EnvConfiguringWorkers Configuring Workers
 
+\subsection Basic General Configuration
 <dl>
-
-<dt>STARPU_NCPU</dt>
-<dd>
-\anchor STARPU_NCPU
-\addindex __env__STARPU_NCPU
-Specify the number of CPU workers (thus not including workers
-dedicated to control accelerators). Note that by default, StarPU will
-not allocate more CPU workers than there are physical CPUs, and that
-some CPUs are used to control the accelerators.
-</dd>
-
-<dt>STARPU_RESERVE_NCPU</dt>
-<dd>
-\anchor STARPU_RESERVE_NCPU
-\addindex __env__STARPU_RESERVE_NCPU
-Specify the number of CPU cores that should not be used by StarPU, so the
-application can use starpu_get_next_bindid() and starpu_bind_thread_on() to bind
-its own threads.
-
-This option is ignored if \ref STARPU_NCPU or starpu_conf::ncpus is set.
-</dd>
-
-<dt>STARPU_NCPUS</dt>
-<dd>
-\anchor STARPU_NCPUS
-\addindex __env__STARPU_NCPUS
-This variable is deprecated. You should use \ref STARPU_NCPU.
-</dd>
-
-<dt>STARPU_NCUDA</dt>
-<dd>
-\anchor STARPU_NCUDA
-\addindex __env__STARPU_NCUDA
-Specify the number of CUDA devices that StarPU can use. If
-\ref STARPU_NCUDA is lower than the number of physical devices, it is
-possible to select which CUDA devices should be used by the means of the
-environment variable \ref STARPU_WORKERS_CUDAID. By default, StarPU will
-create as many CUDA workers as there are CUDA devices.
-</dd>
-
-<dt>STARPU_NWORKER_PER_CUDA</dt>
-<dd>
-\anchor STARPU_NWORKER_PER_CUDA
-\addindex __env__STARPU_NWORKER_PER_CUDA
-Specify the number of workers per CUDA device, and thus the number of kernels
-which will be concurrently running on the devices, i.e. the number of CUDA
-streams. The default value is 1.
-</dd>
-
-<dt>STARPU_CUDA_THREAD_PER_WORKER</dt>
-<dd>
-\anchor STARPU_CUDA_THREAD_PER_WORKER
-\addindex __env__STARPU_CUDA_THREAD_PER_WORKER
-Specify whether the cuda driver should use one thread per stream (1) or to use
-a single thread to drive all the streams of the device or all devices (0), and
-\ref STARPU_CUDA_THREAD_PER_DEV determines whether is it one thread per device or one
-thread for all devices. The default value is 0. Setting it to 1 is contradictory
-with setting \ref STARPU_CUDA_THREAD_PER_DEV.
-</dd>
-
-<dt>STARPU_CUDA_THREAD_PER_DEV</dt>
-<dd>
-\anchor STARPU_CUDA_THREAD_PER_DEV
-\addindex __env__STARPU_CUDA_THREAD_PER_DEV
-Specify whether the cuda driver should use one thread per device (1) or to use a
-single thread to drive all the devices (0). The default value is 1.  It does not
-make sense to set this variable if \ref STARPU_CUDA_THREAD_PER_WORKER is set to to 1
-(since \ref STARPU_CUDA_THREAD_PER_DEV is then meaningless).
-</dd>
-
-<dt>STARPU_CUDA_PIPELINE</dt>
-<dd>
-\anchor STARPU_CUDA_PIPELINE
-\addindex __env__STARPU_CUDA_PIPELINE
-Specify how many asynchronous tasks are submitted in advance on CUDA
-devices. This for instance permits to overlap task management with the execution
-of previous tasks, but it also allows concurrent execution on Fermi cards, which
-otherwise bring spurious synchronizations. The default is 2. Setting the value to 0 forces a synchronous
-execution of all tasks.
-</dd>
-
-<dt>STARPU_NOPENCL</dt>
-<dd>
-\anchor STARPU_NOPENCL
-\addindex __env__STARPU_NOPENCL
-OpenCL equivalent of the environment variable \ref STARPU_NCUDA.
-</dd>
-
-<dt>STARPU_OPENCL_PIPELINE</dt>
-<dd>
-\anchor STARPU_OPENCL_PIPELINE
-\addindex __env__STARPU_OPENCL_PIPELINE
-Specify how many asynchronous tasks are submitted in advance on OpenCL
-devices. This for instance permits to overlap task management with the execution
-of previous tasks, but it also allows concurrent execution on Fermi cards, which
-otherwise bring spurious synchronizations. The default is 2. Setting the value to 0 forces a synchronous
-execution of all tasks.
-</dd>
-
-<dt>STARPU_OPENCL_ON_CPUS</dt>
-<dd>
-\anchor STARPU_OPENCL_ON_CPUS
-\addindex __env__STARPU_OPENCL_ON_CPUS
-By default, the OpenCL driver only enables GPU and accelerator
-devices. By setting the environment variable \ref STARPU_OPENCL_ON_CPUS
-to 1, the OpenCL driver will also enable CPU devices.
-</dd>
-
-<dt>STARPU_OPENCL_ONLY_ON_CPUS</dt>
-<dd>
-\anchor STARPU_OPENCL_ONLY_ON_CPUS
-\addindex __env__STARPU_OPENCL_ONLY_ON_CPUS
-By default, the OpenCL driver enables GPU and accelerator
-devices. By setting the environment variable \ref STARPU_OPENCL_ONLY_ON_CPUS
-to 1, the OpenCL driver will ONLY enable CPU devices.
-</dd>
-
-<dt>STARPU_NMIC</dt>
-<dd>
-\anchor STARPU_NMIC
-\addindex __env__STARPU_NMIC
-MIC equivalent of the environment variable \ref STARPU_NCUDA, i.e. the number of
-MIC devices to use.
-</dd>
-
-<dt>STARPU_NMICTHREADS</dt>
-<dd>
-\anchor STARPU_NMICTHREADS
-\addindex __env__STARPU_NMICTHREADS
-Number of threads to use on the MIC devices.
-</dd>
-
-<dt>STARPU_NMPI_MS</dt>
-<dd>
-\anchor STARPU_NMPI_MS
-\addindex __env__STARPU_NMPI_MS
-MPI Master Slave equivalent of the environment variable \ref STARPU_NCUDA, i.e. the number of
-MPI Master Slave devices to use.
-</dd>
-
-<dt>STARPU_NMPIMSTHREADS</dt>
-<dd>
-\anchor STARPU_NMPIMSTHREADS
-\addindex __env__STARPU_NMPIMSTHREADS
-Number of threads to use on the MPI Slave devices.
-</dd>
-
-<dt>STARPU_MPI_MASTER_NODE</dt>
-<dd>
-\anchor STARPU_MPI_MASTER_NODE
-\addindex __env__STARPU_MPI_MASTER_NODE
-This variable allows to chose which MPI node (with the MPI ID) will be the master.
-</dd>
-
 <dt>STARPU_WORKERS_NOBIND</dt>
 <dd>
 \anchor STARPU_WORKERS_NOBIND
@@ -262,69 +109,6 @@ Same as \ref STARPU_MAIN_THREAD_CPUID, but bind the thread that calls
 starpu_initialize() to the given core, instead of the PU (hyperthread).
 </dd>
 
-<dt>STARPU_MPI_THREAD_CPUID</dt>
-<dd>
-\anchor STARPU_MPI_THREAD_CPUID
-\addindex __env__STARPU_MPI_THREAD_CPUID
-When defined, this make StarPU bind its MPI thread to the given CPU ID. Setting
-it to -1 (the default value) will use a reserved CPU, subtracted from the CPU
-workers.
-</dd>
-
-<dt>STARPU_MPI_THREAD_COREID</dt>
-<dd>
-\anchor STARPU_MPI_THREAD_COREID
-\addindex __env__STARPU_MPI_THREAD_COREID
-Same as \ref STARPU_MPI_THREAD_CPUID, but bind the MPI thread to the given core
-ID, instead of the PU (hyperthread).
-</dd>
-
-<dt>STARPU_MPI_NOBIND</dt>
-<dd>
-\anchor STARPU_MPI_NOBIND
-\addindex __env__STARPU_MPI_NOBIND
-Setting it to non-zero will prevent StarPU from binding the MPI to
-a separate core. This is for instance useful when running the testsuite on a single system.
-</dd>
-
-<dt>STARPU_WORKERS_CUDAID</dt>
-<dd>
-\anchor STARPU_WORKERS_CUDAID
-\addindex __env__STARPU_WORKERS_CUDAID
-Similarly to the \ref STARPU_WORKERS_CPUID environment variable, it is
-possible to select which CUDA devices should be used by StarPU. On a machine
-equipped with 4 GPUs, setting <c>STARPU_WORKERS_CUDAID = "1 3"</c> and
-<c>STARPU_NCUDA=2</c> specifies that 2 CUDA workers should be created, and that
-they should use CUDA devices #1 and #3 (the logical ordering of the devices is
-the one reported by CUDA).
-
-This variable is ignored if the field
-starpu_conf::use_explicit_workers_cuda_gpuid passed to starpu_init()
-is set.
-</dd>
-
-<dt>STARPU_WORKERS_OPENCLID</dt>
-<dd>
-\anchor STARPU_WORKERS_OPENCLID
-\addindex __env__STARPU_WORKERS_OPENCLID
-OpenCL equivalent of the \ref STARPU_WORKERS_CUDAID environment variable.
-
-This variable is ignored if the field
-starpu_conf::use_explicit_workers_opencl_gpuid passed to starpu_init()
-is set.
-</dd>
-
-<dt>STARPU_WORKERS_MICID</dt>
-<dd>
-\anchor STARPU_WORKERS_MICID
-\addindex __env__STARPU_WORKERS_MICID
-MIC equivalent of the \ref STARPU_WORKERS_CUDAID environment variable.
-
-This variable is ignored if the field
-starpu_conf::use_explicit_workers_mic_deviceid passed to starpu_init()
-is set.
-</dd>
-
 <dt>STARPU_WORKER_TREE</dt>
 <dd>
 \anchor STARPU_WORKER_TREE
@@ -346,25 +130,23 @@ and \ref STARPU_MAX_WORKERSIZE can be used to change this default.
 <dd>
 \anchor STARPU_MIN_WORKERSIZE
 \addindex __env__STARPU_MIN_WORKERSIZE
-\ref STARPU_MIN_WORKERSIZE
-permits to specify the minimum size of the combined workers (instead of the default 2)
+Specify the minimum size of the combined workers. Default value is 2.
 </dd>
 
 <dt>STARPU_MAX_WORKERSIZE</dt>
 <dd>
 \anchor STARPU_MAX_WORKERSIZE
 \addindex __env__STARPU_MAX_WORKERSIZE
-\ref STARPU_MAX_WORKERSIZE
-permits to specify the minimum size of the combined workers (instead of the
-number of CPU workers in the system)
+Specify the minimum size of the combined workers. Default value is the
+number of CPU workers in the system.
 </dd>
 
 <dt>STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER</dt>
 <dd>
 \anchor STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER
 \addindex __env__STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER
-Let the user decide how many elements are allowed between combined workers
-created from hwloc information. For instance, in the case of sockets with 6
+Specify how many elements are allowed between combined workers
+created from \c hwloc information. For instance, in the case of sockets with 6
 cores without shared L2 caches, if \ref STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER is
 set to 6, no combined worker will be synthesized beyond one for the socket
 and one per core. If it is set to 3, 3 intermediate combined workers will be
@@ -387,6 +169,143 @@ Disable asynchronous copies between CPU and GPU devices.
 The AMD implementation of OpenCL is known to
 fail when copying data asynchronously. When using this implementation,
 it is therefore necessary to disable asynchronous data transfers.
+
+See also \ref STARPU_DISABLE_ASYNCHRONOUS_CUDA_COPY and \ref
+STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY.
+</dd>
+
+<dt>STARPU_DISABLE_PINNING</dt>
+<dd>
+\anchor STARPU_DISABLE_PINNING
+\addindex __env__STARPU_DISABLE_PINNING
+Disable (1) or Enable (0) pinning host memory allocated through starpu_malloc(), starpu_memory_pin()
+and friends.  The default is Enabled.
+This permits to test the performance effect of memory pinning.
+</dd>
+
+<dt>STARPU_BACKOFF_MIN</dt>
+<dd>
+\anchor STARPU_BACKOFF_MIN
+\addindex __env__STARPU_BACKOFF_MIN
+Set minimum exponential backoff of number of cycles to pause when spinning. Default value is 1.
+</dd>
+
+<dt>STARPU_BACKOFF_MAX</dt>
+<dd>
+\anchor STARPU_BACKOFF_MAX
+\addindex __env__STARPU_BACKOFF_MAX
+Set maximum exponential backoff of number of cycles to pause when spinning. Default value is 32.
+</dd>
+
+<dt>STARPU_SINK</dt>
+<dd>
+\anchor STARPU_SINK
+\addindex __env__STARPU_SINK
+Defined internally by StarPU when running in master slave mode.
+</dd>
+
+</dl>
+
+\subsection cpuWorkers CPU Workers
+<dl>
+<dt>STARPU_NCPU</dt>
+<dd>
+\anchor STARPU_NCPU
+\addindex __env__STARPU_NCPU
+Specify the number of CPU workers (thus not including workers
+dedicated to control accelerators). Note that by default, StarPU will
+not allocate more CPU workers than there are physical CPUs, and that
+some CPUs are used to control the accelerators.
+</dd>
+
+<dt>STARPU_RESERVE_NCPU</dt>
+<dd>
+\anchor STARPU_RESERVE_NCPU
+\addindex __env__STARPU_RESERVE_NCPU
+Specify the number of CPU cores that should not be used by StarPU, so the
+application can use starpu_get_next_bindid() and starpu_bind_thread_on() to bind
+its own threads.
+
+This option is ignored if \ref STARPU_NCPU or starpu_conf::ncpus is set.
+</dd>
+
+<dt>STARPU_NCPUS</dt>
+<dd>
+\anchor STARPU_NCPUS
+\addindex __env__STARPU_NCPUS
+This variable is deprecated. You should use \ref STARPU_NCPU.
+</dd>
+
+</dl>
+
+\subsection cudaWorkers CUDA Workers
+<dl>
+<dt>STARPU_NCUDA</dt>
+<dd>
+\anchor STARPU_NCUDA
+\addindex __env__STARPU_NCUDA
+Specify the number of CUDA devices that StarPU can use. If
+\ref STARPU_NCUDA is lower than the number of physical devices, it is
+possible to select which GPU devices should be used by the means of the
+environment variable \ref STARPU_WORKERS_CUDAID. By default, StarPU will
+create as many CUDA workers as there are GPU devices.
+</dd>
+
+<dt>STARPU_NWORKER_PER_CUDA</dt>
+<dd>
+\anchor STARPU_NWORKER_PER_CUDA
+\addindex __env__STARPU_NWORKER_PER_CUDA
+Specify the number of workers per CUDA device, and thus the number of kernels
+which will be concurrently running on the devices, i.e. the number of CUDA
+streams. The default value is 1.
+</dd>
+
+<dt>STARPU_CUDA_THREAD_PER_WORKER</dt>
+<dd>
+\anchor STARPU_CUDA_THREAD_PER_WORKER
+\addindex __env__STARPU_CUDA_THREAD_PER_WORKER
+Specify whether the cuda driver should use one thread per stream (1) or to use
+a single thread to drive all the streams of the device or all devices (0), and
+\ref STARPU_CUDA_THREAD_PER_DEV determines whether is it one thread per device or one
+thread for all devices. The default value is 0. Setting it to 1 is contradictory
+with setting \ref STARPU_CUDA_THREAD_PER_DEV.
+</dd>
+
+<dt>STARPU_CUDA_THREAD_PER_DEV</dt>
+<dd>
+\anchor STARPU_CUDA_THREAD_PER_DEV
+\addindex __env__STARPU_CUDA_THREAD_PER_DEV
+Specify whether the cuda driver should use one thread per device (1) or to use a
+single thread to drive all the devices (0). The default value is 1.  It does not
+make sense to set this variable if \ref STARPU_CUDA_THREAD_PER_WORKER is set to to 1
+(since \ref STARPU_CUDA_THREAD_PER_DEV is then meaningless).
+</dd>
+
+<dt>STARPU_CUDA_PIPELINE</dt>
+<dd>
+\anchor STARPU_CUDA_PIPELINE
+\addindex __env__STARPU_CUDA_PIPELINE
+Specify how many asynchronous tasks are submitted in advance on CUDA
+devices. This for instance permits to overlap task management with the execution
+of previous tasks, but it also allows concurrent execution on Fermi cards, which
+otherwise bring spurious synchronizations. The default is 2. Setting the value to 0 forces a synchronous
+execution of all tasks.
+</dd>
+
+<dt>STARPU_WORKERS_CUDAID</dt>
+<dd>
+\anchor STARPU_WORKERS_CUDAID
+\addindex __env__STARPU_WORKERS_CUDAID
+Similarly to the \ref STARPU_WORKERS_CPUID environment variable, it is
+possible to select which CUDA devices should be used by StarPU. On a machine
+equipped with 4 GPUs, setting <c>STARPU_WORKERS_CUDAID = "1 3"</c> and
+<c>STARPU_NCUDA=2</c> specifies that 2 CUDA workers should be created, and that
+they should use CUDA devices #1 and #3 (the logical ordering of the devices is
+the one reported by CUDA).
+
+This variable is ignored if the field
+starpu_conf::use_explicit_workers_cuda_gpuid passed to starpu_init()
+is set.
 </dd>
 
 <dt>STARPU_DISABLE_ASYNCHRONOUS_CUDA_COPY</dt>
@@ -394,6 +313,93 @@ it is therefore necessary to disable asynchronous data transfers.
 \anchor STARPU_DISABLE_ASYNCHRONOUS_CUDA_COPY
 \addindex __env__STARPU_DISABLE_ASYNCHRONOUS_CUDA_COPY
 Disable asynchronous copies between CPU and CUDA devices.
+
+See also \ref STARPU_DISABLE_ASYNCHRONOUS_COPY and \ref
+STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY.
+</dd>
+
+<dt>STARPU_ENABLE_CUDA_GPU_GPU_DIRECT</dt>
+<dd>
+\anchor STARPU_ENABLE_CUDA_GPU_GPU_DIRECT
+\addindex __env__STARPU_ENABLE_CUDA_GPU_GPU_DIRECT
+Enable (1) or Disable (0) direct CUDA transfers from GPU to GPU, without copying
+through RAM. The default is Enabled.
+This permits to test the performance effect of GPU-Direct.
+</dd>
+
+<dt>STARPU_CUDA_ONLY_FAST_ALLOC_OTHER_MEMNODES</dt>
+<dd>
+\anchor STARPU_CUDA_ONLY_FAST_ALLOC_OTHER_MEMNODES
+\addindex __env__STARPU_CUDA_ONLY_FAST_ALLOC_OTHER_MEMNODES
+Specify if CUDA workers should do only fast allocations
+when running the datawizard progress of
+other memory nodes. This will pass the internal value
+_STARPU_DATAWIZARD_ONLY_FAST_ALLOC to allocation methods.
+Default value is 0, allowing CUDA workers to do slow allocations.
+
+This can also be specified with starpu_conf::cuda_only_fast_alloc_other_memnodes.
+</dd>
+
+</dl>
+
+\subsection openclWorkers OpenCL Workers
+<dl>
+<dt>STARPU_NOPENCL</dt>
+<dd>
+\anchor STARPU_NOPENCL
+\addindex __env__STARPU_NOPENCL
+Specify the number of OpenCL devices that StarPU can use. If
+\ref STARPU_NOPENCL is lower than the number of physical devices, it is
+possible to select which GPU devices should be used by the means of the
+environment variable \ref STARPU_WORKERS_OPENCLID. By default, StarPU will
+create as many OpenCL workers as there are GPU devices.
+
+Note that by default StarPU will launch CUDA workers on GPU devices.
+You need to disable CUDA to allow the creation of OpenCL workers.
+</dd>
+
+<dt>STARPU_WORKERS_OPENCLID</dt>
+<dd>
+\anchor STARPU_WORKERS_OPENCLID
+\addindex __env__STARPU_WORKERS_OPENCLID
+Similarly to the \ref STARPU_WORKERS_CPUID environment variable, it is
+possible to select which GPU devices should be used by StarPU. On a machine
+equipped with 4 GPUs, setting <c>STARPU_WORKERS_OPENCLID = "1 3"</c> and
+<c>STARPU_NOPENCL=2</c> specifies that 2 OpenCL workers should be
+created, and that they should use GPU devices #1 and #3.
+
+This variable is ignored if the field
+starpu_conf::use_explicit_workers_opencl_gpuid passed to starpu_init()
+is set.
+</dd>
+
+<dt>STARPU_OPENCL_PIPELINE</dt>
+<dd>
+\anchor STARPU_OPENCL_PIPELINE
+\addindex __env__STARPU_OPENCL_PIPELINE
+Specify how many asynchronous tasks are submitted in advance on OpenCL
+devices. This for instance permits to overlap task management with the execution
+of previous tasks, but it also allows concurrent execution on Fermi cards, which
+otherwise bring spurious synchronizations. The default is 2. Setting the value to 0 forces a synchronous
+execution of all tasks.
+</dd>
+
+<dt>STARPU_OPENCL_ON_CPUS</dt>
+<dd>
+\anchor STARPU_OPENCL_ON_CPUS
+\addindex __env__STARPU_OPENCL_ON_CPUS
+By default, the OpenCL driver only enables GPU and accelerator
+devices. By setting the environment variable \ref STARPU_OPENCL_ON_CPUS
+to 1, the OpenCL driver will also enable CPU devices.
+</dd>
+
+<dt>STARPU_OPENCL_ONLY_ON_CPUS</dt>
+<dd>
+\anchor STARPU_OPENCL_ONLY_ON_CPUS
+\addindex __env__STARPU_OPENCL_ONLY_ON_CPUS
+By default, the OpenCL driver enables GPU and accelerator
+devices. By setting the environment variable \ref STARPU_OPENCL_ONLY_ON_CPUS
+to 1, the OpenCL driver will ONLY enable CPU devices.
 </dd>
 
 <dt>STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY</dt>
@@ -404,77 +410,76 @@ Disable asynchronous copies between CPU and OpenCL devices.
 The AMD implementation of OpenCL is known to
 fail when copying data asynchronously. When using this implementation,
 it is therefore necessary to disable asynchronous data transfers.
-</dd>
 
-<dt>STARPU_DISABLE_ASYNCHRONOUS_MIC_COPY</dt>
-<dd>
-\anchor STARPU_DISABLE_ASYNCHRONOUS_MIC_COPY
-\addindex __env__STARPU_DISABLE_ASYNCHRONOUS_MIC_COPY
-Disable asynchronous copies between CPU and MIC devices.
+See also \ref STARPU_DISABLE_ASYNCHRONOUS_COPY and \ref
+STARPU_DISABLE_ASYNCHRONOUS_CUDA_COPY.
 </dd>
+</dl>
 
-<dt>STARPU_DISABLE_ASYNCHRONOUS_MPI_MS_COPY</dt>
-<dd>
-\anchor STARPU_DISABLE_ASYNCHRONOUS_MPI_MS_COPY
-\addindex __env__STARPU_DISABLE_ASYNCHRONOUS_MPI_MS_COPY
-Disable asynchronous copies between CPU and MPI Slave devices.
-</dd>
 
-<dt>STARPU_ENABLE_CUDA_GPU_GPU_DIRECT</dt>
+\subsection mpimsWorkers MPI Master Slave Workers
+<dl>
+<dt>STARPU_NMPI_MS</dt>
 <dd>
-\anchor STARPU_ENABLE_CUDA_GPU_GPU_DIRECT
-\addindex __env__STARPU_ENABLE_CUDA_GPU_GPU_DIRECT
-Enable (1) or Disable (0) direct CUDA transfers from GPU to GPU, without copying
-through RAM. The default is Enabled.
-This permits to test the performance effect of GPU-Direct.
+\anchor STARPU_NMPI_MS
+\addindex __env__STARPU_NMPI_MS
+Specify the number of MPI master slave devices that StarPU can use.
 </dd>
 
-<dt>STARPU_DISABLE_PINNING</dt>
+<dt>STARPU_NMPIMSTHREADS</dt>
 <dd>
-\anchor STARPU_DISABLE_PINNING
-\addindex __env__STARPU_DISABLE_PINNING
-Disable (1) or Enable (0) pinning host memory allocated through starpu_malloc, starpu_memory_pin
-and friends.  The default is Enabled.
-This permits to test the performance effect of memory pinning.
+\anchor STARPU_NMPIMSTHREADS
+\addindex __env__STARPU_NMPIMSTHREADS
+Number of threads to use on the MPI Slave devices.
 </dd>
 
-<dt>STARPU_BACKOFF_MIN</dt>
+<dt>STARPU_MPI_MASTER_NODE</dt>
 <dd>
-\anchor STARPU_BACKOFF_MIN
-\addindex __env__STARPU_BACKOFF_MIN
-Set minimum exponential backoff of number of cycles to pause when spinning. Default value is 1.
+\anchor STARPU_MPI_MASTER_NODE
+\addindex __env__STARPU_MPI_MASTER_NODE
+This variable allows to chose which MPI node (with the MPI ID) will be the master.
 </dd>
 
-<dt>STARPU_BACKOFF_MAX</dt>
+<dt>STARPU_DISABLE_ASYNCHRONOUS_MPI_MS_COPY</dt>
 <dd>
-\anchor STARPU_BACKOFF_MAX
-\addindex __env__STARPU_BACKOFF_MAX
-Set maximum exponential backoff of number of cycles to pause when spinning. Default value is 32.
+\anchor STARPU_DISABLE_ASYNCHRONOUS_MPI_MS_COPY
+\addindex __env__STARPU_DISABLE_ASYNCHRONOUS_MPI_MS_COPY
+Disable asynchronous copies between CPU and MPI Slave devices.
 </dd>
 
-<dt>STARPU_MIC_SINK_PROGRAM_NAME</dt>
+</dl>
+
+\subsection mpiConf MPI Configuration
+<dl>
+
+<dt>STARPU_MPI_THREAD_CPUID</dt>
 <dd>
-\anchor STARPU_MIC_SINK_PROGRAM_NAME
-\addindex __env__STARPU_MIC_SINK_PROGRAM_NAME
-todo
+\anchor STARPU_MPI_THREAD_CPUID
+\addindex __env__STARPU_MPI_THREAD_CPUID
+When defined, this make StarPU bind its MPI thread to the given CPU ID. Setting
+it to -1 (the default value) will use a reserved CPU, subtracted from the CPU
+workers.
 </dd>
 
-<dt>STARPU_MIC_SINK_PROGRAM_PATH</dt>
+<dt>STARPU_MPI_THREAD_COREID</dt>
 <dd>
-\anchor STARPU_MIC_SINK_PROGRAM_PATH
-\addindex __env__STARPU_MIC_SINK_PROGRAM_PATH
-todo
+\anchor STARPU_MPI_THREAD_COREID
+\addindex __env__STARPU_MPI_THREAD_COREID
+Same as \ref STARPU_MPI_THREAD_CPUID, but bind the MPI thread to the given core
+ID, instead of the PU (hyperthread).
 </dd>
 
-<dt>STARPU_MIC_PROGRAM_PATH</dt>
+<dt>STARPU_MPI_NOBIND</dt>
 <dd>
-\anchor STARPU_MIC_PROGRAM_PATH
-\addindex __env__STARPU_MIC_PROGRAM_PATH
-todo
+\anchor STARPU_MPI_NOBIND
+\addindex __env__STARPU_MPI_NOBIND
+Setting it to non-zero will prevent StarPU from binding the MPI to
+a separate core. This is for instance useful when running the testsuite on a single system.
 </dd>
 
 </dl>
 
+
 \section ConfiguringTheSchedulingEngine Configuring The Scheduling Engine
 
 <dl>
@@ -494,6 +499,7 @@ Use <c>STARPU_SCHED=help</c> to get the list of available schedulers.
 \anchor STARPU_MIN_PRIO_env
 \addindex __env__STARPU_MIN_PRIO
 Set the mininum priority used by priorities-aware schedulers.
+The flag can also be set through the field starpu_conf::global_sched_ctx_min_priority.
 </dd>
 
 <dt>STARPU_MAX_PRIO</dt>
@@ -501,6 +507,7 @@ Set the mininum priority used by priorities-aware schedulers.
 \anchor STARPU_MAX_PRIO_env
 \addindex __env__STARPU_MAX_PRIO
 Set the maximum priority used by priorities-aware schedulers.
+The flag can also be set through the field starpu_conf::global_sched_ctx_max_priority.
 </dd>
 
 <dt>STARPU_CALIBRATE</dt>
@@ -576,6 +583,22 @@ pick up a task which has most of its data already available. Setting this to 0
 disables this.
 </dd>
 
+<dt>STARPU_SCHED_SORTED_ABOVE</dt>
+<dd>
+\anchor STARPU_SCHED_SORTED_ABOVE
+\addindex __env__STARPU_SCHED_SORTED_ABOVE
+For a modular scheduler with queues above the decision component, it is
+usually sorted by priority. Setting this to 0 disables this.
+</dd>
+
+<dt>STARPU_SCHED_SORTED_BELOW</dt>
+<dd>
+\anchor STARPU_SCHED_SORTED_BELOW
+\addindex __env__STARPU_SCHED_SORTED_BELOW
+For a modular scheduler with queues below the decision component, they are
+usually sorted by priority. Setting this to 0 disables this.
+</dd>
+
 <dt>STARPU_IDLE_POWER</dt>
 <dd>
 \anchor STARPU_IDLE_POWER
@@ -738,6 +761,27 @@ block when the memory allocation required for network reception overflows the
 available main memory (as typically set by \ref STARPU_LIMIT_CPU_MEM)
 </dd>
 
+<dt>STARPU_MPI_EARLYDATA_ALLOCATE</dt>
+<dd>
+\anchor STARPU_MPI_EARLYDATA_ALLOCATE
+\addindex __env__STARPU_MPI_EARLYDATA_ALLOCATE
+When set to 1, the MPI Driver will immediately allocate the data for early
+requests instead of issuing a data request and blocking. The default value is 0,
+issuing a data request. Because it is an early request and we do not know its
+real priority, the data request will assume \ref STARPU_DEFAULT_PRIO. In cases
+where there are many data requests with priorities greater than
+\ref STARPU_DEFAULT_PRIO the MPI drive could be blocked for long periods.
+</dd>
+
+<dt>STARPU_SIMGRID</dt>
+<dd>
+\anchor STARPU_SIMGRID
+\addindex __env__STARPU_SIMGRID
+When set to 1 (the default is 0), this makes StarPU check that it was really
+build with simulation support. This is convenient in scripts to avoid using a
+native version, that would try to update performance models...
+</dd>
+
 <dt>STARPU_SIMGRID_TRANSFER_COST</dt>
 <dd>
 \anchor STARPU_SIMGRID_TRANSFER_COST
@@ -814,13 +858,6 @@ and allows studying scheduling overhead of the runtime system. However,
 it also makes simulation non-deterministic.
 </dd>
 
-<dt>STARPU_SINK</dt>
-<dd>
-\anchor STARPU_SINK
-\addindex __env__STARPU_SINK
-Variable defined by StarPU when running MPI Xeon PHI on the sink.
-</dd>
-
 </dl>
 
 \section MiscellaneousAndDebug Miscellaneous And Debug
@@ -857,7 +894,7 @@ performance model files. The default is <c>$STARPU_HOME/.starpu/sampling</c>.
 <dd>
 \anchor STARPU_PERF_MODEL_HOMOGENEOUS_CPU
 \addindex __env__STARPU_PERF_MODEL_HOMOGENEOUS_CPU
-When this is set to 0, StarPU will assume that CPU devices do not have the same
+When set to 0, StarPU will assume that CPU devices do not have the same
 performance, and thus use different performance models for them, thus making
 kernel calibration much longer, since measurements have to be made for each CPU
 core.
@@ -867,7 +904,7 @@ core.
 <dd>
 \anchor STARPU_PERF_MODEL_HOMOGENEOUS_CUDA
 \addindex __env__STARPU_PERF_MODEL_HOMOGENEOUS_CUDA
-When this is set to 1, StarPU will assume that all CUDA devices have the same
+When set to 1, StarPU will assume that all CUDA devices have the same
 performance, and thus share performance models for them, thus allowing kernel
 calibration to be much faster, since measurements only have to be once for all
 CUDA GPUs.
@@ -877,27 +914,17 @@ CUDA GPUs.
 <dd>
 \anchor STARPU_PERF_MODEL_HOMOGENEOUS_OPENCL
 \addindex __env__STARPU_PERF_MODEL_HOMOGENEOUS_OPENCL
-When this is set to 1, StarPU will assume that all OPENCL devices have the same
+When set to 1, StarPU will assume that all OPENCL devices have the same
 performance, and thus share performance models for them, thus allowing kernel
 calibration to be much faster, since measurements only have to be once for all
 OPENCL GPUs.
 </dd>
 
-<dt>STARPU_PERF_MODEL_HOMOGENEOUS_MIC</dt>
-<dd>
-\anchor STARPU_PERF_MODEL_HOMOGENEOUS_MIC
-\addindex __env__STARPU_PERF_MODEL_HOMOGENEOUS_MIC
-When this is set to 1, StarPU will assume that all MIC devices have the same
-performance, and thus share performance models for them, thus allowing kernel
-calibration to be much faster, since measurements only have to be once for all
-MIC GPUs.
-</dd>
-
 <dt>STARPU_PERF_MODEL_HOMOGENEOUS_MPI_MS</dt>
 <dd>
 \anchor STARPU_PERF_MODEL_HOMOGENEOUS_MPI_MS
 \addindex __env__STARPU_PERF_MODEL_HOMOGENEOUS_MPI_MS
-When this is set to 1, StarPU will assume that all MPI Slave devices have the same
+When set to 1, StarPU will assume that all MPI Slave devices have the same
 performance, and thus share performance models for them, thus allowing kernel
 calibration to be much faster, since measurements only have to be once for all
 MPI Slaves.
@@ -1123,15 +1150,6 @@ GPUs (or in main memory, when using out of core), when performing an asynchronou
 writeback pass. The default is 10%.
 </dd>
 
-<dt>STARPU_DIDUSE_BARRIER</dt>
-<dd>
-\anchor STARPU_DIDUSE_BARRIER
-\addindex __env__STARPU_DIDUSE_BARRIER
-When set to 1, StarPU will never evict a piece of data if it has not been used
-by at least one task. This avoids odd behaviors under high memory pressure, but
-can lead to deadlocks, so is to be considered experimental only.
-</dd>
-
 <dt>STARPU_DISK_SWAP</dt>
 <dd>
 \anchor STARPU_DISK_SWAP
@@ -1396,9 +1414,9 @@ accesses (see \ref ConcurrentDataAccess).
 When defined, NUMA nodes are taking into account by StarPU. Otherwise, memory
 is considered as only one node. This is experimental for now.
 
-When enabled, STARPU_MAIN_MEMORY is a pointer to the NUMA node associated to the
+When enabled, ::STARPU_MAIN_RAM is a pointer to the NUMA node associated to the
 first CPU worker if it exists, the NUMA node associated to the first GPU discovered otherwise.
-If StarPU doesn't find any NUMA node after these step, STARPU_MAIN_MEMORY is the first NUMA node
+If StarPU doesn't find any NUMA node after these step, ::STARPU_MAIN_RAM is the first NUMA node
 discovered by StarPU.
 </dd>
 
@@ -1406,15 +1424,18 @@ discovered by StarPU.
 <dd>
 \anchor STARPU_IDLE_FILE
 \addindex __env__STARPU_IDLE_FILE
-If the environment variable STARPU_IDLE_FILE is defined, a file named after its contents will be created at the end of the execution.
-The file will contain the sum of the idle times of all the workers.
+When defined, a file named after its contents will be created at the
+end of the execution. This file will contain the sum of the idle times
+of all the workers.
 </dd>
 
 <dt>STARPU_HWLOC_INPUT</dt>
 <dd>
 \anchor STARPU_HWLOC_INPUT
 \addindex __env__STARPU_HWLOC_INPUT
-If the environment variable STARPU_HWLOC_INPUT is defined to the path of an XML file, hwloc will be made to use it as input instead of detecting the current platform topology, which can save significant initialization time.
+When defined to the path of an XML file, \c hwloc will use this file
+as input instead of detecting the current platform topology, which can
+save significant initialization time.
 
 To produce this XML file, use <c>lstopo file.xml</c>
 </dd>
@@ -1423,7 +1444,7 @@ To produce this XML file, use <c>lstopo file.xml</c>
 <dd>
 \anchor STARPU_CATCH_SIGNALS
 \addindex __env__STARPU_CATCH_SIGNALS
-By default, StarPU catch signals SIGINT, SIGSEGV and SIGTRAP to
+By default, StarPU catch signals \c SIGINT, \c SIGSEGV and \c SIGTRAP to
 perform final actions such as dumping FxT trace files even though the
 application has crashed. Setting this variable to a value other than 1
 will disable this behaviour. This should be done on JVM systems which

+ 10 - 33
doc/doxygen/chapters/510_configure_options.doxy

@@ -124,7 +124,7 @@ machine which does not have the tools <c>doxygen</c> and <c>latex</c>
 <dd>
 \anchor enable-build-doc-pdf
 \addindex __configure__--enable-build-doc-pdf
-By default, ontly the HTML documentation is generated. Use this option
+By default, only the HTML documentation is generated. Use this option
 to also enable the generation of the PDF documentation. This should be
 done on a machine which does have the tools <c>doxygen</c> and <c>latex</c>
 (plus the packages <c>latex-xcolor</c> and
@@ -139,6 +139,13 @@ Disable the usage of the ICC compiler. When found, some specific ICC
 examples are compiled.
 </dd>
 
+<dt>--with-check-flags</dt>
+<dd>
+\anchor with-check-flags
+\addindex __configure__--with-check-flags
+Specify flags which will be given to C, CXX and Fortran compilers when valid
+</dd>
+
 </dl>
 
 Additionally, the script <c>configure</c> recognize many variables, which
@@ -340,20 +347,6 @@ fail when copying data asynchronously. When using this implementation,
 it is therefore necessary to disable asynchronous data transfers.
 </dd>
 
-<dt>--enable-maxmicthreads</dt>
-<dd>
-\anchor enable-maxmicthreads
-\addindex __configure__--enable-maxmicthreads
-Specify the maximum number of MIC threads
-</dd>
-
-<dt>--disable-asynchronous-mic-copy</dt>
-<dd>
-\anchor disable-asynchronous-mic-copy
-\addindex __configure__--disable-asynchronous-mic-copy
-Disable asynchronous copies between CPU and MIC devices.
-</dd>
-
 <dt>--disable-asynchronous-mpi-master-slave-copy</dt>
 <dd>
 \anchor disable-asynchronous-mpi-master-slave-copy
@@ -461,22 +454,6 @@ Disable the SOCL extension (\ref SOCLOpenclExtensions).  By
 default, it is enabled when an OpenCL implementation is found.
 </dd>
 
-<dt>--with-coi-dir</dt>
-<dd>
-\anchor with-coi-dir
-\addindex __configure__--with-coi-dir
-Specify the directory to the COI library for MIC support.
-The default value is <c>/opt/intel/mic/coi</c>
-</dd>
-
-<dt>--mic-host</dt>
-<dd>
-\anchor mic-host
-\addindex __configure__--mic-host
-Specify the precise MIC architecture host identifier.
-The default value is <c>x86_64-k1om-linux</c>
-</dd>
-
 <dt>--enable-openmp</dt>
 <dd>
 \anchor enable-openmp
@@ -582,14 +559,14 @@ Enable building HDF5 support.
 <dd>
 \anchor with-hdf5-include-dir
 \addindex __configure__--with-hdf5-include-dir
-Specify the directory where is stored the header hdf5.h.
+Specify the directory where is stored the header file \c hdf5.h.
 </dd>
 
 <dt>--with-hdf5-lib-dir=<c>path</c></dt>
 <dd>
 \anchor with-hdf5-lib-dir
 \addindex __configure__--with-hdf5-lib-dir
-Specify the directory where is stored the hdf5 library.
+Specify the directory where is stored the library \c hdf5.
 </dd>
 
 <dt>--disable-starpufft</dt>

+ 0 - 1
doc/doxygen/chapters/520_files.doxy

@@ -37,7 +37,6 @@
 \file starpu_hash.h
 \file starpu_helper.h
 \file starpu_heteroprio.h
-\file starpu_mic.h
 \file starpu_mpi_ms.h
 \file starpu_mod.f90
 \file starpu_opencl.h

+ 0 - 24
doc/doxygen/chapters/api/versioning.doxy

@@ -1,24 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2009-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-/*! \defgroup API_Versioning Versioning
-
-\fn void starpu_get_version(int *major, int *minor, int *release)
-\ingroup API_Versioning
-Return as 3 integers the version of StarPU used when running the application.
-
-*/
-

+ 1 - 1
doc/doxygen/chapters/code/disk_copy.c

@@ -33,7 +33,7 @@
 
 int main(int argc, char **argv)
 {
-	double * A,*B,*C,*D,*E,*F;
+	double *A, *F;
 
 	/* limit main ram to force to push in disk */
 	setenv("STARPU_LIMIT_CPU_MEM", "160", 1);

doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based_energy_flops.eps → doc/doxygen/chapters/images/starpu_gflops_non_linear_memset_regression_based_energy.eps


doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based_energy_flops.pdf → doc/doxygen/chapters/images/starpu_gflops_non_linear_memset_regression_based_energy.pdf


doc/doxygen/chapters/images/starpu_non_linear_memset_regression_based_energy_flops.png → doc/doxygen/chapters/images/starpu_gflops_non_linear_memset_regression_based_energy.png


File diff suppressed because it is too large
+ 3446 - 1032
doc/doxygen/chapters/images/tasks_size_overhead.eps


BIN
doc/doxygen/chapters/images/tasks_size_overhead.pdf


BIN
doc/doxygen/chapters/images/tasks_size_overhead.png


+ 0 - 1
doc/doxygen/doxygen-config.cfg.in

@@ -36,7 +36,6 @@ INPUT                  = @top_srcdir@/doc/doxygen/chapters \
 			 @top_srcdir@/include/starpu.h \
 			 @top_srcdir@/include/starpu_hash.h \
 			 @top_srcdir@/include/starpu_helper.h \
-			 @top_srcdir@/include/starpu_mic.h \
 			 @top_srcdir@/include/starpu_mpi_ms.h \
 			 @top_srcdir@/include/starpu_mod.f90 \
 			 @top_srcdir@/include/starpu_opencl.h \

+ 0 - 1
doc/doxygen/doxygen.cfg

@@ -1615,7 +1615,6 @@ INCLUDE_FILE_PATTERNS  =
 
 PREDEFINED             = STARPU_USE_OPENCL=1 \
                          STARPU_USE_CUDA=1 \
-                         STARPU_USE_MIC=1 \
 			 STARPU_USE_MPI=1 \
 			 STARPU_HAVE_HWLOC=1 \
 			 STARPU_USE_SC_HYPERVISOR=1 \

+ 1 - 1
doc/doxygen/doxygen_filter.sh.in

@@ -19,5 +19,5 @@ if [ "$(basename $1)" == "starpufft.h" ] ; then
 else
     # the macro STARPU_DEPRECATED needs to be removed as it is not properly processed by doxygen
     # lines starting with // in the doxygen input files are considered as comments to be removed
-    sed -e 's/STARPU_DEPRECATED//' $1 | sed -e 's/^\/\/.*//' | sed -e 's/STARPU_TASK_LIST_INLINE//'
+    sed -e 's/STARPU_DEPRECATED//' $1 | sed -e 's/^\/\/.*//' | sed -e 's/STARPU_TASK_LIST_INLINE//' | sed -e 's/STARPU_WARN_UNUSED_RESULT//'
 fi

+ 0 - 7
doc/doxygen/refman.tex

@@ -163,11 +163,6 @@ Documentation License”.
 \hypertarget{FFTSupport}{}
 \input{FFTSupport}
 
-\chapter{MIC Xeon Phi Support}
-\label{MICSupport}
-\hypertarget{MICSupport}{}
-\input{MICSupport}
-
 \chapter{Native Fortran Support}
 \label{NativeFortranSupport}
 \hypertarget{NativeFortranSupport}{}
@@ -242,7 +237,6 @@ Documentation License”.
 \input{group__API__CUDA__Extensions}
 \input{group__API__OpenCL__Extensions}
 \input{group__API__OpenMP__Runtime__Support}
-\input{group__API__MIC__Extensions}
 \input{group__API__Miscellaneous__Helpers}
 \input{group__API__FxT__Support}
 \input{group__API__FFT__Support}
@@ -292,7 +286,6 @@ Documentation License”.
 \input{starpu__hash_8h}
 \input{starpu__helper_8h}
 \input{starpu__heteroprio_8h}
-\input{starpu__mic_8h}
 \input{starpu__mod_8f90}
 \input{starpu__mpi_8h}
 \input{starpu__mpi__lb_8h}

+ 1 - 3
doc/doxygen_dev/Makefile.am

@@ -146,9 +146,6 @@ dox_inputs = $(DOX_CONFIG) 				\
 	$(top_srcdir)/src/drivers/mp_common/mp_common.h	\
 	$(top_srcdir)/src/drivers/mp_common/source_common.h	\
 	$(top_srcdir)/src/drivers/driver_common/driver_common.h	\
-	$(top_srcdir)/src/drivers/mic/driver_mic_sink.h	\
-	$(top_srcdir)/src/drivers/mic/driver_mic_source.h	\
-	$(top_srcdir)/src/drivers/mic/driver_mic_common.h	\
 	$(top_srcdir)/src/profiling/profiling.h	\
 	$(top_srcdir)/src/profiling/bound.h	\
 	$(top_srcdir)/src/util/starpu_data_cpy.h	\
@@ -219,6 +216,7 @@ $(DOX_TAG): $(dox_inputs)
 
 $(DOX_PDF): $(DOX_TAG) refman.tex
 	@cp $(top_srcdir)/doc/doxygen_dev/chapters/version.sty $(DOX_LATEX_DIR)
+	@cp $(top_srcdir)/doc/doxygen_dev/modules.tex $(DOX_LATEX_DIR)
 	@echo $(PDFLATEX) $(DOX_LATEX_DIR)/refman.tex
 	@cd $(DOX_LATEX_DIR) ;\
 	rm -f *.aux *.toc *.idx *.ind *.ilg *.log *.out ;\

+ 0 - 3
doc/doxygen_dev/doxygen-config.cfg.in

@@ -47,9 +47,6 @@ INPUT                  = @top_srcdir@/doc/doxygen_dev/chapters         \
 			 @top_srcdir@/src/drivers/mp_common/mp_common.h \
 			 @top_srcdir@/src/drivers/mp_common/source_common.h \
 			 @top_srcdir@/src/drivers/driver_common/driver_common.h \
-			 @top_srcdir@/src/drivers/mic/driver_mic_sink.h \
-			 @top_srcdir@/src/drivers/mic/driver_mic_source.h \
-			 @top_srcdir@/src/drivers/mic/driver_mic_common.h \
 			 @top_srcdir@/src/profiling/profiling.h \
 			 @top_srcdir@/src/profiling/bound.h \
 			 @top_srcdir@/src/util/starpu_data_cpy.h \

+ 0 - 1
doc/doxygen_dev/doxygen.cfg

@@ -1615,7 +1615,6 @@ INCLUDE_FILE_PATTERNS  =
 
 PREDEFINED             = STARPU_USE_OPENCL=1 \
                          STARPU_USE_CUDA=1 \
-                         STARPU_USE_MIC=1 \
 			 STARPU_USE_MPI=1 \
 			 STARPU_HAVE_HWLOC=1 \
 			 STARPU_USE_SC_HYPERVISOR=1 \

+ 4 - 0
doc/doxygen_dev/modules.tex

@@ -0,0 +1,4 @@
+\section{Modules}
+Here is a list of all modules\+:\begin{DoxyCompactList}
+\item \contentsline{section}{Workers}{\pageref{group__workers}}{}
+\end{DoxyCompactList}

+ 0 - 3
doc/doxygen_dev/refman.tex

@@ -98,9 +98,6 @@ Documentation License”.
 \input{driver__cpu_8h}
 \input{driver__cuda_8h}
 \input{driver__disk_8h}
-\input{driver__mic__common_8h}
-\input{driver__mic__sink_8h}
-\input{driver__mic__source_8h}
 \input{driver__mpi__common_8h}
 \input{driver__mpi__sink_8h}
 \input{driver__mpi__source_8h}

+ 1 - 1
examples/Makefile.am

@@ -106,6 +106,7 @@ examplebin_PROGRAMS =
 noinst_HEADERS = 				\
 	axpy/axpy.h                             \
 	cg/cg.h					\
+	cg/cg_kernels.c				\
 	heat/lu_kernels_model.h			\
 	heat/dw_sparse_cg.h			\
 	heat/heat.h				\
@@ -869,7 +870,6 @@ if !STARPU_NO_BLAS_LIB
 
 cg_cg_SOURCES =					\
 	cg/cg.c					\
-	cg/cg_kernels.c				\
 	common/blas.c
 
 cg_cg_LDADD =					\

+ 6 - 2
examples/basic_examples/multiformat_conversion_codelets.c

@@ -41,6 +41,7 @@ struct starpu_codelet cpu_to_cuda_cl =
 	.cuda_funcs = {cpu_to_cuda_cuda_func},
 	.cuda_flags = {STARPU_CUDA_ASYNC},
 	.nbuffers = 1,
+	.modes = {STARPU_RW},
 	.name = "codelet_cpu_to_cuda"
 };
 
@@ -48,6 +49,7 @@ struct starpu_codelet cuda_to_cpu_cl =
 {
 	.cpu_funcs = {cuda_to_cpu},
 	.nbuffers = 1,
+	.modes = {STARPU_RW},
 	.name = "codelet_cude_to_cpu"
 };
 #endif
@@ -73,12 +75,14 @@ struct starpu_codelet cpu_to_opencl_cl =
 {
 	.opencl_funcs = {cpu_to_opencl_opencl_func},
 	.opencl_flags = {STARPU_OPENCL_ASYNC},
-	.nbuffers = 1
+	.nbuffers = 1,
+	.modes = {STARPU_RW},
 };
 
 struct starpu_codelet opencl_to_cpu_cl =
 {
 	.cpu_funcs = {opencl_to_cpu},
-	.nbuffers = 1
+	.nbuffers = 1,
+	.modes = {STARPU_RW},
 };
 #endif

+ 55 - 159
examples/cg/cg.c

@@ -19,11 +19,6 @@
 #include <starpu.h>
 #include <common/blas.h>
 
-#ifdef STARPU_USE_CUDA
-#include <cuda.h>
-#endif
-
-#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
 
 /*
  *	Conjugate Gradient
@@ -68,37 +63,34 @@
 
 #include "cg.h"
 
-static int long long n = 4096;
-static int nblocks = 8;
-static int use_reduction = 1;
+static int copy_handle(starpu_data_handle_t dst, starpu_data_handle_t src, unsigned nblocks);
 
-static starpu_data_handle_t A_handle, b_handle, x_handle;
-static TYPE *A, *b, *x;
+#define HANDLE_TYPE_VECTOR starpu_data_handle_t
+#define HANDLE_TYPE_MATRIX starpu_data_handle_t
+#define TASK_INSERT(cl, ...) starpu_task_insert(cl, ##__VA_ARGS__)
+#define GET_VECTOR_BLOCK(v, i) starpu_data_get_sub_data(v, 1, i)
+#define GET_MATRIX_BLOCK(m, i, j) starpu_data_get_sub_data(m, 2, i, j)
+#define BARRIER()
+#define GET_DATA_HANDLE(handle)
+#define FPRINTF_SERVER FPRINTF
 
-#ifdef STARPU_QUICK_CHECK
-static int i_max = 5;
-#elif !defined(STARPU_LONG_CHECK)
-static int i_max = 100;
-#else
-static int i_max = 1000;
-#endif
-static double eps = (10e-14);
+#include "cg_kernels.c"
 
-static starpu_data_handle_t r_handle, d_handle, q_handle;
+static TYPE *A, *b, *x;
 static TYPE *r, *d, *q;
 
-static starpu_data_handle_t dtq_handle, rtr_handle;
-static TYPE dtq, rtr;
+static int copy_handle(starpu_data_handle_t dst, starpu_data_handle_t src, unsigned nblocks)
+{
+	unsigned b;
 
-extern struct starpu_codelet accumulate_variable_cl;
-extern struct starpu_codelet accumulate_vector_cl;
-extern struct starpu_codelet bzero_variable_cl;
-extern struct starpu_codelet bzero_vector_cl;
+	for (b = 0; b < nblocks; b++)
+		starpu_data_cpy(starpu_data_get_sub_data(dst, 1, b), starpu_data_get_sub_data(src, 1, b), 1, NULL, NULL);
+	return 0;
+}
 
 /*
  *	Generate Input data
  */
-
 static void generate_random_problem(void)
 {
 	int i, j;
@@ -264,162 +256,48 @@ static void display_matrix(void)
 }
 #endif
 
-/*
- *	Main loop
- */
-
-static int cg(void)
+static void display_x_result(void)
 {
-	double delta_new, delta_0;
-
-	int i = 0;
-	int ret;
-
-	/* r <- b */
-	ret = copy_handle(r_handle, b_handle, nblocks);
-	if (ret == -ENODEV) return ret;
-
-	/* r <- r - A x */
-	ret = gemv_kernel(r_handle, A_handle, x_handle, 1.0, -1.0, nblocks, use_reduction);
-	if (ret == -ENODEV) return ret;
-
-	/* d <- r */
-	ret = copy_handle(d_handle, r_handle, nblocks);
-	if (ret == -ENODEV) return ret;
+	int j, i;
+	starpu_data_handle_t sub;
 
-	/* delta_new = dot(r,r) */
-	ret = dot_kernel(r_handle, r_handle, rtr_handle, nblocks, use_reduction);
-	if (ret == -ENODEV) return ret;
+	FPRINTF(stderr, "Computed X vector:\n");
 
-	starpu_data_acquire(rtr_handle, STARPU_R);
-	delta_new = rtr;
-	delta_0 = delta_new;
-	starpu_data_release(rtr_handle);
+	int block_size = n / nblocks;
 
-	FPRINTF(stderr, "*************** INITIAL ************ \n");
-	FPRINTF(stderr, "Delta 0: %e\n", delta_new);
-
-	double start;
-	double end;
-	start = starpu_timing_now();
-
-	while ((i < i_max) && ((double)delta_new > (double)(eps*eps*delta_0)))
+	for (j = 0; j < nblocks; j++)
 	{
-		double delta_old;
-		double alpha, beta;
-
-		starpu_iteration_push(i);
-
-		/* q <- A d */
-		gemv_kernel(q_handle, A_handle, d_handle, 0.0, 1.0, nblocks, use_reduction);
-
-		/* dtq <- dot(d,q) */
-		dot_kernel(d_handle, q_handle, dtq_handle, nblocks, use_reduction);
-
-		/* alpha = delta_new / dtq */
-		starpu_data_acquire(dtq_handle, STARPU_R);
-		alpha = delta_new/dtq;
-		starpu_data_release(dtq_handle);
-
-		/* x <- x + alpha d */
-		axpy_kernel(x_handle, d_handle, alpha, nblocks);
-
-		if ((i % 50) == 0)
-		{
-			/* r <- b */
-			copy_handle(r_handle, b_handle, nblocks);
-
-			/* r <- r - A x */
-			gemv_kernel(r_handle, A_handle, x_handle, 1.0, -1.0, nblocks, use_reduction);
-		}
-		else
-		{
-			/* r <- r - alpha q */
-			axpy_kernel(r_handle, q_handle, -alpha, nblocks);
-		}
-
-		/* delta_new = dot(r,r) */
-		dot_kernel(r_handle, r_handle, rtr_handle, nblocks, use_reduction);
-
-		starpu_data_acquire(rtr_handle, STARPU_R);
-		delta_old = delta_new;
-		delta_new = rtr;
-		beta = delta_new / delta_old;
-		starpu_data_release(rtr_handle);
-
-		/* d <- beta d + r */
-		scal_axpy_kernel(d_handle, beta, r_handle, 1.0, nblocks);
-
-		if ((i % 10) == 0)
+		sub = starpu_data_get_sub_data(x_handle, 1, j);
+		starpu_data_acquire(sub, STARPU_R);
+		for (i = 0; i < block_size; i++)
 		{
-			/* We here take the error as ||r||_2 / (n||b||_2) */
-			double error = sqrt(delta_new/delta_0)/(1.0*n);
-			FPRINTF(stderr, "*****************************************\n");
-			FPRINTF(stderr, "iter %d DELTA %e - %e\n", i, delta_new, error);
+			FPRINTF(stderr, "% 02.2e\n", x[j*block_size + i]);
 		}
-
-		starpu_iteration_pop();
-		i++;
+		starpu_data_release(sub);
 	}
-
-	end = starpu_timing_now();
-
-	double timing = end - start;
-	FPRINTF(stderr, "Total timing : %2.2f seconds\n", timing/10e6);
-	FPRINTF(stderr, "Seconds per iteration : %2.2e\n", timing/10e6/i);
-	return 0;
 }
 
-static int check(void)
-{
-	return 0;
-}
 
 static void parse_args(int argc, char **argv)
 {
 	int i;
 	for (i = 1; i < argc; i++)
 	{
-	        if (strcmp(argv[i], "-n") == 0)
-		{
-			n = (int long long)atoi(argv[++i]);
-			continue;
-		}
-
-	        if (strcmp(argv[i], "-maxiter") == 0)
-		{
-			i_max = atoi(argv[++i]);
-			if (i_max <= 0)
-			{
-				FPRINTF(stderr, "the number of iterations must be positive, not %d\n", i_max);
-				exit(EXIT_FAILURE);
-			}
-			continue;
-		}
-
-	        if (strcmp(argv[i], "-nblocks") == 0)
-		{
-			nblocks = atoi(argv[++i]);
-			continue;
-		}
-
-	        if (strcmp(argv[i], "-no-reduction") == 0)
-		{
-			use_reduction = 0;
-			continue;
-		}
-
 		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-help") == 0)
 		{
-			FPRINTF(stderr, "usage: %s [-h] [-nblocks #blocks] [-n problem_size] [-no-reduction] [-maxiter i]\n", argv[0]);
+			FPRINTF_SERVER(stderr, "usage: %s [-h] [-nblocks #blocks] [-display-result] [-n problem_size] [-no-reduction] [-maxiter i]\n", argv[0]);
 			exit(-1);
 		}
-        }
+	}
+
+	parse_common_args(argc, argv);
 }
 
+
 int main(int argc, char **argv)
 {
 	int ret;
+	double start, end;
 
 	/* Not supported yet */
 	if (starpu_get_env_number_default("STARPU_GLOBAL_ARBITER", 0) > 0)
@@ -431,12 +309,27 @@ int main(int argc, char **argv)
 	if (ret == -ENODEV)
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	if (starpu_cpu_worker_get_count() + starpu_cuda_worker_get_count() + starpu_opencl_worker_get_count() == 0)
+	{
+		starpu_shutdown();
+		return 77;
+	}
 
 	starpu_cublas_init();
 
+	FPRINTF(stderr, "************** PARAMETERS ***************\n");
+	FPRINTF(stderr, "Problem size (-n): %lld\n", n);
+	FPRINTF(stderr, "Maximum number of iterations (-maxiter): %d\n", i_max);
+	FPRINTF(stderr, "Number of blocks (-nblocks): %d\n", nblocks);
+	FPRINTF(stderr, "Reduction (-no-reduction): %s\n", use_reduction ? "enabled" : "disabled");
+
+	start = starpu_timing_now();
 	generate_random_problem();
 	register_data();
 	partition_data();
+	end = starpu_timing_now();
+
+	FPRINTF(stderr, "Problem intialization timing : %2.2f seconds\n", (end-start)/1e6);
 
 	ret = cg();
 	if (ret == -ENODEV)
@@ -445,10 +338,13 @@ int main(int argc, char **argv)
 		goto enodev;
 	}
 
-	ret = check();
-
 	starpu_task_wait_for_all();
 
+	if (display_result)
+	{
+		display_x_result();
+	}
+
 enodev:
 	unregister_data();
 	free_data();

+ 0 - 27
examples/cg/cg.h

@@ -32,7 +32,6 @@
 #define TYPE	double
 #define GEMV	STARPU_DGEMV
 #define DOT	STARPU_DDOT
-#define GEMV	STARPU_DGEMV
 #define AXPY	STARPU_DAXPY
 #define SCAL	STARPU_DSCAL
 #define cublasdot	cublasDdot
@@ -44,7 +43,6 @@
 #define TYPE	float
 #define GEMV	STARPU_SGEMV
 #define DOT	STARPU_SDOT
-#define GEMV	STARPU_SGEMV
 #define AXPY	STARPU_SAXPY
 #define SCAL	STARPU_SSCAL
 #define cublasdot	cublasSdot
@@ -54,29 +52,4 @@
 #define cublasscal	cublasSscal
 #endif
 
-int dot_kernel(starpu_data_handle_t v1,
-	       starpu_data_handle_t v2,
-	       starpu_data_handle_t s,
-	       unsigned nblocks,
-	       int use_reduction);
-
-int gemv_kernel(starpu_data_handle_t v1,
-                starpu_data_handle_t matrix, 
-                starpu_data_handle_t v2,
-                TYPE p1, TYPE p2,
-		unsigned nblocks,
-		int use_reduction);
-
-int axpy_kernel(starpu_data_handle_t v1,
-		starpu_data_handle_t v2, TYPE p1,
-		unsigned nblocks);
-
-int scal_axpy_kernel(starpu_data_handle_t v1, TYPE p1,
-		     starpu_data_handle_t v2, TYPE p2,
-		     unsigned nblocks);
-
-int copy_handle(starpu_data_handle_t dst,
-		starpu_data_handle_t src,
-		unsigned nblocks);
-
 #endif /* __STARPU_EXAMPLE_CG_H__ */

+ 237 - 51
examples/cg/cg_kernels.c

@@ -23,11 +23,43 @@
 #include <limits.h>
 
 #ifdef STARPU_USE_CUDA
+#include <cuda.h>
 #include <starpu_cublas_v2.h>
 static const TYPE gp1 = 1.0;
 static const TYPE gm1 = -1.0;
 #endif
 
+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
+
+static int nblocks = 8;
+
+#ifdef STARPU_QUICK_CHECK
+static int i_max = 5;
+static int long long n = 2048;
+#elif !defined(STARPU_LONG_CHECK)
+static int long long n = 4096;
+static int i_max = 100;
+#else
+static int long long n = 4096;
+static int i_max = 1000;
+#endif
+static double eps = (10e-14);
+
+int use_reduction = 1;
+int display_result = 0;
+
+HANDLE_TYPE_MATRIX A_handle;
+HANDLE_TYPE_VECTOR b_handle;
+HANDLE_TYPE_VECTOR x_handle;
+
+HANDLE_TYPE_VECTOR r_handle;
+HANDLE_TYPE_VECTOR d_handle;
+HANDLE_TYPE_VECTOR q_handle;
+
+starpu_data_handle_t dtq_handle;
+starpu_data_handle_t rtr_handle;
+TYPE dtq, rtr;
+
 #if 0
 static void print_vector_from_descr(unsigned nx, TYPE *v)
 {
@@ -59,7 +91,7 @@ static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nim
 	(void)task;
 	(void)nimpl;
 	enum starpu_worker_archtype type = starpu_worker_get_type(workerid);
-	if (type == STARPU_CPU_WORKER || type == STARPU_OPENCL_WORKER || type == STARPU_MIC_WORKER)
+	if (type == STARPU_CPU_WORKER || type == STARPU_OPENCL_WORKER)
 		return 1;
 
 #ifdef STARPU_USE_CUDA
@@ -120,9 +152,10 @@ struct starpu_codelet accumulate_variable_cl =
 	.cuda_funcs = {accumulate_variable_cuda},
 	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
-	.modes = {STARPU_RW, STARPU_R},
+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
 	.nbuffers = 2,
-	.model = &accumulate_variable_model
+	.model = &accumulate_variable_model,
+	.name = "accumulate_variable"
 };
 
 #ifdef STARPU_USE_CUDA
@@ -164,9 +197,10 @@ struct starpu_codelet accumulate_vector_cl =
 	.cuda_funcs = {accumulate_vector_cuda},
 	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
-	.modes = {STARPU_RW, STARPU_R},
+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
 	.nbuffers = 2,
-	.model = &accumulate_vector_model
+	.model = &accumulate_vector_model,
+	.name = "accumulate_vector"
 };
 
 /*
@@ -210,7 +244,8 @@ struct starpu_codelet bzero_variable_cl =
 #endif
 	.modes = {STARPU_W},
 	.nbuffers = 1,
-	.model = &bzero_variable_model
+	.model = &bzero_variable_model,
+	.name = "bzero_variable"
 };
 
 #ifdef STARPU_USE_CUDA
@@ -251,7 +286,8 @@ struct starpu_codelet bzero_vector_cl =
 #endif
 	.modes = {STARPU_W},
 	.nbuffers = 1,
-	.model = &bzero_vector_model
+	.model = &bzero_vector_model,
+	.name = "bzero_vector"
 };
 
 /*
@@ -311,14 +347,14 @@ static struct starpu_codelet dot_kernel_cl =
 #endif
 	.cuda_flags = {STARPU_CUDA_ASYNC},
 	.nbuffers = 3,
-	.model = &dot_kernel_model
+	.model = &dot_kernel_model,
+	.name = "dot_kernel"
 };
 
-int dot_kernel(starpu_data_handle_t v1,
-	       starpu_data_handle_t v2,
+int dot_kernel(HANDLE_TYPE_VECTOR v1,
+	       HANDLE_TYPE_VECTOR v2,
 	       starpu_data_handle_t s,
-	       unsigned nblocks,
-	       int use_reduction)
+	       unsigned nblocks)
 {
 	int ret;
 
@@ -327,21 +363,21 @@ int dot_kernel(starpu_data_handle_t v1,
 		starpu_data_invalidate_submit(s);
 	else
 	{
-		ret = starpu_task_insert(&bzero_variable_cl, STARPU_W, s, 0);
+		ret = TASK_INSERT(&bzero_variable_cl, STARPU_W, s, 0);
 		if (ret == -ENODEV) return ret;
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+		STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
 	}
 
 	unsigned b;
 	for (b = 0; b < nblocks; b++)
 	{
-		ret = starpu_task_insert(&dot_kernel_cl,
+		ret = TASK_INSERT(&dot_kernel_cl,
 					 use_reduction?STARPU_REDUX:STARPU_RW, s,
-					 STARPU_R, starpu_data_get_sub_data(v1, 1, b),
-					 STARPU_R, starpu_data_get_sub_data(v2, 1, b),
+					 STARPU_R, GET_VECTOR_BLOCK(v1, b),
+					 STARPU_R, GET_VECTOR_BLOCK(v2, b),
 					 STARPU_TAG_ONLY, (starpu_tag_t) b,
 					 0);
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+		STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
 	}
 	return 0;
 }
@@ -395,7 +431,8 @@ static struct starpu_codelet scal_kernel_cl =
 	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.nbuffers = 1,
-	.model = &scal_kernel_model
+	.model = &scal_kernel_model,
+	.name = "scal_kernel"
 };
 
 /*
@@ -474,28 +511,28 @@ static struct starpu_codelet gemv_kernel_cl =
 	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.nbuffers = 3,
-	.model = &gemv_kernel_model
+	.model = &gemv_kernel_model,
+	.name = "gemv_kernel"
 };
 
-int gemv_kernel(starpu_data_handle_t v1,
-		starpu_data_handle_t matrix,
-		starpu_data_handle_t v2,
+int gemv_kernel(HANDLE_TYPE_VECTOR v1,
+		HANDLE_TYPE_MATRIX matrix,
+		HANDLE_TYPE_VECTOR v2,
 		TYPE p1, TYPE p2,
-		unsigned nblocks,
-		int use_reduction)
+		unsigned nblocks)
 {
 	unsigned b1, b2;
 	int ret;
 
 	for (b2 = 0; b2 < nblocks; b2++)
 	{
-		ret = starpu_task_insert(&scal_kernel_cl,
-					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b2),
+		ret = TASK_INSERT(&scal_kernel_cl,
+					 STARPU_RW, GET_VECTOR_BLOCK(v1, b2),
 					 STARPU_VALUE, &p1, sizeof(p1),
 					 STARPU_TAG_ONLY, (starpu_tag_t) b2,
 					 0);
 		if (ret == -ENODEV) return ret;
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+		STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
 	}
 
 	for (b2 = 0; b2 < nblocks; b2++)
@@ -503,15 +540,15 @@ int gemv_kernel(starpu_data_handle_t v1,
 		for (b1 = 0; b1 < nblocks; b1++)
 		{
 			TYPE one = 1.0;
-			ret = starpu_task_insert(&gemv_kernel_cl,
-						 use_reduction?STARPU_REDUX:STARPU_RW,	starpu_data_get_sub_data(v1, 1, b2),
-						 STARPU_R,	starpu_data_get_sub_data(matrix, 2, b2, b1),
-						 STARPU_R,	starpu_data_get_sub_data(v2, 1, b1),
+			ret = TASK_INSERT(&gemv_kernel_cl,
+						 use_reduction?STARPU_REDUX:STARPU_RW,	GET_VECTOR_BLOCK(v1, b2),
+						 STARPU_R,	GET_MATRIX_BLOCK(matrix, b2, b1),
+						 STARPU_R,	GET_VECTOR_BLOCK(v2, b1),
 						 STARPU_VALUE,	&one,	sizeof(one),
 						 STARPU_VALUE,	&p2,	sizeof(p2),
 						 STARPU_TAG_ONLY, ((starpu_tag_t)b2) * nblocks + b1,
 						 0);
-			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+			STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
 		}
 	}
 	return 0;
@@ -579,26 +616,27 @@ static struct starpu_codelet scal_axpy_kernel_cl =
 	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.nbuffers = 2,
-	.model = &scal_axpy_kernel_model
+	.model = &scal_axpy_kernel_model,
+	.name = "scal_axpy_kernel"
 };
 
-int scal_axpy_kernel(starpu_data_handle_t v1, TYPE p1,
-		     starpu_data_handle_t v2, TYPE p2,
+int scal_axpy_kernel(HANDLE_TYPE_VECTOR v1, TYPE p1,
+		     HANDLE_TYPE_VECTOR v2, TYPE p2,
 		     unsigned nblocks)
 {
 	unsigned b;
 	for (b = 0; b < nblocks; b++)
 	{
 		int ret;
-		ret = starpu_task_insert(&scal_axpy_kernel_cl,
-					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
-					 STARPU_R,  starpu_data_get_sub_data(v2, 1, b),
+		ret = TASK_INSERT(&scal_axpy_kernel_cl,
+					 STARPU_RW, GET_VECTOR_BLOCK(v1, b),
+					 STARPU_R,  GET_VECTOR_BLOCK(v2, b),
 					 STARPU_VALUE, &p1, sizeof(p1),
 					 STARPU_VALUE, &p2, sizeof(p2),
 					 STARPU_TAG_ONLY, (starpu_tag_t) b,
 					 0);
 		if (ret == -ENODEV) return ret;
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+		STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
 	}
 	return 0;
 }
@@ -658,33 +696,181 @@ static struct starpu_codelet axpy_kernel_cl =
 	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
 	.nbuffers = 2,
-	.model = &axpy_kernel_model
+	.model = &axpy_kernel_model,
+	.name = "axpy_kernel"
 };
 
-int axpy_kernel(starpu_data_handle_t v1,
-		starpu_data_handle_t v2, TYPE p1,
+int axpy_kernel(HANDLE_TYPE_VECTOR v1,
+		HANDLE_TYPE_VECTOR v2, TYPE p1,
 		unsigned nblocks)
 {
 	unsigned b;
 	for (b = 0; b < nblocks; b++)
 	{
 		int ret;
-		ret = starpu_task_insert(&axpy_kernel_cl,
-					 STARPU_RW, starpu_data_get_sub_data(v1, 1, b),
-					 STARPU_R,  starpu_data_get_sub_data(v2, 1, b),
+		ret = TASK_INSERT(&axpy_kernel_cl,
+					 STARPU_RW, GET_VECTOR_BLOCK(v1, b),
+					 STARPU_R,  GET_VECTOR_BLOCK(v2, b),
 					 STARPU_VALUE, &p1, sizeof(p1),
 					 STARPU_TAG_ONLY, (starpu_tag_t) b,
 					 0);
 		if (ret == -ENODEV) return ret;
-		STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
+		STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
 	}
 	return 0;
 }
 
-int copy_handle(starpu_data_handle_t dst, starpu_data_handle_t src, unsigned nblocks)
+
+/*
+ *	Main loop
+ */
+int cg(void)
 {
-	unsigned b;
-	for (b = 0; b < nblocks; b++)
-		starpu_data_cpy(starpu_data_get_sub_data(dst, 1, b), starpu_data_get_sub_data(src, 1, b), 1, NULL, NULL);
+	TYPE delta_new, delta_0, error, delta_old, alpha, beta;
+	double start, end, timing;
+	int i = 0, ret;
+
+	/* r <- b */
+	ret = copy_handle(r_handle, b_handle, nblocks);
+	if (ret == -ENODEV) return ret;
+
+	/* r <- r - A x */
+	ret = gemv_kernel(r_handle, A_handle, x_handle, 1.0, -1.0, nblocks);
+	if (ret == -ENODEV) return ret;
+
+	/* d <- r */
+	ret = copy_handle(d_handle, r_handle, nblocks);
+	if (ret == -ENODEV) return ret;
+
+	/* delta_new = dot(r,r) */
+	ret = dot_kernel(r_handle, r_handle, rtr_handle, nblocks);
+	if (ret == -ENODEV) return ret;
+
+	GET_DATA_HANDLE(rtr_handle);
+	starpu_data_acquire(rtr_handle, STARPU_R);
+	delta_new = rtr;
+	delta_0 = delta_new;
+	starpu_data_release(rtr_handle);
+
+	FPRINTF_SERVER(stderr, "Delta limit: %e\n", (double) (eps*eps*delta_0));
+
+	FPRINTF_SERVER(stderr, "**************** INITIAL ****************\n");
+	FPRINTF_SERVER(stderr, "Delta 0: %e\n", delta_new);
+
+	BARRIER();
+	start = starpu_timing_now();
+
+	while ((i < i_max) && ((double)delta_new > (double)(eps*eps*delta_0)))
+	{
+		starpu_iteration_push(i);
+
+		/* q <- A d */
+		gemv_kernel(q_handle, A_handle, d_handle, 0.0, 1.0, nblocks);
+
+		/* dtq <- dot(d,q) */
+		dot_kernel(d_handle, q_handle, dtq_handle, nblocks);
+
+		/* alpha = delta_new / dtq */
+		GET_DATA_HANDLE(dtq_handle);
+		starpu_data_acquire(dtq_handle, STARPU_R);
+		alpha = delta_new / dtq;
+		starpu_data_release(dtq_handle);
+
+		/* x <- x + alpha d */
+		axpy_kernel(x_handle, d_handle, alpha, nblocks);
+
+		if ((i % 50) == 0)
+		{
+			/* r <- b */
+			copy_handle(r_handle, b_handle, nblocks);
+
+			/* r <- r - A x */
+			gemv_kernel(r_handle, A_handle, x_handle, 1.0, -1.0, nblocks);
+		}
+		else
+		{
+			/* r <- r - alpha q */
+			axpy_kernel(r_handle, q_handle, -alpha, nblocks);
+		}
+
+		/* delta_new = dot(r,r) */
+		dot_kernel(r_handle, r_handle, rtr_handle, nblocks);
+
+		GET_DATA_HANDLE(rtr_handle);
+		starpu_data_acquire(rtr_handle, STARPU_R);
+		delta_old = delta_new;
+		delta_new = rtr;
+		beta = delta_new / delta_old;
+		starpu_data_release(rtr_handle);
+
+		/* d <- beta d + r */
+		scal_axpy_kernel(d_handle, beta, r_handle, 1.0, nblocks);
+
+		if ((i % 10) == 0)
+		{
+			/* We here take the error as ||r||_2 / (n||b||_2) */
+			error = sqrt(delta_new/delta_0)/(1.0*n);
+			FPRINTF_SERVER(stderr, "*****************************************\n");
+			FPRINTF_SERVER(stderr, "iter %d DELTA %e - %e\n", i, delta_new, error);
+		}
+
+		starpu_iteration_pop();
+		i++;
+	}
+
+	BARRIER();
+	end = starpu_timing_now();
+	timing = end - start;
+
+	error = sqrt(delta_new/delta_0)/(1.0*n);
+	FPRINTF_SERVER(stderr, "*****************************************\n");
+	FPRINTF_SERVER(stderr, "iter %d DELTA %e - %e\n", i, delta_new, error);
+	FPRINTF_SERVER(stderr, "Total timing : %2.2f seconds\n", timing/1e6);
+	FPRINTF_SERVER(stderr, "Seconds per iteration : %2.2e seconds\n", timing/1e6/i);
+	FPRINTF_SERVER(stderr, "Number of iterations per second : %2.2e it/s\n", i/(timing/1e6));
+
 	return 0;
 }
+
+
+void parse_common_args(int argc, char **argv)
+{
+	int i;
+	for (i = 1; i < argc; i++)
+	{
+		if (strcmp(argv[i], "-n") == 0)
+		{
+			n = (int long long)atoi(argv[++i]);
+			continue;
+		}
+
+		if (strcmp(argv[i], "-display-result") == 0)
+		{
+			display_result = 1;
+			continue;
+		}
+
+		if (strcmp(argv[i], "-maxiter") == 0)
+		{
+			i_max = atoi(argv[++i]);
+			if (i_max <= 0)
+			{
+				FPRINTF_SERVER(stderr, "the number of iterations must be positive, not %d\n", i_max);
+				exit(EXIT_FAILURE);
+			}
+			continue;
+		}
+
+		if (strcmp(argv[i], "-nblocks") == 0)
+		{
+			nblocks = atoi(argv[++i]);
+			continue;
+		}
+
+		if (strcmp(argv[i], "-no-reduction") == 0)
+		{
+			use_reduction = 0;
+			continue;
+		}
+	}
+}

+ 0 - 1
examples/cpp/add_vectors.cpp

@@ -61,7 +61,6 @@ int main(int argc, char **argv)
 
 	struct starpu_conf conf;
 	starpu_conf_init(&conf);
-	conf.nmic = 0;
 	conf.nmpi_ms = 0;
 
 	// initialize StarPU with default configuration

+ 0 - 1
examples/cpp/add_vectors_cpp11.cpp

@@ -66,7 +66,6 @@ int main(int argc, char **argv)
 
 	struct starpu_conf conf;
 	starpu_conf_init(&conf);
-	conf.nmic = 0;
 	conf.nmpi_ms = 0;
 
 	// initialize StarPU with default configuration

+ 11 - 8
examples/cpp/add_vectors_interface.cpp

@@ -171,7 +171,6 @@ static const struct starpu_data_copy_methods vector_cpp_copy_data_methods_s =
 	.ram_to_ram = NULL,
 	.ram_to_cuda = NULL,
 	.ram_to_opencl = NULL,
-	.ram_to_mic = NULL,
 
 	.cuda_to_ram = NULL,
 	.cuda_to_cuda = NULL,
@@ -179,8 +178,6 @@ static const struct starpu_data_copy_methods vector_cpp_copy_data_methods_s =
 	.opencl_to_ram = NULL,
 	.opencl_to_opencl = NULL,
 
-	.mic_to_ram = NULL,
-
 	.ram_to_mpi_ms = NULL,
 	.mpi_ms_to_ram = NULL,
 	.mpi_ms_to_mpi_ms = NULL,
@@ -197,9 +194,6 @@ static const struct starpu_data_copy_methods vector_cpp_copy_data_methods_s =
 	.mpi_ms_to_ram_async = NULL,
 	.mpi_ms_to_mpi_ms_async = NULL,
 
-	.ram_to_mic_async = NULL,
-	.mic_to_ram_async = NULL,
-
 	.any_to_any = vector_interface_copy_any_to_any,
 };
 #else
@@ -258,6 +252,7 @@ static uint32_t footprint_vector_cpp_interface_crc32(starpu_data_handle_t handle
 static int vector_cpp_compare(void *data_interface_a, void *data_interface_b);
 static void display_vector_cpp_interface(starpu_data_handle_t handle, FILE *f);
 static int pack_vector_cpp_handle(starpu_data_handle_t handle, unsigned node, void **ptr, starpu_ssize_t *count);
+static int peek_vector_cpp_handle(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count);
 static int unpack_vector_cpp_handle(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count);
 static starpu_ssize_t vector_cpp_describe(void *data_interface, char *buf, size_t size);
 
@@ -287,6 +282,7 @@ static struct starpu_data_interface_ops interface_vector_cpp_ops =
 	.dontcache = 0,
 	.get_mf_ops = NULL,
 	.pack_data = pack_vector_cpp_handle,
+	.peek_data = peek_vector_cpp_handle,
 	.unpack_data = unpack_vector_cpp_handle,
 	.name = (char *) "VECTOR_CPP_INTERFACE"
 };
@@ -315,6 +311,7 @@ static struct starpu_data_interface_ops interface_vector_cpp_ops =
 	0,
 	NULL,
 	pack_vector_cpp_handle,
+	peek_vector_cpp_handle,
 	unpack_vector_cpp_handle,
 	(char *) "VECTOR_CPP_INTERFACE"
 };
@@ -458,7 +455,7 @@ static int pack_vector_cpp_handle(starpu_data_handle_t handle, unsigned node, vo
 	return 0;
 }
 
-static int unpack_vector_cpp_handle(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count)
+static int peek_vector_cpp_handle(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count)
 {
 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
 
@@ -468,6 +465,13 @@ static int unpack_vector_cpp_handle(starpu_data_handle_t handle, unsigned node,
 	STARPU_ASSERT(count == vector_interface->elemsize * vector_interface->nx);
 	memcpy((void*)vector_interface->ptr, ptr, count);
 
+	return 0;
+}
+
+static int unpack_vector_cpp_handle(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count)
+{
+	peek_vector_cpp_handle(handle, node, ptr, count);
+
 	starpu_free_on_node_flags(node, (uintptr_t)ptr, count, 0);
 
 	return 0;
@@ -580,7 +584,6 @@ int main(int argc, char **argv)
 	bool fail;
 
 	starpu_conf_init(&conf);
-	conf.nmic = 0;
 	conf.nmpi_ms = 0;
 
 	// initialize StarPU with default configuration

+ 0 - 1
examples/cpp/incrementer_cpp.cpp

@@ -51,7 +51,6 @@ int main(int argc, char **argv)
 
 	struct starpu_conf conf;
 	starpu_conf_init(&conf);
-	conf.nmic = 0;
 	conf.nmpi_ms = 0;
 
 	ret = starpu_init(&conf);

+ 0 - 1
examples/dependency/sequential_consistency.c

@@ -128,7 +128,6 @@ int main(void)
 	}
 
 	starpu_conf_init(&conf);
-	conf.nmic = 0;
 	conf.nmpi_ms = 0;
 
         ret = starpu_init(&conf);

+ 0 - 1
examples/dependency/task_end_dep.c

@@ -76,7 +76,6 @@ int main(void)
 	struct starpu_task *task;
 
 	starpu_conf_init(&conf);
-	conf.nmic = 0;
 	conf.nmpi_ms = 0;
 
         ret = starpu_init(&conf);

+ 0 - 1
examples/dependency/task_end_dep_add.c

@@ -69,7 +69,6 @@ int main(void)
 	struct starpu_conf conf;
 
 	starpu_conf_init(&conf);
-	conf.nmic = 0;
 	conf.nmpi_ms = 0;
 
         ret = starpu_init(&conf);

+ 1 - 1
examples/filters/fmultiple_manual.c

@@ -124,7 +124,7 @@ struct starpu_codelet cl_switch =
 
 int main(void)
 {
-	unsigned j, n=1;
+	unsigned n=1;
 	int matrix[NX][NY];
 	int ret, i;
 

+ 1 - 1
examples/filters/fmultiple_submit.c

@@ -98,7 +98,7 @@ struct starpu_codelet cl_check_scale =
 
 int main(void)
 {
-	unsigned j, n=1;
+	unsigned n=1;
 	int matrix[NX][NY];
 	int ret, i;
 

+ 1 - 1
examples/filters/fmultiple_submit_implicit.c

@@ -145,7 +145,7 @@ struct starpu_codelet cl_check =
 int main(void)
 {
 	int start, factor;
-	unsigned j, n=1;
+	unsigned n=1;
 	int matrix[NX][NY];
 	int ret, i;
 

+ 1 - 1
examples/filters/fplan_notautomatic.c

@@ -45,7 +45,7 @@ void task_cpu(void *descr[], void *args)
 
 void split_callback(void *arg)
 {
-	(arg);
+	(void)arg;
 	struct starpu_task *task = starpu_task_get_current();
 	starpu_data_handle_t value_handle, sub_handles[PARTS];
 

+ 2 - 2
examples/filters/frecursive.c

@@ -41,9 +41,9 @@ void cpu_codelet(void *buffers[], void *cl_arg)
 
 static struct starpu_codelet cl =
 {
-        .cpu_funcs[0] = cpu_codelet,
+        .cpu_funcs = {cpu_codelet},
         .nbuffers = 1,
-	.modes[0] = STARPU_RW,
+	.modes = {STARPU_RW},
 };
 
 #define NX 400

+ 0 - 6
examples/heat/heat.sh

@@ -20,12 +20,6 @@ set -e
 
 PREFIX=$(dirname $0)
 
-if [ -n "$STARPU_MIC_SINK_PROGRAM_PATH" ] ; then
-	STARPU_MIC_SINK_PROGRAM_NAME=$STARPU_MIC_SINK_PROGRAM_PATH/heat
-	# in case libtool got into play
-	[ -x "$STARPU_MIC_SINK_PROGRAM_PATH/.libs/heat" ] && STARPU_MIC_SINK_PROGRAM_NAME=$STARPU_MIC_SINK_PROGRAM_PATH/.libs/heat
-fi
-
 $STARPU_LAUNCH $PREFIX/heat -shape 0
 $STARPU_LAUNCH $PREFIX/heat -shape 1
 # sometimes lead to pivot being 0

+ 1 - 1
examples/interface/complex_codelet.h

@@ -68,7 +68,7 @@ void compare_complex_codelet(void *descr[], void *_args)
 struct starpu_codelet cl_compare =
 {
 	.cpu_funcs = {compare_complex_codelet},
-	/* dereferencing compare won't work on MIC */
+	/* dereferencing compare won't work on MPI Master Slave */
 	/* .cpu_funcs_name = {"compare_complex_codelet"}, */
 	.nbuffers = 2,
 	.modes = {STARPU_R, STARPU_R},

+ 14 - 6
examples/interface/complex_interface.c

@@ -147,7 +147,7 @@ static int complex_pack_data(starpu_data_handle_t handle, unsigned node, void **
 	return 0;
 }
 
-static int complex_unpack_data(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count)
+static int complex_peek_data(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count)
 {
 	char *data = ptr;
 	STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
@@ -159,6 +159,13 @@ static int complex_unpack_data(starpu_data_handle_t handle, unsigned node, void
 	memcpy(complex_interface->real, data, complex_interface->nx*sizeof(double));
 	memcpy(complex_interface->imaginary, data+complex_interface->nx*sizeof(double), complex_interface->nx*sizeof(double));
 
+	return 0;
+}
+
+static int complex_unpack_data(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count)
+{
+	complex_peek_data(handle, node, ptr, count);
+
 	starpu_free_on_node_flags(node, (uintptr_t) ptr, count, 0);
 
 	return 0;
@@ -178,9 +185,9 @@ static int complex_compare(void *data_interface_a, void *data_interface_b)
 	return (complex_a->nx == complex_b->nx);
 }
 
-static int copy_any_to_any(void *src_interface, unsigned src_node,
-			   void *dst_interface, unsigned dst_node,
-			   void *async_data)
+int copy_any_to_any(void *src_interface, unsigned src_node,
+		    void *dst_interface, unsigned dst_node,
+		    void *async_data)
 {
 	struct starpu_complex_interface *src_complex = src_interface;
 	struct starpu_complex_interface *dst_complex = dst_interface;
@@ -199,12 +206,12 @@ static int copy_any_to_any(void *src_interface, unsigned src_node,
 	return ret;
 }
 
-static const struct starpu_data_copy_methods complex_copy_methods =
+const struct starpu_data_copy_methods complex_copy_methods =
 {
 	.any_to_any = copy_any_to_any
 };
 
-static struct starpu_data_interface_ops interface_complex_ops =
+struct starpu_data_interface_ops interface_complex_ops =
 {
 	.register_data_handle = complex_register_data_handle,
 	.allocate_data_on_node = complex_allocate_data_on_node,
@@ -217,6 +224,7 @@ static struct starpu_data_interface_ops interface_complex_ops =
 	.to_pointer = NULL,
 	.pointer_is_inside = complex_pointer_is_inside,
 	.pack_data = complex_pack_data,
+	.peek_data = complex_peek_data,
 	.unpack_data = complex_unpack_data,
 	.describe = complex_describe,
 	.compare = complex_compare

+ 0 - 1
examples/loader-cross.sh.in

@@ -1 +0,0 @@
-../tests/loader-cross.sh.in

+ 0 - 12
examples/lu/lu.sh

@@ -22,12 +22,6 @@ PREFIX=$(dirname $0)
 rm -rf $PREFIX/lu.traces
 mkdir -p $PREFIX/lu.traces
 
-if [ -n "$STARPU_MIC_SINK_PROGRAM_PATH" ] ; then
-	STARPU_MIC_SINK_PROGRAM_NAME=$STARPU_MIC_SINK_PROGRAM_PATH/lu_implicit_example_float
-	# in case libtool got into play
-	[ -x "$STARPU_MIC_SINK_PROGRAM_PATH/.libs/lu_implicit_example_float" ] && STARPU_MIC_SINK_PROGRAM_NAME=$STARPU_MIC_SINK_PROGRAM_PATH/.libs/lu_implicit_example_float
-fi
-
 export STARPU_FXT_PREFIX=$PREFIX/lu.traces
 
 $STARPU_LAUNCH $PREFIX/lu_implicit_example_float -size $((160 * 4)) -nblocks 4 -piv
@@ -36,12 +30,6 @@ $STARPU_LAUNCH $PREFIX/lu_implicit_example_float -size $((160 * 4)) -nblocks 4 -
 $STARPU_LAUNCH $PREFIX/lu_implicit_example_float -size $((160 * 2)) -nblocks 2 -bounddeps -directory $STARPU_FXT_PREFIX
 $STARPU_LAUNCH $PREFIX/lu_implicit_example_float -size $((160 * 2)) -nblocks 2 -bound -bounddeps -bounddepsprio -directory $STARPU_FXT_PREFIX
 
-if [ -n "$STARPU_MIC_SINK_PROGRAM_PATH" ] ; then
-	STARPU_MIC_SINK_PROGRAM_NAME=$STARPU_MIC_SINK_PROGRAM_PATH/lu_example_float
-	# in case libtool got into play
-	[ -x "$STARPU_MIC_SINK_PROGRAM_PATH/.libs/lu_example_float" ] && STARPU_MIC_SINK_PROGRAM_NAME=$STARPU_MIC_SINK_PROGRAM_PATH/.libs/lu_example_float
-fi
-
 $STARPU_LAUNCH $PREFIX/lu_example_float -size $((160 * 4)) -nblocks 4 -piv
 $STARPU_LAUNCH $PREFIX/lu_example_float -size $((160 * 4)) -nblocks 4 -no-stride
 $STARPU_LAUNCH $PREFIX/lu_example_float -size $((160 * 4)) -nblocks 4 -bound

+ 1 - 1
examples/lu/xlu_implicit_pivot.c

@@ -234,7 +234,7 @@ starpu_data_handle_t get_block_with_striding(starpu_data_handle_t *dataAp, unsig
 
 int STARPU_LU(lu_decomposition_pivot)(TYPE *matA, unsigned *ipiv, unsigned size, unsigned ld, unsigned nblocks, unsigned no_prio)
 {
-	if (starpu_mic_worker_get_count() || starpu_mpi_ms_worker_get_count())
+	if (starpu_mpi_ms_worker_get_count())
 		/* These won't work with pivoting: we pass a pointer in cl_args */
 		return -ENODEV;
 

+ 1 - 1
examples/lu/xlu_kernels.c

@@ -118,7 +118,7 @@ static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nim
 	(void)task;
 	(void)nimpl;
 	enum starpu_worker_archtype type = starpu_worker_get_type(workerid);
-	if (type == STARPU_CPU_WORKER || type == STARPU_MIC_WORKER)
+	if (type == STARPU_CPU_WORKER)
 		return 1;
 
 #ifdef STARPU_SIMGRID

+ 1 - 1
examples/lu/xlu_pivot.c

@@ -417,7 +417,7 @@ starpu_data_handle_t get_block_with_no_striding(starpu_data_handle_t *dataAp, un
 int STARPU_LU(lu_decomposition_pivot_no_stride)(TYPE **matA, unsigned *ipiv, unsigned size, unsigned ld, unsigned nblocks, unsigned no_prio)
 {
 	(void)ld;
-	if (starpu_mic_worker_get_count() || starpu_mpi_ms_worker_get_count())
+	if (starpu_mpi_ms_worker_get_count())
 		/* These won't work with pivoting: we pass a pointer in cl_args */
 		return -ENODEV;
 

+ 2 - 4
examples/matvecmult/matvecmult.c

@@ -127,13 +127,11 @@ static struct starpu_perfmodel starpu_matvecmult_model =
 static struct starpu_codelet cl =
 {
 #ifdef STARPU_USE_OPENCL
-        .opencl_funcs[0] = opencl_codelet,
+        .opencl_funcs = {opencl_codelet},
 	.opencl_flags = {STARPU_OPENCL_ASYNC},
 #endif
         .nbuffers = 3,
-	.modes[0] = STARPU_R,
-	.modes[1] = STARPU_R,
-	.modes[2] = STARPU_RW,
+	.modes = {STARPU_R, STARPU_R, STARPU_RW},
 	.model = &starpu_matvecmult_model
 };
 

+ 0 - 6
examples/mult/sgemm.sh

@@ -28,12 +28,6 @@ PREFIX=$(dirname $0)
 rm -rf $PREFIX/sgemm.traces
 mkdir -p $PREFIX/sgemm.traces
 
-if [ -n "$STARPU_MIC_SINK_PROGRAM_PATH" ] ; then
-	STARPU_MIC_SINK_PROGRAM_NAME=$STARPU_MIC_SINK_PROGRAM_PATH/sgemm
-	# in case libtool got into play
-	[ -x "$STARPU_MIC_SINK_PROGRAM_PATH/.libs/sgemm" ] && STARPU_MIC_SINK_PROGRAM_NAME=$STARPU_MIC_SINK_PROGRAM_PATH/.libs/sgemm
-fi
-
 export STARPU_FXT_PREFIX=$PREFIX/sgemm.traces
 
 STARPU_SCHED=dmdas $PREFIX/sgemm -check

+ 5 - 1
examples/native_fortran/nf_vector.f90

@@ -94,7 +94,11 @@ program nf_vector
         !     . . .
         !     C_NULL_PTR
         !   )/
-        call fstarpu_insert_task((/ cl_vec, FSTARPU_R, dh_va, FSTARPU_RW.ior.FSTARPU_LOCALITY, dh_vb, C_NULL_PTR /))
+        call fstarpu_insert_task((/ cl_vec, &
+                FSTARPU_R, dh_va, &
+                FSTARPU_RW.ior.FSTARPU_LOCALITY, dh_vb, &
+                FSTARPU_EXECUTE_WHERE, FSTARPU_CPU, & ! for illustration, not required here
+                C_NULL_PTR /))
 
         ! wait for task completion
         call fstarpu_task_wait_for_all()

+ 1 - 1
examples/perf_steering/perf_knobs_03.c

@@ -58,9 +58,9 @@ int main(int argc, char **argv)
 	if (starpu_cpu_worker_get_count() != 2
 		|| starpu_cuda_worker_get_count() != 0
 		|| starpu_opencl_worker_get_count() != 0
-		|| starpu_mic_worker_get_count() != 0
 		|| starpu_mpi_ms_worker_get_count() != 0)
 	{
+		starpu_shutdown();
 		fprintf(stderr, "example needs exactly two cpu cores.\n");
 		return 77;
 	}

+ 1 - 2
examples/pi/pi_redux.c

@@ -76,7 +76,6 @@ static void init_rng(void *arg)
 	switch (starpu_worker_get_type(workerid))
 	{
 		case STARPU_CPU_WORKER:
-		case STARPU_MIC_WORKER:
 			/* create a seed */
 			starpu_srand48_r((long int)workerid, &randbuffer[PADDING*workerid]);
 
@@ -322,7 +321,7 @@ static struct starpu_codelet redux_codelet =
 	.cuda_funcs = {redux_cuda_func},
 	.cuda_flags = {STARPU_CUDA_ASYNC},
 #endif
-	.modes = {STARPU_RW, STARPU_R},
+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
 	.nbuffers = 2
 };
 

+ 3 - 4
examples/reductions/dot_product.c

@@ -59,7 +59,7 @@ static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nim
 	(void)task;
 	(void)nimpl;
 	enum starpu_worker_archtype type = starpu_worker_get_type(workerid);
-	if (type == STARPU_CPU_WORKER || type == STARPU_OPENCL_WORKER || type == STARPU_MIC_WORKER)
+	if (type == STARPU_CPU_WORKER || type == STARPU_OPENCL_WORKER)
 		return 1;
 
 #ifdef STARPU_USE_CUDA
@@ -186,7 +186,6 @@ void redux_opencl_func(void *buffers[], void *args)
 	{
 		size_t global=1;
                 size_t local=1;
-                size_t s;
                 cl_device_id device;
 
                 starpu_opencl_get_device(devid, &device);
@@ -212,7 +211,7 @@ static struct starpu_codelet redux_codelet =
 	.opencl_funcs = {redux_opencl_func},
 	.opencl_flags = {STARPU_OPENCL_ASYNC},
 #endif
-	.modes = {STARPU_RW, STARPU_R},
+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
 	.nbuffers = 2,
 	.name = "redux"
 };
@@ -301,7 +300,6 @@ void dot_opencl_func(void *buffers[], void *cl_arg)
 	{
 		size_t global=1;
                 size_t local=1;
-                size_t s;
                 cl_device_id device;
 
                 starpu_opencl_get_device(devid, &device);
@@ -461,6 +459,7 @@ int main(void)
 	}
 
 enodev:
+	starpu_shutdown();
 	FPRINTF(stderr, "WARNING: No one can execute this task\n");
 	/* yes, we do not perform the computation but we did detect that no one
  	 * could perform the kernel, so this is not an error from StarPU */

+ 1 - 1
examples/reductions/minmax_reduction.c

@@ -95,7 +95,7 @@ static struct starpu_codelet minmax_redux_codelet =
 {
 	.cpu_funcs = {minmax_redux_cpu_func},
 	.cpu_funcs_name = {"minmax_redux_cpu_func"},
-	.modes = {STARPU_RW, STARPU_R},
+	.modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
 	.nbuffers = 2,
 	.name = "redux"
 };

+ 2 - 2
examples/sched_ctx/parallel_tasks_reuse_handle.c

@@ -172,7 +172,7 @@ static struct starpu_codelet sum_cl =
 int main(void)
 {
 	int ntasks = NTASKS;
-	int ret, j, k;
+	int ret;
 	unsigned ncpus = 0;
 
 	ret = starpu_init(NULL);
@@ -207,7 +207,7 @@ int main(void)
 
 	for (i = 0; i < ntasks; i++)
 	{
-		struct starpu_task * t;
+		struct starpu_task *t;
 		t=starpu_task_build(&sum_cl,
 				    STARPU_RW,handle1,
 				    STARPU_R,handle2,

+ 0 - 1
examples/sched_ctx/parallel_tasks_with_cluster_api.c

@@ -67,7 +67,6 @@ int main(void)
 	int ret, i;
 	struct starpu_cluster_machine *clusters;
 
-	setenv("STARPU_NMIC","0",1);
 	setenv("STARPU_NMPI_MS","0",1);
 
 	ret = starpu_init(NULL);

+ 6 - 1
examples/sched_ctx/two_cpu_contexts.c

@@ -44,16 +44,21 @@ int main(void)
 	int *procs2 = NULL;
 	int i;
 	int n = 20;
+
 	int ret = starpu_init(NULL);
 	if (ret == -ENODEV)
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ncpu = starpu_cpu_worker_get_count();
 
+	ncpu = starpu_cpu_worker_get_count();
 	/* actually we really need at least 2 CPU workers such to allocate 2
 	 * non overlapping contexts */
 	if (ncpu < 2)
+	{
+		starpu_shutdown();
 		return 77;
+	}
+
 	procs = calloc(ncpu, sizeof(int));
 	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, procs, ncpu);
 

+ 1 - 1
examples/scheduler/heteroprio_test.c

@@ -121,7 +121,7 @@ int main(void)
 	assert(ret == 0);
 
 	conf.sched_policy_name = "heteroprio";
-	conf.sched_policy_init = &initSchedulerCallback;
+	conf.sched_policy_callback = &initSchedulerCallback;
 	ret = starpu_init(&conf);
 	if (ret == -ENODEV)
 		return 77;

+ 2 - 0
examples/spmv/dw_block_spmv.c

@@ -338,5 +338,7 @@ int main(int argc, char *argv[])
 	FPRINTF(stderr, "Flop %e\n", totalflop);
 	FPRINTF(stderr, "GFlops : %2.2f\n", totalflop/timing/1000);
 
+	starpu_shutdown();
+
 	return 0;
 }

+ 0 - 1
examples/stencil/loader-cross.sh.in

@@ -1 +0,0 @@
-../../tests/loader-cross.sh.in

+ 13 - 29
include/fstarpu_mod.f90

@@ -25,6 +25,7 @@ module fstarpu_mod
         type(c_ptr), bind(C) :: FSTARPU_RW
         type(c_ptr), bind(C) :: FSTARPU_SCRATCH
         type(c_ptr), bind(C) :: FSTARPU_REDUX
+        type(c_ptr), bind(C) :: FSTARPU_MPI_REDUX
         type(c_ptr), bind(C) :: FSTARPU_COMMUTE
         type(c_ptr), bind(C) :: FSTARPU_SSEND
         type(c_ptr), bind(C) :: FSTARPU_LOCALITY
@@ -36,11 +37,15 @@ module fstarpu_mod
         type(c_ptr), bind(C) :: FSTARPU_TASK_DEPS_ARRAY
         type(c_ptr), bind(C) :: FSTARPU_CALLBACK
         type(c_ptr), bind(C) :: FSTARPU_CALLBACK_WITH_ARG
+        type(c_ptr), bind(C) :: FSTARPU_CALLBACK_WITH_ARG_NFREE
         type(c_ptr), bind(C) :: FSTARPU_CALLBACK_ARG
+        type(c_ptr), bind(C) :: FSTARPU_CALLBACK_ARG_NFREE
         type(c_ptr), bind(C) :: FSTARPU_PROLOGUE_CALLBACK
         type(c_ptr), bind(C) :: FSTARPU_PROLOGUE_CALLBACK_ARG
+        type(c_ptr), bind(C) :: FSTARPU_PROLOGUE_CALLBACK_ARG_NFREE
         type(c_ptr), bind(C) :: FSTARPU_PROLOGUE_CALLBACK_POP
         type(c_ptr), bind(C) :: FSTARPU_PROLOGUE_CALLBACK_POP_ARG
+        type(c_ptr), bind(C) :: FSTARPU_PROLOGUE_CALLBACK_POP_ARG_NFREE
         type(c_ptr), bind(C) :: FSTARPU_PRIORITY
         type(c_ptr), bind(C) :: FSTARPU_EXECUTE_ON_NODE
         type(c_ptr), bind(C) :: FSTARPU_EXECUTE_ON_DATA
@@ -66,7 +71,6 @@ module fstarpu_mod
         type(c_ptr), bind(C) :: FSTARPU_CPU_WORKER
         type(c_ptr), bind(C) :: FSTARPU_CUDA_WORKER
         type(c_ptr), bind(C) :: FSTARPU_OPENCL_WORKER
-        type(c_ptr), bind(C) :: FSTARPU_MIC_WORKER
         type(c_ptr), bind(C) :: FSTARPU_ANY_WORKER
 
         integer(c_int), bind(C) :: FSTARPU_NMAXBUFS
@@ -85,7 +89,6 @@ module fstarpu_mod
         type(c_ptr), bind(C) :: FSTARPU_CPU
         type(c_ptr), bind(C) :: FSTARPU_CUDA
         type(c_ptr), bind(C) :: FSTARPU_OPENCL
-        type(c_ptr), bind(C) :: FSTARPU_MIC
 
         type(c_ptr), bind(C) :: FSTARPU_CODELET_SIMGRID_EXECUTE
         type(c_ptr), bind(C) :: FSTARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT
@@ -185,12 +188,6 @@ module fstarpu_mod
                         integer(c_int), value, intent(in) :: nopencl
                 end subroutine fstarpu_conf_set_nopencl
 
-                subroutine fstarpu_conf_set_nmic (conf, nmic) bind(C)
-                        use iso_c_binding, only: c_ptr, c_int
-                        type(c_ptr), value, intent(in) :: conf
-                        integer(c_int), value, intent(in) :: nmic
-                end subroutine fstarpu_conf_set_nmic
-
                 ! starpu_init: see fstarpu_init
                 ! starpu_initialize: see fstarpu_init
 
@@ -228,12 +225,6 @@ module fstarpu_mod
                         integer(c_int) :: fstarpu_asynchronous_opencl_copy_disabled
                 end function fstarpu_asynchronous_opencl_copy_disabled
 
-                ! int starpu_asynchronous_mic_copy_disabled(void);
-                function fstarpu_asynchronous_mic_copy_disabled() bind(C,name="starpu_asynchronous_mic_copy_disabled")
-                        use iso_c_binding, only: c_int
-                        integer(c_int) :: fstarpu_asynchronous_mic_copy_disabled
-                end function fstarpu_asynchronous_mic_copy_disabled
-
                 ! void starpu_display_stats();
                 subroutine fstarpu_display_stats() bind(C,name="starpu_display_stats")
                 end subroutine fstarpu_display_stats
@@ -284,12 +275,6 @@ module fstarpu_mod
                         integer(c_int)              :: fstarpu_opencl_worker_get_count
                 end function fstarpu_opencl_worker_get_count
 
-                ! unsigned starpu_mic_worker_get_count(void);
-                function fstarpu_mic_worker_get_count() bind(C,name="starpu_mic_worker_get_count")
-                        use iso_c_binding, only: c_int
-                        integer(c_int)              :: fstarpu_mic_worker_get_count
-                end function fstarpu_mic_worker_get_count
-
                 ! int starpu_worker_get_id(void);
                 function fstarpu_worker_get_id() bind(C,name="starpu_worker_get_id")
                         use iso_c_binding, only: c_int
@@ -699,12 +684,6 @@ module fstarpu_mod
                         type(c_ptr), value, intent(in) :: flags ! C function expects an intptr_t
                 end subroutine fstarpu_codelet_add_opencl_flags
 
-                subroutine fstarpu_codelet_add_mic_func (cl, f_ptr) bind(C)
-                        use iso_c_binding, only: c_ptr, c_funptr
-                        type(c_ptr), value, intent(in) :: cl
-                        type(c_funptr), value, intent(in) :: f_ptr
-                end subroutine fstarpu_codelet_add_mic_func
-
                 subroutine fstarpu_codelet_add_buffer (cl, mode) bind(C)
                         use iso_c_binding, only: c_ptr
                         type(c_ptr), value, intent(in) :: cl
@@ -2395,6 +2374,7 @@ module fstarpu_mod
                         FSTARPU_RW      = fstarpu_get_constant(C_CHAR_"FSTARPU_RW"//C_NULL_CHAR)
                         FSTARPU_SCRATCH = fstarpu_get_constant(C_CHAR_"FSTARPU_SCRATCH"//C_NULL_CHAR)
                         FSTARPU_REDUX   = fstarpu_get_constant(C_CHAR_"FSTARPU_REDUX"//C_NULL_CHAR)
+                        FSTARPU_MPI_REDUX   = fstarpu_get_constant(C_CHAR_"FSTARPU_MPI_REDUX"//C_NULL_CHAR)
                         FSTARPU_COMMUTE   = fstarpu_get_constant(C_CHAR_"FSTARPU_COMMUTE"//C_NULL_CHAR)
                         FSTARPU_SSEND   = fstarpu_get_constant(C_CHAR_"FSTARPU_SSEND"//C_NULL_CHAR)
                         FSTARPU_LOCALITY   = fstarpu_get_constant(C_CHAR_"FSTARPU_LOCALITY"//C_NULL_CHAR)
@@ -2406,12 +2386,19 @@ module fstarpu_mod
                         FSTARPU_TASK_DEPS_ARRAY = fstarpu_get_constant(C_CHAR_"FSTARPU_TASK_DEPS_ARRAY"//C_NULL_CHAR)
                         FSTARPU_CALLBACK        = fstarpu_get_constant(C_CHAR_"FSTARPU_CALLBACK"//C_NULL_CHAR)
                         FSTARPU_CALLBACK_WITH_ARG       = fstarpu_get_constant(C_CHAR_"FSTARPU_CALLBACK_WITH_ARG"//C_NULL_CHAR)
+                        FSTARPU_CALLBACK_WITH_ARG_NFREE       = &
+                                fstarpu_get_constant(C_CHAR_"FSTARPU_CALLBACK_WITH_ARG_NFREE"//C_NULL_CHAR)
                         FSTARPU_CALLBACK_ARG    = fstarpu_get_constant(C_CHAR_"FSTARPU_CALLBACK_ARG"//C_NULL_CHAR)
+                        FSTARPU_CALLBACK_ARG_NFREE    = fstarpu_get_constant(C_CHAR_"FSTARPU_CALLBACK_ARG_NFREE"//C_NULL_CHAR)
                         FSTARPU_PROLOGUE_CALLBACK       = fstarpu_get_constant(C_CHAR_"FSTARPU_PROLOGUE_CALLBACK"//C_NULL_CHAR)
                         FSTARPU_PROLOGUE_CALLBACK_ARG   = fstarpu_get_constant(C_CHAR_"FSTARPU_PROLOGUE_CALLBACK_ARG"//C_NULL_CHAR)
+                        FSTARPU_PROLOGUE_CALLBACK_ARG_NFREE   = &
+                                fstarpu_get_constant(C_CHAR_"FSTARPU_PROLOGUE_CALLBACK_ARG_NFREE"//C_NULL_CHAR)
                         FSTARPU_PROLOGUE_CALLBACK_POP   = fstarpu_get_constant(C_CHAR_"FSTARPU_PROLOGUE_CALLBACK_POP"//C_NULL_CHAR)
                         FSTARPU_PROLOGUE_CALLBACK_POP_ARG       = &
                                 fstarpu_get_constant(C_CHAR_"FSTARPU_PROLOGUE_CALLBACK_POP_ARG"//C_NULL_CHAR)
+                        FSTARPU_PROLOGUE_CALLBACK_POP_ARG_NFREE       = &
+                                fstarpu_get_constant(C_CHAR_"FSTARPU_PROLOGUE_CALLBACK_POP_ARG_NFREE"//C_NULL_CHAR)
                         FSTARPU_PRIORITY        = fstarpu_get_constant(C_CHAR_"FSTARPU_PRIORITY"//C_NULL_CHAR)
                         FSTARPU_EXECUTE_ON_NODE = fstarpu_get_constant(C_CHAR_"FSTARPU_EXECUTE_ON_NODE"//C_NULL_CHAR)
                         FSTARPU_EXECUTE_ON_DATA = fstarpu_get_constant(C_CHAR_"FSTARPU_EXECUTE_ON_DATA"//C_NULL_CHAR)
@@ -2432,7 +2419,6 @@ module fstarpu_mod
                         FSTARPU_CPU_WORKER   = fstarpu_get_constant(C_CHAR_"FSTARPU_CPU_WORKER"//C_NULL_CHAR)
                         FSTARPU_CUDA_WORKER   = fstarpu_get_constant(C_CHAR_"FSTARPU_CUDA_WORKER"//C_NULL_CHAR)
                         FSTARPU_OPENCL_WORKER   = fstarpu_get_constant(C_CHAR_"FSTARPU_OPENCL_WORKER"//C_NULL_CHAR)
-                        FSTARPU_MIC_WORKER   = fstarpu_get_constant(C_CHAR_"FSTARPU_MIC_WORKER"//C_NULL_CHAR)
                         FSTARPU_ANY_WORKER   = fstarpu_get_constant(C_CHAR_"FSTARPU_ANY_WORKER"//C_NULL_CHAR)
 
                         FSTARPU_NMAXBUFS   = int(p_to_ip(fstarpu_get_constant(C_CHAR_"FSTARPU_NMAXBUFS"//C_NULL_CHAR)),c_int)
@@ -2464,8 +2450,6 @@ module fstarpu_mod
                             fstarpu_get_constant(C_CHAR_"FSTARPU_CUDA"//C_NULL_CHAR)
                         FSTARPU_OPENCL = &
                             fstarpu_get_constant(C_CHAR_"FSTARPU_OPENCL"//C_NULL_CHAR)
-                        FSTARPU_MIC = &
-                            fstarpu_get_constant(C_CHAR_"FSTARPU_MIC"//C_NULL_CHAR)
 
                         FSTARPU_CODELET_SIMGRID_EXECUTE = &
                              fstarpu_get_constant(C_CHAR_"FSTARPU_CODELET_SIMGRID_EXECUTE"//C_NULL_CHAR)

+ 78 - 80
include/starpu.h

@@ -99,7 +99,8 @@ extern "C"
    It has to be initialized with starpu_conf_init(). When the default
    value is used, StarPU automatically selects the number of
    processing units and takes the default scheduling policy. The
-   environment variables overwrite the equivalent parameters.
+   environment variables overwrite the equivalent parameters unless
+   starpu_conf::precedence_over_environment_variables is set.
 */
 struct starpu_conf
 {
@@ -129,7 +130,13 @@ struct starpu_conf
 	   (default = <c>NULL</c>)
 	*/
 	struct starpu_sched_policy *sched_policy;
-	void (*sched_policy_init)(unsigned);
+
+	/**
+	   Callback function that can later be used by the scheduler.
+	   The scheduler can retrieve this function by calling
+	   starpu_sched_ctx_get_sched_policy_callback()
+	*/
+	void (*sched_policy_callback)(unsigned);
 
 	/**
 	   For all parameters specified in this structure that can
@@ -145,7 +152,7 @@ struct starpu_conf
 	/**
 	   Number of CPU cores that StarPU can use. This can also be
 	   specified with the environment variable \ref STARPU_NCPU.
-	   (default = -1)
+	   (default = \c -1)
 	*/
 	int ncpus;
 
@@ -160,7 +167,7 @@ struct starpu_conf
 	   Number of CUDA devices that StarPU can use. This can also
 	   be specified with the environment variable \ref
 	   STARPU_NCUDA.
-	   (default = -1)
+	   (default = \c -1)
 	*/
 	int ncuda;
 
@@ -168,22 +175,15 @@ struct starpu_conf
 	   Number of OpenCL devices that StarPU can use. This can also
 	   be specified with the environment variable \ref
 	   STARPU_NOPENCL.
-	   (default = -1)
+	   (default = \c -1)
 	*/
 	int nopencl;
 
 	/**
-	   Number of MIC devices that StarPU can use. This can also be
-	   specified with the environment variable \ref STARPU_NMIC.
-	   (default = -1)
-	*/
-	int nmic;
-
-	/**
 	   Number of MPI Master Slave devices that StarPU can use.
 	   This can also be specified with the environment variable
 	   \ref STARPU_NMPI_MS.
-	   (default = -1)
+	   (default = \c -1)
 	*/
         int nmpi_ms;
 
@@ -193,7 +193,7 @@ struct starpu_conf
 	   StarPU automatically selects where to bind the different
 	   workers. This can also be specified with the environment
 	   variable \ref STARPU_WORKERS_CPUID.
-	   (default = 0)
+	   (default = \c 0)
 	*/
 	unsigned use_explicit_workers_bindid;
 
@@ -204,7 +204,7 @@ struct starpu_conf
 	   indicates the logical identifier of the processor which
 	   should execute the i-th worker. Note that the logical
 	   ordering of the CPUs is either determined by the OS, or
-	   provided by the hwloc library in case it is available.
+	   provided by the \c hwloc library in case it is available.
 	*/
 	unsigned workers_bindid[STARPU_NMAXWORKERS];
 
@@ -215,7 +215,7 @@ struct starpu_conf
 	   affects the CUDA devices in a round-robin fashion. This can
 	   also be specified with the environment variable \ref
 	   STARPU_WORKERS_CUDAID.
-	   (default = 0)
+	   (default = \c 0)
 	*/
 	unsigned use_explicit_workers_cuda_gpuid;
 
@@ -233,7 +233,7 @@ struct starpu_conf
 	   affects the OpenCL devices in a round-robin fashion. This
 	   can also be specified with the environment variable \ref
 	   STARPU_WORKERS_OPENCLID.
-	   (default = 0)
+	   (default = \c 0)
 	*/
 	unsigned use_explicit_workers_opencl_gpuid;
 
@@ -245,30 +245,12 @@ struct starpu_conf
 	unsigned workers_opencl_gpuid[STARPU_NMAXWORKERS];
 
 	/**
-	   If this flag is set, the MIC workers will be attached to
-	   the MIC devices specified in the array
-	   starpu_conf::workers_mic_deviceid. Otherwise, StarPU
-	   affects the MIC devices in a round-robin fashion. This can
-	   also be specified with the environment variable \ref
-	   STARPU_WORKERS_MICID.
-	   (default = 0)
-	*/
-	unsigned use_explicit_workers_mic_deviceid;
-
-	/**
-	   If the flag starpu_conf::use_explicit_workers_mic_deviceid
-	   is set, the array contains the logical identifiers of the
-	   MIC devices to be used.
-	*/
-	unsigned workers_mic_deviceid[STARPU_NMAXWORKERS];
-
-	/**
 	   If this flag is set, the MPI Master Slave workers will be
 	   attached to the MPI Master Slave devices specified in the
 	   array starpu_conf::workers_mpi_ms_deviceid. Otherwise,
 	   StarPU affects the MPI Master Slave devices in a
 	   round-robin fashion.
-	   (default = 0)
+	   (default = \c 0)
 	*/
 	unsigned use_explicit_workers_mpi_ms_deviceid;
 
@@ -285,7 +267,7 @@ struct starpu_conf
 	   this value is equal to -1, the default value is used. This
 	   can also be specified with the environment variable \ref
 	   STARPU_BUS_CALIBRATE.
-	   (default = 0)
+	   (default = \c 0)
 	*/
 	int bus_calibrate;
 
@@ -297,7 +279,7 @@ struct starpu_conf
 	   2, the existing performance models will be overwritten.
 	   This can also be specified with the environment variable
 	   \ref STARPU_CALIBRATE.
-	   (default = 0)
+	   (default = \c 0)
 	*/
 	int calibrate;
 
@@ -312,19 +294,11 @@ struct starpu_conf
 	   worker sizes to look for the most efficient ones.
 	   This can also be specified with the environment variable
 	   \ref STARPU_SINGLE_COMBINED_WORKER.
-	   (default = 0)
+	   (default = \c 0)
 	*/
 	int single_combined_worker;
 
 	/**
-	   Path to the kernel to execute on the MIC device, compiled
-	   for MIC architecture. When set to <c>NULL</c>, StarPU
-	   automatically looks next to the host program location.
-	   (default = <c>NULL</c>)
-	*/
-	char *mic_sink_program_path;
-
-	/**
 	   This flag should be set to 1 to disable asynchronous copies
 	   between CPUs and all accelerators.
 	   The AMD implementation of OpenCL is known to fail when
@@ -336,7 +310,7 @@ struct starpu_conf
 	   This can also be specified at compilation time by giving to
 	   the configure script the option \ref
 	   disable-asynchronous-copy "--disable-asynchronous-copy".
-	   (default = 0)
+	   (default = \c 0)
 	*/
 	int disable_asynchronous_copy;
 
@@ -349,7 +323,7 @@ struct starpu_conf
 	   the configure script the option \ref
 	   disable-asynchronous-cuda-copy
 	   "--disable-asynchronous-cuda-copy".
-	   (default = 0)
+	   (default = \c 0)
 	*/
 	int disable_asynchronous_cuda_copy;
 
@@ -366,25 +340,12 @@ struct starpu_conf
 	   the configure script the option \ref
 	   disable-asynchronous-opencl-copy
 	   "--disable-asynchronous-opencl-copy".
-	   (default = 0)
+	   (default = \c 0)
 	*/
 	int disable_asynchronous_opencl_copy;
 
 	/**
 	   This flag should be set to 1 to disable asynchronous copies
-	   between CPUs and MIC accelerators.
-	   This can also be specified with the environment variable
-	   \ref STARPU_DISABLE_ASYNCHRONOUS_MIC_COPY.
-	   This can also be specified at compilation time by giving to
-	   the configure script the option \ref
-	   disable-asynchronous-mic-copy
-	   "--disable-asynchronous-mic-copy".
-	   (default = 0).
-	*/
-	int disable_asynchronous_mic_copy;
-
-	/**
-	   This flag should be set to 1 to disable asynchronous copies
 	   between CPUs and MPI Master Slave devices.
 	   This can also be specified with the environment variable
 	   \ref STARPU_DISABLE_ASYNCHRONOUS_MPI_MS_COPY.
@@ -392,7 +353,7 @@ struct starpu_conf
 	   the configure script the option \ref
 	   disable-asynchronous-mpi-master-slave-copy
 	   "--disable-asynchronous-mpi-master-slave-copy".
-	   (default = 0).
+	   (default = \c 0).
 	*/
 	int disable_asynchronous_mpi_ms_copy;
 
@@ -422,7 +383,7 @@ struct starpu_conf
 	   The number of StarPU drivers that should not be launched by
 	   StarPU, i.e number of elements of the array
 	   starpu_conf::not_launched_drivers.
-	   (default = 0)
+	   (default = \c 0)
 	*/
 	unsigned n_not_launched_drivers;
 
@@ -435,8 +396,20 @@ struct starpu_conf
 	*/
 	uint64_t trace_buffer_size;
 
+	/**
+	   Set the mininum priority used by priorities-aware
+	   schedulers.
+	   This also can be specified with the environment variable \ref
+	   STARPU_MIN_PRIO
+	*/
 	int global_sched_ctx_min_priority;
 
+	/**
+	   Set the maxinum priority used by priorities-aware
+	   schedulers.
+	   This also can be specified with the environment variable \ref
+	   STARPU_MAX_PRIO
+	*/
 	int global_sched_ctx_max_priority;
 
 #ifdef STARPU_WORKER_CALLBACKS
@@ -445,14 +418,14 @@ struct starpu_conf
 #endif
 
 	/**
-	   Specify if StarPU should catch SIGINT, SIGSEGV and SIGTRAP
+	   Specify if StarPU should catch \c SIGINT, \c SIGSEGV and \c SIGTRAP
 	   signals to make sure final actions (e.g dumping FxT trace
 	   files) are done even though the application has crashed. By
 	   default (value = \c 1), signals are catched. It should be
 	   disabled on systems which already catch these signals for
 	   their own needs (e.g JVM)
 	   This can also be specified with the environment variable
-	   \ref STARPU_CATCH_SIGNALS
+	   \ref STARPU_CATCH_SIGNALS.
 	 */
 	int catch_signals;
 
@@ -463,14 +436,26 @@ struct starpu_conf
 	unsigned start_perf_counter_collection;
 
 	/**
-	   Minimum spinning backoff of drivers. Default value: \c 1
+	   Minimum spinning backoff of drivers (default = \c 1)
 	 */
 	unsigned driver_spinning_backoff_min;
 
 	/**
-	   Maximum spinning backoff of drivers. Default value: \c 32
+	   Maximum spinning backoff of drivers. (default = \c 32)
 	 */
 	unsigned driver_spinning_backoff_max;
+
+	/**
+	   Specify if CUDA workers should do only fast allocations
+	   when running the datawizard progress of
+	   other memory nodes. This will pass the interval value
+	   _STARPU_DATAWIZARD_ONLY_FAST_ALLOC to the allocation method.
+	   Default value is 0, allowing CUDA workers to do slow
+	   allocations.
+	   This can also be specified with the environment variable
+	   \ref STARPU_CUDA_ONLY_FAST_ALLOC_OTHER_MEMNODES.
+	 */
+	int cuda_only_fast_alloc_other_memnodes;
 };
 
 /**
@@ -490,9 +475,11 @@ int starpu_conf_init(struct starpu_conf *conf);
    starpu_conf::ncpus = 0, starpu_conf::ncuda = 0, etc.
 
    This allows to portably enable only a given type of worker:
-
-   starpu_conf_noworker(&conf);
+   <br/>
+   <c>
+   starpu_conf_noworker(&conf);<br/>
    conf.ncpus = -1;
+   </c>
 */
 int starpu_conf_noworker(struct starpu_conf *conf);
 
@@ -509,7 +496,8 @@ int starpu_init(struct starpu_conf *conf) STARPU_WARN_UNUSED_RESULT;
 
 /**
    Similar to starpu_init(), but also take the \p argc and \p argv as
-   defined by the application.
+   defined by the application, which is necessary when running in
+   Simgrid mode or MPI Master Slave mode.
    Do not call starpu_init() and starpu_initialize() in the same
    program.
 */
@@ -541,6 +529,7 @@ void starpu_shutdown(void);
    should be used to unfreeze the workers.
 */
 void starpu_pause(void);
+
 /**
    Symmetrical call to starpu_pause(), used to resume the workers
    polling for new tasks.
@@ -558,7 +547,7 @@ void starpu_resume(void);
 /**
    Return a PU binding ID which can be used to bind threads with
    starpu_bind_thread_on(). \p flags can be set to
-   STARPU_THREAD_ACTIVE or 0. When \p npreferred is set to non-zero,
+   ::STARPU_THREAD_ACTIVE or 0. When \p npreferred is set to non-zero,
    \p preferred is an array of size \p npreferred in which a
    preference of PU binding IDs can be set. By default StarPU will
    return the first PU available for binding.
@@ -574,7 +563,7 @@ unsigned starpu_get_next_bindid(unsigned flags, unsigned *preferred, unsigned np
    so the caller can tell the user how to avoid the issue.
 
    \p name should be set to a unique string so that different calls
-   with the same name for the same cpuid does not produce a warning.
+   with the same name for the same \p cpuid does not produce a warning.
 */
 int starpu_bind_thread_on(int cpuid, unsigned flags, const char *name);
 
@@ -602,19 +591,28 @@ int starpu_asynchronous_cuda_copy_disabled(void);
 int starpu_asynchronous_opencl_copy_disabled(void);
 
 /**
-   Return 1 if asynchronous data transfers between CPU and MIC devices
-   are disabled.
-*/
-int starpu_asynchronous_mic_copy_disabled(void);
-
-/**
    Return 1 if asynchronous data transfers between CPU and MPI Slave
    devices are disabled.
 */
 int starpu_asynchronous_mpi_ms_copy_disabled(void);
 
+/**
+   Call starpu_profiling_bus_helper_display_summary() and
+   starpu_profiling_worker_helper_display_summary()
+ */
 void starpu_display_stats(void);
 
+/** @} */
+
+/**
+   @defgroup API_Versioning Versioning
+   @{
+*/
+
+/**
+   Return as 3 integers the version of StarPU used when running the
+   application.
+*/
 void starpu_get_version(int *major, int *minor, int *release);
 
 /** @} */

+ 14 - 14
include/starpu_clusters.h

@@ -35,71 +35,71 @@ extern "C"
  */
 
 /**
-   Used when calling starpu_cluster_machine
+   Used when calling starpu_cluster_machine()
  */
 #define STARPU_CLUSTER_MIN_NB			(1<<STARPU_MODE_SHIFT)
 
 /**
-   Used when calling starpu_cluster_machine
+   Used when calling starpu_cluster_machine()
  */
 #define STARPU_CLUSTER_MAX_NB			(2<<STARPU_MODE_SHIFT)
 
 /**
-   Used when calling starpu_cluster_machine
+   Used when calling starpu_cluster_machine()
  */
 #define STARPU_CLUSTER_NB			(3<<STARPU_MODE_SHIFT)
 
 /**
-   Used when calling starpu_cluster_machine
+   Used when calling starpu_cluster_machine()
  */
 #define STARPU_CLUSTER_PREFERE_MIN		(4<<STARPU_MODE_SHIFT)
 
 /**
-   Used when calling starpu_cluster_machine
+   Used when calling starpu_cluster_machine()
  */
 #define STARPU_CLUSTER_KEEP_HOMOGENEOUS		(5<<STARPU_MODE_SHIFT)
 
 /**
-   Used when calling starpu_cluster_machine
+   Used when calling starpu_cluster_machine()
  */
 #define STARPU_CLUSTER_POLICY_NAME		(6<<STARPU_MODE_SHIFT)
 
 /**
-   Used when calling starpu_cluster_machine
+   Used when calling starpu_cluster_machine()
  */
 #define STARPU_CLUSTER_POLICY_STRUCT		(7<<STARPU_MODE_SHIFT)
 
 /**
-   Used when calling starpu_cluster_machine
+   Used when calling starpu_cluster_machine()
  */
 #define STARPU_CLUSTER_CREATE_FUNC		(8<<STARPU_MODE_SHIFT)
 
 /**
-   Used when calling starpu_cluster_machine
+   Used when calling starpu_cluster_machine()
  */
 #define STARPU_CLUSTER_CREATE_FUNC_ARG		(9<<STARPU_MODE_SHIFT)
 /**
-   Used when calling starpu_cluster_machine
+   Used when calling starpu_cluster_machine()
  */
 #define STARPU_CLUSTER_TYPE			(10<<STARPU_MODE_SHIFT)
 
 /**
-   Used when calling starpu_cluster_machine
+   Used when calling starpu_cluster_machine()
  */
 #define STARPU_CLUSTER_AWAKE_WORKERS		(11<<STARPU_MODE_SHIFT)
 
 /**
-   Used when calling starpu_cluster_machine
+   Used when calling starpu_cluster_machine()
  */
 #define STARPU_CLUSTER_PARTITION_ONE		(12<<STARPU_MODE_SHIFT)
 
 /**
-   Used when calling starpu_cluster_machine
+   Used when calling starpu_cluster_machine()
  */
 #define STARPU_CLUSTER_NEW			(13<<STARPU_MODE_SHIFT)
 
 /**
-   Used when calling starpu_cluster_machine
+   Used when calling starpu_cluster_machine()
  */
 #define STARPU_CLUSTER_NCORES			(14<<STARPU_MODE_SHIFT)
 

+ 9 - 14
include/starpu_config.h.in

@@ -57,6 +57,14 @@
 #undef STARPU_USE_CUDA
 
 /**
+   Defined when StarPU has been installed with
+   NVidia-ML support. It should be used in your code to detect the
+   availability of NVML-related functions.
+   @ingroup API_CUDA_Extensions
+*/
+#undef STARPU_HAVE_LIBNVIDIA_ML
+
+/**
    Defined when StarPU has been installed with OpenCL support. It
    should be used in your code to detect the availability of OpenCL as
    shown in \ref FullSourceCodeVectorScal.
@@ -65,13 +73,6 @@
 #undef STARPU_USE_OPENCL
 
 /**
-   Defined when StarPU has been installed with MIC support. It should
-   be used in your code to detect the availability of MIC.
-   @ingroup API_MIC_Extensions
-*/
-#undef STARPU_USE_MIC
-
-/**
    Defined when StarPU has been installed with MPI Master Slave
    support. It should be used in your code to detect the availability
    of MPI Master Slave.
@@ -152,6 +153,7 @@
 #undef STARPU_HAVE_SYNC_FETCH_AND_OR
 #undef STARPU_HAVE_SYNC_LOCK_TEST_AND_SET
 #undef STARPU_HAVE_SYNC_SYNCHRONIZE
+#undef STARPU_HAVE_ATOMIC_EXCHANGE_N
 
 #undef STARPU_DEVEL
 #undef STARPU_MODEL_DEBUG
@@ -224,13 +226,6 @@
 #undef STARPU_MAXOPENCLDEVS
 
 /**
-   Define the maximum number of MIC devices that are supported by
-   StarPU.
-   @ingroup API_MIC_Extensions
-*/
-#undef STARPU_MAXMICDEVS
-
-/**
    Define the maximum number of workers managed by StarPU.
    @ingroup API_Workers_Properties
 */

+ 12 - 0
include/starpu_cuda.h

@@ -24,6 +24,10 @@
 #include <cuda_runtime.h>
 #include <cuda_runtime_api.h>
 
+#ifdef STARPU_HAVE_LIBNVIDIA_ML
+#include <nvml.h>
+#endif
+
 #ifdef __cplusplus
 extern "C"
 {
@@ -129,6 +133,14 @@ int starpu_cuda_copy3d_async_sync(void *src_ptr, unsigned src_node, void *dst_pt
 */
 void starpu_cuda_set_device(unsigned devid);
 
+#ifdef STARPU_HAVE_LIBNVIDIA_ML
+/**
+  Return the nvml device for a CUDA device
+*/
+nvmlDevice_t starpu_cuda_get_nvmldev(unsigned devid);
+#endif
+
+
 /** @} */
 
 #ifdef __cplusplus

+ 53 - 6
include/starpu_data.h

@@ -110,7 +110,15 @@ enum starpu_data_access_mode
 				   src/sched_policies/work_stealing_policy.c
 				   source code.
 				*/
-	STARPU_ACCESS_MODE_MAX=(1<<7) /**< todo */
+	STARPU_MPI_REDUX=(1<<7), /**< Inter-node reduction only. Codelets
+				    contributing to these reductions should
+				    be registered with ::STARPU_RW | ::STARPU_COMMUTE
+				    access modes.
+			            When inserting these tasks through the
+				    MPI layer however, the access mode needs
+				    to be ::STARPU_MPI_REDUX. */
+	STARPU_ACCESS_MODE_MAX=(1<<8) /**< The purpose of ::STARPU_ACCESS_MODE_MAX is to
+					be the maximum of this enum. */
 };
 
 struct starpu_data_interface_ops;
@@ -298,8 +306,14 @@ int starpu_data_acquire_on_node_cb_sequential_consistency_quick(starpu_data_hand
    to retrieve the jobid of the synchronization tasks. \e pre_sync_jobid happens
    just before the acquisition, and \e post_sync_jobid happens just after the
    release.
+
+   callback_acquired is called when the data is acquired in terms of semantic,
+   but the data is not fetched yet. It is given a pointer to the node, which it
+   can modify if it wishes so.
+
+   This is a very internal interface, subject to changes, do not use this.
 */
-int starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(starpu_data_handle_t handle, int node, enum starpu_data_access_mode mode, void (*callback)(void *), void *arg, int sequential_consistency, int quick, long *pre_sync_jobid, long *post_sync_jobid);
+int starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(starpu_data_handle_t handle, int node, enum starpu_data_access_mode mode, void (*callback_acquired)(void *arg, int *node, enum starpu_data_access_mode mode), void (*callback)(void *arg), void *arg, int sequential_consistency, int quick, long *pre_sync_jobid, long *post_sync_jobid, int prio);
 
 /**
    The application can call this function instead of starpu_data_acquire() so as to
@@ -409,6 +423,26 @@ void starpu_arbiter_destroy(starpu_arbiter_t arbiter);
 int starpu_data_request_allocation(starpu_data_handle_t handle, unsigned node);
 
 /**
+   Prefetch levels
+
+   Data requests are ordered by priorities, but also by prefetching level,
+   between data that a task wants now, and data that we will probably want
+   "soon".
+*/
+enum starpu_is_prefetch
+{
+	/** A task really needs it now! */
+	STARPU_FETCH = 0,
+	/** A task will need it soon */
+	STARPU_TASK_PREFETCH = 1,
+	/** It is a good idea to have it asap */
+	STARPU_PREFETCH = 2,
+	/** Get this here when you have time to */
+	STARPU_IDLEFETCH = 3,
+	STARPU_NFETCH
+};
+
+/**
    Issue a fetch request for the data \p handle to \p node, i.e.
    requests that the data be replicated to the given node as soon as possible, so that it is
    available there for tasks. If \p async is 0, the call will
@@ -445,7 +479,7 @@ int starpu_data_idle_prefetch_on_node_prio(starpu_data_handle_t handle, unsigned
 
 /**
    Check whether a valid copy of \p handle is currently available on
-   memory node \p node.
+   memory node \p node (or a transfer request for getting so is ongoing).
 */
 unsigned starpu_data_is_on_node(starpu_data_handle_t handle, unsigned node);
 
@@ -526,6 +560,17 @@ unsigned starpu_data_get_ooc_flag(starpu_data_handle_t handle);
 
 /**
    Query the status of \p handle on the specified \p memory_node.
+
+   \p is_allocated tells whether memory was allocated there for the data.
+   \p is_valid tells whether the actual value is available there.
+   \p is_loading tells whether the actual value is getting loaded there.
+   \p is_requested tells whether the actual value is requested to be loaded
+   there by some fetch/prefetch/idlefetch request.
+*/
+void starpu_data_query_status2(starpu_data_handle_t handle, int memory_node, int *is_allocated, int *is_valid, int *is_loading, int *is_requested);
+
+/**
+   Same as starpu_data_query_status2(), but without the is_loading parameter.
 */
 void starpu_data_query_status(starpu_data_handle_t handle, int memory_node, int *is_allocated, int *is_valid, int *is_requested);
 
@@ -534,8 +579,10 @@ struct starpu_codelet;
 /**
    Set the codelets to be used for \p handle when it is accessed in the
    mode ::STARPU_REDUX. Per-worker buffers will be initialized with
-   the codelet \p init_cl, and reduction between per-worker buffers will be
-   done with the codelet \p redux_cl.
+   the codelet \p init_cl (which has to take one handle with STARPU_W), and
+   reduction between per-worker buffers will be done with the codelet \p
+   redux_cl (which has to take a first accumulation handle with
+   STARPU_RW|STARPU_COMMUTE, and a second contribution handle with STARPU_R).
 */
 void starpu_data_set_reduction_methods(starpu_data_handle_t handle, struct starpu_codelet *redux_cl, struct starpu_codelet *init_cl);
 
@@ -561,7 +608,7 @@ void *starpu_data_get_user_data(starpu_data_handle_t handle);
 /**
   Check whether data \p handle can be evicted now from node \p node
 */
-int starpu_data_can_evict(starpu_data_handle_t handle, unsigned node);
+int starpu_data_can_evict(starpu_data_handle_t handle, unsigned node, enum starpu_is_prefetch is_prefetch);
 
 /** @} */
 

+ 24 - 44
include/starpu_data_interfaces.h

@@ -123,13 +123,6 @@ struct starpu_data_copy_methods
 
 	/**
 	   Define how to copy data from the \p src_interface interface on the
-	   \p src_node CPU node to the \p dst_interface interface on the \p
-	   dst_node MIC node. Return 0 on success.
-	*/
-	int (*ram_to_mic)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
-
-	/**
-	   Define how to copy data from the \p src_interface interface on the
 	   \p src_node CUDA node to the \p dst_interface interface on the \p
 	   dst_node CPU node. Return 0 on success.
 	*/
@@ -158,13 +151,6 @@ struct starpu_data_copy_methods
 
 	/**
 	   Define how to copy data from the \p src_interface interface on the
-	   \p src_node MIC node to the \p dst_interface interface on the \p
-	   dst_node CPU node. Return 0 on success.
-	*/
-	int (*mic_to_ram)(void *src_interface, unsigned srd_node, void *dst_interface, unsigned dst_node);
-
-	/**
-	   Define how to copy data from the \p src_interface interface on the
 	   \p src_node CPU node to the \p dst_interface interface on the \p
 	   dst_node MPI Slave node. Return 0 on success.
 	*/
@@ -289,26 +275,6 @@ struct starpu_data_copy_methods
 
 	/**
 	   Define how to copy data from the \p src_interface interface on the
-	   \p src_node CPU node to the \p dst_interface interface on the \p
-	   dst_node MIC node. Must return 0 if the transfer was actually
-	   completed completely synchronously, or <c>-EAGAIN</c> if at least
-	   some transfers are still ongoing and should be awaited for by the
-	   core.
-	*/
-	int (*ram_to_mic_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
-
-	/**
-	   Define how to copy data from the \p src_interface interface on the
-	   \p src_node MIC node to the \p dst_interface interface on the \p
-	   dst_node CPU node. Must return 0 if the transfer was actually
-	   completed completely synchronously, or <c>-EAGAIN</c> if at least
-	   some transfers are still ongoing and should be awaited for by the
-	   core.
-	*/
-	int (*mic_to_ram_async)(void *src_interface, unsigned srd_node, void *dst_interface, unsigned dst_node);
-
-	/**
-	   Define how to copy data from the \p src_interface interface on the
 	   \p src_node node to the \p dst_interface interface on the \p
 	   dst_node node. This is meant to be implemented through the
 	   starpu_interface_copy() helper, to which async_data should be
@@ -321,7 +287,7 @@ struct starpu_data_copy_methods
 	   data blocks. If the interface is more involved than
 	   this, i.e. it needs to collect pieces of data before
 	   transferring, starpu_data_interface_ops::pack_data and
-	   starpu_data_interface_ops::unpack_data should be implemented instead,
+	   starpu_data_interface_ops::peek_data should be implemented instead,
 	   and the core will just transfer the resulting data buffer.
 	*/
 	int (*any_to_any)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
@@ -540,6 +506,12 @@ struct starpu_data_interface_ops
 	int (*pack_data) (starpu_data_handle_t handle, unsigned node, void **ptr, starpu_ssize_t *count);
 
 	/**
+	   Read the data handle from the contiguous buffer at the address
+	   \p ptr of size \p count.
+	*/
+	int (*peek_data) (starpu_data_handle_t handle, unsigned node, void *ptr, size_t count);
+
+	/**
 	   Unpack the data handle from the contiguous buffer at the address
 	   \p ptr of size \p count.
 	   The memory at the address \p ptr should be freed after the data unpacking operation.
@@ -637,9 +609,25 @@ int starpu_data_pack_node(starpu_data_handle_t handle, unsigned node, void **ptr
 int starpu_data_pack(starpu_data_handle_t handle, void **ptr, starpu_ssize_t *count);
 
 /**
+   Read in handle's \p node replicate the data located at \p ptr
+   of size \p count as described by the interface of the data. The interface
+   registered at \p handle must define a peeking operation (see
+   starpu_data_interface_ops).
+*/
+int starpu_data_peek_node(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count);
+
+/**
+   Read in handle's local replicate the data located at \p ptr
+   of size \p count as described by the interface of the data. The interface
+   registered at \p handle must define a peeking operation (see
+   starpu_data_interface_ops).
+*/
+int starpu_data_peek(starpu_data_handle_t handle, void *ptr, size_t count);
+
+/**
    Unpack in handle the data located at \p ptr of size \p count allocated
    on node \p node as described by the interface of the data. The interface
-   registered at \p handle must define a unpacking operation (see
+   registered at \p handle must define an unpacking operation (see
    starpu_data_interface_ops).
 */
 int starpu_data_unpack_node(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count);
@@ -2021,9 +2009,6 @@ struct starpu_multiformat_data_interface_ops
 	size_t cuda_elemsize;                     /**< size of each element on CUDA devices */
 	struct starpu_codelet *cpu_to_cuda_cl;    /**< pointer to a codelet which converts from CPU to CUDA */
 	struct starpu_codelet *cuda_to_cpu_cl;    /**< pointer to a codelet which converts from CUDA to CPU */
-	size_t mic_elemsize;                      /**< size of each element on MIC devices */
-	struct starpu_codelet *cpu_to_mic_cl;     /**< pointer to a codelet which converts from CPU to MIC */
-	struct starpu_codelet *mic_to_cpu_cl;     /**< pointer to a codelet which converts from MIC to CPU */
 };
 
 struct starpu_multiformat_interface
@@ -2033,7 +2018,6 @@ struct starpu_multiformat_interface
 	void *cpu_ptr;
 	void *cuda_ptr;
 	void *opencl_ptr;
-	void *mic_ptr;
 	uint32_t nx;
 	struct starpu_multiformat_data_interface_ops *ops;
 };
@@ -2061,10 +2045,6 @@ void starpu_multiformat_data_register(starpu_data_handle_t *handle, int home_nod
 */
 #define STARPU_MULTIFORMAT_GET_OPENCL_PTR(interface) (((struct starpu_multiformat_interface *)(interface))->opencl_ptr)
 /**
-   Return the local pointer to the data with MIC format.
- */
-#define STARPU_MULTIFORMAT_GET_MIC_PTR(interface) (((struct starpu_multiformat_interface *)(interface))->mic_ptr)
-/**
    Return the number of elements in the data.
  */
 #define STARPU_MULTIFORMAT_GET_NX(interface)  (((struct starpu_multiformat_interface *)(interface))->nx)

+ 8 - 0
include/starpu_hash.h

@@ -39,6 +39,14 @@ extern "C"
 uint32_t starpu_hash_crc32c_be_n(const void *input, size_t n, uint32_t inputcrc);
 
 /**
+   Compute the CRC of a pointer value seeded by the \p inputcrc
+   <em>current state</em>. The return value should be considered as the new
+   <em>current state</em> for future CRC computation. This is used for computing
+   data size footprint.
+*/
+uint32_t starpu_hash_crc32c_be_ptr(void *input, uint32_t inputcrc);
+
+/**
    Compute the CRC of a 32bit number seeded by the \p inputcrc
    <em>current state</em>. The return value should be considered as the new
    <em>current state</em> for future CRC computation. This is used for computing

+ 0 - 61
include/starpu_mic.h

@@ -1,61 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2012-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#ifndef __STARPU_MIC_H__
-#define __STARPU_MIC_H__
-
-#include <starpu_config.h>
-
-#ifdef STARPU_USE_MIC
-
-#ifdef __cplusplus
-extern "C"
-{
-#endif
-
-/**
-   @defgroup API_MIC_Extensions MIC Extensions
-   @{
-*/
-
-/**
-   Type for MIC function symbols
-*/
-typedef void *starpu_mic_func_symbol_t;
-
-/**
-   Initiate a lookup on each MIC device to find the address of the
-   function named \p func_name, store it in the global array kernels
-   and return the index in the array through \p symbol.
-*/
-int starpu_mic_register_kernel(starpu_mic_func_symbol_t *symbol, const char *func_name);
-
-/**
-   If successfull, return the pointer to the function defined by \p symbol on
-   the device linked to the called device. This can for instance be used
-   in a starpu_mic_func_t implementation.
-*/
-starpu_mic_kernel_t starpu_mic_get_kernel(starpu_mic_func_symbol_t symbol);
-
-/** @} */
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* STARPU_USE_MIC */
-
-#endif /* __STARPU_MIC_H__ */

+ 0 - 7
include/starpu_mod.f90

@@ -79,13 +79,6 @@ MODULE starpu_mod
      END SUBROUTINE starpu_asynchronous_opencl_copy_disabled
   END INTERFACE
 
-  ! starpu_asynchronous_mic_copy_disabled
-  INTERFACE
-     SUBROUTINE starpu_asynchronous_mic_copy_disabled() BIND(C)
-       USE iso_c_binding
-     END SUBROUTINE starpu_asynchronous_mic_copy_disabled
-  END INTERFACE
-
   ! starpu_display_stats
   INTERFACE
      SUBROUTINE starpu_display_stats() BIND(C)

+ 9 - 9
include/starpu_perf_monitoring.h

@@ -40,10 +40,10 @@ extern "C"
  */
 enum starpu_perf_counter_scope
 {
-	starpu_perf_counter_scope_undefined     = 0, /** undefined scope */
-	starpu_perf_counter_scope_global        = 2, /** global scope */
-	starpu_perf_counter_scope_per_worker    = 4, /** per-worker scope */
-	starpu_perf_counter_scope_per_codelet   = 6  /** per-codelet scope */
+	starpu_perf_counter_scope_undefined     = 0, /**< undefined scope */
+	starpu_perf_counter_scope_global        = 2, /**< global scope */
+	starpu_perf_counter_scope_per_worker    = 4, /**< per-worker scope */
+	starpu_perf_counter_scope_per_codelet   = 6  /**< per-codelet scope */
 };
 
 /**
@@ -51,11 +51,11 @@ enum starpu_perf_counter_scope
  */
 enum starpu_perf_counter_type
 {
-	starpu_perf_counter_type_undefined = 0, /** underfined value type */
-	starpu_perf_counter_type_int32     = 1, /** signed 32-bit integer value */
-	starpu_perf_counter_type_int64     = 2, /** signed 64-bit integer value */
-	starpu_perf_counter_type_float     = 3, /** 32-bit single precision floating-point value */
-	starpu_perf_counter_type_double    = 4  /** 64-bit double precision floating-point value */
+	starpu_perf_counter_type_undefined = 0, /**< underfined value type */
+	starpu_perf_counter_type_int32     = 1, /**< signed 32-bit integer value */
+	starpu_perf_counter_type_int64     = 2, /**< signed 64-bit integer value */
+	starpu_perf_counter_type_float     = 3, /**< 32-bit single precision floating-point value */
+	starpu_perf_counter_type_double    = 4  /**< 64-bit double precision floating-point value */
 };
 
 struct starpu_perf_counter_listener;

+ 109 - 84
include/starpu_perf_steering.h

@@ -35,183 +35,208 @@ extern "C"
    \anchor PM_API
    @{
 */
+
 /**
    Enum of all possible performance knob scopes.
  */
 enum starpu_perf_knob_scope
 {
-	starpu_perf_knob_scope_undefined     = 0, /** undefined scope */
-	starpu_perf_knob_scope_global        = 1, /** global scope */
-	starpu_perf_knob_scope_per_worker    = 3, /** per-worker scope */
-	starpu_perf_knob_scope_per_scheduler = 5  /** per-scheduler scope */
+	starpu_perf_knob_scope_undefined     = 0, /**< undefined scope */
+	starpu_perf_knob_scope_global        = 1, /**< global scope */
+	starpu_perf_knob_scope_per_worker    = 3, /**< per-worker scope */
+	starpu_perf_knob_scope_per_scheduler = 5  /**< per-scheduler scope */
 };
 
 /**
-  Enum of all possible performance knob value type.
+   Enum of all possible performance knob value type.
  */
 enum starpu_perf_knob_type
 {
-	starpu_perf_knob_type_undefined = 0, /** underfined value type */
-	starpu_perf_knob_type_int32     = 1, /** signed 32-bit integer value */
-	starpu_perf_knob_type_int64     = 2, /** signed 64-bit integer value */
-	starpu_perf_knob_type_float     = 3, /** 32-bit single precision floating-point value */
-	starpu_perf_knob_type_double    = 4  /** 64-bit double precision floating-point value */
+	starpu_perf_knob_type_undefined = 0, /**< underfined value type */
+	starpu_perf_knob_type_int32     = 1, /**< signed 32-bit integer value */
+	starpu_perf_knob_type_int64     = 2, /**< signed 64-bit integer value */
+	starpu_perf_knob_type_float     = 3, /**< 32-bit single precision floating-point value */
+	starpu_perf_knob_type_double    = 4  /**< 64-bit double precision floating-point value */
 };
 
 /**
-  Translate scope name constant string to scope id.
-  */
+   Translate scope name constant string to scope id.
+*/
 int starpu_perf_knob_scope_name_to_id(const char *name);
+
 /**
-  Translate scope id to scope name constant string.
-  */
+   Translate scope id to scope name constant string.
+*/
 const char *starpu_perf_knob_scope_id_to_name(enum starpu_perf_knob_scope scope);
 
 /**
-  Translate type name constant string to type id.
-  */
+   Translate type name constant string to type id.
+*/
 int starpu_perf_knob_type_name_to_id(const char *name);
+
 /**
-  Translate type id to type name constant string.
-  */
+   Translate type id to type name constant string.
+*/
 const char *starpu_perf_knob_type_id_to_name(enum starpu_perf_knob_type type);
 
 /**
-  Return the number of performance steering knobs for the given scope.
-  */
+   Return the number of performance steering knobs for the given scope.
+*/
 int starpu_perf_knob_nb(enum starpu_perf_knob_scope scope);
+
 /**
-  Translate a performance knob name to its id.
-  */
+   Translate a performance knob name to its id.
+*/
 int starpu_perf_knob_name_to_id(enum starpu_perf_knob_scope scope, const char *name);
+
 /**
-  Translate a performance knob name to its id.
-  */
+   Translate a performance knob name to its id.
+*/
 int starpu_perf_knob_nth_to_id(enum starpu_perf_knob_scope scope, int nth);
+
 /**
-  Translate a performance knob rank in its scope to its knob id.
-  */
+   Translate a performance knob rank in its scope to its knob id.
+*/
 const char *starpu_perf_knob_id_to_name(int id);
+
 /**
-  Translate a knob id to its name constant string.
-  */
+   Translate a knob id to its name constant string.
+*/
 int starpu_perf_knob_get_type_id(int id);
+
 /**
-  Return the knob's help string.
-  */
+   Return the knob's help string.
+*/
 const char *starpu_perf_knob_get_help_string(int id);
 
 /**
-  Display the list of knobs defined in the given scope.
-  */
+   Display the list of knobs defined in the given scope.
+*/
 void starpu_perf_knob_list_avail(enum starpu_perf_knob_scope scope);
+
 /**
-  Display the list of knobs defined in all scopes.
-  */
+   Display the list of knobs defined in all scopes.
+*/
 void starpu_perf_knob_list_all_avail(void);
 
 /**
-  Get knob value for Global scope.
-  */
+   Get knob value for Global scope.
+*/
 int32_t starpu_perf_knob_get_global_int32_value (const int knob_id);
+
 /**
-  Get knob value for Global scope.
-  */
+   Get knob value for Global scope.
+*/
 int64_t starpu_perf_knob_get_global_int64_value (const int knob_id);
+
 /**
-  Get knob value for Global scope.
-  */
+   Get knob value for Global scope.
+*/
 float   starpu_perf_knob_get_global_float_value (const int knob_id);
+
 /**
-  Get knob value for Global scope.
-  */
+   Get knob value for Global scope.
+*/
 double  starpu_perf_knob_get_global_double_value(const int knob_id);
 
 /**
-  Set int32 knob value for Global scope.
-  */
+   Set int32 knob value for Global scope.
+*/
 void starpu_perf_knob_set_global_int32_value (const int knob_id, int32_t new_value);
+
 /**
-  Set int64 knob value for Global scope.
-  */
+   Set int64 knob value for Global scope.
+*/
 void starpu_perf_knob_set_global_int64_value (const int knob_id, int64_t new_value);
+
 /**
-  Set float knob value for Global scope.
-  */
+   Set float knob value for Global scope.
+*/
 void starpu_perf_knob_set_global_float_value (const int knob_id, float   new_value);
+
 /**
-  Set double knob value for Global scope.
-  */
+   Set double knob value for Global scope.
+*/
 void starpu_perf_knob_set_global_double_value(const int knob_id, double  new_value);
 
-
 /**
- Get int32 value for Per_worker scope.
-  */
+   Get int32 value for Per_worker scope.
+*/
 int32_t starpu_perf_knob_get_per_worker_int32_value (const int knob_id, unsigned workerid);
+
 /**
- Get int64 value for Per_worker scope.
-  */
+   Get int64 value for Per_worker scope.
+*/
 int64_t starpu_perf_knob_get_per_worker_int64_value (const int knob_id, unsigned workerid);
+
 /**
- Get float value for Per_worker scope.
-  */
+   Get float value for Per_worker scope.
+*/
 float   starpu_perf_knob_get_per_worker_float_value (const int knob_id, unsigned workerid);
+
 /**
- Get double value for Per_worker scope.
-  */
+   Get double value for Per_worker scope.
+*/
 double  starpu_perf_knob_get_per_worker_double_value(const int knob_id, unsigned workerid);
 
 /**
- Set int32 value for Per_worker scope.
-  */
+   Set int32 value for Per_worker scope.
+*/
 void starpu_perf_knob_set_per_worker_int32_value (const int knob_id, unsigned workerid, int32_t new_value);
+
 /**
- Set int64 value for Per_worker scope.
-  */
+   Set int64 value for Per_worker scope.
+*/
 void starpu_perf_knob_set_per_worker_int64_value (const int knob_id, unsigned workerid, int64_t new_value);
+
 /**
- Set float value for Per_worker scope.
-  */
+   Set float value for Per_worker scope.
+*/
 void starpu_perf_knob_set_per_worker_float_value (const int knob_id, unsigned workerid, float   new_value);
+
 /**
- Set double value for Per_worker scope.
-  */
+   Set double value for Per_worker scope.
+*/
 void starpu_perf_knob_set_per_worker_double_value(const int knob_id, unsigned workerid, double  new_value);
 
-
 /**
- Get int32 value for per_scheduler scope.
-  */
+   Get int32 value for per_scheduler scope.
+*/
 int32_t starpu_perf_knob_get_per_scheduler_int32_value (const int knob_id, const char * sched_policy_name);
+
 /**
- Get int64 value for per_scheduler scope.
-  */
+   Get int64 value for per_scheduler scope.
+*/
 int64_t starpu_perf_knob_get_per_scheduler_int64_value (const int knob_id, const char * sched_policy_name);
+
 /**
- Get float value for per_scheduler scope.
-  */
+   Get float value for per_scheduler scope.
+*/
 float   starpu_perf_knob_get_per_scheduler_float_value (const int knob_id, const char * sched_policy_name);
+
 /**
- Get double value for per_scheduler scope.
-  */
+   Get double value for per_scheduler scope.
+*/
 double  starpu_perf_knob_get_per_scheduler_double_value(const int knob_id, const char * sched_policy_name);
 
 /**
- Set int32 value for per_scheduler scope.
-  */
+   Set int32 value for per_scheduler scope.
+*/
 void starpu_perf_knob_set_per_scheduler_int32_value (const int knob_id, const char * sched_policy_name, int32_t new_value);
+
 /**
- Set int64 value for per_scheduler scope.
-  */
+   Set int64 value for per_scheduler scope.
+*/
 void starpu_perf_knob_set_per_scheduler_int64_value (const int knob_id, const char * sched_policy_name, int64_t new_value);
+
 /**
- Set float value for per_scheduler scope.
-  */
+   Set float value for per_scheduler scope.
+*/
 void starpu_perf_knob_set_per_scheduler_float_value (const int knob_id, const char * sched_policy_name, float   new_value);
+
 /**
- Set double value for per_scheduler scope.
-  */
+   Set double value for per_scheduler scope.
+*/
 void starpu_perf_knob_set_per_scheduler_double_value(const int knob_id, const char * sched_policy_name, double  new_value);
 
 /** @} */

+ 3 - 5
include/starpu_perfmodel.h

@@ -310,10 +310,10 @@ struct starpu_perfmodel
 void starpu_perfmodel_init(struct starpu_perfmodel *model);
 
 /**
-   Deinitialize the \p model performance model structure. You need to call this 
-   before deallocating the structure. You will probably want to call 
+   Deinitialize the \p model performance model structure. You need to call this
+   before deallocating the structure. You will probably want to call
    starpu_perfmodel_unload_model() before calling this function, to save the perfmodel.
-*/   
+*/
 int starpu_perfmodel_deinit(struct starpu_perfmodel *model);
 
 /**
@@ -322,7 +322,6 @@ int starpu_perfmodel_deinit(struct starpu_perfmodel *model);
    - \p workerid is the worker on which calibration is to be performed (in the case of GPUs, use -1 for CPUs)
    - \p archi is the type of architecture on which calibration will be run
 */
-
 int starpu_energy_start(int workerid, enum starpu_worker_archtype archi);
 
 /**
@@ -335,7 +334,6 @@ int starpu_energy_start(int workerid, enum starpu_worker_archtype archi);
    - \p workerid is the worker on which calibration was performed (in the case of GPUs, use -1 for CPUs)
    - \p archi is the type of architecture on which calibration was run
 */
-
 int starpu_energy_stop(struct starpu_perfmodel *model, struct starpu_task *task, unsigned nimpl, unsigned ntasks, int workerid, enum starpu_worker_archtype archi);
 
 

+ 11 - 0
include/starpu_sched_component.h

@@ -215,6 +215,10 @@ struct starpu_sched_tree *starpu_sched_tree_create(unsigned sched_ctx_id) STARPU
    destroy tree and free all non shared component in it.
 */
 void starpu_sched_tree_destroy(struct starpu_sched_tree *tree);
+/**
+   calls starpu_sched_tree_destroy, ready for use for starpu_sched_policy::deinit_sched field.
+ */
+void starpu_sched_tree_deinitialize(unsigned sched_ctx_id);
 struct starpu_sched_tree *starpu_sched_tree_get(unsigned sched_ctx_id);
 /**
    recursively set all starpu_sched_component::workers, do not take into account shared parts (except workers).
@@ -789,6 +793,13 @@ struct starpu_sched_tree *starpu_sched_component_make_scheduler(unsigned sched_c
 #define STARPU_SCHED_SIMPLE_FIFOS_BELOW_EXP	(1<<13)
 
 /**
+   Request to prepend a component before the decision component. This should be
+   used alone and followed by the component creation function pointer and its
+   data.
+*/
+#define STARPU_SCHED_SIMPLE_PRE_DECISION	(1<<14)
+
+/**
    Create a simple modular scheduler tree around a scheduling decision-making
    component \p component. The details of what should be built around \p component
    is described by \p flags. The different STARPU_SCHED_SIMPL_DECIDE_* flags are

+ 6 - 1
include/starpu_sched_ctx.h

@@ -336,7 +336,12 @@ void starpu_sched_ctx_move_task_to_ctx_locked(struct starpu_task *task, unsigned
 
 int starpu_sched_ctx_get_worker_rank(unsigned sched_ctx_id);
 
-void (*starpu_sched_ctx_get_sched_policy_init(unsigned sched_ctx_id))(unsigned);
+/**
+   Return the function associated with the scheduler context \p
+   sched_ctx_id which was given through the field
+   starpu_conf::sched_policy_callback
+*/
+void (*starpu_sched_ctx_get_sched_policy_callback(unsigned sched_ctx_id))(unsigned);
 
 unsigned starpu_sched_ctx_has_starpu_scheduler(unsigned sched_ctx_id, unsigned *awake_workers);
 

+ 113 - 43
include/starpu_task.h

@@ -75,13 +75,6 @@ extern "C"
 /**
    To be used when setting the field starpu_codelet::where (or
    starpu_task::where) to specify the codelet (or the task) may be
-   executed on a MIC processing unit.
-*/
-#define STARPU_MIC	STARPU_WORKER_TO_MASK(STARPU_MIC_WORKER)
-
-/**
-   To be used when setting the field starpu_codelet::where (or
-   starpu_task::where) to specify the codelet (or the task) may be
    executed on a MPI Slave processing unit.
 */
 #define STARPU_MPI_MS	STARPU_WORKER_TO_MASK(STARPU_MPI_MS_WORKER)
@@ -178,16 +171,6 @@ typedef void (*starpu_cuda_func_t)(void **, void*);
 typedef void (*starpu_opencl_func_t)(void **, void*);
 
 /**
-   MIC implementation of a codelet.
-*/
-typedef void (*starpu_mic_kernel_t)(void **, void*);
-
-/**
-  MIC kernel for a codelet
-*/
-typedef starpu_mic_kernel_t (*starpu_mic_func_t)(void);
-
-/**
    MPI Master Slave kernel for a codelet
 */
 typedef void (*starpu_mpi_ms_kernel_t)(void **, void*);
@@ -397,22 +380,6 @@ struct starpu_codelet
 
 	/**
 	   Optional array of function pointers to a function which
-	   returns the MIC implementation of the codelet. The
-	   functions prototype must be:
-	   \code{.c}
-	   starpu_mic_kernel_t mic_func(struct starpu_codelet *cl, unsigned nimpl)
-	   \endcode
-	   If the field starpu_codelet::where is set, then the field
-	   starpu_codelet::mic_funcs is ignored if ::STARPU_MIC does
-	   not appear in the field starpu_codelet::where. It can be
-	   <c>NULL</c> if starpu_codelet::cpu_funcs_name is
-	   non-<c>NULL</c>, in which case StarPU will simply make a
-	   symbol lookup to get the implementation.
-	*/
-	starpu_mic_func_t mic_funcs[STARPU_MAXIMPLEMENTATIONS];
-
-	/**
-	   Optional array of function pointers to a function which
 	   returns the MPI Master Slave implementation of the codelet.
 	   The functions prototype must be:
 	   \code{.c}
@@ -431,8 +398,8 @@ struct starpu_codelet
 	   Optional array of strings which provide the name of the CPU
 	   functions referenced in the array
 	   starpu_codelet::cpu_funcs. This can be used when running on
-	   MIC devices for StarPU to simply look
-	   up the MIC function implementation through its name.
+	   MPI MS devices for StarPU to simply look
+	   up the MPI MS function implementation through its name.
 	*/
 	const char *cpu_funcs_name[STARPU_MAXIMPLEMENTATIONS];
 
@@ -703,6 +670,7 @@ struct starpu_task
 	   when using ::STARPU_R and alike.
 	*/
 	starpu_data_handle_t handles[STARPU_NMAXBUFS];
+
 	/**
 	   Array of Data pointers to the memory node where execution
 	   will happen, managed by the DSM.
@@ -710,6 +678,7 @@ struct starpu_task
 	   This is filled by StarPU.
 	*/
 	void *interfaces[STARPU_NMAXBUFS];
+
 	/**
 	   Used only when starpu_codelet::nbuffers is \ref
 	   STARPU_VARIABLE_NBUFFERS.
@@ -770,12 +739,56 @@ struct starpu_task
 	size_t cl_arg_size;
 
 	/**
+	   Optional pointer which points to the return value of submitted task.
+	   The default value is <c>NULL</c>. starpu_codelet_pack_arg()
+	   and starpu_codelet_unpack_arg() can be used to respectively
+	   pack and unpack the return value into and form it. starpu_task::cl_ret
+	   can be used for MPI support. The only requirement is that
+	   the size of the return value must be set in starpu_task::cl_ret_size .
+	*/
+	void *cl_ret;
+
+	/**
+	   Optional field. The buffer of starpu_codelet_pack_arg()
+	   and starpu_codelet_unpack_arg() can be allocated with
+	   the starpu_task::cl_ret_size bytes starting at address starpu_task::cl_ret.
+	   starpu_task::cl_ret_size can be used for MPI supoort.
+	*/
+	size_t cl_ret_size;
+
+	/**
+	   Optional field, the default value is <c>NULL</c>. This is a
+	   function pointer of prototype <c>void (*f)(void *)</c> which
+	   specifies a possible callback. If this pointer is non-<c>NULL</c>,
+	   the callback function is executed on the host after the execution of
+	   the task. Contrary to starpu_task::callback_func, it is called
+	   before releasing tasks which depend on this task, so those cannot be
+	   already executing. The callback is passed
+	   the value contained in the starpu_task::epilogue_callback_arg field.
+	   No callback is executed if the field is set to <c>NULL</c>.
+
+	   With starpu_task_insert() and alike this can be specified thanks to
+	   ::STARPU_EPILOGUE_CALLBACK followed by the function pointer.
+	*/
+	void (*epilogue_callback_func)(void *);
+
+	/**
+	   Optional field, the default value is <c>NULL</c>. This is
+	   the pointer passed to the epilogue callback function. This field is
+	   ignored if the field starpu_task::epilogue_callback_func is set to
+	   <c>NULL</c>.
+	*/
+	void *epilogue_callback_arg;
+
+	/**
 	   Optional field, the default value is <c>NULL</c>. This is a
 	   function pointer of prototype <c>void (*f)(void *)</c>
 	   which specifies a possible callback. If this pointer is
 	   non-<c>NULL</c>, the callback function is executed on the
-	   host after the execution of the task. Tasks which depend on
-	   it might already be executing. The callback is passed the
+	   host after the execution of the task. Contrary to
+	   starpu_task::epilogue_callback, it is called after releasing
+	   tasks which depend on this task, so those
+	   might already be executing. The callback is passed the
 	   value contained in the starpu_task::callback_arg field. No
 	   callback is executed if the field is set to <c>NULL</c>.
 
@@ -786,6 +799,7 @@ struct starpu_task
 	   pointer and the argument.
 	*/
 	void (*callback_func)(void *);
+
 	/**
 	   Optional field, the default value is <c>NULL</c>. This is
 	   the pointer passed to the callback function. This field is
@@ -826,7 +840,30 @@ struct starpu_task
 	*/
 	void *prologue_callback_arg;
 
+	/**
+	   Optional field, the default value is <c>NULL</c>. This is a
+	   function pointer of prototype <c>void (*f)(void*)</c>
+	   which specifies a possible callback. If this pointer is
+	   non-<c>NULL</c>, the callback function is executed on the host
+	   when the task is pop-ed from the scheduler, just before getting
+	   executed. The callback is passed the value contained in the
+	   starpu_task::prologue_callback_pop_arg field.
+	   No callback is executed if the field is set to <c>NULL</c>.
+
+	   With starpu_task_insert() and alike this can be specified thanks to
+	   ::STARPU_PROLOGUE_CALLBACK_POP followed by the function pointer.
+	*/
 	void (*prologue_callback_pop_func)(void *);
+
+	/**
+	   Optional field, the default value is <c>NULL</c>. This is
+	   the pointer passed to the prologue_callback_pop function. This
+	   field is ignored if the field
+	   starpu_task::prologue_callback_pop_func is set to <c>NULL</c>.
+
+	   With starpu_task_insert() and alike this can be specified thanks to
+	   ::STARPU_PROLOGUE_CALLBACK_POP_ARG followed by the argument.
+	   */
 	void *prologue_callback_pop_arg;
 
 	/**
@@ -844,10 +881,7 @@ struct starpu_task
 	   by the application through <c>malloc()</c>, setting
 	   starpu_task::cl_arg_free to 1 makes StarPU automatically
 	   call <c>free(cl_arg)</c> when destroying the task. This
-	   saves the user from defining a callback just for that. This
-	   is mostly useful when targetting MIC, where the
-	   codelet does not execute in the same memory space as the
-	   main thread.
+	   saves the user from defining a callback just for that.
 
 	   With starpu_task_insert() and alike this is set to 1 when using
 	   ::STARPU_CL_ARGS.
@@ -855,6 +889,14 @@ struct starpu_task
 	unsigned cl_arg_free:1;
 
 	/**
+	   Optional field. In case starpu_task::cl_ret was allocated
+	   by the application through <c>malloc()</c>, setting
+	   starpu_task::cl_ret_free to 1 makes StarPU automatically
+	   call <c>free(cl_ret)</c> when destroying the task.
+	*/
+	unsigned cl_ret_free:1;
+
+	/**
 	   Optional field. In case starpu_task::callback_arg was
 	   allocated by the application through <c>malloc()</c>,
 	   setting starpu_task::callback_arg_free to 1 makes StarPU
@@ -868,6 +910,15 @@ struct starpu_task
 	unsigned callback_arg_free:1;
 
 	/**
+	   Optional field. In case starpu_task::epilogue_callback_arg was
+	   allocated by the application through <c>malloc()</c>,
+	   setting starpu_task::epilogue_callback_arg_free to 1 makes StarPU
+	   automatically call <c>free(epilogue_callback_arg)</c> when
+	   destroying the task.
+	*/
+	unsigned epilogue_callback_arg_free:1;
+
+	/**
 	   Optional field. In case starpu_task::prologue_callback_arg
 	   was allocated by the application through <c>malloc()</c>,
 	   setting starpu_task::prologue_callback_arg_free to 1 makes
@@ -1264,8 +1315,12 @@ struct starpu_task
 	.where = -1,					\
 	.cl_arg = NULL,					\
 	.cl_arg_size = 0,				\
+	.cl_ret = NULL,					\
+	.cl_ret_size = 0,				\
 	.callback_func = NULL,				\
 	.callback_arg = NULL,				\
+	.epilogue_callback_func = NULL,			\
+	.epilogue_callback_arg = NULL,			\
 	.priority = STARPU_DEFAULT_PRIO,		\
 	.use_tag = 0,					\
 	.sequential_consistency = 1,			\
@@ -1312,6 +1367,13 @@ struct starpu_task
    SettingManyDataHandlesForATask)
 */
 #define STARPU_TASK_GET_HANDLE(task, i) (((task)->dyn_handles) ? (task)->dyn_handles[i] : (task)->handles[i])
+
+/**
+   Return all the data handles of \p task. If \p task is defined
+   with a static or dynamic number of handles, will either return all
+   the element of the field starpu_task::handles or all the elements
+   of the field starpu_task::dyn_handles (see \ref SettingManyDataHandlesForATask)
+*/
 #define STARPU_TASK_GET_HANDLES(task) (((task)->dyn_handles) ? (task)->dyn_handles : (task)->handles)
 
 /**
@@ -1367,8 +1429,13 @@ struct starpu_task
 	do {								\
 		if ((task)->cl->nbuffers == STARPU_VARIABLE_NBUFFERS || (task)->cl->nbuffers > STARPU_NMAXBUFS) \
 			if ((task)->dyn_modes) (task)->dyn_modes[i] = mode; else (task)->modes[i] = mode; \
-		else							\
-			STARPU_CODELET_SET_MODE((task)->cl, mode, i);	\
+		else \
+		{							\
+			enum starpu_data_access_mode cl_mode = STARPU_CODELET_GET_MODE((task)->cl, i); \
+			STARPU_ASSERT_MSG(cl_mode == mode,	\
+				"Task <%s> can't set its  %d-th buffer mode to %d as the codelet it derives from uses %d", \
+				(task)->cl->name, i, mode, cl_mode);	\
+		} \
 	} while(0)
 
 /**
@@ -1479,6 +1546,9 @@ int starpu_task_submit_nodeps(struct starpu_task *task) STARPU_WARN_UNUSED_RESUL
 */
 int starpu_task_submit_to_ctx(struct starpu_task *task, unsigned sched_ctx_id);
 
+/**
+   Return 1 if \p task is terminated
+*/
 int starpu_task_finished(struct starpu_task *task) STARPU_WARN_UNUSED_RESULT;
 
 /**

+ 3 - 0
include/starpu_task_dep.h

@@ -217,6 +217,9 @@ void starpu_tag_notify_from_apps(starpu_tag_t id);
 */
 void starpu_tag_notify_restart_from_apps(starpu_tag_t id);
 
+/**
+   Return the task associated to the tag \p id
+ */
 struct starpu_task *starpu_tag_get_task(starpu_tag_t id);
 
 /** @} */

+ 69 - 7
include/starpu_task_util.h

@@ -88,14 +88,15 @@ extern "C"
 #define STARPU_EXECUTE_ON_DATA	 (7<<STARPU_MODE_SHIFT)
 
 /**
-   Used when calling starpu_task_in_sert(), must be followed by an array of
+   Used when calling starpu_task_insert(), must be followed by an array of
    handles and the number of elements in the array (as int). This is equivalent
-   to passing the handles as separate parameters with STARPU_R/W/RW.
+   to passing the handles as separate parameters with ::STARPU_R,
+   ::STARPU_W or ::STARPU_RW.
 */
 #define STARPU_DATA_ARRAY        (8<<STARPU_MODE_SHIFT)
 
 /**
-   Used when calling starpu_task_in_sert(), must be followed by an array of
+   Used when calling starpu_task_insert(), must be followed by an array of
    struct starpu_data_descr and the number of elements in the array (as int).
    This is equivalent to passing the handles with the corresponding modes.
 */
@@ -322,8 +323,7 @@ extern "C"
 
 /**
    Used when calling starpu_task_insert() and alike, must be followed
-   by a void* specifying the value to be set in the sched_data field of the
-   task.
+   by a void* specifying the value to be set in starpu_task::sched_data
  */
 #define STARPU_TASK_SCHED_DATA (41<<STARPU_MODE_SHIFT)
 
@@ -343,7 +343,23 @@ extern "C"
 */
 #define STARPU_TASK_LINE	 (43<<STARPU_MODE_SHIFT)
 
-#define STARPU_SHIFTED_MODE_MAX (44<<STARPU_MODE_SHIFT)
+/**
+   Used when calling starpu_task_insert(), must be followed by a
+   pointer to a epilogue callback function
+*/
+#define STARPU_EPILOGUE_CALLBACK   (44<<STARPU_MODE_SHIFT)
+
+/**
+   Used when calling starpu_task_insert(), must be followed by a
+   pointer to be given as an argument to the epilogue callback
+   function
+*/
+#define STARPU_EPILOGUE_CALLBACK_ARG   (45<<STARPU_MODE_SHIFT)
+
+/**
+   This has to be the last mode value plus 1
+*/
+#define STARPU_SHIFTED_MODE_MAX (46<<STARPU_MODE_SHIFT)
 
 /**
    Set the given \p task corresponding to \p cl with the following arguments.
@@ -359,7 +375,6 @@ int starpu_task_set(struct starpu_task *task, struct starpu_codelet *cl, ...);
 	starpu_task_set((task), (cl), STARPU_TASK_FILE, __FILE__, STARPU_TASK_LINE, __LINE__, ##__VA_ARGS__)
 #endif
 
-
 /**
    Create a task corresponding to \p cl with the following arguments.
    The argument list must be zero-terminated. The arguments
@@ -470,6 +485,12 @@ void starpu_task_insert_data_process_mode_array_arg(struct starpu_codelet *cl, s
 */
 void starpu_codelet_pack_args(void **arg_buffer, size_t *arg_buffer_size, ...);
 
+/**
+   Structure to be used for starpu_codelet_pack_arg_init() & co, and
+   starpu_codelet_unpack_arg_init() & co. The contents is public,
+   however users should not directly access it, but only use as a
+   parameter to the appropriate functions.
+*/
 struct starpu_codelet_pack_arg_data
 {
 	char *arg_buffer;
@@ -510,6 +531,47 @@ void starpu_codelet_pack_arg_fini(struct starpu_codelet_pack_arg_data *state, vo
 void starpu_codelet_unpack_args(void *cl_arg, ...);
 
 /**
+   Initialize \p state with \p cl_arg and \p cl_arg_size. This has to
+   be called before calling starpu_codelet_unpack_arg().
+*/
+void starpu_codelet_unpack_arg_init(struct starpu_codelet_pack_arg_data *state, void *cl_arg, size_t cl_arg_size);
+
+/**
+   Unpack the next argument of size \p size from \p state into \p ptr with a copy.
+   \p state has to be initialized before with starpu_codelet_unpack_arg_init().
+*/
+void starpu_codelet_unpack_arg(struct starpu_codelet_pack_arg_data *state, void *ptr, size_t size);
+
+/**
+   Unpack the next argument of unknown size from \p state into \p ptr
+   with a copy. \p ptr is allocated before copying in it the value of
+   the argument.
+   The size of the argument is returned in \p size.
+   \p has to be initialized before with starpu_codelet_unpack_arg_init().
+*/
+void starpu_codelet_dup_arg(struct starpu_codelet_pack_arg_data *state, void **ptr, size_t *size);
+
+/**
+   Unpack the next argument of unknown size from \p state into \p ptr.
+   \p ptr will be a pointer to the memory of the argument.
+   The size of the argument is returned in \p size.
+   \p has to be initialized before with starpu_codelet_unpack_arg_init().
+*/
+void starpu_codelet_pick_arg(struct starpu_codelet_pack_arg_data *state, void **ptr, size_t *size);
+
+/**
+   Finish unpacking data, after calling starpu_codelet_unpack_arg_init()
+   once and starpu_codelet_unpack_arg() or starpu_codelet_dup_arg() or
+   starpu_codelet_pick_arg() several times.
+*/
+void starpu_codelet_unpack_arg_fini(struct starpu_codelet_pack_arg_data *state);
+
+/**
+   Call this function during unpacking to skip saving the argument in ptr.
+*/
+void starpu_codelet_unpack_discard_arg(struct starpu_codelet_pack_arg_data *state);
+
+/**
    Similar to starpu_codelet_unpack_args(), but if any parameter is 0,
    copy the part of \p cl_arg that has not been read in \p buffer
    which can then be used in a later call to one of the unpack

+ 0 - 0
include/starpu_util.h


Some files were not shown because too many files changed in this diff