Selaa lähdekoodia

Merge branch 'fpga' of gitlab.inria.fr:starpu/starpu into fpga

Samuel Thibault 4 vuotta sitten
vanhempi
commit
18215eabb1
100 muutettua tiedostoa jossa 4588 lisäystä ja 3351 poistoa
  1. 16 0
      ChangeLog
  2. 1 2
      Makefile.am
  3. 38 292
      configure.ac
  4. 1 1
      contrib/ci.inria.fr/job-0-tarball.sh
  5. 1 2
      contrib/ci.inria.fr/job-1-check.sh
  6. 0 3
      doc/doxygen/Makefile.am
  7. 0 1
      doc/doxygen/chapters/000_introduction.doxy
  8. 1 1
      doc/doxygen/chapters/101_building.doxy
  9. 1 1
      doc/doxygen/chapters/110_basic_examples.doxy
  10. 21 14
      doc/doxygen/chapters/210_check_list_performance.doxy
  11. 4 4
      doc/doxygen/chapters/301_tasks.doxy
  12. 61 55
      doc/doxygen/chapters/310_data_management.doxy
  13. 35 31
      doc/doxygen/chapters/320_scheduling.doxy
  14. 2 2
      doc/doxygen/chapters/380_offline_performance_tools.doxy
  15. 0 97
      doc/doxygen/chapters/430_mic_support.doxy
  16. 0 1
      doc/doxygen/chapters/495_interoperability.doxy
  17. 302 303
      doc/doxygen/chapters/501_environment_variables.doxy
  18. 10 33
      doc/doxygen/chapters/510_configure_options.doxy
  19. 0 1
      doc/doxygen/chapters/520_files.doxy
  20. 0 24
      doc/doxygen/chapters/api/versioning.doxy
  21. 3446 1032
      doc/doxygen/chapters/images/tasks_size_overhead.eps
  22. BIN
      doc/doxygen/chapters/images/tasks_size_overhead.pdf
  23. BIN
      doc/doxygen/chapters/images/tasks_size_overhead.png
  24. 0 1
      doc/doxygen/doxygen-config.cfg.in
  25. 2 3
      doc/doxygen/doxygen.cfg
  26. 1 1
      doc/doxygen/doxygen_filter.sh.in
  27. 0 7
      doc/doxygen/refman.tex
  28. 1 3
      doc/doxygen_dev/Makefile.am
  29. 0 3
      doc/doxygen_dev/doxygen-config.cfg.in
  30. 0 1
      doc/doxygen_dev/doxygen.cfg
  31. 4 0
      doc/doxygen_dev/modules.tex
  32. 0 3
      doc/doxygen_dev/refman.tex
  33. 6 6
      examples/cg/cg.c
  34. 12 14
      examples/cg/cg_kernels.c
  35. 0 1
      examples/cpp/add_vectors.cpp
  36. 0 1
      examples/cpp/add_vectors_cpp11.cpp
  37. 0 12
      examples/cpp/add_vectors_interface.cpp
  38. 0 1
      examples/cpp/incrementer_cpp.cpp
  39. 0 1
      examples/dependency/sequential_consistency.c
  40. 0 1
      examples/dependency/task_end_dep.c
  41. 0 1
      examples/dependency/task_end_dep_add.c
  42. 0 6
      examples/heat/heat.sh
  43. 1 1
      examples/interface/complex_codelet.h
  44. 5 5
      examples/interface/complex_interface.c
  45. 0 1
      examples/loader-cross.sh.in
  46. 0 12
      examples/lu/lu.sh
  47. 1 1
      examples/lu/xlu_implicit_pivot.c
  48. 1 1
      examples/lu/xlu_kernels.c
  49. 1 1
      examples/lu/xlu_pivot.c
  50. 0 6
      examples/mult/sgemm.sh
  51. 5 1
      examples/native_fortran/nf_vector.f90
  52. 0 1
      examples/perf_steering/perf_knobs_03.c
  53. 0 1
      examples/pi/pi_redux.c
  54. 2 1
      examples/reductions/dot_product.c
  55. 0 1
      examples/sched_ctx/parallel_tasks_with_cluster_api.c
  56. 6 1
      examples/sched_ctx/two_cpu_contexts.c
  57. 1 1
      examples/scheduler/heteroprio_test.c
  58. 0 1
      examples/stencil/loader-cross.sh.in
  59. 0 29
      include/fstarpu_mod.f90
  60. 72 82
      include/starpu.h
  61. 14 14
      include/starpu_clusters.h
  62. 9 14
      include/starpu_config.h.in
  63. 12 0
      include/starpu_cuda.h
  64. 15 4
      include/starpu_data.h
  65. 0 42
      include/starpu_data_interfaces.h
  66. 0 61
      include/starpu_mic.h
  67. 0 7
      include/starpu_mod.f90
  68. 9 9
      include/starpu_perf_monitoring.h
  69. 109 84
      include/starpu_perf_steering.h
  70. 11 0
      include/starpu_sched_component.h
  71. 6 1
      include/starpu_sched_ctx.h
  72. 13 39
      include/starpu_task.h
  73. 3 0
      include/starpu_task_dep.h
  74. 43 18
      include/starpu_task_util.h
  75. 2 15
      include/starpu_worker.h
  76. 1 2
      julia/src/StarPU.jl
  77. 1 2
      julia/src/translate_headers.jl
  78. 0 268
      mic-configure
  79. 1 1
      mpi/examples/cg/cg.c
  80. 26 18
      mpi/src/starpu_mpi_task_insert.c
  81. 24 14
      mpi/src/starpu_mpi_task_insert_fortran.c
  82. 1 18
      src/Makefile.am
  83. 0 4
      src/common/utils.h
  84. 0 5
      src/core/combined_workers.c
  85. 1 1
      src/core/dependencies/data_arbiter_concurrency.c
  86. 2 2
      src/core/dependencies/data_concurrency.c
  87. 0 53
      src/core/detect_combined_workers.c
  88. 2 2
      src/core/perfmodel/energy_model.c
  89. 154 66
      src/core/perfmodel/perfmodel_bus.c
  90. 7 12
      src/core/perfmodel/perfmodel_history.c
  91. 6 6
      src/core/sched_ctx.c
  92. 4 4
      src/core/sched_ctx.h
  93. 2 20
      src/core/sched_policy.c
  94. 1 16
      src/core/task.c
  95. 0 5
      src/core/task.h
  96. 13 302
      src/core/topology.c
  97. 25 85
      src/core/workers.c
  98. 8 23
      src/core/workers.h
  99. 2 1
      src/datawizard/coherency.c
  100. 0 0
      src/datawizard/copy_driver.c

+ 16 - 0
ChangeLog

@@ -52,10 +52,21 @@ New features:
     MPI/NUMA/GPUDirect.
   * Add peek_data interface method.
   * Add STARPU_MPI_REDUX
+  * Add starpu_data_query_status2 function.
+
+Small features:
+  * New configure option --with-check-cflags to define flags for C,
+    CXX and Fortran compilers
 
 Small changes:
   * Add a synthetic energy efficiency testcase.
   * Make reduction methods want the commute flag.
+  * Delete old MIC driver code
+  * Rename
+    - starpu_conf::sched_policy_init to starpu_conf::sched_policy_callback
+    and
+   - starpu_sched_ctx_get_sched_policy_init() to starpu_sched_ctx_get_sched_policy_callback()
+   as the callback function may not only be used for init purposes
 
 StarPU 1.3.8
 ====================================================================
@@ -70,6 +81,11 @@ Small features:
     instead of hyperthreads.
   * New STARPU_TASK_PROGRESS environment variable to show task progression.
   * Add STARPU_SIMGRID environment variable guard against native builds.
+  * Add starpu_cuda_get_nvmldev function.
+  * Add starpu_sched_tree_deinitialize function.
+  * Add STARPU_SCHED_SORTED_ABOVE and STARPU_SCHED_SORTED_BELOW environment
+    variables.
+  * Add STARPU_SCHED_SIMPLE_PRE_DECISION.
 
 StarPU 1.3.7
 ====================================================================

+ 1 - 2
Makefile.am

@@ -92,7 +92,6 @@ versinclude_HEADERS = 				\
 	include/starpu_fpga.h			\
 	include/starpu_openmp.h			\
 	include/starpu_sink.h			\
-	include/starpu_mic.h			\
 	include/starpu_mpi_ms.h			\
 	include/starpu_expert.h			\
 	include/starpu_profiling.h		\
@@ -163,7 +162,7 @@ else
 txtdir = ${docdir}
 endif
 txt_DATA = AUTHORS COPYING.LGPL README README.dev STARPU-REVISION
-EXTRA_DIST = autogen.sh AUTHORS COPYING.LGPL README README.dev STARPU-VERSION STARPU-REVISION build-aux/svn2cl.xsl mic-configure
+EXTRA_DIST = autogen.sh AUTHORS COPYING.LGPL README README.dev STARPU-VERSION STARPU-REVISION build-aux/svn2cl.xsl
 
 DISTCLEANFILES = STARPU-REVISION
 

+ 38 - 292
configure.ac

@@ -585,6 +585,12 @@ AC_ARG_ENABLE(mpi-master-slave, [AS_HELP_STRING([--enable-mpi-master-slave],
                               [Enable StarPU to run with the master-slave mode])],
               use_mpi_master_slave=$enableval,
               use_mpi_master_slave=no)
+
+# in case it is explicitely required, but mpicc is not available, this is an error
+if test x$use_mpi_master_slave = xyes -a ! -x "$mpicc_path"; then
+   AC_MSG_ERROR([Compiler MPI '$mpicc_path' not valid])
+fi
+
 #We can only build MPI Master Slave if User wants it and MPI compiler are available
 if test x$use_mpi_master_slave = xyes -a x$mpicc_path != xno -a x${mpicxx_path} != xno ; then
     build_mpi_master_slave=yes
@@ -747,29 +753,6 @@ AC_SUBST(CC_OR_MPICC, $cc_or_mpicc)
 
 ###############################################################################
 #                                                                             #
-#                           MIC device compilation                            #
-#   (Must be done in beginning to change prefix in the whole configuration)   #
-#                                                                             #
-###############################################################################
-
-AC_ARG_ENABLE(mic, [AS_HELP_STRING([--enable-mic],
-	      [use MIC device(s)])], [enable_mic=$enableval], [enable_mic=no])
-AC_ARG_ENABLE(mic-rma, [AS_HELP_STRING([--disable-mic-rma],
-	      [disable MIC RMA transfer])], [enable_mic_rma=$enableval], [enable_mic_rma=yes])
-
-if test x$enable_mic = xyes ; then
-	AC_DEFINE(STARPU_USE_MIC, [1], [MIC workers support is enabled])
-fi
-if test x$enable_mic_rma = xyes ; then
-	AC_DEFINE([STARPU_MIC_USE_RMA], [1], [MIC RMA transfer is enable])
-fi
-
-AM_CONDITIONAL([STARPU_USE_MIC], [test "x$enable_mic" = "xyes"])
-
-###############################################################################
-
-###############################################################################
-#                                                                             #
 #                           NUMA memory nodes                                 #
 #                                                                             #
 ###############################################################################
@@ -1482,7 +1465,7 @@ if test x$enable_cuda = xyes; then
 		      STARPU_CUDA_LDFLAGS="$STARPU_CUDA_LDFLAGS -lcusparse"])
 	AC_CHECK_DECLS([cusparseSetStream], [], [], [[#include <cusparse.h>]])
 
-	LDFLAGS="${LDFLAGS} -lnvidia-ml"
+	LIBS="${LIBS} -lnvidia-ml"
 	AC_COMPILE_IFELSE([AC_LANG_PROGRAM(
 		[[#include <nvml.h>]],
 		[[nvmlInit();]]
@@ -1507,7 +1490,7 @@ if test x$enable_cuda = xyes; then
 		AC_CHECK_DECLS([nvmlDeviceGetTotalEnergyConsumption], [
 			AC_CHECK_FUNCS([nvmlDeviceGetTotalEnergyConsumption])
 			], [], [[#include <nvml.h>]])
-		AC_DEFINE([HAVE_LIBNVIDIA_ML], [1], [Define to 1 if you have the nvidia-ml library])
+		AC_DEFINE([STARPU_HAVE_LIBNVIDIA_ML], [1], [Define to 1 if you have the nvidia-ml library])
 		STARPU_CUDA_LDFLAGS="$STARPU_CUDA_LDFLAGS -lnvidia-ml"
 	fi
 	AC_MSG_CHECKING(whether nvidia-ml should be used)
@@ -1810,19 +1793,6 @@ if test x$disable_asynchronous_opencl_copy = xyes ; then
    AC_DEFINE([STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY], [1], [Define to 1 to disable asynchronous copy between CPU and OpenCL devices])
 fi
 
-AC_MSG_CHECKING(whether asynchronous MIC copy should be disabled)
-AC_ARG_ENABLE(asynchronous-mic-copy, [AS_HELP_STRING([--disable-asynchronous-mic-copy],
-			[disable asynchronous copy between CPU and MIC devices])],
-			enable_asynchronous_mic_copy=$enableval, enable_asynchronous_mic_copy=yes)
-disable_asynchronous_mic_copy=no
-if test x$enable_asynchronous_mic_copy = xno ; then
-   disable_asynchronous_mic_copy=yes
-fi
-AC_MSG_RESULT($disable_asynchronous_mic_copy)
-if test x$disable_asynchronous_mic_copy = xyes ; then
-   AC_DEFINE([STARPU_DISABLE_ASYNCHRONOUS_MIC_COPY], [1], [Define to 1 to disable asynchronous copy between CPU and MIC devices])
-fi
-
 AC_MSG_CHECKING(whether asynchronous MPI Master Slave copy should be disabled)
 AC_ARG_ENABLE(asynchronous-mpi-master-slave-copy, [AS_HELP_STRING([--disable-asynchronous-mpi-master-slave-copy],
 			[disable asynchronous copy between MPI Master and MPI Slave devices])],
@@ -1851,224 +1821,6 @@ fi
 
 ###############################################################################
 #                                                                             #
-#                                 MIC settings                                #
-#                                                                             #
-###############################################################################
-
-# ignore these otions, only meant for mic-configure, but also passed here.
-AC_ARG_ENABLE(native-mic)
-AC_ARG_WITH(compiler)
-AC_ARG_WITH(mic-param)
-AC_ARG_WITH(host-param)
-
-AC_MSG_CHECKING(maximum number of MIC devices)
-AC_ARG_ENABLE(maxmicdev, [AS_HELP_STRING([--enable-maxmicdev=<number>],
-			[maximum number of MIC devices])],
-			nmaxmicdev=$enableval,
-            [
-             if test x$enable_mic = xyes; then
-                 nmaxmicdev=4
-             else
-                 nmaxmicdev=0
-             fi
-            ])
-AC_MSG_RESULT($nmaxmicdev)
-
-AC_DEFINE_UNQUOTED(STARPU_MAXMICDEVS, [$nmaxmicdev],
-	[maximum number of MIC devices])
-
-AC_MSG_CHECKING(maximum number of MIC threads)
-AC_ARG_ENABLE(maxmicthreads, [AS_HELP_STRING([--enable-maxmicthreads=<number>],
-			[maximum number of MIC threads])],
-			nmaxmicthreads=$enableval, nmaxmicthreads=120)
-AC_MSG_RESULT($nmaxmicthread)
-
-AC_ARG_WITH(coi-dir,
-	[AS_HELP_STRING([--with-coi-dir=<path>],
-	[specify the MIC's COI installation directory])],
-	[coi_dir="$withval"],
-	[coi_dir=no])
-
-AC_ARG_WITH(coi-include-dir,
-	[AS_HELP_STRING([--with-coi-include-dir=<path>],
-	[specify where the MIC's COI headers are installed])],
-	[coi_include_dir="$withval"],
-	[coi_include_dir=no])
-
-AC_ARG_WITH(coi-lib-dir,
-	[AS_HELP_STRING([--with-coi-lib-dir=<path>],
-	[specify where the MIC's COI libraries are installed])],
-	[coi_lib_dir="$withval"],
-	[coi_lib_dir=no])
-
-AC_DEFUN([STARPU_CHECK_COI_RUNTIME],
-[
-    __coi_dir=$1
-    __coi_include_dir=$2
-    __coi_lib_dir=$3
-
-    if test "$__coi_dir" != "no" -a "$__coi_dir" != "" ; then
-	AC_MSG_CHECKING(whether MIC's COI runtime is available in $__coi_dir)
-    else
-	AC_MSG_CHECKING(whether MIC's COI runtime is available)
-    fi
-    AC_MSG_RESULT()
-
-    if test "$__coi_include_dir" = "no" -a "$__coi_dir" != "no" ; then
-        __coi_include_dir="${__coi_dir}/include"
-    fi
-    if test "$__coi_lib_dir" = "no" -a "$__coi_dir" != "no" ; then
-        __coi_lib_dir="${__coi_dir}/lib"
-    fi
-
-    SAVED_CPPFLAGS="$CPPFLAGS"
-    SAVED_LDFLAGS="$LDFLAGS"
-
-    if test "$__coi_include_dir" != "no" ; then
-        CPPFLAGS="${CPPFLAGS} -I$__coi_include_dir"
-    fi
-    if test "$__coi_lib_dir" != "no" ; then
-	LDFLAGS="${LDFLAGS} -L$__coi_lib_dir ${STARPU_SCIF_LDFLAGS}"
-    fi
-
-    AC_CHECK_HEADER([source/COIEngine_source.h],[have_valid_coi=yes],[have_valid_coi=no])
-
-    if test "$have_valid_coi" = "yes" ; then
-	AC_HAVE_LIBRARY([$4],[have_valid_coi=yes],[have_valid_coi=no])
-
-        if test "$have_valid_coi" = "no" ; then
-            if test "$3" = "no" -a "$__coi_dir" != "no" ; then
-		# ${__coi_dir}/lib didn't work, let's try with lib64
-                __coi_lib_dir="$__coi_dir/lib64"
-		LDFLAGS="${SAVED_LDFLAGS} -L$__coi_lib_dir"
-	        AC_HAVE_LIBRARY([$4],[have_valid_coi=yes],[have_valid_coi=no])
-            fi
-        fi
-    fi
-
-    if test "$have_valid_coi" = "yes" -a "$__coi_include_dir" != "no"; then
-        STARPU_COI_CPPFLAGS="-I$__coi_include_dir"
-    fi
-
-    if test "$have_valid_coi" = "yes" ; then
-        if test "$__coi_lib_dir" != "no"; then
-	    STARPU_COI_LDFLAGS="-L$__coi_lib_dir"
-        fi
-	STARPU_COI_LDFLAGS="${STARPU_COI_LDFLAGS} -l$4"
-    fi
-
-    CPPFLAGS="${SAVED_CPPFLAGS}"
-    LDFLAGS="${SAVED_LDFLAGS}"
-])
-
-AC_ARG_WITH(scif-dir,
-	[AS_HELP_STRING([--with-scif-dir=<path>],
-	[specify the MIC's SCIF installation directory])],
-	[scif_dir="$withval"],
-	[scif_dir=no])
-
-AC_ARG_WITH(scif-include-dir,
-	[AS_HELP_STRING([--with-scif-include-dir=<path>],
-	[specify where the MIC's SCIF headers are installed])],
-	[scif_include_dir="$withval"],
-	[scif_include_dir=no])
-
-AC_ARG_WITH(scif-lib-dir,
-	[AS_HELP_STRING([--with-scif-lib-dir=<path>],
-	[specify where the MIC's SCIF libraries are installed])],
-	[scif_lib_dir="$withval"],
-	[scif_lib_dir=no])
-
-AC_DEFUN([STARPU_CHECK_SCIF_RUNTIME],
-[
-    __scif_dir=$1
-    __scif_include_dir=$2
-    __scif_lib_dir=$3
-
-    if test "$__scif_dir" != "no" -a "$__scif_dir" != "" ; then
-	AC_MSG_CHECKING(whether MIC's SCIF runtime is available in $__scif_dir)
-    else
-	AC_MSG_CHECKING(whether MIC's SCIF runtime is available)
-    fi
-    AC_MSG_RESULT()
-
-    if test "$__scif_include_dir" = "no" -a "$__scif_dir" != "no" ; then
-        __scif_include_dir="${__scif_dir}/include"
-    fi
-    if test "$__scif_lib_dir" = "no" -a "$__scif_dir" != "no" ; then
-        __scif_lib_dir="${__scif_dir}/lib"
-    fi
-
-    SAVED_CPPFLAGS="$CPPFLAGS"
-    SAVED_LDFLAGS="$LDFLAGS"
-
-    if test "$__scif_include_dir" != "no" ; then
-        CPPFLAGS="${CPPFLAGS} -I$__scif_include_dir"
-    fi
-    if test "$__scif_lib_dir" != "no" ; then
-	LDFLAGS="${LDFLAGS} -L$__scif_lib_dir"
-    fi
-
-#    AC_CHECK_HEADER([source/SCIFEngine_source.h],[have_valid_scif=yes],[have_valid_scif=no])
-
-#    if test "$have_valid_scif" = "yes" ; then
-	AC_HAVE_LIBRARY([scif],[have_valid_scif=yes],[have_valid_scif=no])
-
-        if test "$have_valid_scif" = "no" ; then
-            if test "$3" = "no" -a "$__scif_dir" != "no" ; then
-		# ${__scif_dir}/lib didn't work, let's try with lib64
-                __scif_lib_dir="$__scif_dir/lib64"
-		LDFLAGS="${SAVED_LDFLAGS} -L$__scif_lib_dir"
-	        AC_HAVE_LIBRARY([scif],[have_valid_scif=yes],[have_valid_scif=no])
-            fi
-        fi
-#    fi
-
-    if test "$have_valid_scif" = "yes" -a "$__scif_include_dir" != "no"; then
-        STARPU_SCIF_CPPFLAGS="-I$__scif_include_dir"
-    fi
-
-    if test "$have_valid_scif" = "yes" ; then
-        if test "$__scif_lib_dir" != "no"; then
-	    STARPU_SCIF_LDFLAGS="-L$__scif_lib_dir"
-        fi
-	STARPU_SCIF_LDFLAGS="${STARPU_SCIF_LDFLAGS} -lscif"
-    fi
-
-    CPPFLAGS="${SAVED_CPPFLAGS}"
-    LDFLAGS="${SAVED_LDFLAGS}"
-])
-
-if test x$enable_mic = xyes ; then
-
-    STARPU_CHECK_SCIF_RUNTIME($scif_dir, $scif_include_dir, $scif_lib_dir)
-    if test "$have_valid_scif" = "no" ; then
-	AC_MSG_ERROR([cannot find MIC's SCIF runtime])
-    fi
-
-    case $host_vendor in
-	*1om)
-	    # We are cross-compiling.
-	    # Let's have a look for the device runtime which lib has a different name
-	    STARPU_CHECK_COI_RUNTIME($coi_dir, $coi_include_dir, $coi_lib_dir, coi_device)
-	    ;;
-	*)
-	    STARPU_CHECK_COI_RUNTIME($coi_dir, $coi_include_dir, $coi_lib_dir, coi_host)
-	    ;;
-    esac
-
-    if test "$have_valid_coi" = "no" ; then
-	AC_MSG_ERROR([cannot find MIC's COI runtime])
-    fi
-
-    AC_SUBST(STARPU_COI_CPPFLAGS)
-    AC_SUBST(STARPU_COI_LDFLAGS)
-    AC_SUBST(STARPU_SCIF_CPPFLAGS)
-    AC_SUBST(STARPU_SCIF_LDFLAGS)
-fi
-
-###############################################################################
-#                                                                             #
 #                   Debug and Performance analysis tools                      #
 #                                                                             #
 ###############################################################################
@@ -2150,11 +1902,11 @@ if test x$enable_fast = xyes; then
 	AC_DEFINE(STARPU_NO_ASSERT, [1], [disable assertions])
 else
         # fortify gets really enabled only with optimizations, avoid enabling it
-        # when they optimizations are not enabled, because with some glibc it
+        # when optimizations are not enabled, because with some glibc it
         # spews a lot of warnings.
 	if test x$enable_debug != xyes; then
 		if test x$GCC = xyes; then
-			CPPFLAGS="$CPPFLAGS -D_FORTIFY_SOURCE=1"
+			CPPFLAGS="-D_FORTIFY_SOURCE=1 $CPPFLAGS"
 		fi
 	fi
 fi
@@ -2381,7 +2133,6 @@ fi
 AM_CONDITIONAL([STARPU_USE_AYUDAME1], [test "x$enable_ayudame1" = "xyes"])
 AM_CONDITIONAL([STARPU_USE_AYUDAME2], [test "x$enable_ayudame2" = "xyes"])
 
-
 STARPU_FXT_EVENT_DEFINES="`grep -E '#define\s+_STARPU_(MPI_)?FUT_' ${srcdir}/src/common/fxt.h ${srcdir}/mpi/src/starpu_mpi_fxt.h | grep 0x | grep -v 0x1 | cut -d : -f 2`"
 AC_SUBST([STARPU_FXT_EVENT_DEFINES])
 
@@ -2415,10 +2166,9 @@ AC_ARG_ENABLE(maxnodes, [AS_HELP_STRING([--enable-maxnodes=<nnodes>],
 if test x$maxnodes = x0 ; then
 	if test x$enable_simgrid = xyes ; then
 		# We need the room for the virtual CUDA/OpenCL devices
-		nodes=`expr 4 + $nmaxcudadev + $nmaxopencldev + $nmaxmicdev + 1 + $nmaxmpidev`
+		nodes=`expr 4 + $nmaxcudadev + $nmaxopencldev + 1 + $nmaxmpidev`
 	else
 		# We have one memory node shared by all CPU workers, one node per GPU
-		# and per MIC device
 		# we add nodes to use 2 memory disks
 		nodes=`expr $nmaxnumanodes + 2`
 		if test x$enable_cuda = xyes ; then
@@ -2431,18 +2181,11 @@ if test x$maxnodes = x0 ; then
 			# odd number.
 			nodes=`expr $nodes + $nmaxopencldev`
 		fi
-		if test x$enable_mic = xyes ; then
-			nodes=`expr $nodes + $nmaxmicdev`
-		fi
 		if test x$enable_fpga = xyes ; then
 			# we could have used nmaxcudadev + 1, but this would certainly give an
 			# odd number.
 			nodes=`expr $nodes + $nmaxfpgadev`
 		fi
-		if test x$enable_rcce = xyes ; then
-			# Only 1 memory node for the shared memory.
-			nodes=`expr $nodes + 1`
-		fi
 
 		#nmaxmpidev = 0 if mpi master-slave is disabled
 		nodes=`expr $nodes + $nmaxmpidev`
@@ -2516,16 +2259,21 @@ if test x$enable_simgrid != xyes; then
 	if test x$enable_opencl != xyes; then
 		nmaxopencldev=0
 	fi
-	if test x$enable_mic != xyes; then
-		nmaxmicthreads=0
-	fi
     #By default, if we cannot build mpi master-slave nmaxmpidev is set to zero.
     #But with the multiplication with maxcpus, we need to put it to one.
     if test x$build_mpi_master_slave != xyes; then
         nmaxmpidev=1
     fi
 fi
-nmaxworkers=`expr 16 \* \( \( \( $nmaxmpidev \* $maxcpus \) + $nmaxcudadev +  $nmaxfpgadev + $nmaxopencldev + $nmaxmicthreads + 15 \) / 16 \) `
+if test $maxcpus == 0
+then
+	nmaxworkers=`expr 16 \* \( \( \( $nmaxmpidev \* 64 \) + $nmaxcudadev + $nmaxopencldev + $nmaxfpgadev + 15 \) / 16 \) `
+elif test $nmaxmpidev == 0
+then
+	nmaxworkers=`expr 16 \* \( \( $maxcpus + $nmaxcudadev + $nmaxopencldev + $nmaxfpgadev + 15 \) / 16 \) `
+else
+	nmaxworkers=`expr 16 \* \( \( \( $nmaxmpidev \* $maxcpus \) + $nmaxcudadev + $nmaxopencldev + $nmaxfpgadev + 15 \) / 16 \) `
+fi
 AC_MSG_CHECKING(Maximum number of workers)
 AC_MSG_RESULT($nmaxworkers)
 AC_DEFINE_UNQUOTED(STARPU_NMAXWORKERS, [$nmaxworkers], [Maximum number of workers])
@@ -2539,16 +2287,13 @@ fi
 if test $nmaxdevs -lt $nmaxfpgadev; then
 	nmaxdevs=$nmaxfpgadev
 fi
-if test $nmaxdevs -lt $nmaxmicdev; then
-	nmaxdevs=$nmaxmicdev
-fi
 if test $nmaxdevs -lt $nmaxmpidev; then
 	nmaxdevs=$nmaxmpidev
 fi
 AC_DEFINE_UNQUOTED(STARPU_NMAXDEVS, [$nmaxdevs], [Maximum number of device per device arch])
 
 # Computes the maximun number of combined worker
-nmaxcombinedworkers=`expr $maxcpus + $nmaxmicthreads`
+nmaxcombinedworkers=$maxcpus
 AC_MSG_CHECKING(Maximum number of workers combinations)
 AC_MSG_RESULT($nmaxcombinedworkers)
 AC_DEFINE_UNQUOTED(STARPU_NMAX_COMBINEDWORKERS,
@@ -2592,12 +2337,12 @@ AC_DEFINE_UNQUOTED(STARPU_HISTORYMAXERROR, [$calibration_heuristic], [calibratio
 #                                                                             #
 ###############################################################################
 
-AM_CONDITIONAL([STARPU_USE_MP], [test "x$enable_mic" = "xyes" -o "x$build_mpi_master_slave" = "xyes" -o "x$enable_rcce" = "xyes"])
+AM_CONDITIONAL([STARPU_USE_MP], [test "x$build_mpi_master_slave" = "xyes"])
 
 AC_ARG_ENABLE([export-dynamic], [AS_HELP_STRING([--disable-export-dynamic],
 			  [Prevent the linker from adding all symbols to the dynamic symbol table])], [], [])
 
-if test x$enable_mic = xyes -o x$build_mpi_master_slave = xyes -o x$enable_rcce = xyes ; then
+if test x$build_mpi_master_slave = xyes ; then
 	AC_DEFINE(STARPU_USE_MP, [1], [Message-passing SINKs support
 		  is enabled])
 
@@ -2660,6 +2405,16 @@ AC_SUBST(GLOBAL_AM_FCFLAGS)
 pkglibdir="\${libdir}/$PACKAGE"
 AC_SUBST([pkglibdir])
 
+AC_ARG_WITH(check-flags, [AS_HELP_STRING([--with-check-flags],
+			 [Specify flags for C and Fortran compilers])],
+			 check_flags=$withval, check_flags="")
+if test "x$check_flags" != "x" ; then
+	for xflag in $check_flags
+	do
+		IS_SUPPORTED_FLAG($xflag)
+	done
+fi
+
 ###############################################################################
 #                                                                             #
 #                               Fortran                                       #
@@ -2819,7 +2574,7 @@ if test x$starpu_windows = xyes ; then
    enable_openmp=no
    openmp_msg="disabled on windows"
 fi
-if test "x$use_mpi_master_slave" = "xyes" ; then
+if test "x$build_mpi_master_slave" = "xyes" ; then
    enable_openmp=no
    openmp_msg="incompatibility with MPI master slave support"
 fi
@@ -2865,7 +2620,7 @@ if test "x$enable_socl" = "xyes" -a "$have_valid_opencl" = "no" ; then
 fi
 
 # MPI Master Slave and SOCL are not compatible
-if test "x$use_mpi_master_slave" = "xyes" ; then
+if test "x$build_mpi_master_slave" = "xyes" ; then
    if test "x$enable_socl" = "xyes" ; then
       AC_MSG_ERROR([MPI Master-Slave and SOCL can not be used at the same time !])
    fi
@@ -3693,7 +3448,7 @@ STARPU_H_CPPFLAGS="$HWLOC_CFLAGS $STARPU_CUDA_CPPFLAGS $STARPU_OPENCL_CPPFLAGS $
 AC_SUBST([STARPU_H_CPPFLAGS])
 
 # these are the flags needed for linking libstarpu (and thus also for static linking)
-LIBSTARPU_LDFLAGS="$STARPU_OPENCL_LDFLAGS $STARPU_CUDA_LDFLAGS $HWLOC_LIBS $FXT_LDFLAGS $FXT_LIBS $PAPI_LIBS $STARPU_COI_LDFLAGS $STARPU_SCIF_LDFLAGS $STARPU_RCCE_LDFLAGS $STARPU_LEVELDB_LDFLAGS $STARPU_GLPK_LDFLAGS $STARPU_LEVELDB_LDFLAGS $SIMGRID_LDFLAGS $STARPU_BLAS_LDFLAGS $STARPU_OMP_LDFLAGS $DGELS_LIBS $STARPU_FPGA_LDFLAGS"
+LIBSTARPU_LDFLAGS="$STARPU_OPENCL_LDFLAGS $STARPU_CUDA_LDFLAGS $HWLOC_LIBS $FXT_LDFLAGS $FXT_LIBS $PAPI_LIBS $STARPU_LEVELDB_LDFLAGS $STARPU_GLPK_LDFLAGS $STARPU_LEVELDB_LDFLAGS $SIMGRID_LDFLAGS $STARPU_BLAS_LDFLAGS $STARPU_OMP_LDFLAGS $DGELS_LIBS $STARPU_FPGA_LDFLAGS"
 AC_SUBST([LIBSTARPU_LDFLAGS])
 
 # these are the flags needed for linking against libstarpu (because starpu.h makes its includer use pthread_*, simgrid, etc.)
@@ -3712,10 +3467,7 @@ AC_SUBST([LIBSTARPU_LINK])
 # File configuration
 AC_CONFIG_COMMANDS([executable-scripts], [
   chmod +x tests/regression/regression.sh
-  chmod +x tests/loader-cross.sh
   chmod +x tests/model-checking/starpu-mc.sh
-  chmod +x examples/loader-cross.sh
-  chmod +x examples/stencil/loader-cross.sh
   chmod +x tools/starpu_env
   chmod +x tools/starpu_codelet_profile
   chmod +x tools/starpu_codelet_histo_profile
@@ -3858,11 +3610,8 @@ AC_OUTPUT([
 	examples/Makefile
 	examples/stencil/Makefile
 	tests/Makefile
-	tests/loader-cross.sh
 	tests/model-checking/Makefile
 	tests/model-checking/starpu-mc.sh
-	examples/loader-cross.sh
-	examples/stencil/loader-cross.sh
 	mpi/Makefile
 	mpi/src/Makefile
 	mpi/tests/Makefile
@@ -3894,12 +3643,10 @@ AC_MSG_NOTICE([
 	CUDA   enabled: $enable_cuda
 	OpenCL enabled: $enable_opencl
 	FPGA   enabled: $enable_fpga
-	MIC    enabled: $enable_mic
 
 	Compile-time limits
 	(change these with --enable-maxcpus, --enable-maxcudadev,
-	--enable-maxopencldev, --enable-maxfpgadev, --enable-maxmicdev, --enable-maxnodes,
-        --enable-maxbuffers)
+	--enable-maxopencldev, --enable-maxfpgadev, --enable-maxnodes, --enable-maxbuffers)
         (Note these numbers do not represent the number of detected
 	devices, but the maximum number of devices StarPU can manage)
 
@@ -3907,7 +3654,6 @@ AC_MSG_NOTICE([
 	Maximum number of CUDA devices:             $nmaxcudadev
 	Maximum number of OpenCL devices:           $nmaxopencldev
 	Maximum number of FPGA devices:             $nmaxfpgadev
-	Maximum number of MIC threads:              $nmaxmicthreads
 	Maximum number of MPI master-slave devices: $nmaxmpidev
 	Maximum number of memory nodes:             $maxnodes
 	Maximum number of task buffers:             $nmaxbuffers
@@ -3928,7 +3674,7 @@ AC_MSG_NOTICE([
 	       StarPU MPI enabled:                            $build_mpi_lib
 	       StarPU MPI(nmad) enabled:                      $build_nmad_lib
 	       MPI test suite:                                $running_mpi_check
-	       Master-Slave MPI enabled:                      $use_mpi_master_slave
+	       Master-Slave MPI enabled:                      $build_mpi_master_slave
 	       FFT Support:                                   $fft_support
 	       Resource Management enabled:                   $starpurm_support
 	       Python Interface enabled:                      $starpupy_support

+ 1 - 1
contrib/ci.inria.fr/job-0-tarball.sh

@@ -22,7 +22,7 @@ export LD_LIBRARY_PATH=/home/ci/usr/local/lib:$LD_LIBRARY_PATH
 if test -d build ; then chmod -R 777 build && rm -rf build ; fi
 mkdir build && cd build
 ../configure --enable-build-doc-pdf
-make V=1
+make -j4
 make dist
 cp *gz ..
 cp doc/doxygen/starpu.pdf ..

+ 1 - 2
contrib/ci.inria.fr/job-1-check.sh

@@ -91,8 +91,7 @@ fi
 export STARPU_TIMEOUT_ENV=1800
 export MPIEXEC_TIMEOUT=1800
 
-make
-#make check
+make -j4
 (make -k check || true) 2>&1 | tee  ../check_$$
 make showsuite
 

+ 0 - 3
doc/doxygen/Makefile.am

@@ -81,7 +81,6 @@ chapters =	\
 	chapters/410_mpi_support.doxy		\
 	chapters/415_fault_tolerance.doxy	\
 	chapters/420_fft_support.doxy		\
-	chapters/430_mic_support.doxy		\
 	chapters/440_fpga_support.doxy		\
 	chapters/450_native_fortran_support.doxy		\
 	chapters/460_socl_opencl_extensions.doxy		\
@@ -107,7 +106,6 @@ chapters =	\
 	chapters/code/disk_compute.c \
 	chapters/code/nf_initexit.f90 \
 	chapters/api/fft_support.doxy \
-	chapters/api/versioning.doxy \
 	chapters/api/threads.doxy
 
 images = 	\
@@ -215,7 +213,6 @@ dox_inputs = $(DOX_CONFIG) 				\
 	$(top_srcdir)/include/starpu_fxt.h		\
 	$(top_srcdir)/include/starpu_hash.h		\
 	$(top_srcdir)/include/starpu_helper.h		\
-	$(top_srcdir)/include/starpu_mic.h		\
 	$(top_srcdir)/include/starpu_fpga.h		\
 	$(top_srcdir)/include/starpu_mpi_ms.h		\
 	$(top_srcdir)/include/starpu_mod.f90		\

+ 0 - 1
doc/doxygen/chapters/000_introduction.doxy

@@ -295,7 +295,6 @@ The documentation chapters include
 <li> \ref MPISupport
 <li> \ref FaultTolerance
 <li> \ref FFTSupport
-<li> \ref MICSupport
 <li> \ref NativeFortranSupport
 <li> \ref SOCLOpenclExtensions
 <li> \ref SimGridSupport

+ 1 - 1
doc/doxygen/chapters/101_building.doxy

@@ -46,7 +46,7 @@ recommended.  It allows for topology aware scheduling, which improves
 performance. <c>hwloc</c> is available in major free operating system
 distributions, and for most operating systems. Make sure to not only install
 a <c>hwloc</c> or <c>libhwloc</c> package, but also <c>hwloc-devel</c> or
-<c>libhwloc-dev</c> so as to have hwloc headers etc.
+<c>libhwloc-dev</c> so as to have \c hwloc headers etc.
 
 If <c>libhwloc</c> is installed in a standard
 location, no option is required, it will be detected automatically,

+ 1 - 1
doc/doxygen/chapters/110_basic_examples.doxy

@@ -214,7 +214,7 @@ Once a task has been executed, an optional callback function
 starpu_task::callback_func is called when defined.
 While the computational kernel could be offloaded on various architectures, the
 callback function is always executed on a CPU. The pointer
-starpu_task::callback_arg is passed as an argument of the callback
+starpu_task::callback_arg is passed as an argument to the callback
 function. The prototype of a callback function must be:
 
 \code{.c}

+ 21 - 14
doc/doxygen/chapters/210_check_list_performance.doxy

@@ -224,7 +224,7 @@ starpu_malloc_on_node() which are used by the data interfaces
 (matrix, vector, etc.).  This does not include allocations performed by
 the application through e.g. malloc(). It does not include allocations
 performed through starpu_malloc() either, only allocations
-performed explicitly with the \ref STARPU_MALLOC_COUNT flag, i.e. by calling
+performed explicitly with the flag \ref STARPU_MALLOC_COUNT, i.e. by calling
 
 \code{.c}
 starpu_malloc_flags(STARPU_MALLOC_COUNT)
@@ -258,18 +258,18 @@ structures of StarPU by describing the shape of your machine and/or your
 application when calling \c configure.
 
 To reduce the memory footprint of the data internal structures of StarPU, one
-can set the
+can set the \c configure parameters 
 \ref enable-maxcpus "--enable-maxcpus",
 \ref enable-maxnumanodes "--enable-maxnumanodes",
 \ref enable-maxcudadev "--enable-maxcudadev",
 \ref enable-maxopencldev "--enable-maxopencldev" and
 \ref enable-maxnodes "--enable-maxnodes"
-\c configure parameters to give StarPU
+to give StarPU
 the architecture of the machine it will run on, thus tuning the size of the
 structures to the machine.
 
 To reduce the memory footprint of the task internal structures of StarPU, one
-can set the \ref enable-maxbuffers "--enable-maxbuffers" \c configure parameter to
+can set the \c configure parameter \ref enable-maxbuffers "--enable-maxbuffers" to
 give StarPU the maximum number of buffers that a task can use during an
 execution. For example, in the Cholesky factorization (dense linear algebra
 application), the GEMM task uses up to 3 buffers, so it is possible to set the
@@ -278,15 +278,15 @@ maximum number of task buffers to 3 to run a Cholesky factorization on StarPU.
 The size of the various structures of StarPU can be printed by
 <c>tests/microbenchs/display_structures_size</c>.
 
-It is also often useless to submit *all* the tasks at the same time.
+It is also often useless to submit \b all the tasks at the same time.
 Task submission can be blocked when a reasonable given number of
 tasks have been submitted, by setting the environment variables \ref
 STARPU_LIMIT_MIN_SUBMITTED_TASKS and \ref STARPU_LIMIT_MAX_SUBMITTED_TASKS.
 
-<c>
+\code{.sh}
 export STARPU_LIMIT_MAX_SUBMITTED_TASKS=10000
 export STARPU_LIMIT_MIN_SUBMITTED_TASKS=9000
-</c>
+\endcode
 
 will make StarPU block submission when 10000 tasks are submitted, and unblock
 submission when only 9000 tasks are still submitted, i.e. 1000 tasks have
@@ -341,11 +341,17 @@ overrides the hostname of the system.
 
 By default, StarPU stores separate performance models for each GPU. To avoid
 having to calibrate performance models for each GPU of a homogeneous set of GPU
-devices for instance, the model can be shared by setting
-<c>export STARPU_PERF_MODEL_HOMOGENEOUS_CUDA=1</c> (\ref STARPU_PERF_MODEL_HOMOGENEOUS_CUDA),
-<c>export STARPU_PERF_MODEL_HOMOGENEOUS_OPENCL=1</c> (\ref STARPU_PERF_MODEL_HOMOGENEOUS_OPENCL),
-<c>export STARPU_PERF_MODEL_HOMOGENEOUS_MIC=1</c> (\ref STARPU_PERF_MODEL_HOMOGENEOUS_MIC),
-<c>export STARPU_PERF_MODEL_HOMOGENEOUS_MPI_MS=1</c> (\ref STARPU_PERF_MODEL_HOMOGENEOUS_MPI_MS) depending on your GPU device type.
+devices for instance, the model can be shared by using the environment
+variables \ref STARPU_PERF_MODEL_HOMOGENEOUS_CUDA, \ref
+STARPU_PERF_MODEL_HOMOGENEOUS_OPENCL and \ref
+STARPU_PERF_MODEL_HOMOGENEOUS_MPI_MS depending on your GPU device
+type.
+
+\code{.shell}
+export STARPU_PERF_MODEL_HOMOGENEOUS_CUDA=1
+export STARPU_PERF_MODEL_HOMOGENEOUS_OPENCL=1
+export STARPU_PERF_MODEL_HOMOGENEOUS_MPI_MS=1
+\endcode
 
 To force continuing calibration,
 use <c>export STARPU_CALIBRATE=1</c> (\ref STARPU_CALIBRATE). This may be necessary if your application
@@ -453,12 +459,13 @@ and in Joules for the energy consumption models.
 A quick view of how many tasks each worker has executed can be obtained by setting
 <c>export STARPU_WORKER_STATS=1</c> (\ref STARPU_WORKER_STATS). This is a convenient way to check that
 execution did happen on accelerators, without penalizing performance with
-the profiling overhead. \ref STARPU_WORKER_STATS_FILE can be defined
+the profiling overhead. The environment variable \ref STARPU_WORKER_STATS_FILE can be defined
 to specify a filename in which to display statistics, by default
 statistics are printed on the standard error stream.
 
 A quick view of how much data transfers have been issued can be obtained by setting
-<c>export STARPU_BUS_STATS=1</c> (\ref STARPU_BUS_STATS). \ref
+<c>export STARPU_BUS_STATS=1</c> (\ref STARPU_BUS_STATS). The
+environment variable \ref
 STARPU_BUS_STATS_FILE can be defined to specify a filename in which to
 display statistics, by default statistics are printed on the standard error stream.
 

+ 4 - 4
doc/doxygen/chapters/301_tasks.doxy

@@ -612,13 +612,13 @@ tasks or tags, or to be submitted in callbacks, etc.
 The obvious way is of course to make kernel functions empty, but such task will
 thus have to wait for a worker to become ready, transfer data, etc.
 
-A much lighter way to define a synchronization task is to set its starpu_task::cl
-field to <c>NULL</c>. The task will thus be a mere synchronization point,
+A much lighter way to define a synchronization task is to set its field starpu_task::cl
+to <c>NULL</c>. The task will thus be a mere synchronization point,
 without any data access or execution content: as soon as its dependencies become
 available, it will terminate, call the callbacks, and release dependencies.
 
-An intermediate solution is to define a codelet with its
-starpu_codelet::where field set to \ref STARPU_NOWHERE, for instance:
+An intermediate solution is to define a codelet with its field
+starpu_codelet::where set to \ref STARPU_NOWHERE, for instance:
 
 \code{.c}
 struct starpu_codelet cl =

+ 61 - 55
doc/doxygen/chapters/310_data_management.doxy

@@ -93,8 +93,8 @@ starpu_block_data_register(&block_handle, STARPU_MAIN_RAM, (uintptr_t)block, nx,
 
 3D matrices can be partitioned along the x dimension by
 using starpu_block_filter_block(), or along the y dimension
-by using starpu_block_filter_vertical_block, or along the
-z dimension by using starpu_block_filter_depth_block. They
+by using starpu_block_filter_vertical_block(), or along the
+z dimension by using starpu_block_filter_depth_block(). They
 can also be partitioned with some overlapping by using
 starpu_block_filter_block_shadow(), starpu_block_filter_vertical_block_shadow(),
 or starpu_block_filter_depth_block_shadow().
@@ -206,23 +206,23 @@ need to be set within the original allocation. To reserve room for increasing
 the NX/NY values, one can use starpu_matrix_data_register_allocsize() instead of
 starpu_matrix_data_register(), to specify the allocation size to be used instead
 of the default NX*NY*ELEMSIZE. To support this, the data interface
-has to implement the starpu_data_interface_ops::alloc_footprint and
-starpu_data_interface_ops::alloc_compare methods, for proper StarPU allocation
+has to implement the functions starpu_data_interface_ops::alloc_footprint and
+starpu_data_interface_ops::alloc_compare, for proper StarPU allocation
 management.
 
 A more involved case is changing the amount of allocated data.
 The task implementation can just reallocate the buffer during its execution, and
 set the proper new values in the interface structure, e.g. nx, ny, ld, etc. so
-that the StarPU core knows the new data layout. The starpu_data_interface_ops
-structure however then needs to have the starpu_data_interface_ops::dontcache
-field set to 1, to prevent StarPU from trying to perform any cached allocation,
+that the StarPU core knows the new data layout. The structure starpu_data_interface_ops
+however then needs to have the field starpu_data_interface_ops::dontcache
+set to 1, to prevent StarPU from trying to perform any cached allocation,
 since the allocated size will vary. An example is available in
 <c>tests/datawizard/variable_size.c</c>. The example uses its own data
 interface so as to contain some simulation information for data growth, but the
 principle can be applied for any data interface.
 
-The principle is to use <c>starpu_malloc_on_node_flags</c> to make the new
-allocation, and use <c>starpu_free_on_node_flags</c> to release any previous
+The principle is to use starpu_malloc_on_node_flags() to make the new
+allocation, and use starpu_free_on_node_flags() to release any previous
 allocation. The flags have to be precisely like in the example:
 
 \code{.c}
@@ -244,7 +244,7 @@ stored in the data and the actually allocated buffer. For instance, the vector
 interface uses the <c>nx</c> field for the former, and the <c>allocsize</c> for
 the latter. This allows for lazy reallocation to avoid reallocating the buffer
 everytime to exactly match the actual number of elements. Computations and data
-transfers will use <c>nx</c> field, while allocation functions will use the
+transfers will use the field <c>nx</c>, while allocation functions will use the field
 <c>allocsize</c>. One just has to make sure that <c>allocsize</c> is always
 bigger or equal to <c>nx</c>.
 
@@ -254,12 +254,12 @@ Important note: one can not change the size of a partitioned data.
 \section DataManagement Data Management
 
 When the application allocates data, whenever possible it should use
-the starpu_malloc() function, which will ask CUDA or OpenCL to make
-the allocation itself and pin the corresponding allocated memory, or to use the
-starpu_memory_pin() function to pin memory allocated by other ways, such as local arrays. This
+the function starpu_malloc(), which will ask CUDA or OpenCL to make
+the allocation itself and pin the corresponding allocated memory, or to use the function
+starpu_memory_pin() to pin memory allocated by other ways, such as local arrays. This
 is needed to permit asynchronous data transfer, i.e. permit data
 transfer to overlap with computations. Otherwise, the trace will show
-that the <c>DriverCopyAsync</c> state takes a lot of time, this is
+that the state <c>DriverCopyAsync</c> takes a lot of time, this is
 because CUDA or OpenCL then reverts to synchronous transfers.
 
 The application can provide its own allocation function by calling
@@ -351,8 +351,8 @@ before submitting tasks, which will thus guide StarPU toward an initial task
 distribution (since StarPU will try to avoid further transfers).
 
 This can be achieved by giving the function starpu_data_prefetch_on_node() the
-handle and the desired target memory node. The
-starpu_data_idle_prefetch_on_node() variant can be used to issue the transfer
+handle and the desired target memory node. The variant
+starpu_data_idle_prefetch_on_node() can be used to issue the transfer
 only when the bus is idle.
 
 Conversely, one can advise StarPU that some data will not be useful in the
@@ -434,8 +434,8 @@ __kernel void opencl_kernel(__global int *vector, unsigned offset)
 }
 \endcode
 
-When the sub-data is not of the same type as the original data, the
-starpu_data_filter::get_child_ops field needs to be set appropriately for StarPU
+When the sub-data is not of the same type as the original data, the field
+starpu_data_filter::get_child_ops needs to be set appropriately for StarPU
 to know which type should be used.
 
 StarPU provides various interfaces and filters for matrices, vectors, etc.,
@@ -476,9 +476,8 @@ starpu_data_partition_plan(handle, &f_vert, vert_handle);
 
 starpu_data_partition_plan() returns the handles for the partition in <c>vert_handle</c>.
 
-One can then submit tasks working on the main handle, and tasks working on
-<c>vert_handle</c> handles. Between using the main handle and <c>vert_handle</c>
-handles, StarPU will automatically call starpu_data_partition_submit() and
+One can then submit tasks working on the main handle, and tasks working on the handles
+<c>vert_handle</c>. Between using the main handle and the handles <c>vert_handle</c>, StarPU will automatically call starpu_data_partition_submit() and
 starpu_data_unpartition_submit().
 
 All this code is asynchronous, just submitting which tasks, partitioning and
@@ -544,7 +543,7 @@ ret = starpu_task_insert(&cl_switch, STARPU_RW, handle,
 			0);
 \endcode
 
-The execution of the <c>switch</c> task will get back the matrix data into the
+The execution of the task <c>switch</c> will get back the matrix data into the
 main memory, and thus the vertical slices will get the updated value there.
 
 Again, we prefer to make sure that we don't accidentally access the matrix through the whole-matrix handle:
@@ -557,7 +556,7 @@ And now we can start using vertical slices, etc.
 
 \section DataPointers Handles data buffer pointers
 
-A simple understanding of starpu handles is that it's a collection of buffers on
+A simple understanding of StarPU handles is that it's a collection of buffers on
 each memory node of the machine, which contain the same data.  The picture is
 however made more complex with the OpenCL support and with partitioning.
 
@@ -565,21 +564,28 @@ When partitioning a handle, the data buffers of the subhandles will indeed
 be inside the data buffers of the main handle (to save transferring data
 back and forth between the main handle and the subhandles). But in OpenCL,
 a <c>cl_mem</c> is not a pointer, but an opaque value on which pointer
-arithmetic can not be used. That is why data interfaces contain three members:
-<c>dev_handle</c>, <c>offset</c>, and <c>ptr</c>. The <c>dev_handle</c> member
-is what the allocation function returned, and one can not do arithmetic on
-it. The <c>offset</c> member is the offset inside the allocated area, most often
-it will be 0 because data start at the beginning of the allocated area, but
-when the handle is partitioned, the subhandles will have varying <c>offset</c>
-values, for each subpiece. The <c>ptr</c> member, in the non-OpenCL case, i.e.
-when pointer arithmetic can be used on <c>dev_handle</c>, is just the sum of
+arithmetic can not be used. That is why data interfaces contain three fields:
+<c>dev_handle</c>, <c>offset</c>, and <c>ptr</c>.
+<ul>
+<li> The field <c>dev_handle</c> is what the allocation function
+returned, and one can not do arithmetic on it.
+</li>
+<li> The field <c>offset</c> is the offset inside the allocated area,
+most often it will be 0 because data start at the beginning of the
+allocated area, but when the handle is partitioned, the subhandles
+will have varying <c>offset</c> values, for each subpiece.
+</li>
+<li> The field <c>ptr</c>, in the non-OpenCL case, i.e. when pointer
+arithmetic can be used on <c>dev_handle</c>, is just the sum of
 <c>dev_handle</c> and <c>offset</c>, provided for convenience.
+</li>
+</ul>
 
 This means that:
 <ul>
 <li>computation kernels can use <c>ptr</c> in non-OpenCL implementations.</li>
 <li>computation kernels have to use <c>dev_handle</c> and <c>offset</c> in the OpenCL implementation.</li>
-<li>allocation methods of data interfaces have to store the value returned by starpu_malloc_on_node in <c>dev_handle</c> and <c>ptr</c>, and set <c>offset</c> to 0.</li>
+<li>allocation methods of data interfaces have to store the value returned by starpu_malloc_on_node() in <c>dev_handle</c> and <c>ptr</c>, and set <c>offset</c> to 0.</li>
 <li>partitioning filters have to copy over <c>dev_handle</c> without modifying it, set in the child different values of <c>offset</c>, and set <c>ptr</c> accordingly as the sum of <c>dev_handle</c> and <c>offset</c>.</li>
 </ul>
 
@@ -589,8 +595,8 @@ StarPU provides a series of predefined filters in \ref API_Data_Partition, but
 additional filters can be defined by the application. The principle is that the
 filter function just fills the memory location of the <c>i-th</c> subpart of a data.
 Examples are provided in <c>src/datawizard/interfaces/*_filters.c</c>,
-and see \ref starpu_data_filter::filter_func for the details.
-The starpu_filter_nparts_compute_chunk_size_and_offset() helper can be used to
+check \ref starpu_data_filter::filter_func for further details.
+The helper function starpu_filter_nparts_compute_chunk_size_and_offset() can be used to
 compute the division of pieces of data.
 
 \section DataReduction Data Reduction
@@ -709,7 +715,7 @@ of submission. In some applicative cases, the write contributions can actually
 be performed in any order without affecting the eventual result. In this case
 it is useful to drop the strictly sequential semantic, to improve parallelism
 by allowing StarPU to reorder the write accesses. This can be done by using
-the ::STARPU_COMMUTE data access flag. Accesses without this flag will however
+the data access flag ::STARPU_COMMUTE. Accesses without this flag will however
 properly be serialized against accesses with this flag. For instance:
 
 \code{.c}
@@ -739,7 +745,7 @@ already serialized, and thus by default StarPU uses the Dijkstra solution which
 scales very well in terms of overhead: tasks will just acquire data one by one
 by data handle pointer value order.
 
-When sequential ordering is disabled or the ::STARPU_COMMUTE flag is used, there
+When sequential ordering is disabled or the flag ::STARPU_COMMUTE is used, there
 may be a lot of concurrent accesses to the same data, and the Dijkstra solution
 gets only poor parallelism, typically in some pathological cases which do happen
 in various applications. In this case, one can use a data access arbiter, which
@@ -752,7 +758,7 @@ will acquire them arbiter by arbiter, in arbiter pointer value order.
 
 See the <c>tests/datawizard/test_arbiter.cpp</c> example.
 
-Arbiters however do not support the ::STARPU_REDUX flag yet.
+Arbiters however do not support the flag ::STARPU_REDUX yet.
 
 \section TemporaryBuffers Temporary Buffers
 
@@ -885,7 +891,7 @@ struct starpu_complex_interface
 That structure stores enough to describe <b>one</b> buffer of such kind of
 data. It is used for the buffer stored in the main memory, another instance
 is used for the buffer stored in a GPU, etc. A <i>data handle</i> is thus a
-collection of such structures, to remember each buffer on each memory node.
+collection of such structures, to describe each buffer on each memory node.
 
 Note: one should not take pointers into such structures, because StarPU needs
 to be able to copy over the content of it to various places, for instance to
@@ -921,8 +927,8 @@ void starpu_complex_data_register(starpu_data_handle_t *handleptr,
 The <c>struct starpu_complex_interface complex</c> is here used just to store the
 parameters that the user provided to <c>starpu_complex_data_register</c>.
 starpu_data_register() will first allocate the handle, and
-then pass the <c>starpu_complex_interface</c> structure to the
-starpu_data_interface_ops::register_data_handle method, which records them
+then pass the structure <c>starpu_complex_interface</c> to the method
+starpu_data_interface_ops::register_data_handle, which records them
 within the data handle (it is called once per node by starpu_data_register()):
 
 \code{.c}
@@ -975,7 +981,7 @@ static struct starpu_data_interface_ops interface_complex_ops =
 \endcode
 
 Convenience functions can defined to access the different fields of the
-complex interface from a StarPU data handle after a starpu_data_acquire() call:
+complex interface from a StarPU data handle after a call to starpu_data_acquire():
 
 \code{.c}
 double *starpu_complex_get_real(starpu_data_handle_t handle)
@@ -1029,7 +1035,7 @@ directory <c>examples/interface/</c>.
 
 To be able to run tasks on GPUs etc. StarPU needs to know how to allocate a
 buffer for the interface. In our example, two allocations are needed in the
-allocation complex_allocate_data_on_node() method: one for the real part and one
+allocation method \c complex_allocate_data_on_node(): one for the real part and one
 for the imaginary part.
 
 \code{.c}
@@ -1062,10 +1068,10 @@ fail_real:
 \endcode
 
 Here we try to allocate the two parts. If either of them fails, we return
--ENOMEM. If they succeed, we can record the obtained pointers and returned the
+\c -ENOMEM. If they succeed, we can record the obtained pointers and returned the
 amount of allocated memory (for memory usage accounting).
 
-Conversely, complex_free_data_on_node() frees the two parts:
+Conversely, \c complex_free_data_on_node() frees the two parts:
 
 \code{.c}
 static void complex_free_data_on_node(void *data_interface, unsigned node)
@@ -1085,7 +1091,7 @@ returns the resulting pointer, be it in main memory, in GPU memory, etc.
 \subsection DefiningANewDataInterface_copy Data copy
 
 Now that StarPU knows how to allocate/free a buffer, it needs to be able to
-copy over data into/from it. Defining a copy_any_to_any method allows StarPU to
+copy over data into/from it. Defining a method \c copy_any_to_any() allows StarPU to
 perform direct transfers between main memory and GPU memory.
 
 \code{.c}
@@ -1115,10 +1121,10 @@ static int copy_any_to_any(void *src_interface, unsigned src_node,
 We here again have no idea what is main memory or GPU memory, or even if the
 copy is synchronous or asynchronous: we just call starpu_interface_copy()
 according to the interface, passing it the pointers, and checking whether it
-returned -EAGAIN, which means the copy is asynchronous, and StarPU will
-appropriately wait for it thanks to the \c async_data pointer.
+returned \c -EAGAIN, which means the copy is asynchronous, and StarPU will
+appropriately wait for it thanks to the pointer \c async_data.
 
-This copy method is referenced in a \ref starpu_data_copy_methods structure:
+This copy method is referenced in a structure \ref starpu_data_copy_methods
 
 \code{.c}
 static const struct starpu_data_copy_methods complex_copy_methods =
@@ -1127,7 +1133,7 @@ static const struct starpu_data_copy_methods complex_copy_methods =
 };
 \endcode
 
-which was referenced in the \ref starpu_data_interface_ops structure above.
+which was referenced in the structure \ref starpu_data_interface_ops above.
 
 Other fields of \ref starpu_data_copy_methods allow to provide optimized
 variants, notably for the case of 2D or 3D matrix tiles with non-trivial ld.
@@ -1136,7 +1142,7 @@ variants, notably for the case of 2D or 3D matrix tiles with non-trivial ld.
 
 The copy methods allow for RAM/GPU transfers, but is not enough for e.g.
 transferring over MPI. That requires defining the pack/peek/unpack methods. The
-principle is that the starpu_data_interface_ops::pack_data method concatenates
+principle is that the method starpu_data_interface_ops::pack_data concatenates
 the buffer data into a newly-allocated contiguous bytes array, conversely
 starpu_data_interface_ops::peek_data extracts from a bytes array into the
 buffer data, and starpu_data_interface_ops::unpack_data does the same as
@@ -1164,7 +1170,7 @@ static int complex_pack_data(starpu_data_handle_t handle, unsigned node, void **
 }
 \endcode
 
-complex_pack_data() first computes the size to be allocated, then allocates it,
+\c complex_pack_data() first computes the size to be allocated, then allocates it,
 and copies over into it the content of the two real and imaginary arrays.
 
 \code{.c}
@@ -1184,7 +1190,7 @@ static int complex_peek_data(starpu_data_handle_t handle, unsigned node, void *p
 }
 \endcode
 
-complex_peek_data() simply uses memcpy to copy over from the bytes array into the data buffer.
+\c complex_peek_data() simply uses \c memcpy() to copy over from the bytes array into the data buffer.
 
 \code{.c}
 static int complex_unpack_data(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count)
@@ -1197,7 +1203,7 @@ static int complex_unpack_data(starpu_data_handle_t handle, unsigned node, void
 }
 \endcode
 
-And complex_unpack_data() just calls complex_peek_data() and releases the bytes array.
+And \c complex_unpack_data() just calls \c complex_peek_data() and releases the bytes array.
 
 
 \section SpecifyingATargetNode Specifying A Target Node For Task Data
@@ -1206,8 +1212,8 @@ When executing a task on a GPU for instance, StarPU would normally copy all the
 needed data for the tasks on the embedded memory of the GPU.  It may however
 happen that the task kernel would rather have some of the datas kept in the
 main memory instead of copied in the GPU, a pivoting vector for instance.
-This can be achieved by setting the starpu_codelet::specific_nodes flag to
-<c>1</c>, and then fill the starpu_codelet::nodes array (or starpu_codelet::dyn_nodes when
+This can be achieved by setting the flag starpu_codelet::specific_nodes to
+<c>1</c>, and then fill the array starpu_codelet::nodes (or starpu_codelet::dyn_nodes when
 starpu_codelet::nbuffers is greater than \ref STARPU_NMAXBUFS) with the node numbers
 where data should be copied to, or ::STARPU_SPECIFIC_NODE_LOCAL to let
 StarPU copy it to the memory node where the task will be executed.

Tiedoston diff-näkymää rajattu, sillä se on liian suuri
+ 35 - 31
doc/doxygen/chapters/320_scheduling.doxy


+ 2 - 2
doc/doxygen/chapters/380_offline_performance_tools.doxy

@@ -155,8 +155,8 @@ starpu_task::name or by using \ref STARPU_NAME when calling
 starpu_task_insert().
 
 Tasks are assigned default colors based on the worker which executed
-them (green for CPUs, yellow/orange/red for CUDAs, blue for OpenCLs,
-red for MICs, ...). To use a different color for every type of task,
+them (green for CPUs, yellow/orange/red for CUDAs, blue for OpenCLs, ...).
+To use a different color for every type of task,
 one can specify the option <c>-c</c> to <c>starpu_fxt_tool</c> or in
 \ref STARPU_GENERATE_TRACE_OPTIONS. Tasks can also be given a specific
 color by setting the field starpu_codelet::color or the

+ 0 - 97
doc/doxygen/chapters/430_mic_support.doxy

@@ -1,97 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2009-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-/*! \page MICSupport MIC Xeon Phi Support
-
-\section MICCompilation Compilation
-
-MIC Xeon Phi support actually needs two compilations of StarPU, one for the host and one for
-the device. The <c>PATH</c> environment variable has to include the path to the
-cross-compilation toolchain, for instance <c>/usr/linux-k1om-4.7/bin</c> .
-The <c>SINK_PKG_CONFIG_PATH</c> environment variable should include the path to the
-cross-compiled <c>hwloc.pc</c>.
-The script <c>mic-configure</c> can then be used to achieve the two compilations: it basically
-calls <c>configure</c> as appropriate from two new directories: <c>build_mic</c> and
-<c>build_host</c>. <c>make</c> and <c>make install</c> can then be used as usual and will
-recurse into both directories. If different \c configure options are needed
-for the host and for the mic, one can use <c>--with-host-param=--with-fxt</c>
-for instance to specify the <c>--with-fxt</c> option for the host only, or
-<c>--with-mic-param=--with-fxt</c> for the mic only.
-
-One can also run StarPU just natively on the Xeon Phi, i.e. it will only run
-directly on the Phi without any exchange with the host CPU. The binaries in
-<c>build_mic</c> can be run that way.
-
-For MPI support, you will probably have to specify different MPI compiler path
-or option for the host and the device builds, for instance:
-
-\verbatim
-./mic-configure --with-mic-param=--with-mpicc="/.../mpiicc -mmic" \
-    --with-host-param=--with-mpicc=/.../mpiicc
-\endverbatim
-
-In case you have troubles with the \c coi or \c scif libraries (the Intel paths are
-really not standard, it seems...), you can still make a build in native mode
-only, by using <c>mic-configure --enable-native-mic</c> (and notably without
-<c>--enable-mic</c> since in that case we don't need \c mic offloading support).
-
-\section PortingApplicationsToMIC Porting Applications To MIC Xeon Phi
-
-The simplest way to port an application to MIC Xeon Phi is to set the field
-starpu_codelet::cpu_funcs_name, to provide StarPU with the function
-name of the CPU implementation, so for instance:
-
-\verbatim
-struct starpu_codelet cl =
-{
-    .cpu_funcs = {myfunc},
-    .cpu_funcs_name = {"myfunc"},
-    .nbuffers = 1,
-}
-\endverbatim
-
-StarPU will thus simply use the
-existing CPU implementation (cross-rebuilt in the MIC Xeon Phi case). The
-functions have to be globally-visible (i.e. not <c>static</c>) for
-StarPU to be able to look them up, and \c -rdynamic must be passed to \c gcc (or
-\c -export-dynamic to \c ld) so that symbols of the main program are visible.
-
-If you have used the starpu_codelet::where field, you additionally need to add in it
-::STARPU_MIC for the Xeon Phi.
-
-For non-native MIC Xeon Phi execution, the 'main' function of the application, on the sink, should call starpu_init() immediately upon start-up; the starpu_init() function never returns. On the host, the 'main' function may freely perform application related initialization calls as usual, before calling starpu_init().
-
-For MIC Xeon Phi, the application may programmatically detect whether executing on the sink or on the host, by checking whether the \ref STARPU_SINK environment variable is defined (on the sink) or not (on the host).
-
-\section LaunchingPrograms Launching Programs
-
-MIC programs are started from the host. StarPU automatically
-starts the same program on MIC devices. It however needs to get
-the MIC-cross-built binary. It will look for the file given by the
-environment variable \ref STARPU_MIC_SINK_PROGRAM_NAME or in the
-directory given by the environment variable \ref STARPU_MIC_SINK_PROGRAM_PATH,
-or in the field
-starpu_conf::mic_sink_program_path. It will also look in the current
-directory for the same binary name plus the suffix <c>-mic</c> or
-<c>_mic</c>.
-
-The testsuite can be started by simply running <c>make check</c> from the
-top directory. It will recurse into both <c>build_host</c> to run tests with only
-the host, and into <c>build_mic</c> to run tests with both the host and the MIC
-devices. Single tests with the host and the MIC can be run by starting
-<c>./loader-cross.sh ./the_test</c> from <c>build_mic/tests</c>.
-
-*/

+ 0 - 1
doc/doxygen/chapters/495_interoperability.doxy

@@ -65,7 +65,6 @@ of the following strings as argument and returns the corresponding ID:
 <li><c>"cpu"</c></li>
 <li><c>"opencl"</c></li>
 <li><c>"cuda"</c></li>
-<li><c>"mic"</c></li>
 </ul>
 The \c cpu pseudo device type is defined for convenience and designates CPU
 cores. The number of units of each type available for computation can be

+ 302 - 303
doc/doxygen/chapters/501_environment_variables.doxy

@@ -23,161 +23,8 @@ the following environment variables.
 
 \section EnvConfiguringWorkers Configuring Workers
 
+\subsection Basic General Configuration
 <dl>
-
-<dt>STARPU_NCPU</dt>
-<dd>
-\anchor STARPU_NCPU
-\addindex __env__STARPU_NCPU
-Specify the number of CPU workers (thus not including workers
-dedicated to control accelerators). Note that by default, StarPU will
-not allocate more CPU workers than there are physical CPUs, and that
-some CPUs are used to control the accelerators.
-</dd>
-
-<dt>STARPU_RESERVE_NCPU</dt>
-<dd>
-\anchor STARPU_RESERVE_NCPU
-\addindex __env__STARPU_RESERVE_NCPU
-Specify the number of CPU cores that should not be used by StarPU, so the
-application can use starpu_get_next_bindid() and starpu_bind_thread_on() to bind
-its own threads.
-
-This option is ignored if \ref STARPU_NCPU or starpu_conf::ncpus is set.
-</dd>
-
-<dt>STARPU_NCPUS</dt>
-<dd>
-\anchor STARPU_NCPUS
-\addindex __env__STARPU_NCPUS
-This variable is deprecated. You should use \ref STARPU_NCPU.
-</dd>
-
-<dt>STARPU_NCUDA</dt>
-<dd>
-\anchor STARPU_NCUDA
-\addindex __env__STARPU_NCUDA
-Specify the number of CUDA devices that StarPU can use. If
-\ref STARPU_NCUDA is lower than the number of physical devices, it is
-possible to select which CUDA devices should be used by the means of the
-environment variable \ref STARPU_WORKERS_CUDAID. By default, StarPU will
-create as many CUDA workers as there are CUDA devices.
-</dd>
-
-<dt>STARPU_NWORKER_PER_CUDA</dt>
-<dd>
-\anchor STARPU_NWORKER_PER_CUDA
-\addindex __env__STARPU_NWORKER_PER_CUDA
-Specify the number of workers per CUDA device, and thus the number of kernels
-which will be concurrently running on the devices, i.e. the number of CUDA
-streams. The default value is 1.
-</dd>
-
-<dt>STARPU_CUDA_THREAD_PER_WORKER</dt>
-<dd>
-\anchor STARPU_CUDA_THREAD_PER_WORKER
-\addindex __env__STARPU_CUDA_THREAD_PER_WORKER
-Specify whether the cuda driver should use one thread per stream (1) or to use
-a single thread to drive all the streams of the device or all devices (0), and
-\ref STARPU_CUDA_THREAD_PER_DEV determines whether is it one thread per device or one
-thread for all devices. The default value is 0. Setting it to 1 is contradictory
-with setting \ref STARPU_CUDA_THREAD_PER_DEV.
-</dd>
-
-<dt>STARPU_CUDA_THREAD_PER_DEV</dt>
-<dd>
-\anchor STARPU_CUDA_THREAD_PER_DEV
-\addindex __env__STARPU_CUDA_THREAD_PER_DEV
-Specify whether the cuda driver should use one thread per device (1) or to use a
-single thread to drive all the devices (0). The default value is 1.  It does not
-make sense to set this variable if \ref STARPU_CUDA_THREAD_PER_WORKER is set to to 1
-(since \ref STARPU_CUDA_THREAD_PER_DEV is then meaningless).
-</dd>
-
-<dt>STARPU_CUDA_PIPELINE</dt>
-<dd>
-\anchor STARPU_CUDA_PIPELINE
-\addindex __env__STARPU_CUDA_PIPELINE
-Specify how many asynchronous tasks are submitted in advance on CUDA
-devices. This for instance permits to overlap task management with the execution
-of previous tasks, but it also allows concurrent execution on Fermi cards, which
-otherwise bring spurious synchronizations. The default is 2. Setting the value to 0 forces a synchronous
-execution of all tasks.
-</dd>
-
-<dt>STARPU_NOPENCL</dt>
-<dd>
-\anchor STARPU_NOPENCL
-\addindex __env__STARPU_NOPENCL
-OpenCL equivalent of the environment variable \ref STARPU_NCUDA.
-</dd>
-
-<dt>STARPU_OPENCL_PIPELINE</dt>
-<dd>
-\anchor STARPU_OPENCL_PIPELINE
-\addindex __env__STARPU_OPENCL_PIPELINE
-Specify how many asynchronous tasks are submitted in advance on OpenCL
-devices. This for instance permits to overlap task management with the execution
-of previous tasks, but it also allows concurrent execution on Fermi cards, which
-otherwise bring spurious synchronizations. The default is 2. Setting the value to 0 forces a synchronous
-execution of all tasks.
-</dd>
-
-<dt>STARPU_OPENCL_ON_CPUS</dt>
-<dd>
-\anchor STARPU_OPENCL_ON_CPUS
-\addindex __env__STARPU_OPENCL_ON_CPUS
-By default, the OpenCL driver only enables GPU and accelerator
-devices. By setting the environment variable \ref STARPU_OPENCL_ON_CPUS
-to 1, the OpenCL driver will also enable CPU devices.
-</dd>
-
-<dt>STARPU_OPENCL_ONLY_ON_CPUS</dt>
-<dd>
-\anchor STARPU_OPENCL_ONLY_ON_CPUS
-\addindex __env__STARPU_OPENCL_ONLY_ON_CPUS
-By default, the OpenCL driver enables GPU and accelerator
-devices. By setting the environment variable \ref STARPU_OPENCL_ONLY_ON_CPUS
-to 1, the OpenCL driver will ONLY enable CPU devices.
-</dd>
-
-<dt>STARPU_NMIC</dt>
-<dd>
-\anchor STARPU_NMIC
-\addindex __env__STARPU_NMIC
-MIC equivalent of the environment variable \ref STARPU_NCUDA, i.e. the number of
-MIC devices to use.
-</dd>
-
-<dt>STARPU_NMICTHREADS</dt>
-<dd>
-\anchor STARPU_NMICTHREADS
-\addindex __env__STARPU_NMICTHREADS
-Number of threads to use on the MIC devices.
-</dd>
-
-<dt>STARPU_NMPI_MS</dt>
-<dd>
-\anchor STARPU_NMPI_MS
-\addindex __env__STARPU_NMPI_MS
-MPI Master Slave equivalent of the environment variable \ref STARPU_NCUDA, i.e. the number of
-MPI Master Slave devices to use.
-</dd>
-
-<dt>STARPU_NMPIMSTHREADS</dt>
-<dd>
-\anchor STARPU_NMPIMSTHREADS
-\addindex __env__STARPU_NMPIMSTHREADS
-Number of threads to use on the MPI Slave devices.
-</dd>
-
-<dt>STARPU_MPI_MASTER_NODE</dt>
-<dd>
-\anchor STARPU_MPI_MASTER_NODE
-\addindex __env__STARPU_MPI_MASTER_NODE
-This variable allows to chose which MPI node (with the MPI ID) will be the master.
-</dd>
-
 <dt>STARPU_WORKERS_NOBIND</dt>
 <dd>
 \anchor STARPU_WORKERS_NOBIND
@@ -262,69 +109,6 @@ Same as \ref STARPU_MAIN_THREAD_CPUID, but bind the thread that calls
 starpu_initialize() to the given core, instead of the PU (hyperthread).
 </dd>
 
-<dt>STARPU_MPI_THREAD_CPUID</dt>
-<dd>
-\anchor STARPU_MPI_THREAD_CPUID
-\addindex __env__STARPU_MPI_THREAD_CPUID
-When defined, this make StarPU bind its MPI thread to the given CPU ID. Setting
-it to -1 (the default value) will use a reserved CPU, subtracted from the CPU
-workers.
-</dd>
-
-<dt>STARPU_MPI_THREAD_COREID</dt>
-<dd>
-\anchor STARPU_MPI_THREAD_COREID
-\addindex __env__STARPU_MPI_THREAD_COREID
-Same as \ref STARPU_MPI_THREAD_CPUID, but bind the MPI thread to the given core
-ID, instead of the PU (hyperthread).
-</dd>
-
-<dt>STARPU_MPI_NOBIND</dt>
-<dd>
-\anchor STARPU_MPI_NOBIND
-\addindex __env__STARPU_MPI_NOBIND
-Setting it to non-zero will prevent StarPU from binding the MPI to
-a separate core. This is for instance useful when running the testsuite on a single system.
-</dd>
-
-<dt>STARPU_WORKERS_CUDAID</dt>
-<dd>
-\anchor STARPU_WORKERS_CUDAID
-\addindex __env__STARPU_WORKERS_CUDAID
-Similarly to the \ref STARPU_WORKERS_CPUID environment variable, it is
-possible to select which CUDA devices should be used by StarPU. On a machine
-equipped with 4 GPUs, setting <c>STARPU_WORKERS_CUDAID = "1 3"</c> and
-<c>STARPU_NCUDA=2</c> specifies that 2 CUDA workers should be created, and that
-they should use CUDA devices #1 and #3 (the logical ordering of the devices is
-the one reported by CUDA).
-
-This variable is ignored if the field
-starpu_conf::use_explicit_workers_cuda_gpuid passed to starpu_init()
-is set.
-</dd>
-
-<dt>STARPU_WORKERS_OPENCLID</dt>
-<dd>
-\anchor STARPU_WORKERS_OPENCLID
-\addindex __env__STARPU_WORKERS_OPENCLID
-OpenCL equivalent of the \ref STARPU_WORKERS_CUDAID environment variable.
-
-This variable is ignored if the field
-starpu_conf::use_explicit_workers_opencl_gpuid passed to starpu_init()
-is set.
-</dd>
-
-<dt>STARPU_WORKERS_MICID</dt>
-<dd>
-\anchor STARPU_WORKERS_MICID
-\addindex __env__STARPU_WORKERS_MICID
-MIC equivalent of the \ref STARPU_WORKERS_CUDAID environment variable.
-
-This variable is ignored if the field
-starpu_conf::use_explicit_workers_mic_deviceid passed to starpu_init()
-is set.
-</dd>
-
 <dt>STARPU_WORKER_TREE</dt>
 <dd>
 \anchor STARPU_WORKER_TREE
@@ -346,25 +130,23 @@ and \ref STARPU_MAX_WORKERSIZE can be used to change this default.
 <dd>
 \anchor STARPU_MIN_WORKERSIZE
 \addindex __env__STARPU_MIN_WORKERSIZE
-\ref STARPU_MIN_WORKERSIZE
-permits to specify the minimum size of the combined workers (instead of the default 2)
+Specify the minimum size of the combined workers. Default value is 2.
 </dd>
 
 <dt>STARPU_MAX_WORKERSIZE</dt>
 <dd>
 \anchor STARPU_MAX_WORKERSIZE
 \addindex __env__STARPU_MAX_WORKERSIZE
-\ref STARPU_MAX_WORKERSIZE
-permits to specify the minimum size of the combined workers (instead of the
-number of CPU workers in the system)
+Specify the minimum size of the combined workers. Default value is the
+number of CPU workers in the system.
 </dd>
 
 <dt>STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER</dt>
 <dd>
 \anchor STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER
 \addindex __env__STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER
-Let the user decide how many elements are allowed between combined workers
-created from hwloc information. For instance, in the case of sockets with 6
+Specify how many elements are allowed between combined workers
+created from \c hwloc information. For instance, in the case of sockets with 6
 cores without shared L2 caches, if \ref STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER is
 set to 6, no combined worker will be synthesized beyond one for the socket
 and one per core. If it is set to 3, 3 intermediate combined workers will be
@@ -387,90 +169,162 @@ Disable asynchronous copies between CPU and GPU devices.
 The AMD implementation of OpenCL is known to
 fail when copying data asynchronously. When using this implementation,
 it is therefore necessary to disable asynchronous data transfers.
+
+See also \ref STARPU_DISABLE_ASYNCHRONOUS_CUDA_COPY and \ref
+STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY.
 </dd>
 
-<dt>STARPU_DISABLE_ASYNCHRONOUS_CUDA_COPY</dt>
+<dt>STARPU_DISABLE_PINNING</dt>
 <dd>
-\anchor STARPU_DISABLE_ASYNCHRONOUS_CUDA_COPY
-\addindex __env__STARPU_DISABLE_ASYNCHRONOUS_CUDA_COPY
-Disable asynchronous copies between CPU and CUDA devices.
+\anchor STARPU_DISABLE_PINNING
+\addindex __env__STARPU_DISABLE_PINNING
+Disable (1) or Enable (0) pinning host memory allocated through starpu_malloc(), starpu_memory_pin()
+and friends.  The default is Enabled.
+This permits to test the performance effect of memory pinning.
 </dd>
 
-<dt>STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY</dt>
+<dt>STARPU_BACKOFF_MIN</dt>
 <dd>
-\anchor STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY
-\addindex __env__STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY
-Disable asynchronous copies between CPU and OpenCL devices.
-The AMD implementation of OpenCL is known to
-fail when copying data asynchronously. When using this implementation,
-it is therefore necessary to disable asynchronous data transfers.
+\anchor STARPU_BACKOFF_MIN
+\addindex __env__STARPU_BACKOFF_MIN
+Set minimum exponential backoff of number of cycles to pause when spinning. Default value is 1.
 </dd>
 
-<dt>STARPU_DISABLE_ASYNCHRONOUS_MIC_COPY</dt>
+<dt>STARPU_BACKOFF_MAX</dt>
 <dd>
-\anchor STARPU_DISABLE_ASYNCHRONOUS_MIC_COPY
-\addindex __env__STARPU_DISABLE_ASYNCHRONOUS_MIC_COPY
-Disable asynchronous copies between CPU and MIC devices.
+\anchor STARPU_BACKOFF_MAX
+\addindex __env__STARPU_BACKOFF_MAX
+Set maximum exponential backoff of number of cycles to pause when spinning. Default value is 32.
 </dd>
 
-<dt>STARPU_DISABLE_ASYNCHRONOUS_MPI_MS_COPY</dt>
+<dt>STARPU_SINK</dt>
 <dd>
-\anchor STARPU_DISABLE_ASYNCHRONOUS_MPI_MS_COPY
-\addindex __env__STARPU_DISABLE_ASYNCHRONOUS_MPI_MS_COPY
-Disable asynchronous copies between CPU and MPI Slave devices.
+\anchor STARPU_SINK
+\addindex __env__STARPU_SINK
+Defined internally by StarPU when running in master slave mode.
 </dd>
 
-<dt>STARPU_ENABLE_CUDA_GPU_GPU_DIRECT</dt>
+</dl>
+
+\subsection cpuWorkers CPU Workers
+<dl>
+<dt>STARPU_NCPU</dt>
 <dd>
-\anchor STARPU_ENABLE_CUDA_GPU_GPU_DIRECT
-\addindex __env__STARPU_ENABLE_CUDA_GPU_GPU_DIRECT
-Enable (1) or Disable (0) direct CUDA transfers from GPU to GPU, without copying
-through RAM. The default is Enabled.
-This permits to test the performance effect of GPU-Direct.
+\anchor STARPU_NCPU
+\addindex __env__STARPU_NCPU
+Specify the number of CPU workers (thus not including workers
+dedicated to control accelerators). Note that by default, StarPU will
+not allocate more CPU workers than there are physical CPUs, and that
+some CPUs are used to control the accelerators.
 </dd>
 
-<dt>STARPU_DISABLE_PINNING</dt>
+<dt>STARPU_RESERVE_NCPU</dt>
 <dd>
-\anchor STARPU_DISABLE_PINNING
-\addindex __env__STARPU_DISABLE_PINNING
-Disable (1) or Enable (0) pinning host memory allocated through starpu_malloc, starpu_memory_pin
-and friends.  The default is Enabled.
-This permits to test the performance effect of memory pinning.
+\anchor STARPU_RESERVE_NCPU
+\addindex __env__STARPU_RESERVE_NCPU
+Specify the number of CPU cores that should not be used by StarPU, so the
+application can use starpu_get_next_bindid() and starpu_bind_thread_on() to bind
+its own threads.
+
+This option is ignored if \ref STARPU_NCPU or starpu_conf::ncpus is set.
 </dd>
 
-<dt>STARPU_BACKOFF_MIN</dt>
+<dt>STARPU_NCPUS</dt>
 <dd>
-\anchor STARPU_BACKOFF_MIN
-\addindex __env__STARPU_BACKOFF_MIN
-Set minimum exponential backoff of number of cycles to pause when spinning. Default value is 1.
+\anchor STARPU_NCPUS
+\addindex __env__STARPU_NCPUS
+This variable is deprecated. You should use \ref STARPU_NCPU.
 </dd>
 
-<dt>STARPU_BACKOFF_MAX</dt>
+</dl>
+
+\subsection cudaWorkers CUDA Workers
+<dl>
+<dt>STARPU_NCUDA</dt>
 <dd>
-\anchor STARPU_BACKOFF_MAX
-\addindex __env__STARPU_BACKOFF_MAX
-Set maximum exponential backoff of number of cycles to pause when spinning. Default value is 32.
+\anchor STARPU_NCUDA
+\addindex __env__STARPU_NCUDA
+Specify the number of CUDA devices that StarPU can use. If
+\ref STARPU_NCUDA is lower than the number of physical devices, it is
+possible to select which GPU devices should be used by the means of the
+environment variable \ref STARPU_WORKERS_CUDAID. By default, StarPU will
+create as many CUDA workers as there are GPU devices.
 </dd>
 
-<dt>STARPU_MIC_SINK_PROGRAM_NAME</dt>
+<dt>STARPU_NWORKER_PER_CUDA</dt>
 <dd>
-\anchor STARPU_MIC_SINK_PROGRAM_NAME
-\addindex __env__STARPU_MIC_SINK_PROGRAM_NAME
-todo
+\anchor STARPU_NWORKER_PER_CUDA
+\addindex __env__STARPU_NWORKER_PER_CUDA
+Specify the number of workers per CUDA device, and thus the number of kernels
+which will be concurrently running on the devices, i.e. the number of CUDA
+streams. The default value is 1.
 </dd>
 
-<dt>STARPU_MIC_SINK_PROGRAM_PATH</dt>
+<dt>STARPU_CUDA_THREAD_PER_WORKER</dt>
 <dd>
-\anchor STARPU_MIC_SINK_PROGRAM_PATH
-\addindex __env__STARPU_MIC_SINK_PROGRAM_PATH
-todo
+\anchor STARPU_CUDA_THREAD_PER_WORKER
+\addindex __env__STARPU_CUDA_THREAD_PER_WORKER
+Specify whether the cuda driver should use one thread per stream (1) or to use
+a single thread to drive all the streams of the device or all devices (0), and
+\ref STARPU_CUDA_THREAD_PER_DEV determines whether is it one thread per device or one
+thread for all devices. The default value is 0. Setting it to 1 is contradictory
+with setting \ref STARPU_CUDA_THREAD_PER_DEV.
 </dd>
 
-<dt>STARPU_MIC_PROGRAM_PATH</dt>
+<dt>STARPU_CUDA_THREAD_PER_DEV</dt>
 <dd>
-\anchor STARPU_MIC_PROGRAM_PATH
-\addindex __env__STARPU_MIC_PROGRAM_PATH
-todo
+\anchor STARPU_CUDA_THREAD_PER_DEV
+\addindex __env__STARPU_CUDA_THREAD_PER_DEV
+Specify whether the cuda driver should use one thread per device (1) or to use a
+single thread to drive all the devices (0). The default value is 1.  It does not
+make sense to set this variable if \ref STARPU_CUDA_THREAD_PER_WORKER is set to to 1
+(since \ref STARPU_CUDA_THREAD_PER_DEV is then meaningless).
+</dd>
+
+<dt>STARPU_CUDA_PIPELINE</dt>
+<dd>
+\anchor STARPU_CUDA_PIPELINE
+\addindex __env__STARPU_CUDA_PIPELINE
+Specify how many asynchronous tasks are submitted in advance on CUDA
+devices. This for instance permits to overlap task management with the execution
+of previous tasks, but it also allows concurrent execution on Fermi cards, which
+otherwise bring spurious synchronizations. The default is 2. Setting the value to 0 forces a synchronous
+execution of all tasks.
+</dd>
+
+<dt>STARPU_WORKERS_CUDAID</dt>
+<dd>
+\anchor STARPU_WORKERS_CUDAID
+\addindex __env__STARPU_WORKERS_CUDAID
+Similarly to the \ref STARPU_WORKERS_CPUID environment variable, it is
+possible to select which CUDA devices should be used by StarPU. On a machine
+equipped with 4 GPUs, setting <c>STARPU_WORKERS_CUDAID = "1 3"</c> and
+<c>STARPU_NCUDA=2</c> specifies that 2 CUDA workers should be created, and that
+they should use CUDA devices #1 and #3 (the logical ordering of the devices is
+the one reported by CUDA).
+
+This variable is ignored if the field
+starpu_conf::use_explicit_workers_cuda_gpuid passed to starpu_init()
+is set.
+</dd>
+
+<dt>STARPU_DISABLE_ASYNCHRONOUS_CUDA_COPY</dt>
+<dd>
+\anchor STARPU_DISABLE_ASYNCHRONOUS_CUDA_COPY
+\addindex __env__STARPU_DISABLE_ASYNCHRONOUS_CUDA_COPY
+Disable asynchronous copies between CPU and CUDA devices.
+
+See also \ref STARPU_DISABLE_ASYNCHRONOUS_COPY and \ref
+STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY.
+</dd>
+
+<dt>STARPU_ENABLE_CUDA_GPU_GPU_DIRECT</dt>
+<dd>
+\anchor STARPU_ENABLE_CUDA_GPU_GPU_DIRECT
+\addindex __env__STARPU_ENABLE_CUDA_GPU_GPU_DIRECT
+Enable (1) or Disable (0) direct CUDA transfers from GPU to GPU, without copying
+through RAM. The default is Enabled.
+This permits to test the performance effect of GPU-Direct.
 </dd>
 
 <dt>STARPU_CUDA_ONLY_FAST_ALLOC_OTHER_MEMNODES</dt>
@@ -479,12 +333,153 @@ todo
 \addindex __env__STARPU_CUDA_ONLY_FAST_ALLOC_OTHER_MEMNODES
 Specify if CUDA workers should do only fast allocations
 when running the datawizard progress of
-other memory nodes. This will pass STARPU_DATAWIZARD_ONLY_FAST_ALLOC.
+other memory nodes. This will pass the internal value
+_STARPU_DATAWIZARD_ONLY_FAST_ALLOC to allocation methods.
 Default value is 0, allowing CUDA workers to do slow allocations.
+
+This can also be specified with starpu_conf::cuda_only_fast_alloc_other_memnodes.
 </dd>
 
 </dl>
 
+\subsection openclWorkers OpenCL Workers
+<dl>
+<dt>STARPU_NOPENCL</dt>
+<dd>
+\anchor STARPU_NOPENCL
+\addindex __env__STARPU_NOPENCL
+Specify the number of OpenCL devices that StarPU can use. If
+\ref STARPU_NOPENCL is lower than the number of physical devices, it is
+possible to select which GPU devices should be used by the means of the
+environment variable \ref STARPU_WORKERS_OPENCLID. By default, StarPU will
+create as many OpenCL workers as there are GPU devices.
+
+Note that by default StarPU will launch CUDA workers on GPU devices.
+You need to disable CUDA to allow the creation of OpenCL workers.
+</dd>
+
+<dt>STARPU_WORKERS_OPENCLID</dt>
+<dd>
+\anchor STARPU_WORKERS_OPENCLID
+\addindex __env__STARPU_WORKERS_OPENCLID
+Similarly to the \ref STARPU_WORKERS_CPUID environment variable, it is
+possible to select which GPU devices should be used by StarPU. On a machine
+equipped with 4 GPUs, setting <c>STARPU_WORKERS_OPENCLID = "1 3"</c> and
+<c>STARPU_NOPENCL=2</c> specifies that 2 OpenCL workers should be
+created, and that they should use GPU devices #1 and #3.
+
+This variable is ignored if the field
+starpu_conf::use_explicit_workers_opencl_gpuid passed to starpu_init()
+is set.
+</dd>
+
+<dt>STARPU_OPENCL_PIPELINE</dt>
+<dd>
+\anchor STARPU_OPENCL_PIPELINE
+\addindex __env__STARPU_OPENCL_PIPELINE
+Specify how many asynchronous tasks are submitted in advance on OpenCL
+devices. This for instance permits to overlap task management with the execution
+of previous tasks, but it also allows concurrent execution on Fermi cards, which
+otherwise bring spurious synchronizations. The default is 2. Setting the value to 0 forces a synchronous
+execution of all tasks.
+</dd>
+
+<dt>STARPU_OPENCL_ON_CPUS</dt>
+<dd>
+\anchor STARPU_OPENCL_ON_CPUS
+\addindex __env__STARPU_OPENCL_ON_CPUS
+By default, the OpenCL driver only enables GPU and accelerator
+devices. By setting the environment variable \ref STARPU_OPENCL_ON_CPUS
+to 1, the OpenCL driver will also enable CPU devices.
+</dd>
+
+<dt>STARPU_OPENCL_ONLY_ON_CPUS</dt>
+<dd>
+\anchor STARPU_OPENCL_ONLY_ON_CPUS
+\addindex __env__STARPU_OPENCL_ONLY_ON_CPUS
+By default, the OpenCL driver enables GPU and accelerator
+devices. By setting the environment variable \ref STARPU_OPENCL_ONLY_ON_CPUS
+to 1, the OpenCL driver will ONLY enable CPU devices.
+</dd>
+
+<dt>STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY</dt>
+<dd>
+\anchor STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY
+\addindex __env__STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY
+Disable asynchronous copies between CPU and OpenCL devices.
+The AMD implementation of OpenCL is known to
+fail when copying data asynchronously. When using this implementation,
+it is therefore necessary to disable asynchronous data transfers.
+
+See also \ref STARPU_DISABLE_ASYNCHRONOUS_COPY and \ref
+STARPU_DISABLE_ASYNCHRONOUS_CUDA_COPY.
+</dd>
+</dl>
+
+
+\subsection mpimsWorkers MPI Master Slave Workers
+<dl>
+<dt>STARPU_NMPI_MS</dt>
+<dd>
+\anchor STARPU_NMPI_MS
+\addindex __env__STARPU_NMPI_MS
+Specify the number of MPI master slave devices that StarPU can use.
+</dd>
+
+<dt>STARPU_NMPIMSTHREADS</dt>
+<dd>
+\anchor STARPU_NMPIMSTHREADS
+\addindex __env__STARPU_NMPIMSTHREADS
+Number of threads to use on the MPI Slave devices.
+</dd>
+
+<dt>STARPU_MPI_MASTER_NODE</dt>
+<dd>
+\anchor STARPU_MPI_MASTER_NODE
+\addindex __env__STARPU_MPI_MASTER_NODE
+This variable allows to chose which MPI node (with the MPI ID) will be the master.
+</dd>
+
+<dt>STARPU_DISABLE_ASYNCHRONOUS_MPI_MS_COPY</dt>
+<dd>
+\anchor STARPU_DISABLE_ASYNCHRONOUS_MPI_MS_COPY
+\addindex __env__STARPU_DISABLE_ASYNCHRONOUS_MPI_MS_COPY
+Disable asynchronous copies between CPU and MPI Slave devices.
+</dd>
+
+</dl>
+
+\subsection mpiConf MPI Configuration
+<dl>
+
+<dt>STARPU_MPI_THREAD_CPUID</dt>
+<dd>
+\anchor STARPU_MPI_THREAD_CPUID
+\addindex __env__STARPU_MPI_THREAD_CPUID
+When defined, this make StarPU bind its MPI thread to the given CPU ID. Setting
+it to -1 (the default value) will use a reserved CPU, subtracted from the CPU
+workers.
+</dd>
+
+<dt>STARPU_MPI_THREAD_COREID</dt>
+<dd>
+\anchor STARPU_MPI_THREAD_COREID
+\addindex __env__STARPU_MPI_THREAD_COREID
+Same as \ref STARPU_MPI_THREAD_CPUID, but bind the MPI thread to the given core
+ID, instead of the PU (hyperthread).
+</dd>
+
+<dt>STARPU_MPI_NOBIND</dt>
+<dd>
+\anchor STARPU_MPI_NOBIND
+\addindex __env__STARPU_MPI_NOBIND
+Setting it to non-zero will prevent StarPU from binding the MPI to
+a separate core. This is for instance useful when running the testsuite on a single system.
+</dd>
+
+</dl>
+
+
 \section ConfiguringTheSchedulingEngine Configuring The Scheduling Engine
 
 <dl>
@@ -504,6 +499,7 @@ Use <c>STARPU_SCHED=help</c> to get the list of available schedulers.
 \anchor STARPU_MIN_PRIO_env
 \addindex __env__STARPU_MIN_PRIO
 Set the mininum priority used by priorities-aware schedulers.
+The flag can also be set through the field starpu_conf::global_sched_ctx_min_priority.
 </dd>
 
 <dt>STARPU_MAX_PRIO</dt>
@@ -511,6 +507,7 @@ Set the mininum priority used by priorities-aware schedulers.
 \anchor STARPU_MAX_PRIO_env
 \addindex __env__STARPU_MAX_PRIO
 Set the maximum priority used by priorities-aware schedulers.
+The flag can also be set through the field starpu_conf::global_sched_ctx_max_priority.
 </dd>
 
 <dt>STARPU_CALIBRATE</dt>
@@ -586,6 +583,22 @@ pick up a task which has most of its data already available. Setting this to 0
 disables this.
 </dd>
 
+<dt>STARPU_SCHED_SORTED_ABOVE</dt>
+<dd>
+\anchor STARPU_SCHED_SORTED_ABOVE
+\addindex __env__STARPU_SCHED_SORTED_ABOVE
+For a modular scheduler with queues above the decision component, it is
+usually sorted by priority. Setting this to 0 disables this.
+</dd>
+
+<dt>STARPU_SCHED_SORTED_BELOW</dt>
+<dd>
+\anchor STARPU_SCHED_SORTED_BELOW
+\addindex __env__STARPU_SCHED_SORTED_BELOW
+For a modular scheduler with queues below the decision component, they are
+usually sorted by priority. Setting this to 0 disables this.
+</dd>
+
 <dt>STARPU_IDLE_POWER</dt>
 <dd>
 \anchor STARPU_IDLE_POWER
@@ -845,13 +858,6 @@ and allows studying scheduling overhead of the runtime system. However,
 it also makes simulation non-deterministic.
 </dd>
 
-<dt>STARPU_SINK</dt>
-<dd>
-\anchor STARPU_SINK
-\addindex __env__STARPU_SINK
-Variable defined by StarPU when running MPI Xeon PHI on the sink.
-</dd>
-
 </dl>
 
 \section MiscellaneousAndDebug Miscellaneous And Debug
@@ -888,7 +894,7 @@ performance model files. The default is <c>$STARPU_HOME/.starpu/sampling</c>.
 <dd>
 \anchor STARPU_PERF_MODEL_HOMOGENEOUS_CPU
 \addindex __env__STARPU_PERF_MODEL_HOMOGENEOUS_CPU
-When this is set to 0, StarPU will assume that CPU devices do not have the same
+When set to 0, StarPU will assume that CPU devices do not have the same
 performance, and thus use different performance models for them, thus making
 kernel calibration much longer, since measurements have to be made for each CPU
 core.
@@ -898,7 +904,7 @@ core.
 <dd>
 \anchor STARPU_PERF_MODEL_HOMOGENEOUS_CUDA
 \addindex __env__STARPU_PERF_MODEL_HOMOGENEOUS_CUDA
-When this is set to 1, StarPU will assume that all CUDA devices have the same
+When set to 1, StarPU will assume that all CUDA devices have the same
 performance, and thus share performance models for them, thus allowing kernel
 calibration to be much faster, since measurements only have to be once for all
 CUDA GPUs.
@@ -908,27 +914,17 @@ CUDA GPUs.
 <dd>
 \anchor STARPU_PERF_MODEL_HOMOGENEOUS_OPENCL
 \addindex __env__STARPU_PERF_MODEL_HOMOGENEOUS_OPENCL
-When this is set to 1, StarPU will assume that all OPENCL devices have the same
+When set to 1, StarPU will assume that all OPENCL devices have the same
 performance, and thus share performance models for them, thus allowing kernel
 calibration to be much faster, since measurements only have to be once for all
 OPENCL GPUs.
 </dd>
 
-<dt>STARPU_PERF_MODEL_HOMOGENEOUS_MIC</dt>
-<dd>
-\anchor STARPU_PERF_MODEL_HOMOGENEOUS_MIC
-\addindex __env__STARPU_PERF_MODEL_HOMOGENEOUS_MIC
-When this is set to 1, StarPU will assume that all MIC devices have the same
-performance, and thus share performance models for them, thus allowing kernel
-calibration to be much faster, since measurements only have to be once for all
-MIC GPUs.
-</dd>
-
 <dt>STARPU_PERF_MODEL_HOMOGENEOUS_MPI_MS</dt>
 <dd>
 \anchor STARPU_PERF_MODEL_HOMOGENEOUS_MPI_MS
 \addindex __env__STARPU_PERF_MODEL_HOMOGENEOUS_MPI_MS
-When this is set to 1, StarPU will assume that all MPI Slave devices have the same
+When set to 1, StarPU will assume that all MPI Slave devices have the same
 performance, and thus share performance models for them, thus allowing kernel
 calibration to be much faster, since measurements only have to be once for all
 MPI Slaves.
@@ -1418,9 +1414,9 @@ accesses (see \ref ConcurrentDataAccess).
 When defined, NUMA nodes are taking into account by StarPU. Otherwise, memory
 is considered as only one node. This is experimental for now.
 
-When enabled, STARPU_MAIN_MEMORY is a pointer to the NUMA node associated to the
+When enabled, ::STARPU_MAIN_RAM is a pointer to the NUMA node associated to the
 first CPU worker if it exists, the NUMA node associated to the first GPU discovered otherwise.
-If StarPU doesn't find any NUMA node after these step, STARPU_MAIN_MEMORY is the first NUMA node
+If StarPU doesn't find any NUMA node after these step, ::STARPU_MAIN_RAM is the first NUMA node
 discovered by StarPU.
 </dd>
 
@@ -1428,15 +1424,18 @@ discovered by StarPU.
 <dd>
 \anchor STARPU_IDLE_FILE
 \addindex __env__STARPU_IDLE_FILE
-If the environment variable STARPU_IDLE_FILE is defined, a file named after its contents will be created at the end of the execution.
-The file will contain the sum of the idle times of all the workers.
+When defined, a file named after its contents will be created at the
+end of the execution. This file will contain the sum of the idle times
+of all the workers.
 </dd>
 
 <dt>STARPU_HWLOC_INPUT</dt>
 <dd>
 \anchor STARPU_HWLOC_INPUT
 \addindex __env__STARPU_HWLOC_INPUT
-If the environment variable STARPU_HWLOC_INPUT is defined to the path of an XML file, hwloc will be made to use it as input instead of detecting the current platform topology, which can save significant initialization time.
+When defined to the path of an XML file, \c hwloc will use this file
+as input instead of detecting the current platform topology, which can
+save significant initialization time.
 
 To produce this XML file, use <c>lstopo file.xml</c>
 </dd>
@@ -1445,7 +1444,7 @@ To produce this XML file, use <c>lstopo file.xml</c>
 <dd>
 \anchor STARPU_CATCH_SIGNALS
 \addindex __env__STARPU_CATCH_SIGNALS
-By default, StarPU catch signals SIGINT, SIGSEGV and SIGTRAP to
+By default, StarPU catch signals \c SIGINT, \c SIGSEGV and \c SIGTRAP to
 perform final actions such as dumping FxT trace files even though the
 application has crashed. Setting this variable to a value other than 1
 will disable this behaviour. This should be done on JVM systems which

+ 10 - 33
doc/doxygen/chapters/510_configure_options.doxy

@@ -124,7 +124,7 @@ machine which does not have the tools <c>doxygen</c> and <c>latex</c>
 <dd>
 \anchor enable-build-doc-pdf
 \addindex __configure__--enable-build-doc-pdf
-By default, ontly the HTML documentation is generated. Use this option
+By default, only the HTML documentation is generated. Use this option
 to also enable the generation of the PDF documentation. This should be
 done on a machine which does have the tools <c>doxygen</c> and <c>latex</c>
 (plus the packages <c>latex-xcolor</c> and
@@ -139,6 +139,13 @@ Disable the usage of the ICC compiler. When found, some specific ICC
 examples are compiled.
 </dd>
 
+<dt>--with-check-flags</dt>
+<dd>
+\anchor with-check-flags
+\addindex __configure__--with-check-flags
+Specify flags which will be given to C, CXX and Fortran compilers when valid
+</dd>
+
 </dl>
 
 Additionally, the script <c>configure</c> recognize many variables, which
@@ -340,20 +347,6 @@ fail when copying data asynchronously. When using this implementation,
 it is therefore necessary to disable asynchronous data transfers.
 </dd>
 
-<dt>--enable-maxmicthreads</dt>
-<dd>
-\anchor enable-maxmicthreads
-\addindex __configure__--enable-maxmicthreads
-Specify the maximum number of MIC threads
-</dd>
-
-<dt>--disable-asynchronous-mic-copy</dt>
-<dd>
-\anchor disable-asynchronous-mic-copy
-\addindex __configure__--disable-asynchronous-mic-copy
-Disable asynchronous copies between CPU and MIC devices.
-</dd>
-
 <dt>--disable-asynchronous-mpi-master-slave-copy</dt>
 <dd>
 \anchor disable-asynchronous-mpi-master-slave-copy
@@ -476,22 +469,6 @@ Disable the SOCL extension (\ref SOCLOpenclExtensions).  By
 default, it is enabled when an OpenCL implementation is found.
 </dd>
 
-<dt>--with-coi-dir</dt>
-<dd>
-\anchor with-coi-dir
-\addindex __configure__--with-coi-dir
-Specify the directory to the COI library for MIC support.
-The default value is <c>/opt/intel/mic/coi</c>
-</dd>
-
-<dt>--mic-host</dt>
-<dd>
-\anchor mic-host
-\addindex __configure__--mic-host
-Specify the precise MIC architecture host identifier.
-The default value is <c>x86_64-k1om-linux</c>
-</dd>
-
 <dt>--enable-openmp</dt>
 <dd>
 \anchor enable-openmp
@@ -597,14 +574,14 @@ Enable building HDF5 support.
 <dd>
 \anchor with-hdf5-include-dir
 \addindex __configure__--with-hdf5-include-dir
-Specify the directory where is stored the header hdf5.h.
+Specify the directory where is stored the header file \c hdf5.h.
 </dd>
 
 <dt>--with-hdf5-lib-dir=<c>path</c></dt>
 <dd>
 \anchor with-hdf5-lib-dir
 \addindex __configure__--with-hdf5-lib-dir
-Specify the directory where is stored the hdf5 library.
+Specify the directory where is stored the library \c hdf5.
 </dd>
 
 <dt>--disable-starpufft</dt>

+ 0 - 1
doc/doxygen/chapters/520_files.doxy

@@ -37,7 +37,6 @@
 \file starpu_hash.h
 \file starpu_helper.h
 \file starpu_heteroprio.h
-\file starpu_mic.h
 \file starpu_mpi_ms.h
 \file starpu_mod.f90
 \file starpu_opencl.h

+ 0 - 24
doc/doxygen/chapters/api/versioning.doxy

@@ -1,24 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2009-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-/*! \defgroup API_Versioning Versioning
-
-\fn void starpu_get_version(int *major, int *minor, int *release)
-\ingroup API_Versioning
-Return as 3 integers the version of StarPU used when running the application.
-
-*/
-

Tiedoston diff-näkymää rajattu, sillä se on liian suuri
+ 3446 - 1032
doc/doxygen/chapters/images/tasks_size_overhead.eps


BIN
doc/doxygen/chapters/images/tasks_size_overhead.pdf


BIN
doc/doxygen/chapters/images/tasks_size_overhead.png


+ 0 - 1
doc/doxygen/doxygen-config.cfg.in

@@ -37,7 +37,6 @@ INPUT                  = @top_srcdir@/doc/doxygen/chapters \
 			 @top_srcdir@/include/starpu.h \
 			 @top_srcdir@/include/starpu_hash.h \
 			 @top_srcdir@/include/starpu_helper.h \
-			 @top_srcdir@/include/starpu_mic.h \
 			 @top_srcdir@/include/starpu_mpi_ms.h \
 			 @top_srcdir@/include/starpu_mod.f90 \
 			 @top_srcdir@/include/starpu_opencl.h \

+ 2 - 3
doc/doxygen/doxygen.cfg

@@ -1613,10 +1613,9 @@ INCLUDE_FILE_PATTERNS  =
 # undefined via #undef or recursively expanded use the := operator
 # instead of the = operator.
 
-PREDEFINED             = STARPU_USE_CUDA=1 \
-                         STARPU_USE_OPENCL=1 \
+PREDEFINED             = STARPU_USE_OPENCL=1 \
+                         STARPU_USE_CUDA=1 \
                          STARPU_USE_FPGA=1 \
-                         STARPU_USE_MIC=1 \
 			 STARPU_USE_MPI=1 \
 			 STARPU_HAVE_HWLOC=1 \
 			 STARPU_USE_SC_HYPERVISOR=1 \

+ 1 - 1
doc/doxygen/doxygen_filter.sh.in

@@ -19,5 +19,5 @@ if [ "$(basename $1)" == "starpufft.h" ] ; then
 else
     # the macro STARPU_DEPRECATED needs to be removed as it is not properly processed by doxygen
     # lines starting with // in the doxygen input files are considered as comments to be removed
-    sed -e 's/STARPU_DEPRECATED//' $1 | sed -e 's/^\/\/.*//' | sed -e 's/STARPU_TASK_LIST_INLINE//'
+    sed -e 's/STARPU_DEPRECATED//' $1 | sed -e 's/^\/\/.*//' | sed -e 's/STARPU_TASK_LIST_INLINE//' | sed -e 's/STARPU_WARN_UNUSED_RESULT//'
 fi

+ 0 - 7
doc/doxygen/refman.tex

@@ -163,11 +163,6 @@ Documentation License”.
 \hypertarget{FFTSupport}{}
 \input{FFTSupport}
 
-\chapter{MIC Xeon Phi Support}
-\label{MICSupport}
-\hypertarget{MICSupport}{}
-\input{MICSupport}
-
 \chapter{FPGA Support}
 \label{FPGASupport}
 \hypertarget{FPGASupport}{}
@@ -247,7 +242,6 @@ Documentation License”.
 \input{group__API__CUDA__Extensions}
 \input{group__API__OpenCL__Extensions}
 \input{group__API__OpenMP__Runtime__Support}
-\input{group__API__MIC__Extensions}
 \input{group__API__FPGA__Extensions}
 \input{group__API__Miscellaneous__Helpers}
 \input{group__API__FxT__Support}
@@ -299,7 +293,6 @@ Documentation License”.
 \input{starpu__hash_8h}
 \input{starpu__helper_8h}
 \input{starpu__heteroprio_8h}
-\input{starpu__mic_8h}
 \input{starpu__mod_8f90}
 \input{starpu__mpi_8h}
 \input{starpu__mpi__lb_8h}

+ 1 - 3
doc/doxygen_dev/Makefile.am

@@ -146,9 +146,6 @@ dox_inputs = $(DOX_CONFIG) 				\
 	$(top_srcdir)/src/drivers/mp_common/mp_common.h	\
 	$(top_srcdir)/src/drivers/mp_common/source_common.h	\
 	$(top_srcdir)/src/drivers/driver_common/driver_common.h	\
-	$(top_srcdir)/src/drivers/mic/driver_mic_sink.h	\
-	$(top_srcdir)/src/drivers/mic/driver_mic_source.h	\
-	$(top_srcdir)/src/drivers/mic/driver_mic_common.h	\
 	$(top_srcdir)/src/profiling/profiling.h	\
 	$(top_srcdir)/src/profiling/bound.h	\
 	$(top_srcdir)/src/util/starpu_data_cpy.h	\
@@ -219,6 +216,7 @@ $(DOX_TAG): $(dox_inputs)
 
 $(DOX_PDF): $(DOX_TAG) refman.tex
 	@cp $(top_srcdir)/doc/doxygen_dev/chapters/version.sty $(DOX_LATEX_DIR)
+	@cp $(top_srcdir)/doc/doxygen_dev/modules.tex $(DOX_LATEX_DIR)
 	@echo $(PDFLATEX) $(DOX_LATEX_DIR)/refman.tex
 	@cd $(DOX_LATEX_DIR) ;\
 	rm -f *.aux *.toc *.idx *.ind *.ilg *.log *.out ;\

+ 0 - 3
doc/doxygen_dev/doxygen-config.cfg.in

@@ -47,9 +47,6 @@ INPUT                  = @top_srcdir@/doc/doxygen_dev/chapters         \
 			 @top_srcdir@/src/drivers/mp_common/mp_common.h \
 			 @top_srcdir@/src/drivers/mp_common/source_common.h \
 			 @top_srcdir@/src/drivers/driver_common/driver_common.h \
-			 @top_srcdir@/src/drivers/mic/driver_mic_sink.h \
-			 @top_srcdir@/src/drivers/mic/driver_mic_source.h \
-			 @top_srcdir@/src/drivers/mic/driver_mic_common.h \
 			 @top_srcdir@/src/profiling/profiling.h \
 			 @top_srcdir@/src/profiling/bound.h \
 			 @top_srcdir@/src/util/starpu_data_cpy.h \

+ 0 - 1
doc/doxygen_dev/doxygen.cfg

@@ -1615,7 +1615,6 @@ INCLUDE_FILE_PATTERNS  =
 
 PREDEFINED             = STARPU_USE_OPENCL=1 \
                          STARPU_USE_CUDA=1 \
-                         STARPU_USE_MIC=1 \
 			 STARPU_USE_MPI=1 \
 			 STARPU_HAVE_HWLOC=1 \
 			 STARPU_USE_SC_HYPERVISOR=1 \

+ 4 - 0
doc/doxygen_dev/modules.tex

@@ -0,0 +1,4 @@
+\section{Modules}
+Here is a list of all modules\+:\begin{DoxyCompactList}
+\item \contentsline{section}{Workers}{\pageref{group__workers}}{}
+\end{DoxyCompactList}

+ 0 - 3
doc/doxygen_dev/refman.tex

@@ -98,9 +98,6 @@ Documentation License”.
 \input{driver__cpu_8h}
 \input{driver__cuda_8h}
 \input{driver__disk_8h}
-\input{driver__mic__common_8h}
-\input{driver__mic__sink_8h}
-\input{driver__mic__source_8h}
 \input{driver__mpi__common_8h}
 \input{driver__mpi__sink_8h}
 \input{driver__mpi__source_8h}

+ 6 - 6
examples/cg/cg.c

@@ -76,12 +76,9 @@ static int copy_handle(starpu_data_handle_t dst, starpu_data_handle_t src, unsig
 
 #include "cg_kernels.c"
 
-
-
 static TYPE *A, *b, *x;
 static TYPE *r, *d, *q;
 
-
 static int copy_handle(starpu_data_handle_t dst, starpu_data_handle_t src, unsigned nblocks)
 {
 	unsigned b;
@@ -91,11 +88,9 @@ static int copy_handle(starpu_data_handle_t dst, starpu_data_handle_t src, unsig
 	return 0;
 }
 
-
 /*
  *	Generate Input data
  */
-
 static void generate_random_problem(void)
 {
 	int i, j;
@@ -314,6 +309,11 @@ int main(int argc, char **argv)
 	if (ret == -ENODEV)
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	if (starpu_cpu_worker_get_count() + starpu_cuda_worker_get_count() + starpu_opencl_worker_get_count() == 0)
+	{
+		starpu_shutdown();
+		return 77;
+	}
 
 	starpu_cublas_init();
 
@@ -329,7 +329,7 @@ int main(int argc, char **argv)
 	partition_data();
 	end = starpu_timing_now();
 
-	FPRINTF(stderr, "Problem intialization timing : %2.2f seconds\n", (end-start)/10e6);
+	FPRINTF(stderr, "Problem intialization timing : %2.2f seconds\n", (end-start)/1e6);
 
 	ret = cg();
 	if (ret == -ENODEV)

+ 12 - 14
examples/cg/cg_kernels.c

@@ -91,7 +91,7 @@ static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nim
 	(void)task;
 	(void)nimpl;
 	enum starpu_worker_archtype type = starpu_worker_get_type(workerid);
-	if (type == STARPU_CPU_WORKER || type == STARPU_OPENCL_WORKER || type == STARPU_MIC_WORKER)
+	if (type == STARPU_CPU_WORKER || type == STARPU_OPENCL_WORKER)
 		return 1;
 
 #ifdef STARPU_USE_CUDA
@@ -354,8 +354,7 @@ static struct starpu_codelet dot_kernel_cl =
 int dot_kernel(HANDLE_TYPE_VECTOR v1,
 	       HANDLE_TYPE_VECTOR v2,
 	       starpu_data_handle_t s,
-	       unsigned nblocks,
-	       int use_reduction)
+	       unsigned nblocks)
 {
 	int ret;
 
@@ -520,8 +519,7 @@ int gemv_kernel(HANDLE_TYPE_VECTOR v1,
 		HANDLE_TYPE_MATRIX matrix,
 		HANDLE_TYPE_VECTOR v2,
 		TYPE p1, TYPE p2,
-		unsigned nblocks,
-		int use_reduction)
+		unsigned nblocks)
 {
 	unsigned b1, b2;
 	int ret;
@@ -737,7 +735,7 @@ int cg(void)
 	if (ret == -ENODEV) return ret;
 
 	/* r <- r - A x */
-	ret = gemv_kernel(r_handle, A_handle, x_handle, 1.0, -1.0, nblocks, use_reduction);
+	ret = gemv_kernel(r_handle, A_handle, x_handle, 1.0, -1.0, nblocks);
 	if (ret == -ENODEV) return ret;
 
 	/* d <- r */
@@ -745,7 +743,7 @@ int cg(void)
 	if (ret == -ENODEV) return ret;
 
 	/* delta_new = dot(r,r) */
-	ret = dot_kernel(r_handle, r_handle, rtr_handle, nblocks, use_reduction);
+	ret = dot_kernel(r_handle, r_handle, rtr_handle, nblocks);
 	if (ret == -ENODEV) return ret;
 
 	GET_DATA_HANDLE(rtr_handle);
@@ -767,10 +765,10 @@ int cg(void)
 		starpu_iteration_push(i);
 
 		/* q <- A d */
-		gemv_kernel(q_handle, A_handle, d_handle, 0.0, 1.0, nblocks, use_reduction);
+		gemv_kernel(q_handle, A_handle, d_handle, 0.0, 1.0, nblocks);
 
 		/* dtq <- dot(d,q) */
-		dot_kernel(d_handle, q_handle, dtq_handle, nblocks, use_reduction);
+		dot_kernel(d_handle, q_handle, dtq_handle, nblocks);
 
 		/* alpha = delta_new / dtq */
 		GET_DATA_HANDLE(dtq_handle);
@@ -787,7 +785,7 @@ int cg(void)
 			copy_handle(r_handle, b_handle, nblocks);
 
 			/* r <- r - A x */
-			gemv_kernel(r_handle, A_handle, x_handle, 1.0, -1.0, nblocks, use_reduction);
+			gemv_kernel(r_handle, A_handle, x_handle, 1.0, -1.0, nblocks);
 		}
 		else
 		{
@@ -796,7 +794,7 @@ int cg(void)
 		}
 
 		/* delta_new = dot(r,r) */
-		dot_kernel(r_handle, r_handle, rtr_handle, nblocks, use_reduction);
+		dot_kernel(r_handle, r_handle, rtr_handle, nblocks);
 
 		GET_DATA_HANDLE(rtr_handle);
 		starpu_data_acquire(rtr_handle, STARPU_R);
@@ -827,9 +825,9 @@ int cg(void)
 	error = sqrt(delta_new/delta_0)/(1.0*n);
 	FPRINTF_SERVER(stderr, "*****************************************\n");
 	FPRINTF_SERVER(stderr, "iter %d DELTA %e - %e\n", i, delta_new, error);
-	FPRINTF_SERVER(stderr, "Total timing : %2.2f seconds\n", timing/10e6);
-	FPRINTF_SERVER(stderr, "Seconds per iteration : %2.2e seconds\n", timing/10e6/i);
-	FPRINTF_SERVER(stderr, "Number of iterations per second : %2.2e it/s\n", i/(timing/10e6));
+	FPRINTF_SERVER(stderr, "Total timing : %2.2f seconds\n", timing/1e6);
+	FPRINTF_SERVER(stderr, "Seconds per iteration : %2.2e seconds\n", timing/1e6/i);
+	FPRINTF_SERVER(stderr, "Number of iterations per second : %2.2e it/s\n", i/(timing/1e6));
 
 	return 0;
 }

+ 0 - 1
examples/cpp/add_vectors.cpp

@@ -61,7 +61,6 @@ int main(int argc, char **argv)
 
 	struct starpu_conf conf;
 	starpu_conf_init(&conf);
-	conf.nmic = 0;
 	conf.nmpi_ms = 0;
 
 	// initialize StarPU with default configuration

+ 0 - 1
examples/cpp/add_vectors_cpp11.cpp

@@ -66,7 +66,6 @@ int main(int argc, char **argv)
 
 	struct starpu_conf conf;
 	starpu_conf_init(&conf);
-	conf.nmic = 0;
 	conf.nmpi_ms = 0;
 
 	// initialize StarPU with default configuration

+ 0 - 12
examples/cpp/add_vectors_interface.cpp

@@ -172,7 +172,6 @@ static const struct starpu_data_copy_methods vector_cpp_copy_data_methods_s =
 	.ram_to_cuda = NULL,
 	.ram_to_opencl = NULL,
 	.ram_to_fpga = NULL,
-	.ram_to_mic = NULL,
 
 	.cuda_to_ram = NULL,
 	.cuda_to_cuda = NULL,
@@ -181,9 +180,6 @@ static const struct starpu_data_copy_methods vector_cpp_copy_data_methods_s =
 	.opencl_to_opencl = NULL,
 
 	.fpga_to_ram = NULL,
-
-	.mic_to_ram = NULL,
-
 	.ram_to_mpi_ms = NULL,
 	.mpi_ms_to_ram = NULL,
 	.mpi_ms_to_mpi_ms = NULL,
@@ -199,13 +195,6 @@ static const struct starpu_data_copy_methods vector_cpp_copy_data_methods_s =
 	.ram_to_fpga_async = NULL,
 	.fpga_to_ram_async = NULL,
 
-	.ram_to_mic_async = NULL,
-	.mic_to_ram_async = NULL,
-
-	.ram_to_mpi_ms_async = NULL,
-	.mpi_ms_to_ram_async = NULL,
-	.mpi_ms_to_mpi_ms_async = NULL,
-
 	.any_to_any = vector_interface_copy_any_to_any,
 };
 #else
@@ -598,7 +587,6 @@ int main(int argc, char **argv)
 	bool fail;
 
 	starpu_conf_init(&conf);
-	conf.nmic = 0;
 	conf.nmpi_ms = 0;
 
 	// initialize StarPU with default configuration

+ 0 - 1
examples/cpp/incrementer_cpp.cpp

@@ -51,7 +51,6 @@ int main(int argc, char **argv)
 
 	struct starpu_conf conf;
 	starpu_conf_init(&conf);
-	conf.nmic = 0;
 	conf.nmpi_ms = 0;
 
 	ret = starpu_init(&conf);

+ 0 - 1
examples/dependency/sequential_consistency.c

@@ -128,7 +128,6 @@ int main(void)
 	}
 
 	starpu_conf_init(&conf);
-	conf.nmic = 0;
 	conf.nmpi_ms = 0;
 
         ret = starpu_init(&conf);

+ 0 - 1
examples/dependency/task_end_dep.c

@@ -76,7 +76,6 @@ int main(void)
 	struct starpu_task *task;
 
 	starpu_conf_init(&conf);
-	conf.nmic = 0;
 	conf.nmpi_ms = 0;
 
         ret = starpu_init(&conf);

+ 0 - 1
examples/dependency/task_end_dep_add.c

@@ -69,7 +69,6 @@ int main(void)
 	struct starpu_conf conf;
 
 	starpu_conf_init(&conf);
-	conf.nmic = 0;
 	conf.nmpi_ms = 0;
 
         ret = starpu_init(&conf);

+ 0 - 6
examples/heat/heat.sh

@@ -20,12 +20,6 @@ set -e
 
 PREFIX=$(dirname $0)
 
-if [ -n "$STARPU_MIC_SINK_PROGRAM_PATH" ] ; then
-	STARPU_MIC_SINK_PROGRAM_NAME=$STARPU_MIC_SINK_PROGRAM_PATH/heat
-	# in case libtool got into play
-	[ -x "$STARPU_MIC_SINK_PROGRAM_PATH/.libs/heat" ] && STARPU_MIC_SINK_PROGRAM_NAME=$STARPU_MIC_SINK_PROGRAM_PATH/.libs/heat
-fi
-
 $STARPU_LAUNCH $PREFIX/heat -shape 0
 $STARPU_LAUNCH $PREFIX/heat -shape 1
 # sometimes lead to pivot being 0

+ 1 - 1
examples/interface/complex_codelet.h

@@ -68,7 +68,7 @@ void compare_complex_codelet(void *descr[], void *_args)
 struct starpu_codelet cl_compare =
 {
 	.cpu_funcs = {compare_complex_codelet},
-	/* dereferencing compare won't work on MIC */
+	/* dereferencing compare won't work on MPI Master Slave */
 	/* .cpu_funcs_name = {"compare_complex_codelet"}, */
 	.nbuffers = 2,
 	.modes = {STARPU_R, STARPU_R},

+ 5 - 5
examples/interface/complex_interface.c

@@ -185,9 +185,9 @@ static int complex_compare(void *data_interface_a, void *data_interface_b)
 	return (complex_a->nx == complex_b->nx);
 }
 
-static int copy_any_to_any(void *src_interface, unsigned src_node,
-			   void *dst_interface, unsigned dst_node,
-			   void *async_data)
+int copy_any_to_any(void *src_interface, unsigned src_node,
+		    void *dst_interface, unsigned dst_node,
+		    void *async_data)
 {
 	struct starpu_complex_interface *src_complex = src_interface;
 	struct starpu_complex_interface *dst_complex = dst_interface;
@@ -206,12 +206,12 @@ static int copy_any_to_any(void *src_interface, unsigned src_node,
 	return ret;
 }
 
-static const struct starpu_data_copy_methods complex_copy_methods =
+const struct starpu_data_copy_methods complex_copy_methods =
 {
 	.any_to_any = copy_any_to_any
 };
 
-static struct starpu_data_interface_ops interface_complex_ops =
+struct starpu_data_interface_ops interface_complex_ops =
 {
 	.register_data_handle = complex_register_data_handle,
 	.allocate_data_on_node = complex_allocate_data_on_node,

+ 0 - 1
examples/loader-cross.sh.in

@@ -1 +0,0 @@
-../tests/loader-cross.sh.in

+ 0 - 12
examples/lu/lu.sh

@@ -22,12 +22,6 @@ PREFIX=$(dirname $0)
 rm -rf $PREFIX/lu.traces
 mkdir -p $PREFIX/lu.traces
 
-if [ -n "$STARPU_MIC_SINK_PROGRAM_PATH" ] ; then
-	STARPU_MIC_SINK_PROGRAM_NAME=$STARPU_MIC_SINK_PROGRAM_PATH/lu_implicit_example_float
-	# in case libtool got into play
-	[ -x "$STARPU_MIC_SINK_PROGRAM_PATH/.libs/lu_implicit_example_float" ] && STARPU_MIC_SINK_PROGRAM_NAME=$STARPU_MIC_SINK_PROGRAM_PATH/.libs/lu_implicit_example_float
-fi
-
 export STARPU_FXT_PREFIX=$PREFIX/lu.traces
 
 $STARPU_LAUNCH $PREFIX/lu_implicit_example_float -size $((160 * 4)) -nblocks 4 -piv
@@ -36,12 +30,6 @@ $STARPU_LAUNCH $PREFIX/lu_implicit_example_float -size $((160 * 4)) -nblocks 4 -
 $STARPU_LAUNCH $PREFIX/lu_implicit_example_float -size $((160 * 2)) -nblocks 2 -bounddeps -directory $STARPU_FXT_PREFIX
 $STARPU_LAUNCH $PREFIX/lu_implicit_example_float -size $((160 * 2)) -nblocks 2 -bound -bounddeps -bounddepsprio -directory $STARPU_FXT_PREFIX
 
-if [ -n "$STARPU_MIC_SINK_PROGRAM_PATH" ] ; then
-	STARPU_MIC_SINK_PROGRAM_NAME=$STARPU_MIC_SINK_PROGRAM_PATH/lu_example_float
-	# in case libtool got into play
-	[ -x "$STARPU_MIC_SINK_PROGRAM_PATH/.libs/lu_example_float" ] && STARPU_MIC_SINK_PROGRAM_NAME=$STARPU_MIC_SINK_PROGRAM_PATH/.libs/lu_example_float
-fi
-
 $STARPU_LAUNCH $PREFIX/lu_example_float -size $((160 * 4)) -nblocks 4 -piv
 $STARPU_LAUNCH $PREFIX/lu_example_float -size $((160 * 4)) -nblocks 4 -no-stride
 $STARPU_LAUNCH $PREFIX/lu_example_float -size $((160 * 4)) -nblocks 4 -bound

+ 1 - 1
examples/lu/xlu_implicit_pivot.c

@@ -234,7 +234,7 @@ starpu_data_handle_t get_block_with_striding(starpu_data_handle_t *dataAp, unsig
 
 int STARPU_LU(lu_decomposition_pivot)(TYPE *matA, unsigned *ipiv, unsigned size, unsigned ld, unsigned nblocks, unsigned no_prio)
 {
-	if (starpu_mic_worker_get_count() || starpu_mpi_ms_worker_get_count())
+	if (starpu_mpi_ms_worker_get_count())
 		/* These won't work with pivoting: we pass a pointer in cl_args */
 		return -ENODEV;
 

+ 1 - 1
examples/lu/xlu_kernels.c

@@ -118,7 +118,7 @@ static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nim
 	(void)task;
 	(void)nimpl;
 	enum starpu_worker_archtype type = starpu_worker_get_type(workerid);
-	if (type == STARPU_CPU_WORKER || type == STARPU_MIC_WORKER)
+	if (type == STARPU_CPU_WORKER)
 		return 1;
 
 #ifdef STARPU_SIMGRID

+ 1 - 1
examples/lu/xlu_pivot.c

@@ -417,7 +417,7 @@ starpu_data_handle_t get_block_with_no_striding(starpu_data_handle_t *dataAp, un
 int STARPU_LU(lu_decomposition_pivot_no_stride)(TYPE **matA, unsigned *ipiv, unsigned size, unsigned ld, unsigned nblocks, unsigned no_prio)
 {
 	(void)ld;
-	if (starpu_mic_worker_get_count() || starpu_mpi_ms_worker_get_count())
+	if (starpu_mpi_ms_worker_get_count())
 		/* These won't work with pivoting: we pass a pointer in cl_args */
 		return -ENODEV;
 

+ 0 - 6
examples/mult/sgemm.sh

@@ -28,12 +28,6 @@ PREFIX=$(dirname $0)
 rm -rf $PREFIX/sgemm.traces
 mkdir -p $PREFIX/sgemm.traces
 
-if [ -n "$STARPU_MIC_SINK_PROGRAM_PATH" ] ; then
-	STARPU_MIC_SINK_PROGRAM_NAME=$STARPU_MIC_SINK_PROGRAM_PATH/sgemm
-	# in case libtool got into play
-	[ -x "$STARPU_MIC_SINK_PROGRAM_PATH/.libs/sgemm" ] && STARPU_MIC_SINK_PROGRAM_NAME=$STARPU_MIC_SINK_PROGRAM_PATH/.libs/sgemm
-fi
-
 export STARPU_FXT_PREFIX=$PREFIX/sgemm.traces
 
 STARPU_SCHED=dmdas $PREFIX/sgemm -check

+ 5 - 1
examples/native_fortran/nf_vector.f90

@@ -94,7 +94,11 @@ program nf_vector
         !     . . .
         !     C_NULL_PTR
         !   )/
-        call fstarpu_insert_task((/ cl_vec, FSTARPU_R, dh_va, FSTARPU_RW.ior.FSTARPU_LOCALITY, dh_vb, C_NULL_PTR /))
+        call fstarpu_insert_task((/ cl_vec, &
+                FSTARPU_R, dh_va, &
+                FSTARPU_RW.ior.FSTARPU_LOCALITY, dh_vb, &
+                FSTARPU_EXECUTE_WHERE, FSTARPU_CPU, & ! for illustration, not required here
+                C_NULL_PTR /))
 
         ! wait for task completion
         call fstarpu_task_wait_for_all()

+ 0 - 1
examples/perf_steering/perf_knobs_03.c

@@ -58,7 +58,6 @@ int main(int argc, char **argv)
 	if (starpu_cpu_worker_get_count() != 2
 		|| starpu_cuda_worker_get_count() != 0
 		|| starpu_opencl_worker_get_count() != 0
-		|| starpu_mic_worker_get_count() != 0
 		|| starpu_mpi_ms_worker_get_count() != 0)
 	{
 		starpu_shutdown();

+ 0 - 1
examples/pi/pi_redux.c

@@ -76,7 +76,6 @@ static void init_rng(void *arg)
 	switch (starpu_worker_get_type(workerid))
 	{
 		case STARPU_CPU_WORKER:
-		case STARPU_MIC_WORKER:
 			/* create a seed */
 			starpu_srand48_r((long int)workerid, &randbuffer[PADDING*workerid]);
 

+ 2 - 1
examples/reductions/dot_product.c

@@ -59,7 +59,7 @@ static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nim
 	(void)task;
 	(void)nimpl;
 	enum starpu_worker_archtype type = starpu_worker_get_type(workerid);
-	if (type == STARPU_CPU_WORKER || type == STARPU_OPENCL_WORKER || type == STARPU_MIC_WORKER)
+	if (type == STARPU_CPU_WORKER || type == STARPU_OPENCL_WORKER)
 		return 1;
 
 #ifdef STARPU_USE_CUDA
@@ -459,6 +459,7 @@ int main(void)
 	}
 
 enodev:
+	starpu_shutdown();
 	FPRINTF(stderr, "WARNING: No one can execute this task\n");
 	/* yes, we do not perform the computation but we did detect that no one
  	 * could perform the kernel, so this is not an error from StarPU */

+ 0 - 1
examples/sched_ctx/parallel_tasks_with_cluster_api.c

@@ -67,7 +67,6 @@ int main(void)
 	int ret, i;
 	struct starpu_cluster_machine *clusters;
 
-	setenv("STARPU_NMIC","0",1);
 	setenv("STARPU_NMPI_MS","0",1);
 
 	ret = starpu_init(NULL);

+ 6 - 1
examples/sched_ctx/two_cpu_contexts.c

@@ -44,16 +44,21 @@ int main(void)
 	int *procs2 = NULL;
 	int i;
 	int n = 20;
+
 	int ret = starpu_init(NULL);
 	if (ret == -ENODEV)
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ncpu = starpu_cpu_worker_get_count();
 
+	ncpu = starpu_cpu_worker_get_count();
 	/* actually we really need at least 2 CPU workers such to allocate 2
 	 * non overlapping contexts */
 	if (ncpu < 2)
+	{
+		starpu_shutdown();
 		return 77;
+	}
+
 	procs = calloc(ncpu, sizeof(int));
 	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, procs, ncpu);
 

+ 1 - 1
examples/scheduler/heteroprio_test.c

@@ -121,7 +121,7 @@ int main(void)
 	assert(ret == 0);
 
 	conf.sched_policy_name = "heteroprio";
-	conf.sched_policy_init = &initSchedulerCallback;
+	conf.sched_policy_callback = &initSchedulerCallback;
 	ret = starpu_init(&conf);
 	if (ret == -ENODEV)
 		return 77;

+ 0 - 1
examples/stencil/loader-cross.sh.in

@@ -1 +0,0 @@
-../../tests/loader-cross.sh.in

+ 0 - 29
include/fstarpu_mod.f90

@@ -71,7 +71,6 @@ module fstarpu_mod
         type(c_ptr), bind(C) :: FSTARPU_CPU_WORKER
         type(c_ptr), bind(C) :: FSTARPU_CUDA_WORKER
         type(c_ptr), bind(C) :: FSTARPU_OPENCL_WORKER
-        type(c_ptr), bind(C) :: FSTARPU_MIC_WORKER
         type(c_ptr), bind(C) :: FSTARPU_ANY_WORKER
 
         integer(c_int), bind(C) :: FSTARPU_NMAXBUFS
@@ -90,7 +89,6 @@ module fstarpu_mod
         type(c_ptr), bind(C) :: FSTARPU_CPU
         type(c_ptr), bind(C) :: FSTARPU_CUDA
         type(c_ptr), bind(C) :: FSTARPU_OPENCL
-        type(c_ptr), bind(C) :: FSTARPU_MIC
 
         type(c_ptr), bind(C) :: FSTARPU_CODELET_SIMGRID_EXECUTE
         type(c_ptr), bind(C) :: FSTARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT
@@ -190,12 +188,6 @@ module fstarpu_mod
                         integer(c_int), value, intent(in) :: nopencl
                 end subroutine fstarpu_conf_set_nopencl
 
-                subroutine fstarpu_conf_set_nmic (conf, nmic) bind(C)
-                        use iso_c_binding, only: c_ptr, c_int
-                        type(c_ptr), value, intent(in) :: conf
-                        integer(c_int), value, intent(in) :: nmic
-                end subroutine fstarpu_conf_set_nmic
-
                 ! starpu_init: see fstarpu_init
                 ! starpu_initialize: see fstarpu_init
 
@@ -233,12 +225,6 @@ module fstarpu_mod
                         integer(c_int) :: fstarpu_asynchronous_opencl_copy_disabled
                 end function fstarpu_asynchronous_opencl_copy_disabled
 
-                ! int starpu_asynchronous_mic_copy_disabled(void);
-                function fstarpu_asynchronous_mic_copy_disabled() bind(C,name="starpu_asynchronous_mic_copy_disabled")
-                        use iso_c_binding, only: c_int
-                        integer(c_int) :: fstarpu_asynchronous_mic_copy_disabled
-                end function fstarpu_asynchronous_mic_copy_disabled
-
                 ! void starpu_display_stats();
                 subroutine fstarpu_display_stats() bind(C,name="starpu_display_stats")
                 end subroutine fstarpu_display_stats
@@ -289,12 +275,6 @@ module fstarpu_mod
                         integer(c_int)              :: fstarpu_opencl_worker_get_count
                 end function fstarpu_opencl_worker_get_count
 
-                ! unsigned starpu_mic_worker_get_count(void);
-                function fstarpu_mic_worker_get_count() bind(C,name="starpu_mic_worker_get_count")
-                        use iso_c_binding, only: c_int
-                        integer(c_int)              :: fstarpu_mic_worker_get_count
-                end function fstarpu_mic_worker_get_count
-
                 ! int starpu_worker_get_id(void);
                 function fstarpu_worker_get_id() bind(C,name="starpu_worker_get_id")
                         use iso_c_binding, only: c_int
@@ -704,12 +684,6 @@ module fstarpu_mod
                         type(c_ptr), value, intent(in) :: flags ! C function expects an intptr_t
                 end subroutine fstarpu_codelet_add_opencl_flags
 
-                subroutine fstarpu_codelet_add_mic_func (cl, f_ptr) bind(C)
-                        use iso_c_binding, only: c_ptr, c_funptr
-                        type(c_ptr), value, intent(in) :: cl
-                        type(c_funptr), value, intent(in) :: f_ptr
-                end subroutine fstarpu_codelet_add_mic_func
-
                 subroutine fstarpu_codelet_add_buffer (cl, mode) bind(C)
                         use iso_c_binding, only: c_ptr
                         type(c_ptr), value, intent(in) :: cl
@@ -2445,7 +2419,6 @@ module fstarpu_mod
                         FSTARPU_CPU_WORKER   = fstarpu_get_constant(C_CHAR_"FSTARPU_CPU_WORKER"//C_NULL_CHAR)
                         FSTARPU_CUDA_WORKER   = fstarpu_get_constant(C_CHAR_"FSTARPU_CUDA_WORKER"//C_NULL_CHAR)
                         FSTARPU_OPENCL_WORKER   = fstarpu_get_constant(C_CHAR_"FSTARPU_OPENCL_WORKER"//C_NULL_CHAR)
-                        FSTARPU_MIC_WORKER   = fstarpu_get_constant(C_CHAR_"FSTARPU_MIC_WORKER"//C_NULL_CHAR)
                         FSTARPU_ANY_WORKER   = fstarpu_get_constant(C_CHAR_"FSTARPU_ANY_WORKER"//C_NULL_CHAR)
 
                         FSTARPU_NMAXBUFS   = int(p_to_ip(fstarpu_get_constant(C_CHAR_"FSTARPU_NMAXBUFS"//C_NULL_CHAR)),c_int)
@@ -2477,8 +2450,6 @@ module fstarpu_mod
                             fstarpu_get_constant(C_CHAR_"FSTARPU_CUDA"//C_NULL_CHAR)
                         FSTARPU_OPENCL = &
                             fstarpu_get_constant(C_CHAR_"FSTARPU_OPENCL"//C_NULL_CHAR)
-                        FSTARPU_MIC = &
-                            fstarpu_get_constant(C_CHAR_"FSTARPU_MIC"//C_NULL_CHAR)
 
                         FSTARPU_CODELET_SIMGRID_EXECUTE = &
                              fstarpu_get_constant(C_CHAR_"FSTARPU_CODELET_SIMGRID_EXECUTE"//C_NULL_CHAR)

+ 72 - 82
include/starpu.h

@@ -100,7 +100,8 @@ extern "C"
    It has to be initialized with starpu_conf_init(). When the default
    value is used, StarPU automatically selects the number of
    processing units and takes the default scheduling policy. The
-   environment variables overwrite the equivalent parameters.
+   environment variables overwrite the equivalent parameters unless
+   starpu_conf::precedence_over_environment_variables is set.
 */
 struct starpu_conf
 {
@@ -130,7 +131,13 @@ struct starpu_conf
 	   (default = <c>NULL</c>)
 	*/
 	struct starpu_sched_policy *sched_policy;
-	void (*sched_policy_init)(unsigned);
+
+	/**
+	   Callback function that can later be used by the scheduler.
+	   The scheduler can retrieve this function by calling
+	   starpu_sched_ctx_get_sched_policy_callback()
+	*/
+	void (*sched_policy_callback)(unsigned);
 
 	/**
 	   For all parameters specified in this structure that can
@@ -146,7 +153,7 @@ struct starpu_conf
 	/**
 	   Number of CPU cores that StarPU can use. This can also be
 	   specified with the environment variable \ref STARPU_NCPU.
-	   (default = -1)
+	   (default = \c -1)
 	*/
 	int ncpus;
 
@@ -161,7 +168,7 @@ struct starpu_conf
 	   Number of CUDA devices that StarPU can use. This can also
 	   be specified with the environment variable \ref
 	   STARPU_NCUDA.
-	   (default = -1)
+	   (default = \c -1)
 	*/
 	int ncuda;
 
@@ -169,7 +176,7 @@ struct starpu_conf
 	   Number of OpenCL devices that StarPU can use. This can also
 	   be specified with the environment variable \ref
 	   STARPU_NOPENCL.
-	   (default = -1)
+	   (default = \c -1)
 	*/
 	int nopencl;
 
@@ -182,17 +189,10 @@ struct starpu_conf
 	int nfpga;
 
 	/**
-	   Number of MIC devices that StarPU can use. This can also be
-	   specified with the environment variable \ref STARPU_NMIC.
-	   (default = -1)
-	*/
-	int nmic;
-
-	/**
 	   Number of MPI Master Slave devices that StarPU can use.
 	   This can also be specified with the environment variable
 	   \ref STARPU_NMPI_MS.
-	   (default = -1)
+	   (default = \c -1)
 	*/
         int nmpi_ms;
 
@@ -202,7 +202,7 @@ struct starpu_conf
 	   StarPU automatically selects where to bind the different
 	   workers. This can also be specified with the environment
 	   variable \ref STARPU_WORKERS_CPUID.
-	   (default = 0)
+	   (default = \c 0)
 	*/
 	unsigned use_explicit_workers_bindid;
 
@@ -213,7 +213,7 @@ struct starpu_conf
 	   indicates the logical identifier of the processor which
 	   should execute the i-th worker. Note that the logical
 	   ordering of the CPUs is either determined by the OS, or
-	   provided by the hwloc library in case it is available.
+	   provided by the \c hwloc library in case it is available.
 	*/
 	unsigned workers_bindid[STARPU_NMAXWORKERS];
 
@@ -224,7 +224,7 @@ struct starpu_conf
 	   affects the CUDA devices in a round-robin fashion. This can
 	   also be specified with the environment variable \ref
 	   STARPU_WORKERS_CUDAID.
-	   (default = 0)
+	   (default = \c 0)
 	*/
 	unsigned use_explicit_workers_cuda_gpuid;
 
@@ -242,7 +242,7 @@ struct starpu_conf
 	   affects the OpenCL devices in a round-robin fashion. This
 	   can also be specified with the environment variable \ref
 	   STARPU_WORKERS_OPENCLID.
-	   (default = 0)
+	   (default = \c 0)
 	*/
 	unsigned use_explicit_workers_opencl_gpuid;
 
@@ -288,30 +288,12 @@ struct starpu_conf
 #endif
 
 	/**
-	   If this flag is set, the MIC workers will be attached to
-	   the MIC devices specified in the array
-	   starpu_conf::workers_mic_deviceid. Otherwise, StarPU
-	   affects the MIC devices in a round-robin fashion. This can
-	   also be specified with the environment variable \ref
-	   STARPU_WORKERS_MICID.
-	   (default = 0)
-	*/
-	unsigned use_explicit_workers_mic_deviceid;
-
-	/**
-	   If the flag starpu_conf::use_explicit_workers_mic_deviceid
-	   is set, the array contains the logical identifiers of the
-	   MIC devices to be used.
-	*/
-	unsigned workers_mic_deviceid[STARPU_NMAXWORKERS];
-
-	/**
 	   If this flag is set, the MPI Master Slave workers will be
 	   attached to the MPI Master Slave devices specified in the
 	   array starpu_conf::workers_mpi_ms_deviceid. Otherwise,
 	   StarPU affects the MPI Master Slave devices in a
 	   round-robin fashion.
-	   (default = 0)
+	   (default = \c 0)
 	*/
 	unsigned use_explicit_workers_mpi_ms_deviceid;
 
@@ -328,7 +310,7 @@ struct starpu_conf
 	   this value is equal to -1, the default value is used. This
 	   can also be specified with the environment variable \ref
 	   STARPU_BUS_CALIBRATE.
-	   (default = 0)
+	   (default = \c 0)
 	*/
 	int bus_calibrate;
 
@@ -340,7 +322,7 @@ struct starpu_conf
 	   2, the existing performance models will be overwritten.
 	   This can also be specified with the environment variable
 	   \ref STARPU_CALIBRATE.
-	   (default = 0)
+	   (default = \c 0)
 	*/
 	int calibrate;
 
@@ -355,19 +337,11 @@ struct starpu_conf
 	   worker sizes to look for the most efficient ones.
 	   This can also be specified with the environment variable
 	   \ref STARPU_SINGLE_COMBINED_WORKER.
-	   (default = 0)
+	   (default = \c 0)
 	*/
 	int single_combined_worker;
 
 	/**
-	   Path to the kernel to execute on the MIC device, compiled
-	   for MIC architecture. When set to <c>NULL</c>, StarPU
-	   automatically looks next to the host program location.
-	   (default = <c>NULL</c>)
-	*/
-	char *mic_sink_program_path;
-
-	/**
 	   This flag should be set to 1 to disable asynchronous copies
 	   between CPUs and all accelerators.
 	   The AMD implementation of OpenCL is known to fail when
@@ -379,7 +353,7 @@ struct starpu_conf
 	   This can also be specified at compilation time by giving to
 	   the configure script the option \ref
 	   disable-asynchronous-copy "--disable-asynchronous-copy".
-	   (default = 0)
+	   (default = \c 0)
 	*/
 	int disable_asynchronous_copy;
 
@@ -392,7 +366,7 @@ struct starpu_conf
 	   the configure script the option \ref
 	   disable-asynchronous-cuda-copy
 	   "--disable-asynchronous-cuda-copy".
-	   (default = 0)
+	   (default = \c 0)
 	*/
 	int disable_asynchronous_cuda_copy;
 
@@ -409,25 +383,12 @@ struct starpu_conf
 	   the configure script the option \ref
 	   disable-asynchronous-opencl-copy
 	   "--disable-asynchronous-opencl-copy".
-	   (default = 0)
+	   (default = \c 0)
 	*/
 	int disable_asynchronous_opencl_copy;
 
 	/**
 	   This flag should be set to 1 to disable asynchronous copies
-	   between CPUs and MIC accelerators.
-	   This can also be specified with the environment variable
-	   \ref STARPU_DISABLE_ASYNCHRONOUS_MIC_COPY.
-	   This can also be specified at compilation time by giving to
-	   the configure script the option \ref
-	   disable-asynchronous-mic-copy
-	   "--disable-asynchronous-mic-copy".
-	   (default = 0).
-	*/
-	int disable_asynchronous_mic_copy;
-
-	/**
-	   This flag should be set to 1 to disable asynchronous copies
 	   between CPUs and MPI Master Slave devices.
 	   This can also be specified with the environment variable
 	   \ref STARPU_DISABLE_ASYNCHRONOUS_MPI_MS_COPY.
@@ -435,7 +396,7 @@ struct starpu_conf
 	   the configure script the option \ref
 	   disable-asynchronous-mpi-master-slave-copy
 	   "--disable-asynchronous-mpi-master-slave-copy".
-	   (default = 0).
+	   (default = \c 0).
 	*/
 	int disable_asynchronous_mpi_ms_copy;
 
@@ -478,7 +439,7 @@ struct starpu_conf
 	   The number of StarPU drivers that should not be launched by
 	   StarPU, i.e number of elements of the array
 	   starpu_conf::not_launched_drivers.
-	   (default = 0)
+	   (default = \c 0)
 	*/
 	unsigned n_not_launched_drivers;
 
@@ -491,8 +452,20 @@ struct starpu_conf
 	*/
 	uint64_t trace_buffer_size;
 
+	/**
+	   Set the mininum priority used by priorities-aware
+	   schedulers.
+	   This also can be specified with the environment variable \ref
+	   STARPU_MIN_PRIO
+	*/
 	int global_sched_ctx_min_priority;
 
+	/**
+	   Set the maxinum priority used by priorities-aware
+	   schedulers.
+	   This also can be specified with the environment variable \ref
+	   STARPU_MAX_PRIO
+	*/
 	int global_sched_ctx_max_priority;
 
 #ifdef STARPU_WORKER_CALLBACKS
@@ -501,14 +474,14 @@ struct starpu_conf
 #endif
 
 	/**
-	   Specify if StarPU should catch SIGINT, SIGSEGV and SIGTRAP
+	   Specify if StarPU should catch \c SIGINT, \c SIGSEGV and \c SIGTRAP
 	   signals to make sure final actions (e.g dumping FxT trace
 	   files) are done even though the application has crashed. By
 	   default (value = \c 1), signals are catched. It should be
 	   disabled on systems which already catch these signals for
 	   their own needs (e.g JVM)
 	   This can also be specified with the environment variable
-	   \ref STARPU_CATCH_SIGNALS
+	   \ref STARPU_CATCH_SIGNALS.
 	 */
 	int catch_signals;
 
@@ -519,20 +492,24 @@ struct starpu_conf
 	unsigned start_perf_counter_collection;
 
 	/**
-	   Minimum spinning backoff of drivers. Default value: \c 1
+	   Minimum spinning backoff of drivers (default = \c 1)
 	 */
 	unsigned driver_spinning_backoff_min;
 
 	/**
-	   Maximum spinning backoff of drivers. Default value: \c 32
+	   Maximum spinning backoff of drivers. (default = \c 32)
 	 */
 	unsigned driver_spinning_backoff_max;
 
 	/**
 	   Specify if CUDA workers should do only fast allocations
 	   when running the datawizard progress of
-	   other memory nodes. This will pass STARPU_DATAWIZARD_ONLY_FAST_ALLOC.
-	   Default value is 0, allowing CUDA workers to do slow allocations.
+	   other memory nodes. This will pass the interval value
+	   _STARPU_DATAWIZARD_ONLY_FAST_ALLOC to the allocation method.
+	   Default value is 0, allowing CUDA workers to do slow
+	   allocations.
+	   This can also be specified with the environment variable
+	   \ref STARPU_CUDA_ONLY_FAST_ALLOC_OTHER_MEMNODES.
 	 */
 	int cuda_only_fast_alloc_other_memnodes;
 };
@@ -554,9 +531,11 @@ int starpu_conf_init(struct starpu_conf *conf);
    starpu_conf::ncpus = 0, starpu_conf::ncuda = 0, etc.
 
    This allows to portably enable only a given type of worker:
-
-   starpu_conf_noworker(&conf);
+   <br/>
+   <c>
+   starpu_conf_noworker(&conf);<br/>
    conf.ncpus = -1;
+   </c>
 */
 int starpu_conf_noworker(struct starpu_conf *conf);
 
@@ -573,7 +552,8 @@ int starpu_init(struct starpu_conf *conf) STARPU_WARN_UNUSED_RESULT;
 
 /**
    Similar to starpu_init(), but also take the \p argc and \p argv as
-   defined by the application.
+   defined by the application, which is necessary when running in
+   Simgrid mode or MPI Master Slave mode.
    Do not call starpu_init() and starpu_initialize() in the same
    program.
 */
@@ -605,6 +585,7 @@ void starpu_shutdown(void);
    should be used to unfreeze the workers.
 */
 void starpu_pause(void);
+
 /**
    Symmetrical call to starpu_pause(), used to resume the workers
    polling for new tasks.
@@ -622,7 +603,7 @@ void starpu_resume(void);
 /**
    Return a PU binding ID which can be used to bind threads with
    starpu_bind_thread_on(). \p flags can be set to
-   STARPU_THREAD_ACTIVE or 0. When \p npreferred is set to non-zero,
+   ::STARPU_THREAD_ACTIVE or 0. When \p npreferred is set to non-zero,
    \p preferred is an array of size \p npreferred in which a
    preference of PU binding IDs can be set. By default StarPU will
    return the first PU available for binding.
@@ -638,7 +619,7 @@ unsigned starpu_get_next_bindid(unsigned flags, unsigned *preferred, unsigned np
    so the caller can tell the user how to avoid the issue.
 
    \p name should be set to a unique string so that different calls
-   with the same name for the same cpuid does not produce a warning.
+   with the same name for the same \p cpuid does not produce a warning.
 */
 int starpu_bind_thread_on(int cpuid, unsigned flags, const char *name);
 
@@ -672,19 +653,28 @@ int starpu_asynchronous_opencl_copy_disabled(void);
 int starpu_asynchronous_fpga_copy_disabled(void);
 
 /**
-   Return 1 if asynchronous data transfers between CPU and MIC devices
-   are disabled.
-*/
-int starpu_asynchronous_mic_copy_disabled(void);
-
-/**
    Return 1 if asynchronous data transfers between CPU and MPI Slave
    devices are disabled.
 */
 int starpu_asynchronous_mpi_ms_copy_disabled(void);
 
+/**
+   Call starpu_profiling_bus_helper_display_summary() and
+   starpu_profiling_worker_helper_display_summary()
+ */
 void starpu_display_stats(void);
 
+/** @} */
+
+/**
+   @defgroup API_Versioning Versioning
+   @{
+*/
+
+/**
+   Return as 3 integers the version of StarPU used when running the
+   application.
+*/
 void starpu_get_version(int *major, int *minor, int *release);
 
 /** @} */

+ 14 - 14
include/starpu_clusters.h

@@ -35,71 +35,71 @@ extern "C"
  */
 
 /**
-   Used when calling starpu_cluster_machine
+   Used when calling starpu_cluster_machine()
  */
 #define STARPU_CLUSTER_MIN_NB			(1<<STARPU_MODE_SHIFT)
 
 /**
-   Used when calling starpu_cluster_machine
+   Used when calling starpu_cluster_machine()
  */
 #define STARPU_CLUSTER_MAX_NB			(2<<STARPU_MODE_SHIFT)
 
 /**
-   Used when calling starpu_cluster_machine
+   Used when calling starpu_cluster_machine()
  */
 #define STARPU_CLUSTER_NB			(3<<STARPU_MODE_SHIFT)
 
 /**
-   Used when calling starpu_cluster_machine
+   Used when calling starpu_cluster_machine()
  */
 #define STARPU_CLUSTER_PREFERE_MIN		(4<<STARPU_MODE_SHIFT)
 
 /**
-   Used when calling starpu_cluster_machine
+   Used when calling starpu_cluster_machine()
  */
 #define STARPU_CLUSTER_KEEP_HOMOGENEOUS		(5<<STARPU_MODE_SHIFT)
 
 /**
-   Used when calling starpu_cluster_machine
+   Used when calling starpu_cluster_machine()
  */
 #define STARPU_CLUSTER_POLICY_NAME		(6<<STARPU_MODE_SHIFT)
 
 /**
-   Used when calling starpu_cluster_machine
+   Used when calling starpu_cluster_machine()
  */
 #define STARPU_CLUSTER_POLICY_STRUCT		(7<<STARPU_MODE_SHIFT)
 
 /**
-   Used when calling starpu_cluster_machine
+   Used when calling starpu_cluster_machine()
  */
 #define STARPU_CLUSTER_CREATE_FUNC		(8<<STARPU_MODE_SHIFT)
 
 /**
-   Used when calling starpu_cluster_machine
+   Used when calling starpu_cluster_machine()
  */
 #define STARPU_CLUSTER_CREATE_FUNC_ARG		(9<<STARPU_MODE_SHIFT)
 /**
-   Used when calling starpu_cluster_machine
+   Used when calling starpu_cluster_machine()
  */
 #define STARPU_CLUSTER_TYPE			(10<<STARPU_MODE_SHIFT)
 
 /**
-   Used when calling starpu_cluster_machine
+   Used when calling starpu_cluster_machine()
  */
 #define STARPU_CLUSTER_AWAKE_WORKERS		(11<<STARPU_MODE_SHIFT)
 
 /**
-   Used when calling starpu_cluster_machine
+   Used when calling starpu_cluster_machine()
  */
 #define STARPU_CLUSTER_PARTITION_ONE		(12<<STARPU_MODE_SHIFT)
 
 /**
-   Used when calling starpu_cluster_machine
+   Used when calling starpu_cluster_machine()
  */
 #define STARPU_CLUSTER_NEW			(13<<STARPU_MODE_SHIFT)
 
 /**
-   Used when calling starpu_cluster_machine
+   Used when calling starpu_cluster_machine()
  */
 #define STARPU_CLUSTER_NCORES			(14<<STARPU_MODE_SHIFT)
 

+ 9 - 14
include/starpu_config.h.in

@@ -57,6 +57,14 @@
 #undef STARPU_USE_CUDA
 
 /**
+   Defined when StarPU has been installed with
+   NVidia-ML support. It should be used in your code to detect the
+   availability of NVML-related functions.
+   @ingroup API_CUDA_Extensions
+*/
+#undef STARPU_HAVE_LIBNVIDIA_ML
+
+/**
    Defined when StarPU has been installed with OpenCL support. It
    should be used in your code to detect the availability of OpenCL as
    shown in \ref FullSourceCodeVectorScal.
@@ -72,13 +80,6 @@
 #undef STARPU_USE_FPGA
 
 /**
-   Defined when StarPU has been installed with MIC support. It should
-   be used in your code to detect the availability of MIC.
-   @ingroup API_MIC_Extensions
-*/
-#undef STARPU_USE_MIC
-
-/**
    Defined when StarPU has been installed with MPI Master Slave
    support. It should be used in your code to detect the availability
    of MPI Master Slave.
@@ -159,6 +160,7 @@
 #undef STARPU_HAVE_SYNC_FETCH_AND_OR
 #undef STARPU_HAVE_SYNC_LOCK_TEST_AND_SET
 #undef STARPU_HAVE_SYNC_SYNCHRONIZE
+#undef STARPU_HAVE_ATOMIC_EXCHANGE_N
 
 #undef STARPU_DEVEL
 #undef STARPU_MODEL_DEBUG
@@ -238,13 +240,6 @@
 #undef STARPU_MAXFPGADEVS
 
 /**
-   Define the maximum number of MIC devices that are supported by
-   StarPU.
-   @ingroup API_MIC_Extensions
-*/
-#undef STARPU_MAXMICDEVS
-
-/**
    Define the maximum number of workers managed by StarPU.
    @ingroup API_Workers_Properties
 */

+ 12 - 0
include/starpu_cuda.h

@@ -24,6 +24,10 @@
 #include <cuda_runtime.h>
 #include <cuda_runtime_api.h>
 
+#ifdef STARPU_HAVE_LIBNVIDIA_ML
+#include <nvml.h>
+#endif
+
 #ifdef __cplusplus
 extern "C"
 {
@@ -129,6 +133,14 @@ int starpu_cuda_copy3d_async_sync(void *src_ptr, unsigned src_node, void *dst_pt
 */
 void starpu_cuda_set_device(unsigned devid);
 
+#ifdef STARPU_HAVE_LIBNVIDIA_ML
+/**
+  Return the nvml device for a CUDA device
+*/
+nvmlDevice_t starpu_cuda_get_nvmldev(unsigned devid);
+#endif
+
+
 /** @} */
 
 #ifdef __cplusplus

+ 15 - 4
include/starpu_data.h

@@ -110,14 +110,14 @@ enum starpu_data_access_mode
 				   src/sched_policies/work_stealing_policy.c
 				   source code.
 				*/
-	STARPU_MPI_REDUX=(1<<7), /** Inter-node reduction only. Codelets 
+	STARPU_MPI_REDUX=(1<<7), /**< Inter-node reduction only. Codelets
 				    contributing to these reductions should
-				    be registered with STARPU_RW | STARPU_COMMUTE 
+				    be registered with ::STARPU_RW | ::STARPU_COMMUTE
 				    access modes.
 			            When inserting these tasks through the
 				    MPI layer however, the access mode needs
-				    to be STARPU_MPI_REDUX. */
-	STARPU_ACCESS_MODE_MAX=(1<<8) /** The purpose of ACCESS_MODE_MAX is to
+				    to be ::STARPU_MPI_REDUX. */
+	STARPU_ACCESS_MODE_MAX=(1<<8) /**< The purpose of ::STARPU_ACCESS_MODE_MAX is to
 					be the maximum of this enum. */
 };
 
@@ -560,6 +560,17 @@ unsigned starpu_data_get_ooc_flag(starpu_data_handle_t handle);
 
 /**
    Query the status of \p handle on the specified \p memory_node.
+
+   \p is_allocated tells whether memory was allocated there for the data.
+   \p is_valid tells whether the actual value is available there.
+   \p is_loading tells whether the actual value is getting loaded there.
+   \p is_requested tells whether the actual value is requested to be loaded
+   there by some fetch/prefetch/idlefetch request.
+*/
+void starpu_data_query_status2(starpu_data_handle_t handle, int memory_node, int *is_allocated, int *is_valid, int *is_loading, int *is_requested);
+
+/**
+   Same as starpu_data_query_status2(), but without the is_loading parameter.
 */
 void starpu_data_query_status(starpu_data_handle_t handle, int memory_node, int *is_allocated, int *is_valid, int *is_requested);
 

+ 0 - 42
include/starpu_data_interfaces.h

@@ -130,13 +130,6 @@ struct starpu_data_copy_methods
 
 	/**
 	   Define how to copy data from the \p src_interface interface on the
-	   \p src_node CPU node to the \p dst_interface interface on the \p
-	   dst_node MIC node. Return 0 on success.
-	*/
-	int (*ram_to_mic)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
-
-	/**
-	   Define how to copy data from the \p src_interface interface on the
 	   \p src_node CUDA node to the \p dst_interface interface on the \p
 	   dst_node CPU node. Return 0 on success.
 	*/
@@ -172,13 +165,6 @@ struct starpu_data_copy_methods
 
 	/**
 	   Define how to copy data from the \p src_interface interface on the
-	   \p src_node MIC node to the \p dst_interface interface on the \p
-	   dst_node CPU node. Return 0 on success.
-	*/
-	int (*mic_to_ram)(void *src_interface, unsigned srd_node, void *dst_interface, unsigned dst_node);
-
-	/**
-	   Define how to copy data from the \p src_interface interface on the
 	   \p src_node CPU node to the \p dst_interface interface on the \p
 	   dst_node MPI Slave node. Return 0 on success.
 	*/
@@ -294,26 +280,6 @@ struct starpu_data_copy_methods
 	/**
 	   Define how to copy data from the \p src_interface interface on the
 	   \p src_node CPU node to the \p dst_interface interface on the \p
-	   dst_node MIC node. Must return 0 if the transfer was actually
-	   completed completely synchronously, or <c>-EAGAIN</c> if at least
-	   some transfers are still ongoing and should be awaited for by the
-	   core.
-	*/
-	int (*ram_to_mic_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
-
-	/**
-	   Define how to copy data from the \p src_interface interface on the
-	   \p src_node MIC node to the \p dst_interface interface on the \p
-	   dst_node CPU node. Must return 0 if the transfer was actually
-	   completed completely synchronously, or <c>-EAGAIN</c> if at least
-	   some transfers are still ongoing and should be awaited for by the
-	   core.
-	*/
-	int (*mic_to_ram_async)(void *src_interface, unsigned srd_node, void *dst_interface, unsigned dst_node);
-
-	/**
-	   Define how to copy data from the \p src_interface interface on the
-	   \p src_node CPU node to the \p dst_interface interface on the \p
 	   dst_node MPI Slave node, with the given even. Must return 0 if the
 	   transfer was actually completed completely synchronously, or
 	   <c>-EAGAIN</c> if at least some transfers are still ongoing and
@@ -2077,9 +2043,6 @@ struct starpu_multiformat_data_interface_ops
 	size_t cuda_elemsize;                     /**< size of each element on CUDA devices */
 	struct starpu_codelet *cpu_to_cuda_cl;    /**< pointer to a codelet which converts from CPU to CUDA */
 	struct starpu_codelet *cuda_to_cpu_cl;    /**< pointer to a codelet which converts from CUDA to CPU */
-	size_t mic_elemsize;                      /**< size of each element on MIC devices */
-	struct starpu_codelet *cpu_to_mic_cl;     /**< pointer to a codelet which converts from CPU to MIC */
-	struct starpu_codelet *mic_to_cpu_cl;     /**< pointer to a codelet which converts from MIC to CPU */
 };
 
 struct starpu_multiformat_interface
@@ -2089,7 +2052,6 @@ struct starpu_multiformat_interface
 	void *cpu_ptr;
 	void *cuda_ptr;
 	void *opencl_ptr;
-	void *mic_ptr;
 	uint32_t nx;
 	struct starpu_multiformat_data_interface_ops *ops;
 };
@@ -2117,10 +2079,6 @@ void starpu_multiformat_data_register(starpu_data_handle_t *handle, int home_nod
 */
 #define STARPU_MULTIFORMAT_GET_OPENCL_PTR(interface) (((struct starpu_multiformat_interface *)(interface))->opencl_ptr)
 /**
-   Return the local pointer to the data with MIC format.
- */
-#define STARPU_MULTIFORMAT_GET_MIC_PTR(interface) (((struct starpu_multiformat_interface *)(interface))->mic_ptr)
-/**
    Return the number of elements in the data.
  */
 #define STARPU_MULTIFORMAT_GET_NX(interface)  (((struct starpu_multiformat_interface *)(interface))->nx)

+ 0 - 61
include/starpu_mic.h

@@ -1,61 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2012-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#ifndef __STARPU_MIC_H__
-#define __STARPU_MIC_H__
-
-#include <starpu_config.h>
-
-#ifdef STARPU_USE_MIC
-
-#ifdef __cplusplus
-extern "C"
-{
-#endif
-
-/**
-   @defgroup API_MIC_Extensions MIC Extensions
-   @{
-*/
-
-/**
-   Type for MIC function symbols
-*/
-typedef void *starpu_mic_func_symbol_t;
-
-/**
-   Initiate a lookup on each MIC device to find the address of the
-   function named \p func_name, store it in the global array kernels
-   and return the index in the array through \p symbol.
-*/
-int starpu_mic_register_kernel(starpu_mic_func_symbol_t *symbol, const char *func_name);
-
-/**
-   If successfull, return the pointer to the function defined by \p symbol on
-   the device linked to the called device. This can for instance be used
-   in a starpu_mic_func_t implementation.
-*/
-starpu_mic_kernel_t starpu_mic_get_kernel(starpu_mic_func_symbol_t symbol);
-
-/** @} */
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* STARPU_USE_MIC */
-
-#endif /* __STARPU_MIC_H__ */

+ 0 - 7
include/starpu_mod.f90

@@ -79,13 +79,6 @@ MODULE starpu_mod
      END SUBROUTINE starpu_asynchronous_opencl_copy_disabled
   END INTERFACE
 
-  ! starpu_asynchronous_mic_copy_disabled
-  INTERFACE
-     SUBROUTINE starpu_asynchronous_mic_copy_disabled() BIND(C)
-       USE iso_c_binding
-     END SUBROUTINE starpu_asynchronous_mic_copy_disabled
-  END INTERFACE
-
   ! starpu_display_stats
   INTERFACE
      SUBROUTINE starpu_display_stats() BIND(C)

+ 9 - 9
include/starpu_perf_monitoring.h

@@ -40,10 +40,10 @@ extern "C"
  */
 enum starpu_perf_counter_scope
 {
-	starpu_perf_counter_scope_undefined     = 0, /** undefined scope */
-	starpu_perf_counter_scope_global        = 2, /** global scope */
-	starpu_perf_counter_scope_per_worker    = 4, /** per-worker scope */
-	starpu_perf_counter_scope_per_codelet   = 6  /** per-codelet scope */
+	starpu_perf_counter_scope_undefined     = 0, /**< undefined scope */
+	starpu_perf_counter_scope_global        = 2, /**< global scope */
+	starpu_perf_counter_scope_per_worker    = 4, /**< per-worker scope */
+	starpu_perf_counter_scope_per_codelet   = 6  /**< per-codelet scope */
 };
 
 /**
@@ -51,11 +51,11 @@ enum starpu_perf_counter_scope
  */
 enum starpu_perf_counter_type
 {
-	starpu_perf_counter_type_undefined = 0, /** underfined value type */
-	starpu_perf_counter_type_int32     = 1, /** signed 32-bit integer value */
-	starpu_perf_counter_type_int64     = 2, /** signed 64-bit integer value */
-	starpu_perf_counter_type_float     = 3, /** 32-bit single precision floating-point value */
-	starpu_perf_counter_type_double    = 4  /** 64-bit double precision floating-point value */
+	starpu_perf_counter_type_undefined = 0, /**< underfined value type */
+	starpu_perf_counter_type_int32     = 1, /**< signed 32-bit integer value */
+	starpu_perf_counter_type_int64     = 2, /**< signed 64-bit integer value */
+	starpu_perf_counter_type_float     = 3, /**< 32-bit single precision floating-point value */
+	starpu_perf_counter_type_double    = 4  /**< 64-bit double precision floating-point value */
 };
 
 struct starpu_perf_counter_listener;

+ 109 - 84
include/starpu_perf_steering.h

@@ -35,183 +35,208 @@ extern "C"
    \anchor PM_API
    @{
 */
+
 /**
    Enum of all possible performance knob scopes.
  */
 enum starpu_perf_knob_scope
 {
-	starpu_perf_knob_scope_undefined     = 0, /** undefined scope */
-	starpu_perf_knob_scope_global        = 1, /** global scope */
-	starpu_perf_knob_scope_per_worker    = 3, /** per-worker scope */
-	starpu_perf_knob_scope_per_scheduler = 5  /** per-scheduler scope */
+	starpu_perf_knob_scope_undefined     = 0, /**< undefined scope */
+	starpu_perf_knob_scope_global        = 1, /**< global scope */
+	starpu_perf_knob_scope_per_worker    = 3, /**< per-worker scope */
+	starpu_perf_knob_scope_per_scheduler = 5  /**< per-scheduler scope */
 };
 
 /**
-  Enum of all possible performance knob value type.
+   Enum of all possible performance knob value type.
  */
 enum starpu_perf_knob_type
 {
-	starpu_perf_knob_type_undefined = 0, /** underfined value type */
-	starpu_perf_knob_type_int32     = 1, /** signed 32-bit integer value */
-	starpu_perf_knob_type_int64     = 2, /** signed 64-bit integer value */
-	starpu_perf_knob_type_float     = 3, /** 32-bit single precision floating-point value */
-	starpu_perf_knob_type_double    = 4  /** 64-bit double precision floating-point value */
+	starpu_perf_knob_type_undefined = 0, /**< underfined value type */
+	starpu_perf_knob_type_int32     = 1, /**< signed 32-bit integer value */
+	starpu_perf_knob_type_int64     = 2, /**< signed 64-bit integer value */
+	starpu_perf_knob_type_float     = 3, /**< 32-bit single precision floating-point value */
+	starpu_perf_knob_type_double    = 4  /**< 64-bit double precision floating-point value */
 };
 
 /**
-  Translate scope name constant string to scope id.
-  */
+   Translate scope name constant string to scope id.
+*/
 int starpu_perf_knob_scope_name_to_id(const char *name);
+
 /**
-  Translate scope id to scope name constant string.
-  */
+   Translate scope id to scope name constant string.
+*/
 const char *starpu_perf_knob_scope_id_to_name(enum starpu_perf_knob_scope scope);
 
 /**
-  Translate type name constant string to type id.
-  */
+   Translate type name constant string to type id.
+*/
 int starpu_perf_knob_type_name_to_id(const char *name);
+
 /**
-  Translate type id to type name constant string.
-  */
+   Translate type id to type name constant string.
+*/
 const char *starpu_perf_knob_type_id_to_name(enum starpu_perf_knob_type type);
 
 /**
-  Return the number of performance steering knobs for the given scope.
-  */
+   Return the number of performance steering knobs for the given scope.
+*/
 int starpu_perf_knob_nb(enum starpu_perf_knob_scope scope);
+
 /**
-  Translate a performance knob name to its id.
-  */
+   Translate a performance knob name to its id.
+*/
 int starpu_perf_knob_name_to_id(enum starpu_perf_knob_scope scope, const char *name);
+
 /**
-  Translate a performance knob name to its id.
-  */
+   Translate a performance knob name to its id.
+*/
 int starpu_perf_knob_nth_to_id(enum starpu_perf_knob_scope scope, int nth);
+
 /**
-  Translate a performance knob rank in its scope to its knob id.
-  */
+   Translate a performance knob rank in its scope to its knob id.
+*/
 const char *starpu_perf_knob_id_to_name(int id);
+
 /**
-  Translate a knob id to its name constant string.
-  */
+   Translate a knob id to its name constant string.
+*/
 int starpu_perf_knob_get_type_id(int id);
+
 /**
-  Return the knob's help string.
-  */
+   Return the knob's help string.
+*/
 const char *starpu_perf_knob_get_help_string(int id);
 
 /**
-  Display the list of knobs defined in the given scope.
-  */
+   Display the list of knobs defined in the given scope.
+*/
 void starpu_perf_knob_list_avail(enum starpu_perf_knob_scope scope);
+
 /**
-  Display the list of knobs defined in all scopes.
-  */
+   Display the list of knobs defined in all scopes.
+*/
 void starpu_perf_knob_list_all_avail(void);
 
 /**
-  Get knob value for Global scope.
-  */
+   Get knob value for Global scope.
+*/
 int32_t starpu_perf_knob_get_global_int32_value (const int knob_id);
+
 /**
-  Get knob value for Global scope.
-  */
+   Get knob value for Global scope.
+*/
 int64_t starpu_perf_knob_get_global_int64_value (const int knob_id);
+
 /**
-  Get knob value for Global scope.
-  */
+   Get knob value for Global scope.
+*/
 float   starpu_perf_knob_get_global_float_value (const int knob_id);
+
 /**
-  Get knob value for Global scope.
-  */
+   Get knob value for Global scope.
+*/
 double  starpu_perf_knob_get_global_double_value(const int knob_id);
 
 /**
-  Set int32 knob value for Global scope.
-  */
+   Set int32 knob value for Global scope.
+*/
 void starpu_perf_knob_set_global_int32_value (const int knob_id, int32_t new_value);
+
 /**
-  Set int64 knob value for Global scope.
-  */
+   Set int64 knob value for Global scope.
+*/
 void starpu_perf_knob_set_global_int64_value (const int knob_id, int64_t new_value);
+
 /**
-  Set float knob value for Global scope.
-  */
+   Set float knob value for Global scope.
+*/
 void starpu_perf_knob_set_global_float_value (const int knob_id, float   new_value);
+
 /**
-  Set double knob value for Global scope.
-  */
+   Set double knob value for Global scope.
+*/
 void starpu_perf_knob_set_global_double_value(const int knob_id, double  new_value);
 
-
 /**
- Get int32 value for Per_worker scope.
-  */
+   Get int32 value for Per_worker scope.
+*/
 int32_t starpu_perf_knob_get_per_worker_int32_value (const int knob_id, unsigned workerid);
+
 /**
- Get int64 value for Per_worker scope.
-  */
+   Get int64 value for Per_worker scope.
+*/
 int64_t starpu_perf_knob_get_per_worker_int64_value (const int knob_id, unsigned workerid);
+
 /**
- Get float value for Per_worker scope.
-  */
+   Get float value for Per_worker scope.
+*/
 float   starpu_perf_knob_get_per_worker_float_value (const int knob_id, unsigned workerid);
+
 /**
- Get double value for Per_worker scope.
-  */
+   Get double value for Per_worker scope.
+*/
 double  starpu_perf_knob_get_per_worker_double_value(const int knob_id, unsigned workerid);
 
 /**
- Set int32 value for Per_worker scope.
-  */
+   Set int32 value for Per_worker scope.
+*/
 void starpu_perf_knob_set_per_worker_int32_value (const int knob_id, unsigned workerid, int32_t new_value);
+
 /**
- Set int64 value for Per_worker scope.
-  */
+   Set int64 value for Per_worker scope.
+*/
 void starpu_perf_knob_set_per_worker_int64_value (const int knob_id, unsigned workerid, int64_t new_value);
+
 /**
- Set float value for Per_worker scope.
-  */
+   Set float value for Per_worker scope.
+*/
 void starpu_perf_knob_set_per_worker_float_value (const int knob_id, unsigned workerid, float   new_value);
+
 /**
- Set double value for Per_worker scope.
-  */
+   Set double value for Per_worker scope.
+*/
 void starpu_perf_knob_set_per_worker_double_value(const int knob_id, unsigned workerid, double  new_value);
 
-
 /**
- Get int32 value for per_scheduler scope.
-  */
+   Get int32 value for per_scheduler scope.
+*/
 int32_t starpu_perf_knob_get_per_scheduler_int32_value (const int knob_id, const char * sched_policy_name);
+
 /**
- Get int64 value for per_scheduler scope.
-  */
+   Get int64 value for per_scheduler scope.
+*/
 int64_t starpu_perf_knob_get_per_scheduler_int64_value (const int knob_id, const char * sched_policy_name);
+
 /**
- Get float value for per_scheduler scope.
-  */
+   Get float value for per_scheduler scope.
+*/
 float   starpu_perf_knob_get_per_scheduler_float_value (const int knob_id, const char * sched_policy_name);
+
 /**
- Get double value for per_scheduler scope.
-  */
+   Get double value for per_scheduler scope.
+*/
 double  starpu_perf_knob_get_per_scheduler_double_value(const int knob_id, const char * sched_policy_name);
 
 /**
- Set int32 value for per_scheduler scope.
-  */
+   Set int32 value for per_scheduler scope.
+*/
 void starpu_perf_knob_set_per_scheduler_int32_value (const int knob_id, const char * sched_policy_name, int32_t new_value);
+
 /**
- Set int64 value for per_scheduler scope.
-  */
+   Set int64 value for per_scheduler scope.
+*/
 void starpu_perf_knob_set_per_scheduler_int64_value (const int knob_id, const char * sched_policy_name, int64_t new_value);
+
 /**
- Set float value for per_scheduler scope.
-  */
+   Set float value for per_scheduler scope.
+*/
 void starpu_perf_knob_set_per_scheduler_float_value (const int knob_id, const char * sched_policy_name, float   new_value);
+
 /**
- Set double value for per_scheduler scope.
-  */
+   Set double value for per_scheduler scope.
+*/
 void starpu_perf_knob_set_per_scheduler_double_value(const int knob_id, const char * sched_policy_name, double  new_value);
 
 /** @} */

+ 11 - 0
include/starpu_sched_component.h

@@ -215,6 +215,10 @@ struct starpu_sched_tree *starpu_sched_tree_create(unsigned sched_ctx_id) STARPU
    destroy tree and free all non shared component in it.
 */
 void starpu_sched_tree_destroy(struct starpu_sched_tree *tree);
+/**
+   calls starpu_sched_tree_destroy, ready for use for starpu_sched_policy::deinit_sched field.
+ */
+void starpu_sched_tree_deinitialize(unsigned sched_ctx_id);
 struct starpu_sched_tree *starpu_sched_tree_get(unsigned sched_ctx_id);
 /**
    recursively set all starpu_sched_component::workers, do not take into account shared parts (except workers).
@@ -789,6 +793,13 @@ struct starpu_sched_tree *starpu_sched_component_make_scheduler(unsigned sched_c
 #define STARPU_SCHED_SIMPLE_FIFOS_BELOW_EXP	(1<<13)
 
 /**
+   Request to prepend a component before the decision component. This should be
+   used alone and followed by the component creation function pointer and its
+   data.
+*/
+#define STARPU_SCHED_SIMPLE_PRE_DECISION	(1<<14)
+
+/**
    Create a simple modular scheduler tree around a scheduling decision-making
    component \p component. The details of what should be built around \p component
    is described by \p flags. The different STARPU_SCHED_SIMPL_DECIDE_* flags are

+ 6 - 1
include/starpu_sched_ctx.h

@@ -336,7 +336,12 @@ void starpu_sched_ctx_move_task_to_ctx_locked(struct starpu_task *task, unsigned
 
 int starpu_sched_ctx_get_worker_rank(unsigned sched_ctx_id);
 
-void (*starpu_sched_ctx_get_sched_policy_init(unsigned sched_ctx_id))(unsigned);
+/**
+   Return the function associated with the scheduler context \p
+   sched_ctx_id which was given through the field
+   starpu_conf::sched_policy_callback
+*/
+void (*starpu_sched_ctx_get_sched_policy_callback(unsigned sched_ctx_id))(unsigned);
 
 unsigned starpu_sched_ctx_has_starpu_scheduler(unsigned sched_ctx_id, unsigned *awake_workers);
 

+ 13 - 39
include/starpu_task.h

@@ -82,13 +82,6 @@ extern "C"
 /**
    To be used when setting the field starpu_codelet::where (or
    starpu_task::where) to specify the codelet (or the task) may be
-   executed on a MIC processing unit.
-*/
-#define STARPU_MIC	STARPU_WORKER_TO_MASK(STARPU_MIC_WORKER)
-
-/**
-   To be used when setting the field starpu_codelet::where (or
-   starpu_task::where) to specify the codelet (or the task) may be
    executed on a MPI Slave processing unit.
 */
 #define STARPU_MPI_MS	STARPU_WORKER_TO_MASK(STARPU_MPI_MS_WORKER)
@@ -190,16 +183,6 @@ typedef void (*starpu_opencl_func_t)(void **, void*);
 typedef void (*starpu_fpga_func_t)(void **, void*);
 
 /**
-   MIC implementation of a codelet.
-*/
-typedef void (*starpu_mic_kernel_t)(void **, void*);
-
-/**
-  MIC kernel for a codelet
-*/
-typedef starpu_mic_kernel_t (*starpu_mic_func_t)(void);
-
-/**
    MPI Master Slave kernel for a codelet
 */
 typedef void (*starpu_mpi_ms_kernel_t)(void **, void*);
@@ -442,22 +425,6 @@ struct starpu_codelet
 
 	/**
 	   Optional array of function pointers to a function which
-	   returns the MIC implementation of the codelet. The
-	   functions prototype must be:
-	   \code{.c}
-	   starpu_mic_kernel_t mic_func(struct starpu_codelet *cl, unsigned nimpl)
-	   \endcode
-	   If the field starpu_codelet::where is set, then the field
-	   starpu_codelet::mic_funcs is ignored if ::STARPU_MIC does
-	   not appear in the field starpu_codelet::where. It can be
-	   <c>NULL</c> if starpu_codelet::cpu_funcs_name is
-	   non-<c>NULL</c>, in which case StarPU will simply make a
-	   symbol lookup to get the implementation.
-	*/
-	starpu_mic_func_t mic_funcs[STARPU_MAXIMPLEMENTATIONS];
-
-	/**
-	   Optional array of function pointers to a function which
 	   returns the MPI Master Slave implementation of the codelet.
 	   The functions prototype must be:
 	   \code{.c}
@@ -476,8 +443,8 @@ struct starpu_codelet
 	   Optional array of strings which provide the name of the CPU
 	   functions referenced in the array
 	   starpu_codelet::cpu_funcs. This can be used when running on
-	   MIC devices for StarPU to simply look
-	   up the MIC function implementation through its name.
+	   MPI MS devices for StarPU to simply look
+	   up the MPI MS function implementation through its name.
 	*/
 	const char *cpu_funcs_name[STARPU_MAXIMPLEMENTATIONS];
 
@@ -952,10 +919,7 @@ struct starpu_task
 	   by the application through <c>malloc()</c>, setting
 	   starpu_task::cl_arg_free to 1 makes StarPU automatically
 	   call <c>free(cl_arg)</c> when destroying the task. This
-	   saves the user from defining a callback just for that. This
-	   is mostly useful when targetting MIC, where the
-	   codelet does not execute in the same memory space as the
-	   main thread.
+	   saves the user from defining a callback just for that.
 
 	   With starpu_task_insert() and alike this is set to 1 when using
 	   ::STARPU_CL_ARGS.
@@ -1441,6 +1405,13 @@ struct starpu_task
    SettingManyDataHandlesForATask)
 */
 #define STARPU_TASK_GET_HANDLE(task, i) (((task)->dyn_handles) ? (task)->dyn_handles[i] : (task)->handles[i])
+
+/**
+   Return all the data handles of \p task. If \p task is defined
+   with a static or dynamic number of handles, will either return all
+   the element of the field starpu_task::handles or all the elements
+   of the field starpu_task::dyn_handles (see \ref SettingManyDataHandlesForATask)
+*/
 #define STARPU_TASK_GET_HANDLES(task) (((task)->dyn_handles) ? (task)->dyn_handles : (task)->handles)
 
 /**
@@ -1613,6 +1584,9 @@ int starpu_task_submit_nodeps(struct starpu_task *task) STARPU_WARN_UNUSED_RESUL
 */
 int starpu_task_submit_to_ctx(struct starpu_task *task, unsigned sched_ctx_id);
 
+/**
+   Return 1 if \p task is terminated
+*/
 int starpu_task_finished(struct starpu_task *task) STARPU_WARN_UNUSED_RESULT;
 
 /**

+ 3 - 0
include/starpu_task_dep.h

@@ -217,6 +217,9 @@ void starpu_tag_notify_from_apps(starpu_tag_t id);
 */
 void starpu_tag_notify_restart_from_apps(starpu_tag_t id);
 
+/**
+   Return the task associated to the tag \p id
+ */
 struct starpu_task *starpu_tag_get_task(starpu_tag_t id);
 
 /** @} */

+ 43 - 18
include/starpu_task_util.h

@@ -343,7 +343,23 @@ extern "C"
 */
 #define STARPU_TASK_LINE	 (43<<STARPU_MODE_SHIFT)
 
-#define STARPU_SHIFTED_MODE_MAX (44<<STARPU_MODE_SHIFT)
+/**
+   Used when calling starpu_task_insert(), must be followed by a
+   pointer to a epilogue callback function
+*/
+#define STARPU_EPILOGUE_CALLBACK   (44<<STARPU_MODE_SHIFT)
+
+/**
+   Used when calling starpu_task_insert(), must be followed by a
+   pointer to be given as an argument to the epilogue callback
+   function
+*/
+#define STARPU_EPILOGUE_CALLBACK_ARG   (45<<STARPU_MODE_SHIFT)
+
+/**
+   This has to be the last mode value plus 1
+*/
+#define STARPU_SHIFTED_MODE_MAX (46<<STARPU_MODE_SHIFT)
 
 /**
    Set the given \p task corresponding to \p cl with the following arguments.
@@ -470,6 +486,12 @@ void starpu_task_insert_data_process_mode_array_arg(struct starpu_codelet *cl, s
 */
 void starpu_codelet_pack_args(void **arg_buffer, size_t *arg_buffer_size, ...);
 
+/**
+   Structure to be used for starpu_codelet_pack_arg_init() & co, and
+   starpu_codelet_unpack_arg_init() & co. The contents is public,
+   however users should not directly access it, but only use as a
+   parameter to the appropriate functions.
+*/
 struct starpu_codelet_pack_arg_data
 {
 	char *arg_buffer;
@@ -510,35 +532,39 @@ void starpu_codelet_pack_arg_fini(struct starpu_codelet_pack_arg_data *state, vo
 void starpu_codelet_unpack_args(void *cl_arg, ...);
 
 /**
-   Initialize struct starpu_codelet_pack_arg_data before calling
-   starpu_codelet_unpack_arg(). This will pass the starpu_task->cl_arg 
-   and starpu_task->cl_arg_size to the content of struct starpu_codelet_pack_arg_data.
+   Initialize \p state with \p cl_arg and \p cl_arg_size. This has to
+   be called before calling starpu_codelet_unpack_arg().
 */
-void starpu_codelet_unpack_arg_init(struct starpu_codelet_pack_arg_data *state, void **cl_arg, size_t *cl_arg_size);
+void starpu_codelet_unpack_arg_init(struct starpu_codelet_pack_arg_data *state, void *cl_arg, size_t cl_arg_size);
 
 /**
-   Unpack one argument from struct starpu_codelet_pack_arg \p state into ptr with a copy. 
-   That structure has to be initialized before with starpu_codelet_unpack_arg_init().
-   Size is stored in starpu_task->cl_arg, and it is a known parameter in this function .
+   Unpack the next argument of size \p size from \p state into \p ptr with a copy.
+   \p state has to be initialized before with starpu_codelet_unpack_arg_init().
 */
 void starpu_codelet_unpack_arg(struct starpu_codelet_pack_arg_data *state, void *ptr, size_t size);
 
 /**
-   Unpack one argument from struct starpu_codelet_pack_arg \p state into ptr with a copy.
-   That structure has to be initialized before with starpu_codelet_unpack_arg_init(). 
-   Size is stored in starpu_task->cl_arg, and it is an unknown parameter in this function. 
-   It will be returned from starpu_task->cl_arg with a copy.
+   Unpack the next argument of unknown size from \p state into \p ptr
+   with a copy. \p ptr is allocated before copying in it the value of
+   the argument.
+   The size of the argument is returned in \p size.
+   \p has to be initialized before with starpu_codelet_unpack_arg_init().
 */
 void starpu_codelet_dup_arg(struct starpu_codelet_pack_arg_data *state, void **ptr, size_t *size);
 
 /**
-   Unpack one argument from struct starpu_codelet_pack_arg \p state into ptr, and the pointer of ptr will be returned.
-   That structure has to be initialized before with starpu_codelet_unpack_arg_init(). 
-   Size is stored in starpu_task->cl_arg, and it is an unknown parameter in this function. 
-   It will be returned from starpu_task->cl_arg with a copy.
+   Unpack the next argument of unknown size from \p state into \p ptr.
+   \p ptr will be a pointer to the memory of the argument.
+   The size of the argument is returned in \p size.
+   \p has to be initialized before with starpu_codelet_unpack_arg_init().
 */
-void starpu_codelet_pick_arg(struct starpu_codelet_pack_arg_data *state, void **ptr, size_t *size); 
+void starpu_codelet_pick_arg(struct starpu_codelet_pack_arg_data *state, void **ptr, size_t *size);
 
+/**
+   Finish unpacking data, after calling starpu_codelet_unpack_arg_init()
+   once and starpu_codelet_unpack_arg() or starpu_codelet_dup_arg() or
+   starpu_codelet_pick_arg() several times.
+*/
 void starpu_codelet_unpack_arg_fini(struct starpu_codelet_pack_arg_data *state);
 
 /**
@@ -546,7 +572,6 @@ void starpu_codelet_unpack_arg_fini(struct starpu_codelet_pack_arg_data *state);
 */
 void starpu_codelet_unpack_discard_arg(struct starpu_codelet_pack_arg_data *state);
 
-
 /**
    Similar to starpu_codelet_unpack_args(), but if any parameter is 0,
    copy the part of \p cl_arg that has not been read in \p buffer

+ 2 - 15
include/starpu_worker.h

@@ -49,9 +49,8 @@ enum starpu_node_kind
 	STARPU_OPENCL_RAM=3,
 	STARPU_FPGA_RAM=4,
 	STARPU_DISK_RAM=5,
-	STARPU_MIC_RAM=6,
-	STARPU_MPI_MS_RAM=7,
-	STARPU_MAX_RAM=7
+	STARPU_MPI_MS_RAM=6,
+	STARPU_MAX_RAM=6
 };
 
 /**
@@ -67,7 +66,6 @@ enum starpu_worker_archtype
 	STARPU_CUDA_WORKER=1,       /**< NVIDIA CUDA device */
 	STARPU_OPENCL_WORKER=2,     /**< OpenCL device */
 	STARPU_FPGA_WORKER=4,       /**< FPGA device */
-	STARPU_MIC_WORKER=3,        /**< Intel MIC device */
 	STARPU_MPI_MS_WORKER=5,     /**< MPI Slave device */
 	STARPU_NARCH = 6,           /**< Number of arch types */
 	STARPU_ANY_WORKER=255       /**< any worker, used in the hypervisor */
@@ -187,22 +185,11 @@ unsigned starpu_cuda_worker_get_count(void);
 unsigned starpu_opencl_worker_get_count(void);
 
 /**
-   Return the number of MIC workers controlled by StarPU.
-*/
-unsigned starpu_mic_worker_get_count(void);
-
-/**
    Return the number of MPI Master Slave workers controlled by StarPU.
 */
 unsigned starpu_mpi_ms_worker_get_count(void);
 
 /**
-   Return the number of MIC devices controlled by StarPU. The return
-   value should be at most \ref STARPU_MAXMICDEVS.
-*/
-unsigned starpu_mic_device_get_count(void);
-
-/**
    Return the identifier of the current worker, i.e the one associated
    to the calling thread. The return value is either \c -1 if the
    current context is not a StarPU worker (i.e. when called from the

+ 1 - 2
julia/src/StarPU.jl

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+# Copyright (C) 2020-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -116,6 +116,5 @@ export starpu_worker_get_count
 export starpu_cpu_worker_get_count
 export starpu_cuda_worker_get_count
 export starpu_opencl_worker_get_count
-export starpu_mic_worker_get_count
 
 end

+ 1 - 2
julia/src/translate_headers.jl

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
+# Copyright (C) 2020-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -89,7 +89,6 @@ function starpu_translate_headers()
                                "starpu_cpu_worker_get_count",
                                "starpu_cuda_worker_get_count",
                                "starpu_opencl_worker_get_count",
-                               "starpu_mic_worker_get_count",
                                "STARPU_CPU",
                                "STARPU_CUDA",
                                "STARPU_CUDA_ASYNC",

+ 0 - 268
mic-configure

@@ -1,268 +0,0 @@
-#!/bin/bash
-# StarPU --- Runtime system for heterogeneous multicore architectures.
-#
-# Copyright (C) 2013-2021  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
-# Copyright (C) 2013       Thibaut Lambert
-#
-# StarPU is free software; you can redistribute it and/or modify
-# it under the terms of the GNU Lesser General Public License as published by
-# the Free Software Foundation; either version 2.1 of the License, or (at
-# your option) any later version.
-#
-# StarPU is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-#
-# See the GNU Lesser General Public License in COPYING.LGPL for more details.
-#
-ROOT_DIR=$(dirname $0)
-
-cat > ./mic-config.log << EOF
-This file was created by StarPU mic-configure
-
- $ $0 $*
-EOF
-
-prefix="/usr/local"
-coi_dir="/opt/intel/mic/coi"
-scif_dir="/opt/intel/mic/scif"
-mic_host="x86_64-k1om-linux"
-declare -a host_params
-declare -a mic_params
-unset host_params
-unset mic_params
-native_mic=0
-mpi=0
-for arg in "$@"
-do
-	case $arg in
-		--prefix=*)
-			prefix="${arg#--prefix=}"
-			;;
-		--with-coi-dir=*)
-			coi_dir="${arg#--with-coi-dir=}"
-			;;
-		--with-scif-dir=*)
-			scif_dir="${arg#--with-scif-dir=}"
-			;;
-		--mic-host=*)
-			mic_host="${arg#--mic-host=}"
-			;;
-	        --enable-native-mic)
-		        native_mic=1
-		        ;;
-		--with-compiler=*)
-			compiler="${arg#--with-compiler=}"
-			;;
-		--with-mic-param=*)
-			mic_params+=("${arg#--with-mic-param=}")
-			;;
-		--with-host-param=*)
-			host_params+=("${arg#--with-host-param=}")
-			;;
-		--with-mpi*)
-			mpi=1
-			;;
-		--help)
-			cat << EOF
-mic-configure specific options:
-
-  --with-coi-dir=DIR	Specify directory that contains
-			device-linux-release/lib/libcoi_device and 
-			host-linux-release/lib/libcoi_host and 
-  --with-scif-dir=DIR	Specify directory that contains
-			device-linux-release/lib/libscif_device and 
-			host-linux-release/lib/libscif_host and 
-  --mic-host=HOST	Specify the precise Phi host to build for
-			(default: k1om)
-  --with-compiler=[icc|gcc]
-			Specify whether to build with icc or with gcc
-  --enable-native-mic	Only build the Phi binaries
-  --with-mic-param=--OPTION
-			Pass --OPTION to the Phi configure script
-  --with-host-param=--OPTION
-			Pass --OPTION to the host configure script
-EOF
-			;;
-	esac
-done
-
-if [ -z "$compiler" ]
-then
-    # Test gcc compiler
-    x=$(type -t ${mic_host}-gcc)
-    if [ -z "$x" ]
-    then
-	# Test icc compiler
-	echo "int main(int argc, char **argv) { return 0; }" > /tmp/icc_$USER_$$.c
-	icc -mmic /tmp/icc_$USER_$$.c > /dev/null 2>/tmp/icc_$USER_$$.err
-	l=$(grep -c "invalid argument" /tmp/icc_$USER_$$.err)
-	if [ "$l" != "0" ]
-	then
-	    echo "[error] no compiler found. please add path to either ${mic_host}-gcc or to an enabled mic icc compiler in your PATH"
-	    exit 1
-	else
-	    compiler="icc"
-	fi
-    else
-	compiler="gcc"
-    fi
-fi
-
-dev_list="host mic"
-if [ "$native_mic" -eq "1" ]
-then
-    dev_list="mic"
-fi
-
-# prepend mic_params with "--with-mpicc=mpicc -mmic", to allow possible override by the user
-if [ $mpi = 1 ]
-then
-	mic_params=("--with-mpicc=mpicc -mmic" "${mic_params[@]}")
-	mic_params=("--with-mpifort=mpifort -mmic" "${mic_params[@]}")
-fi
-
-for arch in $dev_list #host mic
-do
-	# We call the configure script from a build directory further in the
-	# arborescence
-
-	case $ROOT_DIR in
-		/*) command="${ROOT_DIR}/configure";;
-		*) command="../${ROOT_DIR}/configure";;
-	esac
-
-	declare -a params
-	params=("--prefix=$prefix/$arch" "--disable-fstack-protector-all")
-
-	if [ "$arch" = mic ] ; then
-		if [ $compiler = "icc" ] ; then
-		    export CC="icc -mmic"
-		    export CXX="icc -mmic"
-		    export LD="icc -mmic"
-		    export CXXLD="icc -mmic"
-		    export F77="ifort -mmic"
-		    export FC="ifort -mmic"
-		else
-		    # let configure auto-detect GNU cross-building tools
-		    unset CC
-		    unset CXX
-		    unset LD
-		    unset CXXLD
-		    unset F77
-		    unset FC
-		    params+=(--disable-fortran)
-		fi
-	fi
-
-	if [ "$native_mic" -eq "0" ]
-	then
-		params+=(--enable-mic "--with-coi-dir=$coi_dir" "--with-scif-dir=$scif_dir")
-	fi
-
-	if test x$arch = xmic ; then
-	    params+=(--host=$mic_host --disable-build-doc)
-	    if [ "$native_mic" -eq "1" ]
-	    then
-		params+=(--enable-maxcpus=250)
-	    else
-		params+=("--with-coi-lib-dir=$coi_dir/device-linux-release/lib" "--with-scif-lib-dir=$scif_dir/device-linux-release/lib")
-	    fi
-	else
-	    params+=("--with-coi-lib-dir=$coi_dir/host-linux-release/lib" "--with-scif-lib-dir=$scif_dir/host-linux-release/lib")
-	fi
-
-	# If the build directory doesn't exist yet, create it
-	if [ ! -d "build_${arch}" ] ; then
-		mkdir "build_${arch}"
-	fi
-
-	cd "build_${arch}"
-
-	if test x$arch = xmic ; then
-		LIBRARY_PATH=$SINK_LIBRARY_PATH:$MIC_LIBRARY_PATH \
-		INCLUDE=$SINK_INCLUDE \
-		C_INCLUDE_PATH=$SINK_C_INCLUDE_PATH \
-		CPLUS_INCLUDE_PATH=$SINK_CPLUS_INCLUDE_PATH \
-		PKG_CONFIG_PATH=$SINK_PKG_CONFIG_PATH \
-		$command "$@" "${params[@]}" "${mic_params[@]}"
-		MIC_BUILD_ENV="\
-LIBRARY_PATH=$SINK_LIBRARY_PATH:$MIC_LIBRARY_PATH \\
-	INCLUDE=$SINK_INCLUDE \\
-	C_INCLUDE_PATH=$SINK_C_INCLUDE_PATH \\
-	CPLUS_INCLUDE_PATH=$SINK_CPLUS_INCLUDE_PATH \\
-	PKG_CONFIG_PATH=$SINK_PKG_CONFIG_PATH \\\
-"
-	else
-		$command "$@" "${params[@]}""${host_params[@]}"
-	fi
-	if [ "$?" != 0 ]
-	then
-		exit $?
-	fi
-	cd ..
-done
-if [ "$native_mic" -eq "1" ]
-then
-cat > Makefile << EOF
-all:
-	$MIC_BUILD_ENV
-	\$(MAKE) \$(MFLAGS) -C build_mic
-
-clean:
-	\$(MAKE) \$(MFLAGS) -C build_mic clean
-
-distclean: clean
-	rm -f Makefile
-
-check:
-	$MIC_BUILD_ENV
-	\$(MAKE) \$(MFLAGS) -C build_mic check
-
-showfailed:
-	@\$(MAKE) \$(MFLAGS) -C build_mic showfailed
-
-showcheck:
-	\$(MAKE) \$(MFLAGS) -C build_mic showcheck
-
-install:
-	$MIC_BUILD_ENV
-	\$(MAKE) \$(MFLAGS) -C build_mic install
-	ln -sf "${prefix}/mic/lib/pkgconfig/starpu-1.3.pc" "${prefix}/mic/lib/pkgconfig/starpu-1.3-mic.pc"
-EOF
-else
-cat > Makefile << EOF
-all:
-	\$(MAKE) \$(MFLAGS) -C build_host
-	$MIC_BUILD_ENV
-	\$(MAKE) \$(MFLAGS) -C build_mic
-
-clean:
-	\$(MAKE) \$(MFLAGS) -C build_host clean
-	\$(MAKE) \$(MFLAGS) -C build_mic clean
-
-distclean: clean
-	rm -f Makefile
-
-check:
-	\$(MAKE) \$(MFLAGS) -C build_host check
-	$MIC_BUILD_ENV
-	\$(MAKE) \$(MFLAGS) -C build_mic check ; \
-	RET=\$\$? ; \
-	STARPU_NCPUS=0 \$(MAKE) \$(MFLAGS) -C build_mic check && [ \$\$RET == 0 ]
-
-showfailed:
-	@\$(MAKE) \$(MFLAGS) -C build_host showfailed
-	@\$(MAKE) \$(MFLAGS) -C build_mic showfailed
-
-showcheck:
-	\$(MAKE) \$(MFLAGS) -C build_host showcheck
-	\$(MAKE) \$(MFLAGS) -C build_mic showcheck
-
-install:
-	\$(MAKE) \$(MFLAGS) -C build_host install
-	$MIC_BUILD_ENV
-	\$(MAKE) \$(MFLAGS) -C build_mic install
-	ln -sf "${prefix}/mic/lib/pkgconfig/starpu-1.3.pc" "${prefix}/mic/lib/pkgconfig/starpu-1.3-mic.pc"
-EOF
-fi

+ 1 - 1
mpi/examples/cg/cg.c

@@ -397,7 +397,7 @@ int main(int argc, char **argv)
 	starpu_mpi_barrier(MPI_COMM_WORLD);
 	end = starpu_timing_now();
 
-	FPRINTF_SERVER(stderr, "Problem initialization timing : %2.2f seconds\n", (end-start)/10e6);
+	FPRINTF_SERVER(stderr, "Problem initialization timing : %2.2f seconds\n", (end-start)/1e6);
 
 	ret = cg();
 	if (ret == -ENODEV)

+ 26 - 18
mpi/src/starpu_mpi_task_insert.c

@@ -386,6 +386,14 @@ int _starpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nod
 		{
 			(void)va_arg(varg_list_copy, void *);
 		}
+		else if (arg_type==STARPU_EPILOGUE_CALLBACK)
+		{
+			(void)va_arg(varg_list_copy, _starpu_callback_func_t);
+		}
+		else if (arg_type==STARPU_EPILOGUE_CALLBACK_ARG)
+		{
+			(void)va_arg(varg_list_copy, void *);
+		}
 		else if (arg_type==STARPU_PRIORITY)
 		{
 			prio = va_arg(varg_list_copy, int);
@@ -411,28 +419,28 @@ int _starpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nod
 			(void)va_arg(varg_list_copy, unsigned);
 		}
 		else if (arg_type==STARPU_PROLOGUE_CALLBACK)
-                {
+		{
 			(void)va_arg(varg_list_copy, _starpu_callback_func_t);
 		}
-                else if (arg_type==STARPU_PROLOGUE_CALLBACK_ARG)
-                {
-                        (void)va_arg(varg_list_copy, void *);
-                }
-                else if (arg_type==STARPU_PROLOGUE_CALLBACK_ARG_NFREE)
-                {
-                        (void)va_arg(varg_list_copy, void *);
-                }
-                else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP)
-                {
+		else if (arg_type==STARPU_PROLOGUE_CALLBACK_ARG)
+		{
+			(void)va_arg(varg_list_copy, void *);
+		}
+		else if (arg_type==STARPU_PROLOGUE_CALLBACK_ARG_NFREE)
+		{
+			(void)va_arg(varg_list_copy, void *);
+		}
+		else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP)
+		{
 			(void)va_arg(varg_list_copy, _starpu_callback_func_t);
-                }
-                else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP_ARG)
-                {
-                        (void)va_arg(varg_list_copy, void *);
 		}
-                else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP_ARG_NFREE)
-                {
-                        (void)va_arg(varg_list_copy, void *);
+		else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP_ARG)
+		{
+			(void)va_arg(varg_list_copy, void *);
+		}
+		else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP_ARG_NFREE)
+		{
+			(void)va_arg(varg_list_copy, void *);
 		}
 		else if (arg_type==STARPU_EXECUTE_WHERE)
 		{

+ 24 - 14
mpi/src/starpu_mpi_task_insert_fortran.c

@@ -225,6 +225,16 @@ int _fstarpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_no
 			arg_i++;
 			/* void* */
 		}
+		else if (arg_type==STARPU_EPILOGUE_CALLBACK)
+		{
+			arg_i++;
+			/* _starpu_callback_func_t */
+		}
+		else if (arg_type==STARPU_EPILOGUE_CALLBACK_ARG)
+		{
+			arg_i++;
+			/* void* */
+		}
 		else if (arg_type==STARPU_PRIORITY)
 		{
 			arg_i++;
@@ -256,32 +266,32 @@ int _fstarpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_no
 			/* unsigned* */
 		}
 		else if (arg_type==STARPU_PROLOGUE_CALLBACK)
-                {
+		{
 			arg_i++;
 			/* _starpu_callback_func_t */
 		}
-                else if (arg_type==STARPU_PROLOGUE_CALLBACK_ARG)
-                {
+		else if (arg_type==STARPU_PROLOGUE_CALLBACK_ARG)
+		{
 			arg_i++;
 			/* void* */
-                }
-                else if (arg_type==STARPU_PROLOGUE_CALLBACK_ARG_NFREE)
-                {
+		}
+		else if (arg_type==STARPU_PROLOGUE_CALLBACK_ARG_NFREE)
+		{
 			arg_i++;
 			/* void* */
-                }
-                else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP)
-                {
+		}
+		else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP)
+		{
 			arg_i++;
 			/* _starpu_callback_func_t */
-                }
-                else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP_ARG)
-                {
+		}
+		else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP_ARG)
+		{
 			arg_i++;
 			/* void* */
 		}
-                else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP_ARG_NFREE)
-                {
+		else if (arg_type==STARPU_PROLOGUE_CALLBACK_POP_ARG_NFREE)
+		{
 			arg_i++;
 			/* void* */
 		}

+ 1 - 18
src/Makefile.am

@@ -19,7 +19,7 @@ include $(top_srcdir)/starpu-notests.mk
 
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -DBUILDING_STARPU -DSTARPU_DATADIR='"$(datadir)"'
 AM_CPPFLAGS += $(STARPU_H_CPPFLAGS)
-AM_CPPFLAGS += $(FXT_CFLAGS) $(STARPU_COI_CPPFLAGS) $(STARPU_SCIF_CPPFLAGS) $(STARPU_RCCE_CFLAGS) $(STARPU_RCCE_CPPFLAGS)
+AM_CPPFLAGS += $(FXT_CFLAGS)
 LIBS += -lm $(LIBSTARPU_LDFLAGS)
 if STARPU_USE_MPI_MASTER_SLAVE
 LIBS += $(MPICC_LDFLAGS)
@@ -144,9 +144,6 @@ noinst_HEADERS = 						\
 	drivers/opencl/driver_opencl_utils.h			\
 	drivers/max/driver_fpga.h				\
 	debug/starpu_debug_helpers.h				\
-	drivers/mic/driver_mic_common.h				\
-	drivers/mic/driver_mic_source.h				\
-	drivers/mic/driver_mic_sink.h				\
 	drivers/mpi/driver_mpi_common.h				\
 	drivers/mpi/driver_mpi_source.h				\
 	drivers/mpi/driver_mpi_sink.h				\
@@ -386,20 +383,6 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/mp_common/sink_common
 endif
 
 #########################################
-#										#
-#	     MIC compilation				#
-#										#
-#########################################
-
-libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/mic/driver_mic_init.c
-if STARPU_USE_MIC
-libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/mic/driver_mic_common.c
-libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/mic/driver_mic_source.c
-libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/mic/driver_mic_sink.c
-libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/mic/driver_mic_utils.c
-endif
-
-#########################################
 #                                       #
 #     MPI Master/Slave compilation      #
 #                                       #

+ 0 - 4
src/common/utils.h

@@ -71,11 +71,7 @@
 #define STARPU_HG_DISABLE_CHECKING(variable) VALGRIND_HG_DISABLE_CHECKING(&(variable), sizeof(variable))
 #define STARPU_HG_ENABLE_CHECKING(variable)  VALGRIND_HG_ENABLE_CHECKING(&(variable), sizeof(variable))
 
-#if defined(__KNC__) || defined(__KNF__)
-#define STARPU_DEBUG_PREFIX "[starpu-mic]"
-#else
 #define STARPU_DEBUG_PREFIX "[starpu]"
-#endif
 
 /* This is needed in some places to make valgrind yield to another thread to be
  * able to progress.  */

+ 0 - 5
src/core/combined_workers.c

@@ -72,14 +72,9 @@ int starpu_combined_worker_assign_workerid(int nworkers, int workerid_array[])
 		if ((id < 0) || (id >= basic_worker_count))
 			return -EINVAL;
 
-#ifdef STARPU_USE_MIC
-		STARPU_ASSERT(config->workers[id].arch == STARPU_CPU_WORKER || config->workers[id].arch == STARPU_MIC_WORKER);
-		STARPU_ASSERT(config->workers[id].worker_mask == STARPU_CPU || config->workers[id].worker_mask == STARPU_MIC);
-#else/* STARPU_USE_MIC */
 		/* We only combine CPUs */
 		STARPU_ASSERT(config->workers[id].arch == STARPU_CPU_WORKER);
 		STARPU_ASSERT(config->workers[id].worker_mask == STARPU_CPU);
-#endif /* STARPU_USE_MIC */
 	}
 
 	/* Get an id for that combined worker. Note that this is not thread

+ 1 - 1
src/core/dependencies/data_arbiter_concurrency.c

@@ -286,7 +286,7 @@ unsigned _starpu_attempt_to_submit_arbitered_data_request(unsigned request_from_
 		while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
 		{
 			cpt++;
-			_starpu_datawizard_progress(STARPU_DATAWIZARD_DO_NOT_ALLOC);
+			_starpu_datawizard_progress(_STARPU_DATAWIZARD_DO_NOT_ALLOC);
 		}
 		if (cpt == STARPU_SPIN_MAXTRY)
 			_starpu_spin_lock(&handle->header_lock);

+ 2 - 2
src/core/dependencies/data_concurrency.c

@@ -132,7 +132,7 @@ static unsigned _starpu_attempt_to_submit_data_request(unsigned request_from_cod
 		while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
 		{
 			cpt++;
-			_starpu_datawizard_progress(STARPU_DATAWIZARD_DO_NOT_ALLOC);
+			_starpu_datawizard_progress(_STARPU_DATAWIZARD_DO_NOT_ALLOC);
 		}
 		if (cpt == STARPU_SPIN_MAXTRY)
 			_starpu_spin_lock(&handle->header_lock);
@@ -266,7 +266,7 @@ static void _starpu_take_data(unsigned request_from_codelet,
 		while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
 		{
 			cpt++;
-			_starpu_datawizard_progress(STARPU_DATAWIZARD_DO_NOT_ALLOC);
+			_starpu_datawizard_progress(_STARPU_DATAWIZARD_DO_NOT_ALLOC);
 		}
 		if (cpt == STARPU_SPIN_MAXTRY)
 			_starpu_spin_lock(&handle->header_lock);

+ 0 - 53
src/core/detect_combined_workers.c

@@ -241,55 +241,19 @@ static void find_and_assign_combinations_without_hwloc(int *workerids, int nwork
 	if(sched_ctx_id == STARPU_NMAX_SCHED_CTXS)
 		sched_ctx_id = 0;
 	int min, max;
-#ifdef STARPU_USE_MIC
-	unsigned j;
-	int mic_min, mic_max;
-#endif
-
 	struct starpu_worker_collection* workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
 
 	/* We put the id of all CPU workers in this array */
 	int cpu_workers[STARPU_NMAXWORKERS];
 	unsigned ncpus = 0;
-#ifdef STARPU_USE_MIC
-	unsigned nb_mics = _starpu_get_machine_config()->topology.ndevices[STARPU_MIC_WORKER];
-	unsigned * nmics_table;
-	int * mic_id;
-	int ** mic_workers;
-	_STARPU_MALLOC(mic_id, sizeof(int)*nb_mics);
-	_STARPU_MALLOC(nmics_table, sizeof(unsigned)*nb_mics);
-	_STARPU_MALLOC(mic_workers, sizeof(int*)*nb_mics);
-	for(j=0; j<nb_mics; j++)
-	{
-		mic_id[j] = -1;
-		nmics_table[j] = 0;
-		_STARPU_MALLOC(mic_workers[j], sizeof(int)*STARPU_NMAXWORKERS);
-	}
-#endif /* STARPU_USE_MIC */
 
 	for (i = 0; i < nworkers; i++)
 	{
 		struct _starpu_worker *worker = _starpu_get_worker_struct(workerids[i]);
 		if (worker->arch == STARPU_CPU_WORKER)
 			cpu_workers[ncpus++] = i;
-#ifdef STARPU_USE_MIC
-		else if(worker->arch == STARPU_MIC_WORKER)
-		{
-			for(j=0; j<nb_mics && mic_id[j] != worker->devid && mic_id[j] != -1; j++);
-			if(j<nb_mics)
-			{
-				if(mic_id[j] == -1)
-				{
-					mic_id[j] = worker->devid;
-				}
-				mic_workers[j][nmics_table[j]++] = i;
-			}
-		}
-#endif /* STARPU_USE_MIC */
-
 	}
 
-
 	min = starpu_get_env_number("STARPU_MIN_WORKERSIZE");
 	if (min < 2)
 		min = 2;
@@ -298,23 +262,6 @@ static void find_and_assign_combinations_without_hwloc(int *workerids, int nwork
 		max = ncpus;
 
 	assign_combinations_without_hwloc(workers,cpu_workers,ncpus,min,max);
-#ifdef STARPU_USE_MIC
-	mic_min = starpu_get_env_number("STARPU_MIN_WORKERSIZE");
-	mic_max = starpu_get_env_number("STARPU_MAX_WORKERSIZE");
-	if (mic_min < 2)
-		mic_min = 2;
-	for(j=0; j<nb_mics; j++)
-	{
-		int _mic_max = mic_max;
-		if (_mic_max == -1 || _mic_max > (int) nmics_table[j])
-			_mic_max = nmics_table[j];
-		assign_combinations_without_hwloc(workers,mic_workers[j],nmics_table[j],mic_min,_mic_max);
-		free(mic_workers[j]);
-	}
-	free(mic_id);
-	free(nmics_table);
-	free(mic_workers);
-#endif /* STARPU_USE_MIC */
 }
 
 #endif /* STARPU_HAVE_HWLOC */

+ 2 - 2
src/core/perfmodel/energy_model.c

@@ -169,7 +169,7 @@ int starpu_energy_start(int workerid STARPU_ATTRIBUTE_UNUSED, enum starpu_worker
 int starpu_energy_stop(struct starpu_perfmodel *model, struct starpu_task *task, unsigned nimpl, unsigned ntasks, int workerid, enum starpu_worker_archtype archi)
 {
 	double energy = 0.;
-	int retval;
+	int retval = 0;
 	unsigned cpuid = 0;
 	double t2 = starpu_timing_now();
 	double t STARPU_ATTRIBUTE_UNUSED = t2 - t1;
@@ -221,7 +221,7 @@ int starpu_energy_stop(struct starpu_perfmodel *model, struct starpu_task *task,
 	case STARPU_CUDA_WORKER:
 	{
 		STARPU_ASSERT_MSG(workerid != -1, "For CUDA GPUs we measure each GPU separately, please specify a worker\n");
-		int ret = nvmlDeviceGetTotalEnergyConsumption(device, &energy_end );
+		int ret = nvmlDeviceGetTotalEnergyConsumption(device, &energy_end);
 		if (ret != NVML_SUCCESS)
 			return -1;
 		energy = (energy_end - energy_begin) / 1000.;

+ 154 - 66
src/core/perfmodel/perfmodel_bus.c

@@ -49,6 +49,9 @@
 
 #ifdef STARPU_HAVE_HWLOC
 #include <hwloc.h>
+#ifdef STARPU_HAVE_LIBNVIDIA_ML
+#include <hwloc/nvml.h>
+#endif
 #ifndef HWLOC_API_VERSION
 #define HWLOC_OBJ_PU HWLOC_OBJ_PROC
 #endif
@@ -91,7 +94,6 @@ static unsigned nnumas = 0;
 static unsigned ncuda = 0;
 static unsigned nopencl = 0;
 #ifndef STARPU_SIMGRID
-static unsigned nmic = 0;
 static unsigned nmpi_ms = 0;
 
 /* Benchmarking the performance of the bus */
@@ -130,11 +132,6 @@ static struct dev_timing opencldev_timing_per_numa[STARPU_MAXOPENCLDEVS*STARPU_M
 static unsigned fpga_affinity_matrix[STARPU_MAXFPGADEVS][STARPU_MAXCPUS];
 #endif
 
-#ifdef STARPU_USE_MIC
-static double mic_time_host_to_device[STARPU_MAXNODES] = {0.0};
-static double mic_time_device_to_host[STARPU_MAXNODES] = {0.0};
-#endif /* STARPU_USE_MIC */
-
 #ifdef STARPU_USE_MPI_MASTER_SLAVE
 static double mpi_time_device_to_device[STARPU_MAXMPIDEVS][STARPU_MAXMPIDEVS] = {{0.0}};
 static double mpi_latency_device_to_device[STARPU_MAXMPIDEVS][STARPU_MAXMPIDEVS] = {{0.0}};
@@ -829,23 +826,8 @@ static void benchmark_all_gpu_devices(void)
 	}
 #endif
 
-#ifdef STARPU_USE_MIC
-	/* TODO: implement real calibration ! For now we only put an arbitrary
-	 * value for each device during at the declaration as a bug fix, else
-	 * we get problems on heft scheduler */
-	nmic = _starpu_mic_src_get_device_count();
-
-	for (i = 0; i < STARPU_MAXNODES; i++)
-	{
-		mic_time_host_to_device[i] = 0.1;
-		mic_time_device_to_host[i] = 0.1;
-	}
-#endif /* STARPU_USE_MIC */
-
 #ifdef STARPU_USE_MPI_MASTER_SLAVE
-
 	_starpu_mpi_common_measure_bandwidth_latency(mpi_time_device_to_device, mpi_latency_device_to_device);
-
 #endif /* STARPU_USE_MPI_MASTER_SLAVE */
 
 #ifdef STARPU_HAVE_HWLOC
@@ -1365,9 +1347,6 @@ static void write_bus_latency_file_content(void)
 #ifdef STARPU_USE_OPENCL
 	maxnode += nopencl;
 #endif
-#ifdef STARPU_USE_MIC
-	maxnode += nmic;
-#endif
 #ifdef STARPU_USE_MPI_MASTER_SLAVE
 	maxnode += nmpi_ms;
 #endif
@@ -1441,11 +1420,6 @@ static void write_bus_latency_file_content(void)
 						latency += search_bus_best_latency(dst-b_low, "OpenCL", 1);
 				b_low += nopencl;
 #endif
-#ifdef STARPU_USE_MIC
-				b_up += nmic;
-				/* TODO Latency MIC */
-				b_low += nmic;
-#endif
 #ifdef STARPU_USE_MPI_MASTER_SLAVE
 				b_up += nmpi_ms;
 				/* Modify MPI src and MPI dst if they contain the master node or not
@@ -1717,9 +1691,6 @@ static void write_bus_bandwidth_file_content(void)
 #ifdef STARPU_USE_OPENCL
 	maxnode += nopencl;
 #endif
-#ifdef STARPU_USE_MIC
-	maxnode += nmic;
-#endif
 #ifdef STARPU_USE_MPI_MASTER_SLAVE
 	maxnode += nmpi_ms;
 #endif
@@ -1789,14 +1760,6 @@ static void write_bus_bandwidth_file_content(void)
 					slowness += search_bus_best_timing(dst-b_low, "OpenCL", 1);
 				b_low += nopencl;
 #endif
-#ifdef STARPU_USE_MIC
-				b_up += nmic;
-				if (src >= b_low && src < b_up)
-					slowness += mic_time_device_to_host[src-b_low];
-				if (dst >= b_low && dst < b_up)
-					slowness += mic_time_host_to_device[dst-b_low];
-				b_low += nmic;
-#endif
 #ifdef STARPU_USE_MPI_MASTER_SLAVE
 				b_up += nmpi_ms;
 				/* Modify MPI src and MPI dst if they contain the master node or not
@@ -2062,7 +2025,7 @@ static void check_bus_config_file(void)
 	{
 		FILE *f;
 		int ret;
-		unsigned read_cuda = -1, read_opencl = -1, read_mic = -1, read_mpi_ms = -1;
+		unsigned read_cuda = -1, read_opencl = -1, read_mpi_ms = -1;
 		unsigned read_cpus = -1, read_numa = -1;
 		int locked;
 
@@ -2088,11 +2051,6 @@ static void check_bus_config_file(void)
 		STARPU_ASSERT_MSG(ret == 1, "Error when reading from file '%s'", path);
 		_starpu_drop_comments(f);
 
-		ret = fscanf(f, "%u\t", &read_mic);
-		if (ret == 0)
-			read_mic = 0;
-		_starpu_drop_comments(f);
-
 		ret = fscanf(f, "%u\t", &read_mpi_ms);
 		if (ret == 0)
 			read_mpi_ms = 0;
@@ -2111,9 +2069,6 @@ static void check_bus_config_file(void)
 #ifdef STARPU_USE_OPENCL
 		nopencl = _starpu_opencl_get_device_count();
 #endif
-#ifdef STARPU_USE_MIC
-		nmic = _starpu_mic_src_get_device_count();
-#endif /* STARPU_USE_MIC */
 #ifdef STARPU_USE_MPI_MASTER_SLAVE
 		nmpi_ms = _starpu_mpi_src_get_device_count();
 #endif /* STARPU_USE_MPI_MASTER_SLAVE */
@@ -2123,7 +2078,6 @@ static void check_bus_config_file(void)
 		compare_value_and_recalibrate("NUMA", read_numa, nnumas);
 		compare_value_and_recalibrate("CUDA", read_cuda, ncuda);
 		compare_value_and_recalibrate("OpenCL", read_opencl, nopencl);
-		compare_value_and_recalibrate("MIC", read_mic, nmic);
 		compare_value_and_recalibrate("MPI Master-Slave", read_mpi_ms, nmpi_ms);
 	}
 }
@@ -2150,7 +2104,6 @@ static void write_bus_config_file_content(void)
 	fprintf(f, "%u # Number of NUMA nodes\n", nnumas);
 	fprintf(f, "%u # Number of CUDA devices\n", ncuda);
 	fprintf(f, "%u # Number of OpenCL devices\n", nopencl);
-	fprintf(f, "%u # Number of MIC devices\n", nmic);
 	fprintf(f, "%u # Number of MPI devices\n", nmpi_ms);
 
 	if (locked)
@@ -2379,12 +2332,51 @@ static int find_platform_path_up(hwloc_obj_t obj1, hwloc_obj_t obj2, double band
 	return ret;
 }
 
+static hwloc_obj_t get_hwloc_cuda_obj(hwloc_topology_t topology, unsigned devid)
+{
+	hwloc_obj_t res;
+	struct cudaDeviceProp props;
+	cudaError_t cures;
+
+	res = hwloc_cuda_get_device_osdev_by_index(topology, devid);
+	if (res)
+		return res;
+
+	cures = cudaGetDeviceProperties(&props, devid);
+	if (cures == cudaSuccess)
+	{
+		res = hwloc_get_pcidev_by_busid(topology, props.pciDomainID, props.pciBusID, props.pciDeviceID, 0);
+		if (res)
+			return res;
+
+#ifdef STARPU_HAVE_LIBNVIDIA_ML
+		nvmlDevice_t nvmldev = _starpu_cuda_get_nvmldev(&props);
+
+		if (nvmldev)
+		{
+			unsigned int index;
+			if (nvmlDeviceGetIndex(nvmldev, &index) == NVML_SUCCESS)
+			{
+				res = hwloc_nvml_get_device_osdev_by_index(topology, index);
+				if (res)
+					return res;
+			}
+
+			res = hwloc_nvml_get_device_osdev(topology, nvmldev);
+			if (res)
+				return res;
+		}
+#endif
+	}
+	return NULL;
+}
+
 /* find the path between cuda i and cuda j, and update the maximum bandwidth along the path */
 static int find_platform_cuda_path(hwloc_topology_t topology, unsigned i, unsigned j, double bandwidth)
 {
 	hwloc_obj_t cudai, cudaj;
-	cudai = hwloc_cuda_get_device_osdev_by_index(topology, i);
-	cudaj = hwloc_cuda_get_device_osdev_by_index(topology, j);
+	cudai = get_hwloc_cuda_obj(topology, i);
+	cudaj = get_hwloc_cuda_obj(topology, j);
 
 	if (!cudai || !cudaj)
 		return 0;
@@ -2830,34 +2822,118 @@ static void write_bus_platform_file_content(int version)
 #warning TODO: use libnvml to get NVLink links, otherwise numbers will be bogusly propagated through PCI topology
 #endif
 	/* If we have enough hwloc information, write PCI bandwidths and routes */
-	if (!starpu_get_env_number_default("STARPU_PCI_FLAT", 0))
+	if (!starpu_get_env_number_default("STARPU_PCI_FLAT", 0) && ncuda > 0)
 	{
 		hwloc_topology_t topology;
 		hwloc_topology_init(&topology);
 		_starpu_topology_filter(topology);
 		hwloc_topology_load(topology);
 
-		/* First find paths and record measured bandwidth along the path */
+		char nvlink[ncuda][ncuda];
+		char nvlinkhost[ncuda];
+		memset(nvlink, 0, sizeof(nvlink));
+		memset(nvlinkhost, 0, sizeof(nvlinkhost));
+
+#ifdef STARPU_HAVE_LIBNVIDIA_ML
+		/* First find NVLinks */
+		struct cudaDeviceProp props[ncuda];
+
+		for (i = 0; i < ncuda; i++)
+		{
+			cudaError_t cures = cudaGetDeviceProperties(&props[i], i);
+			if (cures != cudaSuccess)
+				props[i].name[0] = 0;
+		}
+
 		for (i = 0; i < ncuda; i++)
 		{
 			unsigned j;
+
+			if (!props[i].name[0])
+				continue;
+
+			nvmlDevice_t nvmldev;
+			nvmldev = _starpu_cuda_get_nvmldev(&props[i]);
+			if (!nvmldev)
+				continue;
+
+			for (j = 0; j < NVML_NVLINK_MAX_LINKS; j++)
+			{
+				nvmlEnableState_t active;
+				nvmlReturn_t ret;
+				nvmlPciInfo_t pci;
+				unsigned k;
+
+				ret = nvmlDeviceGetNvLinkState(nvmldev, j, &active);
+				if (ret != NVML_SUCCESS)
+					continue;
+				if (active != NVML_FEATURE_ENABLED)
+					continue;
+				ret = nvmlDeviceGetNvLinkRemotePciInfo(nvmldev, j, &pci);
+				if (ret != NVML_SUCCESS)
+					continue;
+
+				hwloc_obj_t obj = hwloc_get_pcidev_by_busid(topology,
+						pci.domain, pci.bus, pci.device, 0);
+				if (obj && obj->type == HWLOC_OBJ_PCI_DEVICE && (obj->attr->pcidev.class_id >> 8 == 0x06))
+				{
+					switch (obj->attr->pcidev.vendor_id)
+					{
+					case 0x1014:
+						/* IBM OpenCAPI port, direct CPU-GPU NVLink */
+						/* TODO: NUMA affinity */
+						nvlinkhost[i] = 1;
+						continue;
+					case 0x10de:
+						/* TODO: NVIDIA NVSwitch */
+						continue;
+					}
+				}
+
+				/* Otherwise, link to another GPU? */
+				for (k = i+1; k < ncuda; k++)
+				{
+					if ((int) pci.domain == props[k].pciDomainID
+					 && (int) pci.bus == props[k].pciBusID
+					 && (int) pci.device == props[k].pciDeviceID)
+					{
+						nvlink[i][k] = 1;
+						nvlink[k][i] = 1;
+						break;
+					}
+				}
+			}
+		}
+#endif
+
+		/* Find paths and record measured bandwidth along the path */
+		for (i = 0; i < ncuda; i++)
+		{
+			unsigned j;
+
 			for (j = 0; j < ncuda; j++)
-				if (i != j)
+				if (i != j && !nvlink[i][j] && !nvlinkhost[i] && !nvlinkhost[j])
 					if (!find_platform_cuda_path(topology, i, j, 1000000. / cudadev_timing_dtod[i][j]))
 					{
+						_STARPU_DISP("Warning: could not get CUDA location from hwloc\n");
 						clean_topology(hwloc_get_root_obj(topology));
 						hwloc_topology_destroy(topology);
 						goto flat_cuda;
 					}
+
 			/* Record RAM/CUDA bandwidths */
-			find_platform_forward_path(hwloc_cuda_get_device_osdev_by_index(topology, i), 1000000. / search_bus_best_timing(i, "CUDA", 0));
-			find_platform_backward_path(hwloc_cuda_get_device_osdev_by_index(topology, i), 1000000. / search_bus_best_timing(i, "CUDA", 1));
+			if (!nvlinkhost[i])
+			{
+				find_platform_forward_path(get_hwloc_cuda_obj(topology, i), 1000000. / search_bus_best_timing(i, "CUDA", 0));
+				find_platform_backward_path(get_hwloc_cuda_obj(topology, i), 1000000. / search_bus_best_timing(i, "CUDA", 1));
+			}
 		}
 
 		/* Ok, found path in all cases, can emit advanced platform routes */
 		fprintf(f, "\n");
 		emit_topology_bandwidths(f, hwloc_get_root_obj(topology), Bps, s);
 		fprintf(f, "\n");
+
 		for (i = 0; i < ncuda; i++)
 		{
 			unsigned j;
@@ -2866,20 +2942,35 @@ static void write_bus_platform_file_content(int version)
 				{
 					fprintf(f, "   <route src=\"CUDA%u\" dst=\"CUDA%u\" symmetrical=\"NO\">\n", i, j);
 					fprintf(f, "    <link_ctn id=\"CUDA%u-CUDA%u\"/>\n", i, j);
-					emit_platform_path_up(f,
-							hwloc_cuda_get_device_osdev_by_index(topology, i),
-							hwloc_cuda_get_device_osdev_by_index(topology, j));
+					if (!nvlink[i][j])
+					{
+						if (nvlinkhost[i] && nvlinkhost[j])
+							/* TODO: NUMA affinity */
+							fprintf(f, "    <link_ctn id=\"Host\"/>\n");
+						else
+							emit_platform_path_up(f,
+									get_hwloc_cuda_obj(topology, i),
+									get_hwloc_cuda_obj(topology, j));
+					}
 					fprintf(f, "   </route>\n");
 				}
 
 			fprintf(f, "   <route src=\"CUDA%u\" dst=\"RAM\" symmetrical=\"NO\">\n", i);
 			fprintf(f, "    <link_ctn id=\"CUDA%u-RAM\"/>\n", i);
-			emit_platform_forward_path(f, hwloc_cuda_get_device_osdev_by_index(topology, i));
+			if (nvlinkhost[i])
+				/* TODO: NUMA affinity */
+				fprintf(f, "    <link_ctn id=\"Host\"/>\n");
+			else
+				emit_platform_forward_path(f, get_hwloc_cuda_obj(topology, i));
 			fprintf(f, "   </route>\n");
 
 			fprintf(f, "   <route src=\"RAM\" dst=\"CUDA%u\" symmetrical=\"NO\">\n", i);
 			fprintf(f, "    <link_ctn id=\"RAM-CUDA%u\"/>\n", i);
-			emit_platform_backward_path(f, hwloc_cuda_get_device_osdev_by_index(topology, i));
+			if (nvlinkhost[i])
+				/* TODO: NUMA affinity */
+				fprintf(f, "    <link_ctn id=\"Host\"/>\n");
+			else
+				emit_platform_backward_path(f, get_hwloc_cuda_obj(topology, i));
 			fprintf(f, "   </route>\n");
 		}
 
@@ -3014,9 +3105,6 @@ void _starpu_load_bus_performance_files(void)
 #if defined(STARPU_USE_MPI_MASTER_SLAVE) || defined(STARPU_USE_SIMGRID)
 	nmpi_ms = _starpu_mpi_src_get_device_count();
 #endif
-#if defined(STARPU_USE_MIC) || defined(STARPU_USE_SIMGRID)
-	nmic = _starpu_mic_src_get_device_count();
-#endif
 
 #ifndef STARPU_SIMGRID
 	check_bus_config_file();

+ 7 - 12
src/core/perfmodel/perfmodel_history.c

@@ -99,24 +99,19 @@ void _starpu_initialize_registered_performance_models(void)
 	unsigned ncores = conf->topology.nhwworker[STARPU_CPU_WORKER][0];
 	unsigned ncuda =  conf->topology.nhwdevices[STARPU_CUDA_WORKER];
 	unsigned nopencl = conf->topology.nhwdevices[STARPU_OPENCL_WORKER];
-	unsigned nmic = 0;
 	enum starpu_worker_archtype archtype;
-#if STARPU_MAXMICDEVS > 0 || STARPU_MAXMPIDEVS > 0
+#if STARPU_MAXMPIDEVS > 0
 	unsigned i;
 #endif
-#if STARPU_MAXMICDEVS > 0
-	for(i = 0; i < conf->topology.nhwdevices[STARPU_MIC_WORKER]; i++)
-		nmic += conf->topology.nhwworker[STARPU_MIC_WORKER][i];
-#endif
 	unsigned nmpi = 0;
 #if STARPU_MAXMPIDEVS > 0
 	for(i = 0; i < conf->topology.nhwdevices[STARPU_MPI_MS_WORKER]; i++)
 		nmpi += conf->topology.nhwworker[STARPU_MPI_MS_WORKER][i];
 #endif
 
-	// We used to allocate 2**(ncores + ncuda + nopencl + nmic + nmpi), this is too big
-	// We now allocate only 2*(ncores + ncuda + nopencl + nmic + nmpi), and reallocate when necessary in starpu_perfmodel_arch_comb_add
-	nb_arch_combs = 2 * (ncores + ncuda + nopencl + nmic + nmpi);
+	// We used to allocate 2**(ncores + ncuda + nopencl + nmpi), this is too big
+	// We now allocate only 2*(ncores + ncuda + nopencl + nmpi), and reallocate when necessary in starpu_perfmodel_arch_comb_add
+	nb_arch_combs = 2 * (ncores + ncuda + nopencl + nmpi);
 	_STARPU_MALLOC(arch_combs, nb_arch_combs*sizeof(struct starpu_perfmodel_arch*));
 	current_arch_comb = 0;
 	historymaxerror = starpu_get_env_number_default("STARPU_HISTORY_MAX_ERROR", STARPU_HISTORYMAXERROR);
@@ -942,8 +937,8 @@ static void dump_model_file(FILE *f, struct starpu_perfmodel *model)
 		{
 			fprintf(f, "####################\n");
 			fprintf(f, "# DEV_%d\n", dev);
-			fprintf(f, "# device type (CPU - %d, CUDA - %d, OPENCL - %d, MIC - %d, MPI_MS - %d)\n",
-				STARPU_CPU_WORKER, STARPU_CUDA_WORKER, STARPU_OPENCL_WORKER, STARPU_MIC_WORKER, STARPU_MPI_MS_WORKER);
+			fprintf(f, "# device type (CPU - %d, CUDA - %d, OPENCL - %d, MPI_MS - %d)\n",
+				STARPU_CPU_WORKER, STARPU_CUDA_WORKER, STARPU_OPENCL_WORKER, STARPU_MPI_MS_WORKER);
 			fprintf(f, "%u\n", arch_combs[comb]->devices[dev].type);
 
 			fprintf(f, "####################\n");
@@ -1256,7 +1251,7 @@ static void _starpu_dump_registered_models(void)
 	     node != _starpu_perfmodel_list_end(&registered_models);
 	     node  = _starpu_perfmodel_list_next(node))
 	{
-		if (node->model->is_init)
+		if (node->model->is_init && (node->model->type != STARPU_PER_WORKER && node->model->type != STARPU_PER_ARCH && node->model->type != STARPU_COMMON))
 			starpu_save_history_based_model(node->model);
 	}
 

+ 6 - 6
src/core/sched_ctx.c

@@ -532,7 +532,7 @@ struct _starpu_sched_ctx* _starpu_create_sched_ctx(struct starpu_sched_policy *p
 						   int min_prio_set, int min_prio,
 						   int max_prio_set, int max_prio,
 						   unsigned awake_workers,
-						   void (*sched_policy_init)(unsigned),
+						   void (*sched_policy_callback)(unsigned),
 						   void * user_data,
 						   int nsub_ctxs, int *sub_ctxs, int nsms)
 {
@@ -589,7 +589,7 @@ struct _starpu_sched_ctx* _starpu_create_sched_ctx(struct starpu_sched_policy *p
 	sched_ctx->main_master = -1;
 	sched_ctx->perf_arch.devices = NULL;
 	sched_ctx->perf_arch.ndevices = 0;
-	sched_ctx->init_sched = sched_policy_init;
+	sched_ctx->callback_sched = sched_policy_callback;
 	sched_ctx->user_data = user_data;
 	sched_ctx->sms_start_idx = 0;
 	sched_ctx->sms_end_idx = STARPU_NMAXSMS;
@@ -2673,10 +2673,10 @@ int starpu_sched_ctx_get_worker_rank(unsigned sched_ctx_id)
 	return -1;
 }
 
-void (*starpu_sched_ctx_get_sched_policy_init(unsigned sched_ctx_id))(unsigned)
+void (*starpu_sched_ctx_get_sched_policy_callback(unsigned sched_ctx_id))(unsigned)
 {
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
-	return sched_ctx->init_sched;
+	return sched_ctx->callback_sched;
 }
 
 unsigned starpu_sched_ctx_has_starpu_scheduler(unsigned sched_ctx_id, unsigned *awake_workers)
@@ -2782,7 +2782,7 @@ static void _get_workers(int min, int max, int *workers, int *nw, enum starpu_wo
 	if(config->topology.nsched_ctxs == 1)
 	{
 		/*we have all available resources */
-		npus = starpu_worker_get_nids_by_type(arch, pus, max);
+		npus = _starpu_worker_get_nids_by_type(arch, pus, max);
 /*TODO: hierarchical ctxs: get max good workers: close one to another */
 		for(i = 0; i < npus; i++)
 			workers[(*nw)++] = pus[i];
@@ -2790,7 +2790,7 @@ static void _get_workers(int min, int max, int *workers, int *nw, enum starpu_wo
 	else
 	{
 		unsigned enough_ressources = 0;
-		npus = starpu_worker_get_nids_ctx_free_by_type(arch, pus, max);
+		npus = _starpu_worker_get_nids_ctx_free_by_type(arch, pus, max);
 
 		for(i = 0; i < npus; i++)
 			workers[(*nw)++] = pus[i];

+ 4 - 4
src/core/sched_ctx.h

@@ -149,8 +149,8 @@ struct _starpu_sched_ctx
 	   them awake & use them in the parallel code*/
 	unsigned awake_workers;
 
-	/** function called when initializing the scheduler */
-	void (*init_sched)(unsigned);
+	/** callback function called when initializing the scheduler */
+	void (*callback_sched)(unsigned);
 
 	int sub_ctxs[STARPU_NMAXWORKERS];
 	int nsub_ctxs;
@@ -184,8 +184,8 @@ void _starpu_init_all_sched_ctxs(struct _starpu_machine_config *config);
 /** allocate all structures belonging to a context */
 struct _starpu_sched_ctx*  _starpu_create_sched_ctx(struct starpu_sched_policy *policy, int *workerid, int nworkerids, unsigned is_init_sched, const char *sched_name,
 						    int min_prio_set, int min_prio,
-						    int max_prio_set, int max_prio, unsigned awake_workers, void (*sched_policy_init)(unsigned), void *user_data,
-							int nsub_ctxs, int *sub_ctxs, int nsms);
+						    int max_prio_set, int max_prio, unsigned awake_workers, void (*sched_policy_callback)(unsigned), void *user_data,
+						    int nsub_ctxs, int *sub_ctxs, int nsms);
 
 /** delete all sched_ctx */
 void _starpu_delete_all_sched_ctxs();

+ 2 - 20
src/core/sched_policy.c

@@ -712,7 +712,7 @@ struct starpu_task *_starpu_create_conversion_task_for_arch(starpu_data_handle_t
 {
 	struct starpu_task *conversion_task;
 
-#if defined(STARPU_USE_OPENCL) || defined(STARPU_USE_CUDA) || defined(STARPU_USE_MIC) || defined(STARPU_SIMGRID)
+#if defined(STARPU_USE_OPENCL) || defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
 	struct starpu_multiformat_interface *format_interface;
 #endif
 
@@ -721,7 +721,7 @@ struct starpu_task *_starpu_create_conversion_task_for_arch(starpu_data_handle_t
 	conversion_task->synchronous = 0;
 	STARPU_TASK_SET_HANDLE(conversion_task, handle, 0);
 
-#if defined(STARPU_USE_OPENCL) || defined(STARPU_USE_CUDA) || defined(STARPU_USE_MIC) || defined(STARPU_SIMGRID)
+#if defined(STARPU_USE_OPENCL) || defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
 	/* The node does not really matter here */
 	format_interface = (struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
 #endif
@@ -756,15 +756,6 @@ struct starpu_task *_starpu_create_conversion_task_for_arch(starpu_data_handle_t
 			break;
 		}
 #endif
-#ifdef STARPU_USE_MIC
-		case STARPU_MIC_RAM:
-		{
-			struct starpu_multiformat_data_interface_ops *mf_ops;
-			mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface);
-			conversion_task->cl = mf_ops->mic_to_cpu_cl;
-			break;
-		}
-#endif
 		default:
 			_STARPU_ERROR("Oops : %u\n", handle->mf_node);
 		}
@@ -787,15 +778,6 @@ struct starpu_task *_starpu_create_conversion_task_for_arch(starpu_data_handle_t
 		break;
 	}
 #endif
-#ifdef STARPU_USE_MIC
-	case STARPU_MIC_RAM:
-	{
-		struct starpu_multiformat_data_interface_ops *mf_ops;
-		mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface);
-		conversion_task->cl = mf_ops->cpu_to_mic_cl;
-		break;
-	}
-#endif
 	default:
 		STARPU_ABORT();
 	}

+ 1 - 16
src/core/task.c

@@ -724,18 +724,6 @@ void _starpu_codelet_check_deprecated_fields(struct starpu_codelet *cl)
 
 	some_impl = 0;
 	for (i = 0; i < STARPU_MAXIMPLEMENTATIONS; i++)
-		if (cl->mic_funcs[i])
-		{
-			some_impl = 1;
-			break;
-		}
-	if (some_impl && is_where_unset)
-	{
-		where |= STARPU_MIC;
-	}
-
-	some_impl = 0;
-	for (i = 0; i < STARPU_MAXIMPLEMENTATIONS; i++)
 		if (cl->mpi_ms_funcs[i])
 		{
 			some_impl = 1;
@@ -755,7 +743,7 @@ void _starpu_codelet_check_deprecated_fields(struct starpu_codelet *cl)
 		}
 	if (some_impl && is_where_unset)
 	{
-		where |= STARPU_MIC|STARPU_MPI_MS;
+		where |= STARPU_MPI_MS;
 	}
 	cl->where = where;
 
@@ -1492,12 +1480,10 @@ _starpu_handle_needs_conversion_task_for_arch(starpu_data_handle_t handle,
 	switch (node_kind)
 	{
 		case STARPU_CPU_RAM:
-		case STARPU_MIC_RAM:
 		case STARPU_MPI_MS_RAM:
 			switch(starpu_node_get_kind(handle->mf_node))
 			{
 				case STARPU_CPU_RAM:
-				case STARPU_MIC_RAM:
                                 case STARPU_MPI_MS_RAM:
 					return 0;
 				default:
@@ -1508,7 +1494,6 @@ _starpu_handle_needs_conversion_task_for_arch(starpu_data_handle_t handle,
 			switch(starpu_node_get_kind(handle->mf_node))
 			{
 				case STARPU_CPU_RAM:
-				case STARPU_MIC_RAM:
                                 case STARPU_MPI_MS_RAM:
 					return 1;
 				default:

+ 0 - 5
src/core/task.h

@@ -107,11 +107,6 @@ static inline starpu_fpga_func_t _starpu_task_get_fpga_nth_implementation(struct
 	return cl->fpga_funcs[nimpl];
 }
 
-static inline starpu_mic_func_t _starpu_task_get_mic_nth_implementation(struct starpu_codelet *cl, unsigned nimpl)
-{
-	return cl->mic_funcs[nimpl];
-}
-
 static inline starpu_mpi_ms_func_t _starpu_task_get_mpi_ms_nth_implementation(struct starpu_codelet *cl, unsigned nimpl)
 {
 	return cl->mpi_ms_funcs[nimpl];

+ 13 - 302
src/core/topology.c

@@ -28,7 +28,6 @@
 #include <drivers/cuda/driver_cuda.h>
 #include <drivers/cpu/driver_cpu.h>
 #include <drivers/max/driver_fpga.h>
-#include <drivers/mic/driver_mic_source.h>
 #include <drivers/mpi/driver_mpi_source.h>
 #include <drivers/mpi/driver_mpi_common.h>
 #include <drivers/mp_common/source_common.h>
@@ -85,7 +84,6 @@ static int _starpu_get_logical_numa_node_worker(unsigned workerid);
 #define STARPU_NUMA_MAIN_RAM (-1)
 
 #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_FPGA) || defined(STARPU_SIMGRID) || defined(STARPU_USE_MPI_MASTER_SLAVE)
-
 struct handle_entry
 {
 	UT_hash_handle hh;
@@ -97,16 +95,15 @@ struct handle_entry
 static struct handle_entry *devices_using_cuda;
 #  endif
 
+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_FPGA) || defined(STARPU_SIMGRID)
 static unsigned may_bind_automatically[STARPU_NARCH] = { 0 };
+#endif
 
 #endif // defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
 
 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
 static struct _starpu_worker_set cuda_worker_set[STARPU_MAXCUDADEVS];
 #endif
-#ifdef STARPU_USE_MIC
-static struct _starpu_worker_set mic_worker_set[STARPU_MAXMICDEVS];
-#endif
 #ifdef STARPU_USE_MPI_MASTER_SLAVE
 struct _starpu_worker_set mpi_worker_set[STARPU_MAXMPIDEVS];
 #endif
@@ -677,36 +674,6 @@ static inline int _starpu_get_next_fpga_deviceid (struct _starpu_machine_config
 }
 #endif
 
-#if 0
-#if defined(STARPU_USE_MIC) || defined(STARPU_SIMGRID)
-static void _starpu_initialize_workers_mic_deviceid(struct _starpu_machine_config *config)
-{
-	struct _starpu_machine_topology *topology = &config->topology;
-	struct starpu_conf *uconf = &config->conf;
-
-	_starpu_initialize_workers_deviceid(uconf->use_explicit_workers_mic_deviceid == 0
-					    ? NULL
-					    : (int *)config->user_conf->workers_mic_deviceid,
-					    &(config->current_mic_deviceid),
-					    (int *)topology->workers_mic_deviceid,
-					    "STARPU_WORKERS_MICID",
-					    topology->nhwdevices[STARPU_MIC_WORKER],
-					    STARPU_MIC_WORKER);
-}
-#endif
-#endif
-
-#if 0
-#ifdef STARPU_USE_MIC
-static inline int _starpu_get_next_mic_deviceid(struct _starpu_machine_config *config)
-{
-	unsigned i = ((config->current_mic_deviceid++) % config->topology.ndevices[STARPU_MIC_WORKER]);
-
-	return (int)config->topology.workers_mic_deviceid[i];
-}
-#endif
-#endif
-
 #ifdef STARPU_USE_MPI_MASTER_SLAVE
 static inline int _starpu_get_next_mpi_deviceid(struct _starpu_machine_config *config)
 {
@@ -731,89 +698,6 @@ static void _starpu_init_mpi_topology(struct _starpu_machine_config *config, lon
 
 #endif /* STARPU_USE_MPI_MASTER_SLAVE */
 
-#ifdef STARPU_USE_MIC
-static void _starpu_init_mic_topology(struct _starpu_machine_config *config, long mic_idx)
-{
-	/* Discover the topology of the mic node identifier by MIC_IDX. That
-	 * means, make this StarPU instance aware of the number of cores available
-	 * on this MIC device. Update the `nhwworker[STARPU_MIC_WORKER]' topology field
-	 * accordingly. */
-
-	struct _starpu_machine_topology *topology = &config->topology;
-
-	int nbcores;
-	_starpu_src_common_sink_nbcores(_starpu_mic_nodes[mic_idx], &nbcores);
-	topology->nhwworker[STARPU_MIC_WORKER][mic_idx] = nbcores;
-}
-
-static int _starpu_init_mic_node(struct _starpu_machine_config *config, int mic_idx,
-				 COIENGINE *coi_handle, COIPROCESS *coi_process)
-{
-	/* Initialize the MIC node of index MIC_IDX. */
-
-	struct starpu_conf *user_conf = &config->conf;
-
-	char ***argv = _starpu_get_argv();
-	const char *suffixes[] = {"-mic", "_mic", NULL};
-
-	/* Environment variables to send to the Sink, it informs it what kind
-	 * of node it is (architecture and type) as there is no way to discover
-	 * it itself */
-	char mic_idx_env[32];
-	snprintf(mic_idx_env, sizeof(mic_idx_env), "_STARPU_MIC_DEVID=%d", mic_idx);
-
-	/* XXX: this is currently necessary so that the remote process does not
-	 * segfault. */
-	char nb_mic_env[32];
-	snprintf(nb_mic_env, sizeof(nb_mic_env), "_STARPU_MIC_NB=%d", 2);
-
-	const char *mic_sink_env[] = {"STARPU_SINK=STARPU_MIC", mic_idx_env, nb_mic_env, NULL};
-
-	char mic_sink_program_path[1024];
-	/* Let's get the helper program to run on the MIC device */
-	int mic_file_found = _starpu_src_common_locate_file(mic_sink_program_path,
-							    sizeof(mic_sink_program_path),
-							    starpu_getenv("STARPU_MIC_SINK_PROGRAM_NAME"),
-							    starpu_getenv("STARPU_MIC_SINK_PROGRAM_PATH"),
-							    user_conf->mic_sink_program_path,
-							    (argv ? (*argv)[0] : NULL),
-							    suffixes);
-
-	if (0 != mic_file_found)
-	{
-		_STARPU_MSG("No MIC program specified, use the environment\n"
-			    "variable STARPU_MIC_SINK_PROGRAM_NAME or the environment\n"
-			    "or the field 'starpu_conf.mic_sink_program_path'\n"
-			    "to define it.\n");
-
-		return -1;
-	}
-
-	COIRESULT res;
-	/* Let's get the handle which let us manage the remote MIC device */
-	res = COIEngineGetHandle(COI_ISA_MIC, mic_idx, coi_handle);
-	if (STARPU_UNLIKELY(res != COI_SUCCESS))
-		STARPU_MIC_SRC_REPORT_COI_ERROR(res);
-
-	/* We launch the helper on the MIC device, which will wait for us
-	 * to give it work to do.
-	 * As we will communicate further with the device throught scif we
-	 * don't need to keep the process pointer */
-	res = COIProcessCreateFromFile(*coi_handle, mic_sink_program_path, 0, NULL, 0,
-				       mic_sink_env, 1, NULL, 0, NULL,
-				       coi_process);
-	if (STARPU_UNLIKELY(res != COI_SUCCESS))
-		STARPU_MIC_SRC_REPORT_COI_ERROR(res);
-
-	/* Let's create the node structure, we'll communicate with the peer
-	 * through scif thanks to it */
-	_starpu_mic_nodes[mic_idx] =
-		_starpu_mp_common_node_create(STARPU_NODE_MIC_SOURCE, mic_idx);
-
-	return 0;
-}
-#endif
-
 #ifndef STARPU_SIMGRID
 #ifdef STARPU_HAVE_HWLOC
 static void _starpu_allocate_topology_userdata(hwloc_obj_t obj)
@@ -1279,10 +1163,10 @@ unsigned _starpu_topology_get_nnumanodes(struct _starpu_machine_config *config S
 void _starpu_topology_filter(hwloc_topology_t topology)
 {
 #if HWLOC_API_VERSION >= 0x20000
-	hwloc_topology_set_io_types_filter(topology, HWLOC_TYPE_FILTER_KEEP_IMPORTANT);
+	hwloc_topology_set_io_types_filter(topology, HWLOC_TYPE_FILTER_KEEP_ALL);
 	hwloc_topology_set_flags(topology, HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM);
 #else
-	hwloc_topology_set_flags(topology, HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM | HWLOC_TOPOLOGY_FLAG_IO_DEVICES | HWLOC_TOPOLOGY_FLAG_IO_BRIDGES);
+	hwloc_topology_set_flags(topology, HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM | HWLOC_TOPOLOGY_FLAG_WHOLE_IO);
 #endif
 #ifdef HAVE_HWLOC_TOPOLOGY_SET_COMPONENTS
 #  ifndef STARPU_USE_CUDA
@@ -1296,73 +1180,6 @@ void _starpu_topology_filter(hwloc_topology_t topology)
 }
 #endif
 
-#ifdef STARPU_USE_MIC
-static void _starpu_init_mic_config(struct _starpu_machine_config *config,
-				    struct starpu_conf *user_conf,
-				    unsigned mic_idx)
-{
-	// Configure the MIC device of index MIC_IDX.
-
-	struct _starpu_machine_topology *topology = &config->topology;
-
-	topology->nhwworker[STARPU_MIC_WORKER][mic_idx] = 0;
-
-	_starpu_init_mic_topology(config, mic_idx);
-
-	int nmiccores;
-	nmiccores = starpu_get_env_number("STARPU_NMICTHREADS");
-
-	STARPU_ASSERT_MSG(nmiccores >= -1, "nmiccores can not be negative and different from -1 (is is %d)", nmiccores);
-	if (nmiccores == -1)
-	{
-		/* Nothing was specified, so let's use the number of
-		 * detected mic cores. ! */
-		nmiccores = topology->nhwworker[STARPU_MIC_WORKER][mic_idx];
-	}
-	else
-	{
-		if ((unsigned) nmiccores > topology->nhwworker[STARPU_MIC_WORKER][mic_idx])
-		{
-			/* The user requires more MIC cores than there is available */
-			_STARPU_MSG("# Warning: %d MIC cores requested. Only %u available.\n", nmiccores, topology->nhwworker[STARPU_MIC_WORKER][mic_idx]);
-			nmiccores = topology->nhwworker[STARPU_MIC_WORKER][mic_idx];
-		}
-	}
-
-	topology->nworker[STARPU_MIC_WORKER][mic_idx] = nmiccores;
-	STARPU_ASSERT_MSG(topology->nworker[STARPU_MIC_WORKER][mic_idx] + topology->nworkers <= STARPU_NMAXWORKERS,
-			  "topology->nworker[STARPU_MIC_WORKER][mic_idx(%u)] (%u) + topology->nworkers (%u) <= STARPU_NMAXWORKERS (%d)",
-			  mic_idx, topology->nworker[STARPU_MIC_WORKER][mic_idx], topology->nworkers, STARPU_NMAXWORKERS);
-
-	/* _starpu_initialize_workers_mic_deviceid (config); */
-
-	mic_worker_set[mic_idx].workers = &config->workers[topology->nworkers];
-	mic_worker_set[mic_idx].nworkers = topology->nworker[STARPU_MIC_WORKER][mic_idx];
-	unsigned miccore_id;
-	for (miccore_id = 0; miccore_id < topology->nworker[STARPU_MIC_WORKER][mic_idx]; miccore_id++)
-	{
-		int worker_idx = topology->nworkers + miccore_id;
-		config->workers[worker_idx].set = &mic_worker_set[mic_idx];
-		config->workers[worker_idx].arch = STARPU_MIC_WORKER;
-		_STARPU_MALLOC(config->workers[worker_idx].perf_arch.devices, sizeof(struct starpu_perfmodel_device));
-		config->workers[worker_idx].perf_arch.ndevices = 1;
-		config->workers[worker_idx].perf_arch.devices[0].type = STARPU_MIC_WORKER;
-		config->workers[worker_idx].perf_arch.devices[0].devid = mic_idx;
-		config->workers[worker_idx].perf_arch.devices[0].ncores = 1;
-		config->workers[worker_idx].devid = mic_idx;
-		config->workers[worker_idx].subworkerid = miccore_id;
-		config->workers[worker_idx].worker_mask = STARPU_MIC;
-		config->worker_mask |= STARPU_MIC;
-	}
-	_starpu_mic_nodes[mic_idx]->baseworkerid = topology->nworkers;
-
-	topology->nworkers += topology->nworker[STARPU_MIC_WORKER][mic_idx];
-}
-
-static COIENGINE mic_handles[STARPU_MAXMICDEVS];
-COIPROCESS _starpu_mic_process[STARPU_MAXMICDEVS];
-#endif
-
 #ifdef STARPU_USE_MPI_MASTER_SLAVE
 static void _starpu_init_mpi_config(struct _starpu_machine_config *config,
 				    struct starpu_conf *user_conf,
@@ -1423,7 +1240,7 @@ static void _starpu_init_mpi_config(struct _starpu_machine_config *config,
 }
 #endif
 
-#if defined(STARPU_USE_MIC) || defined(STARPU_USE_MPI_MASTER_SLAVE)
+#if defined(STARPU_USE_MPI_MASTER_SLAVE)
 static void _starpu_init_mp_config(struct _starpu_machine_config *config,
 				   struct starpu_conf *user_conf, int no_mp_config)
 {
@@ -1434,44 +1251,6 @@ static void _starpu_init_mp_config(struct _starpu_machine_config *config,
 	 * - configure the workers accordingly.
 	 */
 
-#ifdef STARPU_USE_MIC
-	if (!no_mp_config)
-	{
-		struct _starpu_machine_topology *topology = &config->topology;
-
-		/* Discover and initialize the number of MIC nodes through the mp
-		 * infrastructure. */
-		unsigned nhwmicdevices = _starpu_mic_src_get_device_count();
-
-		int reqmicdevices = starpu_get_env_number("STARPU_NMIC");
-		if (reqmicdevices == -1 && user_conf)
-			reqmicdevices = user_conf->nmic;
-		if (reqmicdevices == -1)
-			/* Nothing was specified, so let's use the number of
-			 * detected mic devices. ! */
-			reqmicdevices = nhwmicdevices;
-
-		STARPU_ASSERT_MSG(reqmicdevices >= -1, "nmic can not be negative and different from -1 (is is %d)", reqmicdevices);
-		if (reqmicdevices != -1)
-		{
-			if ((unsigned) reqmicdevices > nhwmicdevices)
-			{
-				/* The user requires more MIC devices than there is available */
-				_STARPU_MSG("# Warning: %d MIC devices requested. Only %u available.\n", reqmicdevices, nhwmicdevices);
-				reqmicdevices = nhwmicdevices;
-			}
-		}
-
-		topology->ndevices[STARPU_MIC_WORKER] = 0;
-		unsigned i;
-		for (i = 0; i < (unsigned) reqmicdevices; i++)
-			if (0 == _starpu_init_mic_node(config, i, &mic_handles[i], &_starpu_mic_process[i]))
-				topology->ndevices[STARPU_MIC_WORKER]++;
-
-		for (i = 0; i < topology->ndevices[STARPU_MIC_WORKER]; i++)
-			_starpu_init_mic_config(config, user_conf, i);
-	}
-#endif
 #ifdef STARPU_USE_MPI_MASTER_SLAVE
 	{
 		struct _starpu_machine_topology *topology = &config->topology;
@@ -1522,17 +1301,6 @@ static void _starpu_init_mp_config(struct _starpu_machine_config *config,
 }
 #endif
 
-#ifdef STARPU_USE_MIC
-static void _starpu_deinit_mic_node(unsigned mic_idx)
-{
-	_starpu_mp_common_send_command(_starpu_mic_nodes[mic_idx], STARPU_MP_COMMAND_EXIT, NULL, 0);
-
-	COIProcessDestroy(_starpu_mic_process[mic_idx], -1, 0, NULL, NULL);
-
-	_starpu_mp_common_node_destroy(_starpu_mic_nodes[mic_idx]);
-}
-#endif
-
 #ifdef STARPU_USE_MPI_MASTER_SLAVE
 static void _starpu_deinit_mpi_node(int devid)
 {
@@ -1543,17 +1311,12 @@ static void _starpu_deinit_mpi_node(int devid)
 #endif
 
 
-#if defined(STARPU_USE_MIC) || defined(STARPU_USE_MPI_MASTER_SLAVE)
+#if defined(STARPU_USE_MPI_MASTER_SLAVE)
 static void _starpu_deinit_mp_config(struct _starpu_machine_config *config)
 {
 	struct _starpu_machine_topology *topology = &config->topology;
 	unsigned i;
 
-#ifdef STARPU_USE_MIC
-	for (i = 0; i < topology->ndevices[STARPU_MIC_WORKER]; i++)
-		_starpu_deinit_mic_node(i);
-	_starpu_mic_clear_kernels();
-#endif
 #ifdef STARPU_USE_MPI_MASTER_SLAVE
 	for (i = 0; i < topology->ndevices[STARPU_MPI_MS_WORKER]; i++)
 		_starpu_deinit_mpi_node(i);
@@ -1615,10 +1378,6 @@ static int _starpu_init_machine_config(struct _starpu_machine_config *config, in
 	for (i = 0; i < (int) (sizeof(cuda_worker_set)/sizeof(cuda_worker_set[0])); i++)
 		cuda_worker_set[i].workers = NULL;
 #endif
-#ifdef STARPU_USE_MIC
-	for (i = 0; i < (int) (sizeof(mic_worker_set)/sizeof(mic_worker_set[0])); i++)
-		mic_worker_set[i].workers = NULL;
-#endif
 #ifdef STARPU_USE_MPI_MASTER_SLAVE
 	for (i = 0; i < (int) (sizeof(mpi_worker_set)/sizeof(mpi_worker_set[0])); i++)
 		mpi_worker_set[i].workers = NULL;
@@ -1848,7 +1607,6 @@ static int _starpu_init_machine_config(struct _starpu_machine_config *config, in
 	topology->nworkers += topology->ndevices[STARPU_OPENCL_WORKER];
 #endif
 
-
 #if defined(STARPU_USE_FPGA)
 	int nfpga = config->conf.nfpga;
 	if (nfpga != 0)
@@ -1920,7 +1678,7 @@ static int _starpu_init_machine_config(struct _starpu_machine_config *config, in
 	topology->nworkers += topology->ndevices[STARPU_FPGA_WORKER];
 #endif
 
-#if defined(STARPU_USE_MIC) || defined(STARPU_USE_MPI_MASTER_SLAVE)
+#if defined(STARPU_USE_MPI_MASTER_SLAVE)
 	    _starpu_init_mp_config(config, &config->conf, no_mp_config);
 #endif
 
@@ -1934,11 +1692,6 @@ static int _starpu_init_machine_config(struct _starpu_machine_config *config, in
 		STARPU_ASSERT_MSG(ncpu >= -1, "ncpus can not be negative and different from -1 (is is %d)", ncpu);
 		if (ncpu == -1)
 		{
-			unsigned mic_busy_cpus = 0;
-			int j = 0;
-			for (j = 0; j < STARPU_MAXMICDEVS; j++)
-				mic_busy_cpus += (topology->nworker[STARPU_MIC_WORKER][j] ? 1 : 0);
-
 			unsigned mpi_ms_busy_cpus = 0;
 #ifdef STARPU_USE_MPI_MASTER_SLAVE
 #ifdef STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD
@@ -1954,7 +1707,7 @@ static int _starpu_init_machine_config(struct _starpu_machine_config *config, in
 				topology->cuda_th_per_dev == 0 && topology->cuda_th_per_stream == 0 ? (topology->ndevices[STARPU_CUDA_WORKER] ? 1 : 0) :
 				topology->cuda_th_per_stream ? (nworker_per_cuda * topology->ndevices[STARPU_CUDA_WORKER]) : topology->ndevices[STARPU_CUDA_WORKER];
 #endif
-			unsigned already_busy_cpus = mpi_ms_busy_cpus + mic_busy_cpus
+			unsigned already_busy_cpus = mpi_ms_busy_cpus
 				+ cuda_busy_cpus
 				+ topology->ndevices[STARPU_OPENCL_WORKER]
 				+ topology->ndevices[STARPU_FPGA_WORKER];
@@ -2346,6 +2099,9 @@ static size_t _starpu_cpu_get_global_mem_size(int nodeid, struct _starpu_machine
 		}
 	}
 
+	/* Don't eat all memory for ourself */
+	global_mem *= 0.9;
+
 	if (limit < 0)
 		// No limit is defined, we return the global memory size
 		return global_mem;
@@ -2676,11 +2432,6 @@ static void _starpu_init_workers_binding_and_memory(struct _starpu_machine_confi
 	unsigned fpga_bindid[STARPU_MAXFPGADEVS];
 #endif
 
-#ifdef STARPU_USE_MIC
-	unsigned mic_init[STARPU_MAXMICDEVS] = { };
-	unsigned mic_memory_nodes[STARPU_MAXMICDEVS];
-	unsigned mic_bindid[STARPU_MAXMICDEVS];
-#endif
 #ifdef STARPU_USE_MPI_MASTER_SLAVE
 	unsigned mpi_init[STARPU_MAXMPIDEVS] = { };
 	unsigned mpi_memory_nodes[STARPU_MAXMPIDEVS];
@@ -2710,7 +2461,7 @@ static void _starpu_init_workers_binding_and_memory(struct _starpu_machine_confi
 		struct _starpu_worker *workerarg = &config->workers[worker];
 		unsigned devid STARPU_ATTRIBUTE_UNUSED = workerarg->devid;
 
-#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_MIC) || defined(STARPU_SIMGRID) || defined(STARPU_USE_MPI_MASTER_SLAVE) || defined(STARPU_USE_FPGA)
+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID) || defined(STARPU_USE_MPI_MASTER_SLAVE) || defined(STARPU_USE_FPGA)
 		/* Perhaps the worker has some "favourite" bindings  */
 		unsigned *preferred_binding = NULL;
 		unsigned npreferred = 0;
@@ -2945,46 +2696,6 @@ static void _starpu_init_workers_binding_and_memory(struct _starpu_machine_confi
 			}
 #endif
 
-#ifdef STARPU_USE_MIC
-		        case STARPU_MIC_WORKER:
-			{
-				unsigned numa;
-				if (mic_init[devid])
-				{
-					memory_node = mic_memory_nodes[devid];
-				}
-				else
-				{
-					mic_init[devid] = 1;
-					/* TODO */
-					//if (may_bind_automatically)
-					//{
-					//	/* StarPU is allowed to bind threads automatically */
-						//	preferred_binding = _starpu_get_mic_affinity_vector(devid);
-					//	npreferred = config->topology.nhwpus;
-					//}
-					mic_bindid[devid] = _starpu_get_next_bindid(config, STARPU_THREAD_ACTIVE, preferred_binding, npreferred);
-					memory_node = mic_memory_nodes[devid] = _starpu_memory_node_register(STARPU_MIC_RAM, devid, &_starpu_driver_mic_node_ops);
-
-					for (numa = 0; numa < nb_numa_nodes; numa++)
-					{
-						_starpu_register_bus(numa, memory_node);
-						_starpu_register_bus(memory_node, numa);
-					}
-
-				}
-				workerarg->bindid = mic_bindid[devid];
-				_starpu_memory_node_add_nworkers(memory_node);
-
-				//This worker can manage transfers on NUMA nodes
-				for (numa = 0; numa < nb_numa_nodes; numa++)
-						_starpu_worker_drives_memory_node(&workerarg->set->workers[0], numa);
-
-				_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
-				break;
-			}
-#endif /* STARPU_USE_MIC */
-
 #ifdef STARPU_USE_MPI_MASTER_SLAVE
 			case STARPU_MPI_MS_WORKER:
 			{
@@ -3167,7 +2878,7 @@ int _starpu_build_topology(struct _starpu_machine_config *config, int no_mp_conf
 
 void _starpu_destroy_topology(struct _starpu_machine_config *config STARPU_ATTRIBUTE_UNUSED)
 {
-#if defined(STARPU_USE_MIC) || defined(STARPU_USE_MPI_MASTER_SLAVE)
+#if defined(STARPU_USE_MPI_MASTER_SLAVE)
 	_starpu_deinit_mp_config(config);
 #endif
 

+ 25 - 85
src/core/workers.c

@@ -46,7 +46,6 @@
 #include <drivers/cuda/driver_cuda.h>
 #include <drivers/opencl/driver_opencl.h>
 #include <drivers/max/driver_fpga.h>
-#include <drivers/mic/driver_mic_source.h>
 #include <drivers/mpi/driver_mpi_source.h>
 #include <drivers/disk/driver_disk.h>
 
@@ -207,16 +206,16 @@ void _starpu__workers_c__register_kobs(void)
 	/* TODO */
 }
 
-struct starpu_driver_info starpu_driver_info[STARPU_NARCH];
+struct _starpu_driver_info starpu_driver_info[STARPU_NARCH];
 
-void starpu_driver_info_register(enum starpu_worker_archtype archtype, const struct starpu_driver_info *info)
+void _starpu_driver_info_register(enum starpu_worker_archtype archtype, const struct _starpu_driver_info *info)
 {
 	starpu_driver_info[archtype] = *info;
 }
 
-struct starpu_memory_driver_info starpu_memory_driver_info[STARPU_MAX_RAM+1];
+struct _starpu_memory_driver_info starpu_memory_driver_info[STARPU_MAX_RAM+1];
 
-void starpu_memory_driver_info_register(enum starpu_node_kind kind, const struct starpu_memory_driver_info *info)
+void _starpu_memory_driver_info_register(enum starpu_node_kind kind, const struct _starpu_memory_driver_info *info)
 {
 	starpu_memory_driver_info[kind] = *info;
 }
@@ -295,10 +294,6 @@ static uint32_t _starpu_worker_exists_and_can_execute(struct starpu_task *task,
 				if (task->cl->opencl_funcs[impl] != NULL)
 					test_implementation = 1;
 				break;
-			case STARPU_MIC_WORKER:
-				if (task->cl->cpu_funcs_name[impl] != NULL || task->cl->mic_funcs[impl] != NULL)
-					test_implementation = 1;
-				break;
                         case STARPU_MPI_MS_WORKER:
                                 if (task->cl->cpu_funcs_name[impl] != NULL || task->cl->mpi_ms_funcs[impl] != NULL)
                                         test_implementation = 1;
@@ -357,11 +352,6 @@ uint32_t _starpu_worker_exists(struct starpu_task *task)
 	    _starpu_worker_exists_and_can_execute(task, STARPU_OPENCL_WORKER))
 		return 1;
 #endif
-#ifdef STARPU_USE_MIC
-	if ((task->where & STARPU_MIC) &&
-	    _starpu_worker_exists_and_can_execute(task, STARPU_MIC_WORKER))
-		return 1;
-#endif
 #ifdef STARPU_USE_MPI_MASTER_SLAVE
 	if ((task->where & STARPU_MPI_MS) &&
 	    _starpu_worker_exists_and_can_execute(task, STARPU_MPI_MS_WORKER))
@@ -393,7 +383,6 @@ static inline int _starpu_can_use_nth_implementation(enum starpu_worker_archtype
 	case STARPU_ANY_WORKER:
 	{
 		int cpu_func_enabled=1, cuda_func_enabled=1, opencl_func_enabled=1;
-		/* TODO: MIC */
 
 #if defined(STARPU_USE_CPU) || defined(STARPU_SIMGRID)
 		starpu_cpu_func_t cpu_func = _starpu_task_get_cpu_nth_implementation(cl, nimpl);
@@ -430,13 +419,6 @@ static inline int _starpu_can_use_nth_implementation(enum starpu_worker_archtype
 		starpu_fpga_func_t func = _starpu_task_get_fpga_nth_implementation(cl, nimpl);
 		return func != NULL;
 	}
-	case STARPU_MIC_WORKER:
-	{
-		starpu_mic_func_t func = _starpu_task_get_mic_nth_implementation(cl, nimpl);
-		const char *func_name = _starpu_task_get_cpu_name_nth_implementation(cl, nimpl);
-
-		return func != NULL || func_name != NULL;
-	}
 	case STARPU_MPI_MS_WORKER:
 	{
 		starpu_mpi_ms_func_t func = _starpu_task_get_mpi_ms_nth_implementation(cl, nimpl);
@@ -836,21 +818,20 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 	for (worker = 0; worker < nworkers; worker++)
 	{
 		struct _starpu_worker *workerarg = &pconfig->workers[worker];
-		unsigned devid = workerarg->devid;
 		workerarg->wait_for_worker_initialization = 0;
 
 		_STARPU_DEBUG("initialising worker %u/%u\n", worker, nworkers);
 
 		_starpu_init_worker_queue(workerarg);
 
-		struct starpu_driver driver;
-		driver.type = workerarg->arch;
 		switch (workerarg->arch)
 		{
 #if defined(STARPU_USE_CPU) || defined(STARPU_SIMGRID)
 			case STARPU_CPU_WORKER:
 			{
-				driver.id.cpu_id = devid;
+				struct starpu_driver driver;
+				driver.type = workerarg->arch;
+				driver.id.cpu_id = workerarg->devid;
 				workerarg->driver_ops = &_starpu_driver_cpu_ops;
 				workerarg->wait_for_worker_initialization = 1;
 
@@ -875,7 +856,9 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
 			case STARPU_CUDA_WORKER:
 			{
-				driver.id.cuda_id = devid;
+				struct starpu_driver driver;
+				driver.type = workerarg->arch;
+				driver.id.cuda_id = workerarg->devid;
 				workerarg->driver_ops = &_starpu_driver_cuda_ops;
 				struct _starpu_worker_set *worker_set = workerarg->set;
 
@@ -910,7 +893,9 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 			case STARPU_OPENCL_WORKER:
 			{
 #ifndef STARPU_SIMGRID
-				starpu_opencl_get_device(devid, &driver.id.opencl_id);
+				struct starpu_driver driver;
+				driver.type = workerarg->arch;
+				starpu_opencl_get_device(workerarg->devid, &driver.id.opencl_id);
 				workerarg->driver_ops = &_starpu_driver_opencl_ops;
 				workerarg->wait_for_worker_initialization = 1;
 
@@ -935,6 +920,9 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 
 #if defined(STARPU_USE_FPGA)
 			case STARPU_FPGA_WORKER:
+			{
+				struct starpu_driver driver;
+				driver.type = workerarg->arch;
 				if (!_starpu_may_launch_driver(&pconfig->conf, &driver))
 				{
 					workerarg->run_by_starpu = 0;
@@ -954,33 +942,8 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 				STARPU_PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
 #endif
 				break;
-#endif
-
-#ifdef STARPU_USE_MIC
-			case STARPU_MIC_WORKER:
-			{
-				/* We spawn only one thread
-				 * per MIC device, which will control all MIC
-				 * workers of this device. (by using a worker set). */
-				struct _starpu_worker_set *worker_set = workerarg->set;
-				if (worker_set->workers != workerarg)
-					break;
-
-				worker_set->set_is_initialized = 0;
-				worker_set->wait_for_set_initialization = 1;
-				workerarg->wait_for_worker_initialization = 0;
-
-				STARPU_PTHREAD_CREATE_ON(
-						"MIC",
-						&worker_set->worker_thread,
-						NULL,
-						_starpu_mic_src_worker,
-						worker_set,
-						_starpu_simgrid_get_host_by_worker(workerarg));
-
-				break;
 			}
-#endif /* STARPU_USE_MIC */
+#endif
 
 #ifdef STARPU_USE_MPI_MASTER_SLAVE
 			case STARPU_MPI_MS_WORKER:
@@ -1129,11 +1092,9 @@ int starpu_conf_init(struct starpu_conf *conf)
 	conf->ncuda = starpu_get_env_number("STARPU_NCUDA");
 	conf->nopencl = starpu_get_env_number("STARPU_NOPENCL");
 	conf->nfpga = starpu_get_env_number("STARPU_NFPGA");
-	conf->nmic = starpu_get_env_number("STARPU_NMIC");
 	conf->nmpi_ms = starpu_get_env_number("STARPU_NMPI_MS");
 	conf->calibrate = starpu_get_env_number("STARPU_CALIBRATE");
 	conf->bus_calibrate = starpu_get_env_number("STARPU_BUS_CALIBRATE");
-	conf->mic_sink_program_path = starpu_getenv("STARPU_MIC_PROGRAM_PATH");
 
 	if (conf->calibrate == -1)
 	     conf->calibrate = 0;
@@ -1145,7 +1106,6 @@ int starpu_conf_init(struct starpu_conf *conf)
 	conf->use_explicit_workers_cuda_gpuid = 0; /* TODO */
 	conf->use_explicit_workers_opencl_gpuid = 0; /* TODO */
 	conf->use_explicit_workers_fpga_deviceid = 0; /* TODO */
-	conf->use_explicit_workers_mic_deviceid = 0; /* TODO */
 	conf->use_explicit_workers_mpi_ms_deviceid = 0; /* TODO */
 
 	conf->single_combined_worker = starpu_get_env_number("STARPU_SINGLE_COMBINED_WORKER");
@@ -1184,14 +1144,6 @@ int starpu_conf_init(struct starpu_conf *conf)
 		conf->disable_asynchronous_fpga_copy = 0;
 #endif
 
-#if defined(STARPU_DISABLE_ASYNCHRONOUS_MIC_COPY)
-	conf->disable_asynchronous_mic_copy = 1;
-#else
-	conf->disable_asynchronous_mic_copy = starpu_get_env_number("STARPU_DISABLE_ASYNCHRONOUS_MIC_COPY");
-	if (conf->disable_asynchronous_mic_copy == -1)
-		conf->disable_asynchronous_mic_copy = 0;
-#endif
-
 #if defined(STARPU_DISABLE_ASYNCHRONOUS_MPI_MS_COPY)
 	conf->disable_asynchronous_mpi_ms_copy = 1;
 #else
@@ -1219,7 +1171,6 @@ int starpu_conf_noworker(struct starpu_conf *conf)
 	conf->ncuda = 0;
 	conf->nopencl = 0;
 	conf->nfpga = 0;
-	conf->nmic = 0;
 	conf->nmpi_ms = 0;
 	return 0;
 }
@@ -1271,8 +1222,10 @@ void _starpu_conf_check_environment(struct starpu_conf *conf)
 	_starpu_conf_set_value_against_environment("STARPU_DISABLE_ASYNCHRONOUS_CUDA_COPY", &conf->disable_asynchronous_cuda_copy, conf->precedence_over_environment_variables);
 	_starpu_conf_set_value_against_environment("STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY", &conf->disable_asynchronous_opencl_copy, conf->precedence_over_environment_variables);
 	_starpu_conf_set_value_against_environment("STARPU_DISABLE_ASYNCHRONOUS_FPGA_COPY", &conf->disable_asynchronous_fpga_copy, conf->precedence_over_environment_variables);
-	_starpu_conf_set_value_against_environment("STARPU_DISABLE_ASYNCHRONOUS_MIC_COPY", &conf->disable_asynchronous_mic_copy, conf->precedence_over_environment_variables);
 	_starpu_conf_set_value_against_environment("STARPU_DISABLE_ASYNCHRONOUS_MPI_MS_COPY", &conf->disable_asynchronous_mpi_ms_copy, conf->precedence_over_environment_variables);
+	_starpu_conf_set_value_against_environment("STARPU_MIN_PRIO", &conf->global_sched_ctx_min_priority, conf->precedence_over_environment_variables);
+	_starpu_conf_set_value_against_environment("STARPU_MAX_PRIO", &conf->global_sched_ctx_max_priority, conf->precedence_over_environment_variables);
+	_starpu_conf_set_value_against_environment("STARPU_CATCH_SIGNALS", &conf->catch_signals, conf->precedence_over_environment_variables);
 }
 
 struct starpu_tree* starpu_workers_get_tree(void)
@@ -1453,7 +1406,6 @@ void starpu_drivers_preinit(void)
 	_starpu_cuda_preinit();
 	_starpu_opencl_preinit();
 	_starpu_fpga_preinit();
-	_starpu_mic_preinit();
 	_starpu_mpi_ms_preinit();
 	_starpu_disk_preinit();
 }
@@ -1469,9 +1421,10 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 			    * used, we cannot be a sink. */
 	unsigned worker;
 
+#if !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MP)
 	(void)argc;
 	(void)argv;
-
+#endif
 	/* This initializes _starpu_silent, thus needs to be early */
 	_starpu_util_init();
 
@@ -1609,8 +1562,6 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 	/* Make a copy of arrays */
 	if (_starpu_config.conf.sched_policy_name)
 		_starpu_config.conf.sched_policy_name = strdup(_starpu_config.conf.sched_policy_name);
-	if (_starpu_config.conf.mic_sink_program_path)
-		_starpu_config.conf.mic_sink_program_path = strdup(_starpu_config.conf.mic_sink_program_path);
 	if (_starpu_config.conf.n_cuda_opengl_interoperability)
 	{
 		size_t size = _starpu_config.conf.n_cuda_opengl_interoperability * sizeof(*_starpu_config.conf.cuda_opengl_interoperability);
@@ -1705,7 +1656,7 @@ int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
 	if (!is_a_sink)
 	{
 		struct starpu_sched_policy *selected_policy = _starpu_select_sched_policy(&_starpu_config, _starpu_config.conf.sched_policy_name);
-		_starpu_create_sched_ctx(selected_policy, NULL, -1, 1, "init", (_starpu_config.conf.global_sched_ctx_min_priority != -1), _starpu_config.conf.global_sched_ctx_min_priority, (_starpu_config.conf.global_sched_ctx_max_priority != -1), _starpu_config.conf.global_sched_ctx_max_priority, 1, _starpu_config.conf.sched_policy_init, NULL,  0, NULL, 0);
+		_starpu_create_sched_ctx(selected_policy, NULL, -1, 1, "init", (_starpu_config.conf.global_sched_ctx_min_priority != -1), _starpu_config.conf.global_sched_ctx_min_priority, (_starpu_config.conf.global_sched_ctx_max_priority != -1), _starpu_config.conf.global_sched_ctx_max_priority, 1, _starpu_config.conf.sched_policy_callback, NULL,  0, NULL, 0);
 	}
 
 	_starpu_initialize_registered_performance_models();
@@ -2063,7 +2014,6 @@ void starpu_shutdown(void)
 
 	/* Clear memory */
 	free((char*) _starpu_config.conf.sched_policy_name);
-	free(_starpu_config.conf.mic_sink_program_path);
 	if (_starpu_config.conf.n_cuda_opengl_interoperability)
 		free(_starpu_config.conf.cuda_opengl_interoperability);
 	if (_starpu_config.conf.n_not_launched_drivers)
@@ -2219,21 +2169,11 @@ int starpu_asynchronous_fpga_copy_disabled(void)
 	return _starpu_config.conf.disable_asynchronous_fpga_copy;
 }
 
-int starpu_asynchronous_mic_copy_disabled(void)
-{
-	return _starpu_config.conf.disable_asynchronous_mic_copy;
-}
-
 int starpu_asynchronous_mpi_ms_copy_disabled(void)
 {
         return _starpu_config.conf.disable_asynchronous_mpi_ms_copy;
 }
 
-unsigned starpu_mic_worker_get_count(void)
-{
-	return starpu_worker_get_count_by_type(STARPU_MIC_WORKER);
-}
-
 unsigned starpu_mpi_ms_worker_get_count(void)
 {
 	return starpu_worker_get_count_by_type(STARPU_MPI_MS_WORKER);
@@ -2549,7 +2489,7 @@ int starpu_wake_worker_no_relax(int workerid)
 	return starpu_wakeup_worker_no_relax(workerid, sched_cond, sched_mutex);
 }
 
-int starpu_worker_get_nids_by_type(enum starpu_worker_archtype type, int *workerids, int maxsize)
+int _starpu_worker_get_nids_by_type(enum starpu_worker_archtype type, int *workerids, int maxsize)
 {
 	unsigned nworkers = starpu_worker_get_count();
 
@@ -2571,7 +2511,7 @@ int starpu_worker_get_nids_by_type(enum starpu_worker_archtype type, int *worker
 	return cnt;
 }
 
-int starpu_worker_get_nids_ctx_free_by_type(enum starpu_worker_archtype type, int *workerids, int maxsize)
+int _starpu_worker_get_nids_ctx_free_by_type(enum starpu_worker_archtype type, int *workerids, int maxsize)
 {
 	unsigned nworkers = starpu_worker_get_count();
 	int cnt = 0;

+ 8 - 23
src/core/workers.h

@@ -47,10 +47,6 @@
 #include <drivers/cuda/driver_cuda.h>
 #include <drivers/opencl/driver_opencl.h>
 
-#ifdef STARPU_USE_MIC
-#include <drivers/mic/driver_mic_source.h>
-#endif /* STARPU_USE_MIC */
-
 #ifdef STARPU_USE_MPI_MASTER_SLAVE
 #include <drivers/mpi/driver_mpi_source.h>
 #endif
@@ -327,14 +323,6 @@ struct _starpu_machine_topology
 	 */
 	unsigned workers_fpga_deviceid[STARPU_NMAXWORKERS];
 
-	/*** Indicates the successive MIC devices that should be used
-	 * by the MIC driver.  It is either filled according to the
-	 * user's explicit parameters (from starpu_conf) or according
-	 * to the STARPU_WORKERS_MICID env. variable. Otherwise, they
-	 * are taken in ID order. */
-	/** TODO */
-	/** unsigned workers_mic_deviceid[STARPU_NMAXWORKERS]; */
-
 	unsigned workers_mpi_ms_deviceid[STARPU_NMAXWORKERS];
 
 };
@@ -362,9 +350,6 @@ struct _starpu_machine_config
         /* Which FPGA(s) do we use for FPGA? */
 	int current_fpga_deviceid;
 
-	/** Which MIC do we use? */
-	int current_mic_deviceid;
-
 	/** Which MPI do we use? */
 	int current_mpi_deviceid;
 
@@ -425,7 +410,7 @@ struct _starpu_machine_config
 };
 
 /** Provides information for a device driver */
-struct starpu_driver_info
+struct _starpu_driver_info
 {
 	const char *name_upper;	/**< Name of worker type in upper case */
 	const char *name_var;	/**< Name of worker type for environment variables */
@@ -435,21 +420,21 @@ struct starpu_driver_info
 };
 
 /** Device driver information, indexed by enum starpu_worker_archtype */
-extern struct starpu_driver_info starpu_driver_info[STARPU_NARCH];
+extern struct _starpu_driver_info starpu_driver_info[STARPU_NARCH];
 
-void starpu_driver_info_register(enum starpu_worker_archtype archtype, const struct starpu_driver_info *info);
+void _starpu_driver_info_register(enum starpu_worker_archtype archtype, const struct _starpu_driver_info *info);
 
 /** Provides information for a memory node driver */
-struct starpu_memory_driver_info
+struct _starpu_memory_driver_info
 {
 	const char *name_upper;	/**< Name of memory in upper case */
 	enum starpu_worker_archtype worker_archtype;	/**< Kind of device */
 };
 
 /** Memory driver information, indexed by enum starpu_node_kind */
-extern struct starpu_memory_driver_info starpu_memory_driver_info[STARPU_MAX_RAM+1];
+extern struct _starpu_memory_driver_info starpu_memory_driver_info[STARPU_MAX_RAM+1];
 
-void starpu_memory_driver_info_register(enum starpu_node_kind kind, const struct starpu_memory_driver_info *info);
+void _starpu_memory_driver_info_register(enum starpu_node_kind kind, const struct _starpu_memory_driver_info *info);
 
 extern int _starpu_worker_parallel_blocks;
 
@@ -603,11 +588,11 @@ static inline struct _starpu_sched_ctx* _starpu_get_initial_sched_ctx(void)
 	return &_starpu_config.sched_ctxs[STARPU_GLOBAL_SCHED_CTX];
 }
 
-int starpu_worker_get_nids_by_type(enum starpu_worker_archtype type, int *workerids, int maxsize);
+int _starpu_worker_get_nids_by_type(enum starpu_worker_archtype type, int *workerids, int maxsize);
 
 /** returns workers not belonging to any context, be careful no mutex is used,
    the list might not be updated */
-int starpu_worker_get_nids_ctx_free_by_type(enum starpu_worker_archtype type, int *workerids, int maxsize);
+int _starpu_worker_get_nids_ctx_free_by_type(enum starpu_worker_archtype type, int *workerids, int maxsize);
 
 static inline unsigned _starpu_worker_mutex_is_sched_mutex(int workerid, starpu_pthread_mutex_t *mutex)
 {

+ 2 - 1
src/datawizard/coherency.c

@@ -81,6 +81,7 @@ int _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
 				unsigned handling_node;
 
 				/* Avoid indirect transfers */
+				/* TODO: but with NVLink, that might be better than a "direct" transfer that actually goes through the Host! */
 				if (!link_supports_direct_transfers(handle, i, destination, &handling_node))
 					continue;
 
@@ -911,7 +912,7 @@ void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_
 	while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
 	{
 		cpt++;
-		_starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC);
+		_starpu_datawizard_progress(_STARPU_DATAWIZARD_DO_ALLOC);
 	}
 	if (cpt == STARPU_SPIN_MAXTRY)
 		_starpu_spin_lock(&handle->header_lock);

+ 0 - 0
src/datawizard/copy_driver.c


Kaikkia tiedostoja ei voida näyttää, sillä liian monta tiedostoa muuttui tässä diffissä