Browse Source

nmad/ merge trunk in branch

Guillaume Beauchamp 8 years ago
parent
commit
16ba2daebd
100 changed files with 1752 additions and 696 deletions
  1. 5 0
      AUTHORS
  2. 40 1
      ChangeLog
  3. 2 1
      Makefile.am
  4. 108 33
      configure.ac
  5. 3 2
      doc/doxygen/chapters/110_basic_examples.doxy
  6. 29 6
      doc/doxygen/chapters/210_check_list_performance.doxy
  7. 15 8
      doc/doxygen/chapters/301_tasks.doxy
  8. 130 3
      doc/doxygen/chapters/310_data_management.doxy
  9. 11 4
      doc/doxygen/chapters/320_scheduling.doxy
  10. 3 3
      doc/doxygen/chapters/330_scheduling_contexts.doxy
  11. 5 1
      doc/doxygen/chapters/350_modularized_scheduler.doxy
  12. 6 3
      doc/doxygen/chapters/370_online_performance_tools.doxy
  13. 5 1
      doc/doxygen/chapters/380_offline_performance_tools.doxy
  14. 19 10
      doc/doxygen/chapters/390_faq.doxy
  15. 1 1
      doc/doxygen/chapters/401_out_of_core.doxy
  16. 38 19
      doc/doxygen/chapters/410_mpi_support.doxy
  17. 3 2
      doc/doxygen/chapters/430_mic_scc_support.doxy
  18. 4 3
      doc/doxygen/chapters/470_simgrid.doxy
  19. 31 11
      doc/doxygen/chapters/501_environment_variables.doxy
  20. 15 0
      doc/doxygen/chapters/510_configure_options.doxy
  21. 21 12
      doc/doxygen/chapters/api/codelet_and_tasks.doxy
  22. 17 0
      doc/doxygen/chapters/api/cuda_extensions.doxy
  23. 5 2
      doc/doxygen/chapters/api/data_out_of_core.doxy
  24. 4 2
      doc/doxygen/chapters/api/data_partition.doxy
  25. 1 1
      doc/doxygen/chapters/api/insert_task.doxy
  26. 5 2
      doc/doxygen/chapters/api/modularized_scheduler.doxy
  27. 1 1
      doc/doxygen/chapters/api/mpi.doxy
  28. 4 3
      doc/doxygen/chapters/api/scheduling_contexts.doxy
  29. 23 6
      doc/doxygen/chapters/api/scheduling_policy.doxy
  30. 7 7
      doc/doxygen/chapters/api/task_lists.doxy
  31. 66 0
      doc/doxygen/chapters/api/workers.doxy
  32. 6 3
      doc/doxygen/chapters/code/multiformat.c
  33. 2 1
      doc/doxygen/chapters/code/vector_scal_c.c
  34. 2 1
      doc/doxygen/chapters/code/vector_scal_opencl_codelet.cl
  35. 13 8
      examples/Makefile.am
  36. 2 2
      examples/basic_examples/mult.c
  37. 5 0
      examples/cg/cg.c
  38. 7 7
      examples/cholesky/cholesky_models.c
  39. 2 2
      examples/cpp/add_vectors_cpp11.cpp
  40. 1 1
      examples/filters/fblock_opencl_kernel.cl
  41. 5 0
      examples/filters/fmultiple_manual.c
  42. 105 79
      examples/heat/dw_sparse_cg.c
  43. 7 3
      examples/heat/heat.c
  44. 44 0
      examples/heat/heat.sh
  45. 19 2
      examples/interface/complex_codelet.h
  46. 34 0
      examples/lu/lu.sh
  47. 3 1
      examples/lu/lu_example.c
  48. 4 0
      examples/lu/xlu_implicit_pivot.c
  49. 5 0
      examples/lu/xlu_pivot.c
  50. 3 0
      examples/mandelbrot/mandelbrot.c
  51. 45 20
      examples/mlr/mlr.c
  52. 8 2
      examples/sched_ctx/dummy_sched_with_ctx.c
  53. 5 3
      examples/sched_ctx/gpu_partition.c
  54. 5 1
      examples/sched_ctx/nested_sched_ctxs.c
  55. 8 6
      examples/sched_ctx/parallel_code.c
  56. 15 11
      examples/sched_ctx/parallel_tasks_reuse_handle.c
  57. 19 10
      examples/sched_ctx/sched_ctx.c
  58. 2 2
      examples/sched_ctx/two_cpu_contexts.c
  59. 4 2
      examples/scheduler/dummy_sched.c
  60. 6 5
      examples/spmv/spmv.c
  61. 2 3
      examples/stencil/stencil-blocks.c
  62. 12 27
      include/fstarpu_mod.f90
  63. 9 1
      include/starpu.h
  64. 3 1
      include/starpu_config.h.in
  65. 5 1
      include/starpu_cublas_v2.h
  66. 41 0
      include/starpu_cusparse.h
  67. 2 0
      include/starpu_data.h
  68. 57 16
      include/starpu_data_interfaces.h
  69. 2 0
      include/starpu_disk.h
  70. 2 0
      include/starpu_fxt.h
  71. 2 1
      include/starpu_perfmodel.h
  72. 19 0
      include/starpu_sched_component.h
  73. 7 9
      include/starpu_sched_ctx.h
  74. 5 6
      include/starpu_scheduler.h
  75. 6 4
      include/starpu_task.h
  76. 33 125
      include/starpu_task_list.h
  77. 10 9
      include/starpu_task_util.h
  78. 1 0
      include/starpu_thread_util.h
  79. 15 10
      include/starpu_util.h
  80. 24 2
      include/starpu_worker.h
  81. 109 0
      mpi/dev/starpu_mpi_comm_check.sh
  82. 26 3
      mpi/examples/Makefile.am
  83. 17 1
      mpi/examples/complex/mpi_complex.c
  84. 2 2
      mpi/examples/matrix_decomposition/mpi_decomposition_params.c
  85. 17 1
      mpi/examples/stencil/stencil5.c
  86. 27 17
      mpi/src/starpu_mpi.c
  87. 37 56
      mpi/src/starpu_mpi_collective.c
  88. 1 1
      mpi/src/starpu_mpi_comm.c
  89. 23 3
      mpi/src/starpu_mpi_datatype.c
  90. 1 1
      mpi/src/starpu_mpi_init.c
  91. 2 2
      mpi/src/starpu_mpi_private.c
  92. 19 19
      mpi/src/starpu_mpi_private.h
  93. 7 1
      mpi/src/starpu_mpi_task_insert.c
  94. 5 0
      mpi/src/starpu_mpi_task_insert_fortran.c
  95. 52 29
      mpi/tests/Makefile.am
  96. 9 6
      mpi/tests/block_interface.c
  97. 11 7
      mpi/tests/block_interface_pinned.c
  98. 107 6
      mpi/tests/datatypes.c
  99. 10 6
      mpi/tests/early_request.c
  100. 0 0
      mpi/tests/gather.c

+ 5 - 0
AUTHORS

@@ -6,10 +6,12 @@ Berenger Bramas <berenger.bramas@inria.fr>
 Alfredo Buttari <alfredo.buttari@enseeiht.fr>
 Adrien Cassagne <adrien.cassagne@inria.fr>
 Jérôme Clet-Ortega <jerome.clet-ortega@labri.fr>
+Terry Cojean <terry.cojean@inria.fr>
 Nicolas Collin <nicolas.collin@inria.fr>
 Ludovic Courtès <ludovic.courtes@inria.fr>
 Yann Courtois <yann.courtois33@gmail.com>
 Jean-Marie Couteyen <jm.couteyen@gmail.com>
+Lionel Eyraud-Dubois <lionel.eyraud-dubois@inria.fr>
 Nathalie Furmento <nathalie.furmento@labri.fr>
 David Gómez <david_gomez1380@yahoo.com.mx>
 Sylvain Henry <sylvain.henry@inria.fr>
@@ -22,9 +24,11 @@ Brice Mortier <brice.mortier@etu.u-bordeaux1.fr>
 Stojce Nakov <stojce.nakov@inria.fr>
 Joris Pablo <joris.pablo@orange.fr>
 Damien Pasqualinotto <dam.pasqualinotto@wanadoo.fr>
+Samuel Pitoiset <samuel.pitoiset@inria.fr>
 Nguyen Quôc-Dinh <nguyen.quocdinh@gmail.com>
 Cyril Roelandt <cyril.roelandt@inria.fr>
 Anthony Roy <theanthony33@gmail.com>
+Chiheb Sakka <chiheb.sakka@inria.fr>
 Corentin Salingue <corentin.salingue@gmail.com>
 Marc Sergent <marc.sergent@inria.fr>
 Anthony Simonet <anthony.simonet@etu.u-bordeaux.fr>
@@ -33,4 +37,5 @@ Ludovic Stordeur <ludovic.stordeur@inria.fr>
 Guillaume Sylvand <guillaume.sylvand@airbus.com>
 François Tessier <francois.tessier@inria.fr>
 Samuel Thibault <samuel.thibault@labri.fr>
+Leo Villeveygoux <leo.villeveygoux@inria.fr>
 Pierre-André Wacrenier <wacrenier@labri.fr>

+ 40 - 1
ChangeLog

@@ -43,12 +43,17 @@ Small features:
   * New function starpu_worker_display_names to display the names of
     all the workers of a specified type.
   * Arbiters now support concurrent read access.
+  * Add a field starpu_task::where similar to starpu_codelet::where
+    which allows to restrict where to execute a task. Also add
+    STARPU_TASK_WHERE to be used when calling starpu_task_insert().
 
 Changes:
   * Vastly improve simgrid simulation time.
 
 Small changes:
   * Use asynchronous transfers for task data fetches with were not prefetched.
+  * Allow to call starpu_sched_ctx_set_policy_data on the main
+    scheduler context
 
 StarPU 1.2.2 (svn revision xxx)
 ==============================================
@@ -56,6 +61,39 @@ StarPU 1.2.2 (svn revision xxx)
 New features:
   * Add starpu_data_acquire_try and starpu_data_acquire_on_node_try.
   * Add NVCC_CC environment variable.
+  * Add -no-flops and -no-events options to starpu_fxt_tool to make
+    traces lighter
+  * Add starpu_cusparse_init/shutdown/get_local_handle for proper CUDA
+    overlapping with cusparse.
+  * Allow precise debugging by setting STARPU_TASK_BREAK_ON_PUSH,
+    STARPU_TASK_BREAK_ON_SCHED, STARPU_TASK_BREAK_ON_POP, and
+    STARPU_TASK_BREAK_ON_EXEC environment variables, with the job_id
+    of a task. StarPU will raise SIGTRAP when the task is being
+    scheduled, pushed, or popped by the scheduler.
+
+Small features:
+  * New function starpu_worker_get_job_id(struct starpu_task *task)
+    which returns the job identifier for a given task
+  * Show package/numa topology in starpu_machine_display
+  * MPI: Add mpi communications in dag.dot
+  * Add STARPU_PERF_MODEL_HOMOGENEOUS_CPU environment variable to
+    allow having one perfmodel per CPU core
+
+Small changes:
+  * Output generated through STARPU_MPI_COMM has been modified to
+    allow easier automated checking
+  * MPI: Fix reactivity of the beginning of the application, when a
+    lot of ready requests have to be processed at the same time, we
+    want to poll the pending requests from time to time.
+  * MPI: Fix gantt chart for starpu_mpi_irecv: it should use the
+    termination time of the request, not the submission time.
+  * MPI: Modify output generated through STARPU_MPI_COMM to allow
+    easier automated checking
+  * MPI: enable more tests in simgrid mode
+  * Use assumed-size instead of assumed-shape arrays for native
+    fortran API, for better backward compatibility.
+  * Fix odd ordering of CPU workers on CPUs due to GPUs stealing some
+    cores
 
 StarPU 1.2.1 (svn revision 20299)
 ==============================================
@@ -268,6 +306,7 @@ Small features:
     allows to copy in a new buffer values which have not been unpacked by
     the current call
   * Add STARPU_CODELET_SIMGRID_EXECUTE flag.
+  * Add STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT flag.
   * Add STARPU_CL_ARGS flag to starpu_task_insert() and
     starpu_mpi_task_insert() functions call
 
@@ -480,7 +519,7 @@ New features:
   * Add the Hypervisor to manage the Scheduling Contexts automatically
     	- The Contexts can be registered to the Hypervisor
 	- Only the registered contexts are managed by the Hypervisor
-	- The Hypervisor can detect the initial distribution of resources of 
+	- The Hypervisor can detect the initial distribution of resources of
 	a context and constructs it consequently (the cost of execution is required)
     	- Several policies can adapt dynamically the distribution of resources
 	in contexts if the initial one was not appropriate

+ 2 - 1
Makefile.am

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2009-2016  Université de Bordeaux
+# Copyright (C) 2009-2017  Université de Bordeaux
 # Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016  CNRS
 # Copyright (C) 2014  INRIA
 # Copyright (C) 2016  Inria
@@ -103,6 +103,7 @@ versinclude_HEADERS = 				\
 	include/starpu_disk.h			\
 	include/starpu_cublas.h			\
 	include/starpu_cublas_v2.h		\
+	include/starpu_cusparse.h		\
 	include/starpu_driver.h			\
 	include/starpu_stdlib.h			\
 	include/starpu_thread.h			\

+ 108 - 33
configure.ac

@@ -3,7 +3,7 @@
 # Copyright (C) 2009-2017  Université de Bordeaux
 # Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
 # Copyright (C) 2011  Télécom-SudParis
-# Copyright (C) 2011, 2012, 2014-2016  INRIA
+# Copyright (C) 2011, 2012, 2014-2017  INRIA
 #
 # StarPU is free software; you can redistribute it and/or modify
 # it under the terms of the GNU Lesser General Public License as published by
@@ -87,6 +87,7 @@ AC_OPENMP
 if test x$enable_perf_debug = xyes; then
     enable_shared=no
 fi
+default_enable_mpi_check=maybe
 
 ###############################################################################
 #                                                                             #
@@ -185,6 +186,10 @@ if test x$enable_simgrid = xyes ; then
 	# We won't bind or detect anything
 	with_hwloc=no
 
+        # disable mpi checks by default, they require static linking, we don't
+        # want that by default
+	default_enable_mpi_check=no
+
 	# Simgrid 3.12 & 3.13 need -std=c++11 to be able to build anything in C++...
 	AC_LANG_PUSH([C++])
 	AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
@@ -198,7 +203,15 @@ if test x$enable_simgrid = xyes ; then
 			  CXXFLAGS="-std=c++11 $CXXFLAGS"
 			  NVCCFLAGS="-std=c++11 $NVCCFLAGS")
 	AC_LANG_POP([C++])
+	AC_ARG_ENABLE(simgrid-mc, [AS_HELP_STRING([--enable-simgrid-mc],
+				[Enable using Model Checker of simgrid])],
+				enable_simgrid_mc=$enableval, enable_simgrid_mc=no)
+	if test x$enable_simgrid_mc = xyes ; then
+		AC_DEFINE(STARPU_SIMGRID_MC, [1], [Define this to enable Model Checker in simgrid execution])
+		AC_PATH_PROG([SIMGRID_MC], [simgrid-mc], [no], [$simgrid_dir/bin:$PATH])
+	fi
 fi
+AM_CONDITIONAL(STARPU_SIMGRID_MC, test x$enable_simgrid_mc = xyes)
 AM_CONDITIONAL(STARPU_SIMGRID, test x$enable_simgrid = xyes)
 AC_SUBST(SIMGRID_CFLAGS)
 AC_SUBST(SIMGRID_LIBS)
@@ -348,9 +361,10 @@ else
     build_mpi_master_slave=no
 fi
 
-#Warn users that they cannot use both at the same time
+#users cannot use both at the same time
 if test x$build_mpi_master_slave = xyes -a x$enable_mpi = xyes; then
-    AC_MSG_WARN(StarPU-MPI and MPI Master-Slave cannot be used at the same time !)
+    AC_MSG_WARN(StarPU-MPI and MPI Master-Slave cannot be used at the same time ! Disabling StarPU-MPI...)
+	enable_mpi=no
 fi
 
 if test x$build_mpi_master_slave = xyes; then
@@ -489,7 +503,8 @@ AM_CONDITIONAL([STARPU_CROSS_COMPILING], [test "x$cross_compiling" = "xyes"])
 ###############################################################################
 
 # If the user specifically asks for it, or if we are in a developer checkout, we enable mpi check
-AC_ARG_ENABLE(mpi-check, AC_HELP_STRING([--enable-mpi-check], [Enable execution of MPI testcases]))
+AC_ARG_ENABLE(mpi-check, AC_HELP_STRING([--enable-mpi-check], [Enable execution of MPI testcases]),
+	      [enable_mpi_check=$enableval], [enable_mpi_check=$default_enable_mpi_check])
 running_mpi_check=no
 if test $svndir = 1 -o -d "$srcdir/.git" ; then
     running_mpi_check=yes
@@ -502,37 +517,42 @@ if test x$enable_mpi_check = xno ; then
 fi
 
 
-# Check if mpiexec is available
-AC_ARG_WITH(mpiexec, [AS_HELP_STRING([--with-mpiexec[=<path to mpiexec>]],
-            [Path of mpiexec])],
-    [
-        if test x$withval = xyes; then
-            AC_MSG_ERROR(--with-mpiexec must be given a pathname)
-        else
-            mpiexec_path=$withval
-        fi
-    ],
-    [
-        # nothing was specified: look in the path
-        AC_PATH_PROG(mpiexec_path, mpiexec, [no], [$(dirname $mpicc_path):$PATH])
-    ])
-
-AC_MSG_CHECKING(whether mpiexec is available)
-AC_MSG_RESULT($mpiexec_path)
-
-# We test if MPIEXEC exists
-if test ! -x $mpiexec_path; then
-    #MPIEXEC does not exists or is not executable
-    AC_MSG_RESULT(The mpiexec script is not valid)
-        running_mpi_check=no
-        mpiexec_path=""
+if test x$enable_simgrid = xno ; then
+    # Check if mpiexec is available
+    AC_ARG_WITH(mpiexec, [AS_HELP_STRING([--with-mpiexec[=<path to mpiexec>]],
+                [Path of mpiexec])],
+        [
+            if test x$withval = xyes; then
+                AC_MSG_ERROR(--with-mpiexec must be given a pathname)
+            else
+                mpiexec_path=$withval
+            fi
+        ],
+        [
+            # nothing was specified: look in the path
+            AC_PATH_PROG(mpiexec_path, mpiexec, [no], [$(dirname $mpicc_path):$PATH])
+        ])
+    
+    AC_MSG_CHECKING(whether mpiexec is available)
+    AC_MSG_RESULT($mpiexec_path)
+    
+    # We test if MPIEXEC exists
+    if test ! -x $mpiexec_path; then
+        #MPIEXEC does not exists or is not executable
+        AC_MSG_RESULT(The mpiexec script is not valid)
+            running_mpi_check=no
+            mpiexec_path=""
+    fi
+    AC_SUBST(MPIEXEC,$mpiexec_path)
 fi
 
 AM_CONDITIONAL(STARPU_MPI_CHECK, test x$running_mpi_check = xyes)
+if test x$running_mpi_check = xyes -a x$enable_simgrid = xyes -a x$enable_shared = xyes ; then
+    AC_MSG_ERROR([MPI with simgrid can not work with shared libraries, use --disable-shared to fix this])
+fi
 if test x$use_mpi = xyes ; then
     AC_MSG_CHECKING(whether MPI tests should be run)
     AC_MSG_RESULT($running_mpi_check)
-    AC_SUBST(MPIEXEC,$mpiexec_path)
 fi
 
 #We can only build StarPU MPI Library if User wants it and MPI is available
@@ -775,9 +795,12 @@ fi
 AC_CHECK_FUNC([sched_yield], [AC_DEFINE([STARPU_HAVE_SCHED_YIELD], [1], [Define to 1 if the function sched_yield is available.])])
 
 AC_CHECK_HEADERS([aio.h])
+AC_CHECK_HEADERS([libaio.h])
 AC_CHECK_LIB([rt], [aio_read])
+AC_CHECK_LIB([aio], [io_setup])
 
 AC_CHECK_FUNCS([mkostemp])
+AC_CHECK_FUNCS([mkdtemp])
 
 AC_CHECK_FUNCS([pread pwrite])
 
@@ -808,6 +831,18 @@ if test x$enable_libnuma = xyes; then
 	AC_DEFINE(STARPU_HAVE_LIBNUMA,[1],[libnuma is available])
 fi
 
+AC_MSG_CHECKING(whether statement expressions are available)
+AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+#define maxint(a,b) ({int _a = (a), _b = (b); _a > _b ? _a : _b; })
+]],
+		[[ int x=maxint(12,42); ]])],
+		[statement_expressions="yes"],
+		[statement_expressions="no"])
+AC_MSG_RESULT($statement_expressions)
+if test x$statement_expressions = xyes; then
+	AC_DEFINE(STARPU_HAVE_STATEMENT_EXPRESSIONS,[1],[statement expressions are available])
+fi
+
 ###############################################################################
 #									      #
 #				SCHED_CTX settings			      #
@@ -1139,6 +1174,9 @@ if test x$enable_cuda = xyes; then
 	fi
 
 	AC_CHECK_HEADERS([cuda_gl_interop.h])
+
+	AC_CHECK_LIB([cusparse], [cusparseCreate])
+	AC_CHECK_DECLS([cusparseSetStream], [], [], [[#include <cusparse.h>]])
 fi
 
 dnl Hey dude, are you around?
@@ -1482,7 +1520,14 @@ AC_ARG_WITH(host-param)
 AC_MSG_CHECKING(maximum number of MIC devices)
 AC_ARG_ENABLE(maxmicdev, [AS_HELP_STRING([--enable-maxmicdev=<number>],
 			[maximum number of MIC devices])],
-			nmaxmicdev=$enableval, nmaxmicdev=4)
+			nmaxmicdev=$enableval,
+            [
+             if test x$enable_mic = xyes; then
+                 nmaxmicdev=4
+             else
+                 nmaxmicdev=0
+             fi
+            ])
 AC_MSG_RESULT($nmaxmicdev)
 
 AC_DEFINE_UNQUOTED(STARPU_MAXMICDEVS, [$nmaxmicdev],
@@ -1941,6 +1986,10 @@ if test x$use_fxt = xyes; then
 	##########################################
 	PKG_CHECK_MODULES([POTI], [poti], [
 		AC_DEFINE(STARPU_HAVE_POTI, [1], [Define to 1 if you have libpoti])
+		save_LIBS="$LIBS"
+		LIBS="$LIBS $POTI_LIBS"
+		AC_CHECK_FUNCS([poti_init_custom])
+		LIBS="$save_LIBS"
 	], [:])
 	FXT_CFLAGS="$FXT_CFLAGS $POTI_CFLAGS"
 	FXT_LIBS="$FXT_LIBS $POTI_LIBS"
@@ -1985,8 +2034,13 @@ if test x$enable_memory_stats = xyes; then
         AC_DEFINE(STARPU_MEMORY_STATS, [1], [enable memory stats])
 fi
 
-AC_CHECK_HEADERS([glpk.h], [AC_DEFINE([STARPU_HAVE_GLPK_H], [1], [Define to 1 if you have the <glpk.h> header file.])])
-STARPU_HAVE_LIBRARY(GLPK, [glpk])
+AC_ARG_ENABLE(glpk, [AS_HELP_STRING([--disable-glpk],
+			     [disable using glpk for bound computation])],
+			     enable_glpk=$enableval, enable_glpk=yes)
+if test x$enable_glpk = xyes; then
+	AC_CHECK_HEADERS([glpk.h], [AC_DEFINE([STARPU_HAVE_GLPK_H], [1], [Define to 1 if you have the <glpk.h> header file.])])
+	STARPU_HAVE_LIBRARY(GLPK, [glpk])
+fi
 
 AC_ARG_WITH(ayudame1-include-dir,
 	[AS_HELP_STRING([--with-ayudame1-include-dir=<path>],
@@ -2606,6 +2660,16 @@ if test "x$enable_socl" = "xyes" -a "$have_valid_opencl" = "no" ; then
     AC_MSG_ERROR([SOCL cannot be enabled without OpenCL])
 fi
 
+# MPI Master Slave and SOCL are not compatible
+if test "x$use_mpi_master_slave" = "xyes" ; then
+   if test "x$enable_socl" = "xyes" ; then
+      AC_MSG_ERROR([MPI Master-Slave and SOCL can not be used at the same time !])
+   fi
+   if test "x$enable_socl" = "xmaybe" ; then
+     enable_socl=no 
+   fi
+fi
+
 # now we enable SOCL if and only if a proper setup is available
 if test "x$enable_socl" = "xyes" -o "x$enable_socl" = "xmaybe" ; then
    build_socl=$have_valid_opencl
@@ -2954,7 +3018,7 @@ AS_IF([test "$use_hwloc" = "yes" -a "$have_valid_hwloc" = "no"],
      )
 # in case hwloc is not available but was not explicitely disabled, this is an error
 AS_IF([test "$have_valid_hwloc" = "no" -a "$use_hwloc" != "no"],
-      [AC_MSG_ERROR([libhwloc was not found on your system. If the target machine is hyperthreaded the performance may be impacted a lot.  It is strongly recommended to install libhwloc. However, if you really want to use StarPU without enabling libhwloc, please restart configure by specifying the option '--without-hwloc'.])]
+      [AC_MSG_ERROR([libhwloc or pkg-config was not found on your system. If the target machine is hyperthreaded the performance may be impacted a lot.  It is strongly recommended to install libhwloc and pkg-config. However, if you really want to use StarPU without enabling libhwloc, please restart configure by specifying the option '--without-hwloc'.])]
      )
 
 LDFLAGS="${HWLOC_LIBS} ${SAVED_LDFLAGS}"
@@ -3107,6 +3171,7 @@ AC_SUBST(STARPU_EXPORTED_LIBS)
 AC_CONFIG_COMMANDS([executable-scripts], [
   chmod +x tests/regression/regression.sh
   chmod +x tests/loader-cross.sh
+  chmod +x tests/model-checking/starpu-mc.sh
   chmod +x examples/loader-cross.sh
   chmod +x examples/stencil/loader-cross.sh
   chmod +x gcc-plugin/tests/run-test
@@ -3131,6 +3196,15 @@ AC_CONFIG_COMMANDS([executable-scripts], [
   test -e tests/microbenchs/parallel_independent_heterogeneous_tasks.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_heterogeneous_tasks.sh tests/microbenchs/
   test -e tests/microbenchs/parallel_independent_homogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_homogeneous_tasks_data.sh tests/microbenchs/
   test -e tests/microbenchs/parallel_independent_homogeneous_tasks.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_homogeneous_tasks.sh tests/microbenchs/
+  mkdir -p tests/datawizard
+  test -e tests/datawizard/locality.sh || ln -sf $ac_abs_top_srcdir/tests/datawizard/locality.sh tests/datawizard/
+  mkdir -p tests/model-checking
+  test -e tests/model-checking/prio_list.sh || ln -sf $ac_abs_top_srcdir/tests/model-checking/prio_list.sh tests/model-checking/
+  test -e tests/model-checking/barrier.sh || ln -sf $ac_abs_top_srcdir/tests/model-checking/barrier.sh tests/model-checking/
+  mkdir -p examples/heat
+  test -e examples/heat/heat.sh || ln -sf $ac_abs_top_srcdir/examples/heat/heat.sh examples/heat/
+  mkdir -p examples/lu
+  test -e examples/lu/lu.sh || ln -sf $ac_abs_top_srcdir/examples/lu/lu.sh examples/lu/
 ])
 
 # Create links to ICD files in build/socl/vendors directory. SOCL will use this
@@ -3210,6 +3284,7 @@ AC_OUTPUT([
 	examples/stencil/Makefile
 	tests/Makefile
 	tests/loader-cross.sh
+	tests/model-checking/starpu-mc.sh
 	examples/loader-cross.sh
 	examples/stencil/loader-cross.sh
 	mpi/Makefile

+ 3 - 2
doc/doxygen/chapters/110_basic_examples.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2016  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2016, 2017  CNRS
  * Copyright (C) 2011, 2012 INRIA
  * See the file version.doxy for copying conditions.
  */
@@ -632,7 +632,8 @@ tools to compile a OpenCL kernel stored in a file.
 __kernel void vector_mult_opencl(int nx, __global float* val, float factor)
 {
         const int i = get_global_id(0);
-        if (i < nx) {
+        if (i < nx)
+	{
                 val[i] *= factor;
         }
 }

+ 29 - 6
doc/doxygen/chapters/210_check_list_performance.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
  * Copyright (C) 2011, 2012, 2017  INRIA
  * See the file version.doxy for copying conditions.
  */
@@ -65,9 +65,13 @@ Calling starpu_cublas_init() makes StarPU already do appropriate calls for the
 CUBLAS library. Some libraries like Magma may however change the current stream of CUBLAS v1,
 one then has to call <c>cublasSetKernelStream(starpu_cuda_get_local_stream())</c> at
 the beginning of the codelet to make sure that CUBLAS is really using the proper
-stream. When using CUBLAS v2, starpu_cublas_local_handle() can be called to queue CUBLAS
+stream. When using CUBLAS v2, starpu_cublas_get_local_handle() can be called to queue CUBLAS
 kernels with the proper configuration.
 
+Similarly, calling starpu_cusparse_init() makes StarPU create CUSPARSE handles
+on each CUDA device, starpu_cusparse_get_local_handle() can then be used to
+queue CUSPARSE kernels with the proper configuration.
+
 If the kernel can be made to only use this local stream or other self-allocated
 streams, i.e. the whole kernel submission can be made asynchronous, then
 one should enable asynchronous execution of the kernel.  That means setting
@@ -78,6 +82,21 @@ able to submit and complete data transfers while kernels are executing, instead
 kernel submission. The kernel just has to make sure that StarPU can use the
 local stream to synchronize with the kernel startup and completion.
 
+If the kernel uses its own non-default stream, one can synchronize that stream
+with the StarPU-provided stream this way:
+
+\code{.c}
+cudaEvent_t event;
+call_kernel_with_its_own_stream()
+cudaEventCreateWithFlags(&event, cudaEventDisableTiming);
+cudaEventRecord(event, get_kernel_stream());
+cudaStreamWaitEvent(starpu_cuda_get_local_stream(), event, 0);
+cudaEventDestroy(event);
+\endcode
+
+That code makes the StarPU-provided stream wait for a new event, which will be
+triggered by the completion of the kernel.
+
 Using the flag ::STARPU_CUDA_ASYNC also permits to enable concurrent kernel
 execution, on cards which support it (Kepler and later, notably). This is
 enabled by setting the environment variable \ref STARPU_NWORKER_PER_CUDA to the
@@ -315,26 +334,30 @@ measurements. This can be done by using starpu_perfmodel_update_history(),
 for instance:
 
 \code{.c}
-static struct starpu_perfmodel perf_model = {
+static struct starpu_perfmodel perf_model =
+{
     .type = STARPU_HISTORY_BASED,
     .symbol = "my_perfmodel",
 };
 
-struct starpu_codelet cl = {
+struct starpu_codelet cl =
+{
     .cuda_funcs = { cuda_func1, cuda_func2 },
     .nbuffers = 1,
     .modes = {STARPU_W},
     .model = &perf_model
 };
 
-void feed(void) {
+void feed(void)
+{
     struct my_measure *measure;
     struct starpu_task task;
     starpu_task_init(&task);
 
     task.cl = &cl;
 
-    for (measure = &measures[0]; measure < measures[last]; measure++) {
+    for (measure = &measures[0]; measure < measures[last]; measure++)
+    {
         starpu_data_handle_t handle;
 	starpu_vector_data_register(&handle, -1, 0, measure->size, sizeof(float));
 	task.handles[0] = handle;

+ 15 - 8
doc/doxygen/chapters/301_tasks.doxy

@@ -56,7 +56,8 @@ the field starpu_task::dyn_handles when defining a task and the field
 starpu_codelet::dyn_modes when defining the corresponding codelet.
 
 \code{.c}
-enum starpu_data_access_mode modes[STARPU_NMAXBUFS+1] = {
+enum starpu_data_access_mode modes[STARPU_NMAXBUFS+1] =
+{
 	STARPU_R, STARPU_R, ...
 };
 
@@ -132,7 +133,8 @@ void scal_sse_func(void *buffers[], void *cl_arg)
 \endcode
 
 \code{.c}
-struct starpu_codelet cl = {
+struct starpu_codelet cl =
+{
     .cpu_funcs = { scal_cpu_func, scal_sse_func },
     .cpu_funcs_name = { "scal_cpu_func", "scal_sse_func" },
     .nbuffers = 1,
@@ -167,7 +169,8 @@ static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nim
   return 0;
 }
 
-struct starpu_codelet cl = {
+struct starpu_codelet cl =
+{
     .can_execute = can_execute,
     .cpu_funcs = { cpu_func },
     .cpu_funcs_name = { "cpu_func" },
@@ -212,7 +215,8 @@ static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nim
   return 0;
 }
 
-struct starpu_codelet cl = {
+struct starpu_codelet cl =
+{
     .can_execute = can_execute,
     .cpu_funcs = { cpu_func },
     .cpu_funcs_name = { "cpu_func" },
@@ -247,7 +251,8 @@ static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nim
   }
 }
 
-struct starpu_codelet cl = {
+struct starpu_codelet cl =
+{
     .can_execute = can_execute,
     .cpu_funcs = { cpu_func },
     .cpu_funcs_name = { "cpu_func" },
@@ -280,7 +285,8 @@ void func_cpu(void *descr[], void *_args)
         *x1 = *x1 * ffactor;
 }
 
-struct starpu_codelet mycodelet = {
+struct starpu_codelet mycodelet =
+{
         .cpu_funcs = { func_cpu },
         .cpu_funcs_name = { "func_cpu" },
         .nbuffers = 2,
@@ -544,14 +550,15 @@ An intermediate solution is to define a codelet with its
 starpu_codelet::where field set to \ref STARPU_NOWHERE, for instance:
 
 \code{.c}
-struct starpu_codelet {
+struct starpu_codelet cl =
+{
 	.where = STARPU_NOWHERE,
 	.nbuffers = 1,
 	.modes = { STARPU_R },
 }
 
 task = starpu_task_create();
-task->cl = starpu_codelet;
+task->cl = &cl;
 task->handles[0] = handle;
 starpu_task_submit(task);
 \endcode

+ 130 - 3
doc/doxygen/chapters/310_data_management.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
  * Copyright (C) 2011, 2012 INRIA
  * See the file version.doxy for copying conditions.
  */
@@ -10,6 +10,131 @@
 
 TODO: intro qui parle de coherency entre autres
 
+\section DataInterface Data Interface
+
+StarPU provides several data interfaces for programmers to describe the data layout of their application. There are predefined interfaces already available in StarPU. Users can define new data interfaces as explained in \ref DefiningANewDataInterface. All functions provided by StarPU are documented in \ref API_Data_Interfaces. You will find a short list below.
+
+\subsection VariableDataInterface Variable Data Interface
+
+A variable is a given size byte element, typically a scalar. Here an
+example of how to register a variable data to StarPU by using
+starpu_variable_data_register().
+
+
+\code{.c}
+float var = 42.0;
+starpu_data_handle_t var_handle;
+starpu_variable_data_register(&var_handle, STARPU_MAIN_RAM, (uintptr_t)&var, sizeof(var));
+\endcode
+
+\subsection VectorDataInterface Vector Data Interface
+
+A vector is a fixed number of elements of a given size. Here an
+example of how to register a vector data to StarPU by using
+starpu_vector_data_register().
+
+\code{.c}
+float vector[NX];
+starpu_data_handle_t vector_handle;
+starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector, NX, sizeof(vector[0]));
+\endcode
+
+\subsection MatrixDataInterface Matrix Data Interface
+
+To register 2-D matrices with a potential padding, one can use the
+matrix data interface. Here an example of how to register a matrix
+data to StarPU by using starpu_matrix_data_register().
+
+\code{.c}
+float *matrix;
+starpu_data_handle_t matrix_handle;
+matrix = (float*)malloc(width * height * sizeof(float));
+starpu_matrix_data_register(&matrix_handle, STARPU_MAIN_RAM, (uintptr_t)matrix, width, width, height, sizeof(float));
+\endcode
+
+\subsection BlockDataInterface Block Data Interface
+
+To register 3-D blocks with potential paddings on Y and Z dimensions,
+one can use the block data interface. Here an example of how to
+register a block data to StarPU by using starpu_block_data_register().
+
+\code{.c}
+float *block;
+starpu_data_handle_t block_handle;
+block = (float*)malloc(nx*ny*nz*sizeof(float));
+starpu_block_data_register(&block_handle, STARPU_MAIN_RAM, (uintptr_t)block, nx, nx*ny, nx, ny, nz, sizeof(float));
+\endcode
+
+\subsection BCSRDataInterface BCSR Data Interface
+
+BCSR (Blocked Compressed Sparse Row Representation) sparse matrix data
+can be registered to StarPU using the bcsr data interface. Here an
+example on how to do so by using starpu_bcsr_data_register().
+
+\code{.c}
+/*
+ * We use the following matrix:
+ *
+ *   +----------------+
+ *   |  0   1   0   0 |
+ *   |  2   3   0   0 |
+ *   |  4   5   8   9 |
+ *   |  6   7  10  11 |
+ *   +----------------+
+ *
+ * nzval  = [0, 1, 2, 3] ++ [4, 5, 6, 7] ++ [8, 9, 10, 11]
+ * colind = [0, 0, 1]
+ * rowptr = [0, 1 ]
+ * r = c = 2
+ */
+
+/* Size of the blocks */
+int R = 2;
+int C = 2;
+
+int NROW = 2;
+int NNZ_BLOCKS = 3;    /* out of 4 */
+int NZVAL_SIZE = (R*C*NNZ_BLOCKS);
+
+int nzval[NZVAL_SIZE]  =
+{
+	0, 1, 2, 3,    /* First block  */
+	4, 5, 6, 7,    /* Second block */
+	8, 9, 10, 11   /* Third block  */
+};
+uint32_t colind[NNZ_BLOCKS] =
+{
+	0, /* block-column index for first block in nzval */
+	0, /* block-column index for second block in nzval */
+	1  /* block-column index for third block in nzval */
+};
+uint32_t rowptr[NROW] =
+{
+	0, / * block-index in nzval of the first block of the first row. */
+	1  / * block-index in nzval of the first block of the second row. */
+};
+
+starpu_data_handle_t bcsr_handle;
+starpu_bcsr_data_register(&bcsr_handle,
+			  STARPU_MAIN_RAM,
+			  NNZ_BLOCKS,
+			  NROW,
+			  (uintptr_t) nzval,
+			  colind,
+			  rowptr,
+			  0, /* firstentry */
+			  R,
+			  C,
+			  sizeof(nzval[0]));
+\endcode
+
+StarPU provides an example on how to deal with such matrices in
+<c>examples/spmv</c>.
+
+\subsection CSRDataInterface CSR Data Interface
+
+TODO
+
 \section DataManagement Data Management
 
 When the application allocates data, whenever possible it should use
@@ -140,7 +265,8 @@ to retrieve the sub-handles to be passed as tasks parameters.
 
 \code{.c}
 /* Submit a task on each sub-vector */
-for (i=0; i<starpu_data_get_nb_children(handle); i++) {
+for (i=0; i<starpu_data_get_nb_children(handle); i++)
+{
     /* Get subdata number i (there is only 1 dimension) */
     starpu_data_handle_t sub_handle = starpu_data_get_sub_data(handle, 1, i);
     struct starpu_task *task = starpu_task_create();
@@ -404,7 +530,8 @@ into data <c>res</c>, then uses it for other computation, before looping again
 with a new reduction:
 
 \code{.c}
-for (i = 0; i < 100; i++) {
+for (i = 0; i < 100; i++)
+{
     starpu_mpi_task_insert(MPI_COMM_WORLD, &init_res, STARPU_W, res, 0);
     starpu_mpi_task_insert(MPI_COMM_WORLD, &work, STARPU_RW, A,
                STARPU_R, B, STARPU_REDUX, res, 0);

+ 11 - 4
doc/doxygen/chapters/320_scheduling.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
  * Copyright (C) 2011, 2012 INRIA
  * See the file version.doxy for copying conditions.
  */
@@ -95,6 +95,11 @@ except that it sorts tasks by priority order, which allows to become even closer
 to HEFT by respecting priorities after having made the scheduling decision (but
 it still schedules tasks in the order they become available).
 
+The <b>dmdasd</b> (deque model data aware sorted decision) scheduler is similar
+to dmdas, except that when scheduling a task, it takes into account its priority
+when computing the minimum completion time, since this task may get executed
+before others, and thus the latter should be ignored.
+
 The <b>heft</b> (heterogeneous earliest finish time) scheduler is a deprecated
 alias for <b>dmda</b>.
 
@@ -200,7 +205,8 @@ the StarPU sources in the directory <c>examples/scheduler/</c>.
 The scheduler has to provide methods:
 
 \code{.c}
-static struct starpu_sched_policy dummy_sched_policy = {
+static struct starpu_sched_policy dummy_sched_policy =
+{
     .init_sched = init_dummy_sched,
     .deinit_sched = deinit_dummy_sched,
     .add_workers = dummy_sched_add_workers,
@@ -285,8 +291,9 @@ be used to get information about how well the execution proceeded, and thus the
 overall quality of the execution.
 
 Precise debugging can also be performed by using the
-\ref STARPU_TASK_BREAK_ON_SCHED, \ref STARPU_TASK_BREAK_ON_PUSH, and
-\ref STARPU_TASK_BREAK_ON_POP environment variables. By setting the job_id of a task
+\ref STARPU_TASK_BREAK_ON_PUSH, \ref STARPU_TASK_BREAK_ON_SCHED,
+\ref STARPU_TASK_BREAK_ON_POP, and \ref STARPU_TASK_BREAK_ON_EXEC environment variables.
+By setting the job_id of a task
 in these environment variables, StarPU will raise <c>SIGTRAP</c> when the task is being
 scheduled, pushed, or popped by the scheduler. That means that when one notices
 that a task is being scheduled in a seemingly odd way, one can just reexecute

+ 3 - 3
doc/doxygen/chapters/330_scheduling_contexts.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
 //  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2016  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2016, 2017  CNRS
  * Copyright (C) 2011, 2012 INRIA
  * Copyright (C) 2016 Uppsala University
  * See the file version.doxy for copying conditions.
@@ -84,7 +84,7 @@ contexts in this traditional way.
 
 To create a <b>context</b> with the default scheduler, that is either
 controlled through the environment variable <c>STARPU_SCHED</c> or the
-StarPU default scheduler, one can explicitly use the option <c>STARPU_SCHED_CTX_POLICY_NAME, NULL</c> as in the following example:
+StarPU default scheduler, one can explicitly use the option <c>STARPU_SCHED_CTX_POLICY_NAME, ""</c> as in the following example:
 
 \code{.c}
 /* the list of resources the context will manage */
@@ -92,7 +92,7 @@ int workerids[3] = {1, 3, 10};
 
 /* indicate the list of workers assigned to it, the number of workers,
 and use the default scheduling policy. */
-int id_ctx = starpu_sched_ctx_create(workerids, 3, "my_ctx", STARPU_SCHED_CTX_POLICY_NAME, NULL, 0);
+int id_ctx = starpu_sched_ctx_create(workerids, 3, "my_ctx", STARPU_SCHED_CTX_POLICY_NAME, "", 0);
 
 /* .... */
 \endcode

+ 5 - 1
doc/doxygen/chapters/350_modularized_scheduler.doxy

@@ -112,19 +112,23 @@ to be able to interact with other Scheduling Components.
 	Child Component. When the Push function returns, the task no longer
 	belongs to the calling Component. The Modularized Schedulers'
 	model relies on this function to perform prefetching.
+	See starpu_sched_component::push_task for more details
 
 	- Pull (Caller_Component, Parent_Component)  ->  Task \n
 	The calling Scheduling Component requests a task from
 	its Parent Component. When the Pull function ends, the returned
 	task belongs to the calling Component.
+	See starpu_sched_component::pull_task for more details
 
 	- Can_Push (Caller_Component, Parent_Component) \n
 	The calling Scheduling Component notifies its Parent Component that
 	it is ready to accept new tasks.
+	See starpu_sched_component::can_push for more details
 
 	- Can_Pull (Caller_Component, Child_Component) \n
 	The calling Scheduling Component notifies its Child Component
 	that it is ready to give new tasks.
+	See starpu_sched_component::can_pull for more details
 
 
 \section BuildAModularizedScheduler Building a Modularized Scheduler
@@ -225,7 +229,7 @@ static void initialize_eager_prefetching_center_policy(unsigned sched_ctx_id)
     /* Each Worker Component has a Flow-control Fifo Component as
      * father */
     struct starpu_sched_component * worker_component =
-	  starpu_sched_component_worker_get(i);
+	  starpu_sched_component_worker_new(i);
     struct starpu_sched_component * fifo_component =
 	  starpu_sched_component_fifo_create(&fifo_data);
     fifo_component->add_child

+ 6 - 3
doc/doxygen/chapters/370_online_performance_tools.doxy

@@ -198,7 +198,8 @@ starpu_top_register_parameter_float("alpha", &alpha, 0, 10, modif_hook);
 <c>modif_hook</c> is a function which will be called when the parameter is being modified, it can for instance print the new value:
 
 \code{.c}
-void modif_hook(struct starpu_top_param *d) {
+void modif_hook(struct starpu_top_param *d)
+{
     fprintf(stderr,"%s has been modified: %f\n", d->name, alpha);
 }
 \endcode
@@ -341,12 +342,14 @@ be constructed dynamically at execution time, as long as this is done before
 submitting any task using it.
 
 \code{.c}
-static struct starpu_perfmodel mult_perf_model = {
+static struct starpu_perfmodel mult_perf_model =
+{
     .type = STARPU_HISTORY_BASED,
     .symbol = "mult_perf_model"
 };
 
-struct starpu_codelet cl = {
+struct starpu_codelet cl =
+{
     .cpu_funcs = { cpu_mult },
     .cpu_funcs_name = { "cpu_mult" },
     .nbuffers = 3,

+ 5 - 1
doc/doxygen/chapters/380_offline_performance_tools.doxy

@@ -129,12 +129,16 @@ collect the trace files from the MPI nodes, and
 specify them all on the command <c>starpu_fxt_tool</c>, for instance:
 
 \verbatim
-$ starpu_fxt_tool -i /tmp/prof_file_something1 -i /tmp/prof_file_something2
+$ starpu_fxt_tool -i /tmp/prof_file_something*
 \endverbatim
 
 By default, all tasks are displayed using a green color. To display tasks with
 varying colors, pass option <c>-c</c> to <c>starpu_fxt_tool</c>.
 
+By default, the trace contains all informations. To reduce the trace size,
+various <c>-no-foo</c> options can be passed to <c>starpu_fxt_tool</c>, see
+<c>starpu_fxt_tool --help</c> .
+
 To identify tasks precisely, the application can set the starpu_task::tag_id field of the
 task (or use \ref STARPU_TAG_ONLY when using starpu_task_insert()), and
 the value of the tag will show up in the trace.

+ 19 - 10
doc/doxygen/chapters/390_faq.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
  * Copyright (C) 2011, 2012 INRIA
  * See the file version.doxy for copying conditions.
  */
@@ -27,8 +27,10 @@ At initialisation time of libstarpu, the objects are initialized:
 
 \code{.c}
 int workerid;
-for (workerid = 0; workerid < starpu_worker_get_count(); workerid++) {
-    switch (starpu_worker_get_type(workerid)) {
+for (workerid = 0; workerid < starpu_worker_get_count(); workerid++)
+{
+    switch (starpu_worker_get_type(workerid))
+    {
         case STARPU_CPU_WORKER:
             plan_cpu[workerid] = fftw_plan(...);
             break;
@@ -75,14 +77,16 @@ void starpufft_plan(void)
 
 \code{.c}
 int ret;
-struct starpu_driver = {
+struct starpu_driver =
+{
     .type = STARPU_CUDA_WORKER,
     .id.cuda_id = 0
 };
 ret = starpu_driver_init(&d);
 if (ret != 0)
     error();
-while (some_condition) {
+while (some_condition)
+{
     ret = starpu_driver_run_once(&d);
     if (ret != 0)
         error();
@@ -258,7 +262,8 @@ run the following program with -lcudart:
 #include <cuda.h>
 #include <cuda_runtime.h>
 
-int main(void) {
+int main(void)
+{
 	int n, i, version;
 	cudaError_t err;
 
@@ -274,7 +279,8 @@ int main(void) {
 	printf("runtime version %d\n", version);
 	printf("\n");
 
-	for (i = 0; i < n; i++) {
+	for (i = 0; i < n; i++)
+	{
 		struct cudaDeviceProp props;
 		printf("CUDA%d\n", i);
 		err = cudaGetDeviceProperties(&props, i);
@@ -306,7 +312,8 @@ run the following program with -lOpenCL:
 #include <stdio.h>
 #include <assert.h>
 
-int main(void) {
+int main(void)
+{
     cl_device_id did[16];
     cl_int err;
     cl_platform_id pid, pids[16];
@@ -318,7 +325,8 @@ int main(void) {
     err = clGetPlatformIDs(sizeof(pids)/sizeof(pids[0]), pids, &nbplat);
     assert(err == CL_SUCCESS);
     printf("%u platforms\n", nbplat);
-    for (j = 0; j < nbplat; j++) {
+    for (j = 0; j < nbplat; j++)
+    {
         pid = pids[j];
         printf("    platform %d\n", j);
         err = clGetPlatformInfo(pid, CL_PLATFORM_VERSION, sizeof(buf)-1, buf, &size);
@@ -329,7 +337,8 @@ int main(void) {
         err = clGetDeviceIDs(pid, CL_DEVICE_TYPE_ALL, sizeof(did)/sizeof(did[0]), did, &nb);
         assert(err == CL_SUCCESS);
         printf("%d devices\n", nb);
-        for (i = 0; i < nb; i++) {
+        for (i = 0; i < nb; i++)
+	{
             err = clGetDeviceInfo(did[i], CL_DEVICE_VERSION, sizeof(buf)-1, buf, &size);
             buf[size] = 0;
             printf("    device %d version %s\n", i, buf);

+ 1 - 1
doc/doxygen/chapters/401_out_of_core.doxy

@@ -51,7 +51,7 @@ export STARPU_DISK_SWAP_SIZE=200
 When the register function is called, StarPU will benchmark the disk. This can
 take some time.
 
-<strong>Warning: the size thus has to be at least 1 MB!</strong> 
+<strong>Warning: the size thus has to be at least \ref STARPU_DISK_SIZE_MIN bytes ! </strong> 
 
 StarPU will automatically try to evict unused data to this new disk. One can
 also use the standard StarPU memory node API, see the \ref API_Standard_Memory_Library

+ 38 - 19
doc/doxygen/chapters/410_mpi_support.doxy

@@ -28,7 +28,8 @@ initializes a token on node 0, and the token is passed from node to node,
 incremented by one on each step. The code is not using StarPU yet.
 
 \code{.c}
-    for (loop = 0; loop < nloops; loop++) {
+    for (loop = 0; loop < nloops; loop++)
+    {
         int tag = loop*size + rank;
 
         if (loop == 0 && rank == 0)
@@ -62,7 +63,8 @@ execution to StarPU.  This is possible by just using starpu_data_acquire(), for
 instance:
 
 \code{.c}
-    for (loop = 0; loop < nloops; loop++) {
+    for (loop = 0; loop < nloops; loop++)
+    {
         int tag = loop*size + rank;
 
 	/* Acquire the data to be able to write to it */
@@ -412,7 +414,8 @@ communication cache when unregistering the data.
 
 \code{.c}
 /* Returns the MPI node number where data is */
-int my_distrib(int x, int y, int nb_nodes) {
+int my_distrib(int x, int y, int nb_nodes)
+{
   /* Block distrib */
   return ((int)(x / sqrt(nb_nodes) + (y / sqrt(nb_nodes)) * sqrt(nb_nodes))) % nb_nodes;
 
@@ -439,8 +442,10 @@ data which will be needed by the tasks that we will execute.
     unsigned matrix[X][Y];
     starpu_data_handle_t data_handles[X][Y];
 
-    for(x = 0; x < X; x++) {
-        for (y = 0; y < Y; y++) {
+    for(x = 0; x < X; x++)
+    {
+        for (y = 0; y < Y; y++)
+	{
             int mpi_rank = my_distrib(x, y, size);
             if (mpi_rank == my_rank)
                 /* Owning data */
@@ -454,7 +459,8 @@ data which will be needed by the tasks that we will execute.
             else
                 /* I know it's useless to allocate anything for this */
                 data_handles[x][y] = NULL;
-            if (data_handles[x][y]) {
+            if (data_handles[x][y])
+	    {
                 starpu_mpi_data_register(data_handles[x][y], x*X+y, mpi_rank);
             }
         }
@@ -604,8 +610,10 @@ register any data that wasn't registered already and will be needed, then
 migrate the data, and register the new location.
 
 \code{.c}
-    for(x = 0; x < X; x++) {
-        for (y = 0; y < Y; y++) {
+    for(x = 0; x < X; x++)
+    {
+        for (y = 0; y < Y; y++)
+	{
             int mpi_rank = my_distrib2(x, y, size);
             if (!data_handles[x][y] && (mpi_rank == my_rank
                   || my_rank == my_distrib(x+1, y, size) || my_rank == my_distrib(x-1, y, size)
@@ -613,7 +621,8 @@ migrate the data, and register the new location.
                 /* Register newly-needed data */
                 starpu_variable_data_register(&data_handles[x][y], -1,
                                               (uintptr_t)NULL, sizeof(unsigned));
-            if (data_handles[x][y]) {
+            if (data_handles[x][y])
+	    {
                 /* Migrate the data */
                 starpu_mpi_data_migrate(MPI_COMM_WORLD, data_handles[x][y], mpi_rank);
             }
@@ -636,9 +645,12 @@ resides. Otherwise the unregistration will complain that it does not have the
 latest value on the original home node.
 
 \code{.c}
-    for(x = 0; x < X; x++) {
-        for (y = 0; y < Y; y++) {
-            if (data_handles[x][y]) {
+    for(x = 0; x < X; x++)
+    {
+        for (y = 0; y < Y; y++)
+	{
+            if (data_handles[x][y])
+	    {
                 int mpi_rank = my_distrib(x, y, size);
                 /* Get back data to original place where the user-provided buffer is.  */
                 starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, data_handles[x][y], mpi_rank, NULL, NULL);
@@ -669,20 +681,24 @@ data_handles = malloc(nblocks*sizeof(starpu_data_handle_t *));
 for(x = 0; x < nblocks ;  x++)
 {
     int mpi_rank = my_distrib(x, nodes);
-    if (rank == root) {
+    if (rank == root)
+    {
         starpu_vector_data_register(&data_handles[x], STARPU_MAIN_RAM, (uintptr_t)vector[x],
                                     blocks_size, sizeof(float));
     }
-    else if ((mpi_rank == rank) || ((rank == mpi_rank+1 || rank == mpi_rank-1))) {
+    else if ((mpi_rank == rank) || ((rank == mpi_rank+1 || rank == mpi_rank-1)))
+    {
         /* I own that index, or i will need it for my computations */
         starpu_vector_data_register(&data_handles[x], -1, (uintptr_t)NULL,
                                    block_size, sizeof(float));
     }
-    else {
+    else
+    {
         /* I know it's useless to allocate anything for this */
         data_handles[x] = NULL;
     }
-    if (data_handles[x]) {
+    if (data_handles[x])
+    {
         starpu_mpi_data_register(data_handles[x], x*nblocks+y, mpi_rank);
     }
 }
@@ -691,10 +707,13 @@ for(x = 0; x < nblocks ;  x++)
 starpu_mpi_scatter_detached(data_handles, nblocks, root, MPI_COMM_WORLD);
 
 /* Calculation */
-for(x = 0; x < nblocks ;  x++) {
-    if (data_handles[x]) {
+for(x = 0; x < nblocks ;  x++)
+{
+    if (data_handles[x])
+    {
         int owner = starpu_data_get_rank(data_handles[x]);
-        if (owner == rank) {
+        if (owner == rank)
+	{
             starpu_task_insert(&cl, STARPU_RW, data_handles[x], 0);
         }
     }

+ 3 - 2
doc/doxygen/chapters/430_mic_scc_support.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2016  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2016, 2017  CNRS
  * Copyright (C) 2011, 2012 INRIA
  * See the file version.doxy for copying conditions.
  */
@@ -49,7 +49,8 @@ starpu_codelet::cpu_funcs_name, to provide StarPU with the function
 name of the CPU implementation, so for instance:
 
 \verbatim
-struct starpu_codelet cl = {
+struct starpu_codelet cl =
+{
     .cpu_funcs = {myfunc},
     .cpu_funcs_name = {"myfunc"},
     .nbuffers = 1,

+ 4 - 3
doc/doxygen/chapters/470_simgrid.doxy

@@ -9,8 +9,8 @@
 /*! \page SimGridSupport SimGrid Support
 
 StarPU can use Simgrid in order to simulate execution on an arbitrary
-platform. This was tested with simgrid 3.11, 3.12, 3.13, 3.14, and 3.14.159, other versions may have
-compatibility issues.
+platform. This was tested with simgrid from 3.11 to 3.15,
+other versions may have compatibility issues.
 
 \section Preparing Preparing Your Application For Simulation
 
@@ -36,7 +36,8 @@ To be able to run the application with e.g. CUDA simulation on a system which
 does not have CUDA installed, one can fill the cuda_funcs with (void*)1, to
 express that there is a CUDA implementation, even if one does not actually
 provide it. StarPU will not actually run it in Simgrid mode anyway by default
-(unless the ::STARPU_CODELET_SIMGRID_EXECUTE flag is set in the codelet)
+(unless the ::STARPU_CODELET_SIMGRID_EXECUTE or ::STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT
+flags are set in the codelet)
 
 \snippet simgrid.c To be included. You should update doxygen if you see this text.
 

+ 31 - 11
doc/doxygen/chapters/501_environment_variables.doxy

@@ -642,9 +642,10 @@ especially regarding data transfers.
 <dd>
 \anchor STARPU_SIMGRID_SCHED_COST
 \addindex __env__STARPU_SIMGRID_SCHED_COST
-When set to 1 (which is the default), scheduling costs are taken into
+When set to 1 (0 is the default), scheduling costs are taken into
 account in simgrid mode. This provides more accurate simgrid predictions,
-and allows studying scheduling overhead of the runtime system.
+and allows studying scheduling overhead of the runtime system. However,
+it also makes simulation non-deterministic.
 </dd>
 
 </dl>
@@ -679,6 +680,16 @@ This specifies the main directory in which StarPU stores its
 performance model files. The default is <c>$STARPU_HOME/.starpu/sampling</c>.
 </dd>
 
+<dt>STARPU_PERF_MODEL_HOMOGENEOUS_CPU</dt>
+<dd>
+\anchor STARPU_PERF_MODEL_HOMOGENEOUS_CPU
+\addindex __env__STARPU_PERF_MODEL_HOMOGENEOUS_CPU
+When this is set to 0, StarPU will assume that CPU devices do not have the same
+performance, and thus use different performance models for them, thus making
+kernel calibration much longer, since measurements have to be made for each CPU
+core.
+</dd>
+
 <dt>STARPU_PERF_MODEL_HOMOGENEOUS_CUDA</dt>
 <dd>
 \anchor STARPU_PERF_MODEL_HOMOGENEOUS_CUDA
@@ -830,7 +841,7 @@ that have a limited amount of memory.
 \anchor STARPU_LIMIT_CPU_MEM
 \addindex __env__STARPU_LIMIT_CPU_MEM
 This variable specifies the maximum number of megabytes that should be
-available to the application on each CPU device. Setting it enables allocation
+available to the application in the main CPU memory. Setting it enables allocation
 cache in main memory
 </dd>
 
@@ -1013,6 +1024,15 @@ dog is reached, thus allowing to catch the situation in gdb, etc
 (see \ref DetectionStuckConditions)
 </dd>
 
+<dt>STARPU_TASK_BREAK_ON_PUSH</dt>
+<dd>
+\anchor STARPU_TASK_BREAK_ON_PUSH
+\addindex __env__STARPU_TASK_BREAK_ON_PUSH
+When this variable contains a job id, StarPU will raise SIGTRAP when the task
+with that job id is being pushed to the scheduler, which will be nicely catched by debuggers
+(see \ref DebuggingScheduling)
+</dd>
+
 <dt>STARPU_TASK_BREAK_ON_SCHED</dt>
 <dd>
 \anchor STARPU_TASK_BREAK_ON_SCHED
@@ -1024,21 +1044,21 @@ This only works for schedulers which have such a scheduling point defined
 (see \ref DebuggingScheduling)
 </dd>
 
-<dt>STARPU_TASK_BREAK_ON_PUSH</dt>
+<dt>STARPU_TASK_BREAK_ON_POP</dt>
 <dd>
-\anchor STARPU_TASK_BREAK_ON_PUSH
-\addindex __env__STARPU_TASK_BREAK_ON_PUSH
+\anchor STARPU_TASK_BREAK_ON_POP
+\addindex __env__STARPU_TASK_BREAK_ON_POP
 When this variable contains a job id, StarPU will raise SIGTRAP when the task
-with that job id is being pushed to the scheduler, which will be nicely catched by debuggers
+with that job id is being popped from the scheduler, which will be nicely catched by debuggers
 (see \ref DebuggingScheduling)
 </dd>
 
-<dt>STARPU_TASK_BREAK_ON_POP</dt>
+<dt>STARPU_TASK_BREAK_ON_EXEC</dt>
 <dd>
-\anchor STARPU_TASK_BREAK_ON_POP
-\addindex __env__STARPU_TASK_BREAK_ON_POP
+\anchor STARPU_TASK_BREAK_ON_EXEC
+\addindex __env__STARPU_TASK_BREAK_ON_EXEC
 When this variable contains a job id, StarPU will raise SIGTRAP when the task
-with that job id is being popped from the scheduler, which will be nicely catched by debuggers
+with that job id is being executed, which will be nicely catched by debuggers
 (see \ref DebuggingScheduling)
 </dd>
 

+ 15 - 0
doc/doxygen/chapters/510_configure_options.doxy

@@ -553,6 +553,13 @@ that the MKL website
 provides a script to determine the linking flags.
 </dd>
 
+<dt>--disable-glpk</dt>
+<dd>
+\anchor disable-glpk
+\addindex __configure__--disable-glpk
+Disable the use of libglpk for computing area bounds.
+</dd>
+
 <dt>--disable-build-tests</dt>
 <dd>
 \anchor disable-build-tests
@@ -638,6 +645,14 @@ allows to specify the location to the SimGrid lib directory.
 Use the smpirun at <c>path</c>
 </dd>
 
+<dt>--enable-simgrid-mc</dt>
+<dd>
+\anchor enable-simgrid-mc
+\addindex __configure__--enable-simgrid-mc
+Enable the Model Checker in simulation of execution in simgrid, to allow
+exploring various execution paths.
+</dd>
+
 <dt>--enable-calibration-heuristic</dt>
 <dd>
 \anchor enable-calibration-heuristic

+ 21 - 12
doc/doxygen/chapters/api/codelet_and_tasks.doxy

@@ -56,33 +56,33 @@ essentially used for synchronization tasks.
 
 \def STARPU_CPU
 \ingroup API_Codelet_And_Tasks
-This macro is used when setting the field starpu_codelet::where
-to specify the codelet may be executed on a CPU processing unit.
+This macro is used when setting the field starpu_codelet::where (or starpu_task::where)
+to specify the codelet (or the task) may be executed on a CPU processing unit.
 
 \def STARPU_CUDA
 \ingroup API_Codelet_And_Tasks
-This macro is used when setting the field starpu_codelet::where
-to specify the codelet may be executed on a CUDA processing unit.
+This macro is used when setting the field starpu_codelet::where (or starpu_task::where)
+to specify the codelet (or the task) may be executed on a CUDA processing unit.
 
 \def STARPU_OPENCL
 \ingroup API_Codelet_And_Tasks
-This macro is used when setting the field starpu_codelet::where to
-specify the codelet may be executed on a OpenCL processing unit.
+This macro is used when setting the field starpu_codelet::where (or starpu_task::where) to
+specify the codelet (or the task) may be executed on a OpenCL processing unit.
 
 \def STARPU_MIC
 \ingroup API_Codelet_And_Tasks
-This macro is used when setting the field starpu_codelet::where to
-specify the codelet may be executed on a MIC processing unit.
+This macro is used when setting the field starpu_codelet::where (or starpu_task::where) to
+specify the codelet (or the task) may be executed on a MIC processing unit.
 
 \def STARPU_MPI_MS
 \ingroup API_Codelet_And_Tasks
-This macro is used when setting the field starpu_codelet::where to
-specify the codelet may be executed on a MPI Slave processing unit.
+This macro is used when setting the field starpu_codelet::where (or starpu_task::where) to
+specify the codelet (or the task) may be executed on a MPI Slave processing unit.
 
 \def STARPU_SCC
 \ingroup API_Codelet_And_Tasks
-This macro is used when setting the field starpu_codelet::where to
-specify the codelet may be executed on an SCC processing unit.
+This macro is used when setting the field starpu_codelet::where (or starpu_task::where) to
+specify the codelet (or the task) may be executed on an SCC processing unit.
 
 \def STARPU_MAIN_RAM
 \ingroup API_Codelet_And_Tasks
@@ -135,6 +135,11 @@ Value to be set in starpu_codelet::opencl_flags to allow asynchronous OpenCL ker
 \ingroup API_Codelet_And_Tasks
 Value to be set in starpu_codelet::flags to execute the codelet functions even in simgrid mode.
 
+\def STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT
+\ingroup API_Codelet_And_Tasks
+Value to be set in starpu_codelet::flags to execute the codelet functions even in simgrid mode,
+and later inject the measured timing inside the simulation.
+
 \typedef starpu_cpu_func_t
 \ingroup API_Codelet_And_Tasks
 CPU implementation of a codelet.
@@ -438,6 +443,10 @@ the configuration of a task allocated with starpu_task_create().
     the task. The access modes are now defined in the field
     starpu_codelet::modes.
 
+\var uint32_t starpu_task::where
+    When set, specifies where the task is allowed to be executed.
+    When unset, it takes the value of starpu_codelet::where.
+
 \var int starpu_task::nbuffers
     Specifies the number of buffers. This is only used when
     starpu_codelet::nbuffers is \ref STARPU_VARIABLE_NBUFFERS.

+ 17 - 0
doc/doxygen/chapters/api/cuda_extensions.doxy

@@ -95,4 +95,21 @@ Report a cublas error.
 Calls starpu_cublas_report_error(), passing the current
 function, file and line position.
 
+\fn void starpu_cusparse_init(void)
+\ingroup API_CUDA_Extensions
+Calling starpu_cusparse_init() will initialize CUSPARSE on every CUDA device
+controlled by StarPU. This call blocks until CUSPARSE has been properly
+initialized on every device.
+
+\fn cusparseHandle_t starpu_cusparse_get_local_handle(void)
+\ingroup API_CUDA_Extensions
+This function returns the CUSPARSE handle to be used to queue CUSPARSE
+kernels. It is properly initialized and configured for multistream by
+starpu_cusparse_init().
+
+\fn void starpu_cusparse_shutdown(void)
+\ingroup API_CUDA_Extensions
+This function synchronously deinitializes the CUSPARSE library on
+every CUDA device.
+
 */

+ 5 - 2
doc/doxygen/chapters/api/data_out_of_core.doxy

@@ -7,6 +7,10 @@
 
 /*! \defgroup API_Out_Of_Core Out Of Core
 
+\def STARPU_DISK_SIZE_MIN
+\ingroup API_Out_Of_Core
+Minimum size of a registered disk. The size of a disk is the last parameter of the function starpu_disk_register().
+
 \struct starpu_disk_ops
 \ingroup API_Out_Of_Core
 This is a set of functions to manipulate datas on disk.
@@ -88,8 +92,7 @@ Register a disk memory node with a set of functions to manipulate datas. The \c
 plug member of \p func will be passed \p parameter, and return a \c base which will be passed to all \p func methods. <br />
 SUCCESS: return the disk node. <br />
 FAIL: return an error code. <br />
-The \p size must be at least 1 MB !
-\p size being negative means infinite size.
+\p size must be at least \ref STARPU_DISK_SIZE_MIN bytes ! \p size being negative means infinite size.
 
 \fn void *starpu_disk_open(unsigned node, void *pos, size_t size)
 \ingroup API_Out_Of_Core

+ 4 - 2
doc/doxygen/chapters/api/data_partition.doxy

@@ -42,7 +42,8 @@ according to the filter \p f.
 
 Here an example of how to use the function.
 \code{.c}
-struct starpu_data_filter f = {
+struct starpu_data_filter f =
+{
         .filter_func = starpu_matrix_filter_block,
         .nchildren = nslicesx
 };
@@ -119,7 +120,8 @@ Here is an example of how to use the function:
 
 \code{.c}
 starpu_data_handle_t children[nslicesx];
-struct starpu_data_filter f = {
+struct starpu_data_filter f =
+{
         .filter_func = starpu_matrix_filter_block,
         .nchildren = nslicesx
 };

+ 1 - 1
doc/doxygen/chapters/api/insert_task.doxy

@@ -6,7 +6,7 @@
  * See the file version.doxy for copying conditions.
  */
 
-/*! \defgroup API_Insert_Task Insert_Task
+/*! \defgroup API_Insert_Task Task Insert Utility
 
 \fn int starpu_insert_task(struct starpu_codelet *cl, ...)
 \ingroup API_Insert_Task

+ 5 - 2
doc/doxygen/chapters/api/modularized_scheduler.doxy

@@ -69,6 +69,9 @@ like <c>component->push_task(component,task)</c>
      recursive call on a child or store the task in the component,
      then it will be returned by a further pull_task call.
      the caller must ensure that component is able to execute task.
+     This method must either return 0 if it the task was properly stored or
+     passed over to a child component, or return a value different from 0 if the
+     task could not be consumed (e.g. the queue is full).
 \var struct starpu_task * (*starpu_sched_component::pull_task)(struct starpu_sched_component *)
      pop a task from the scheduler module. this function is called by workers to get a task from their
      parents. this function should first return a locally stored task
@@ -79,13 +82,13 @@ like <c>component->push_task(component,task)</c>
      This function is called by a component which implements a queue,
      allowing it to signify to its parents that an empty slot is
      available in its queue. The basic implementation of this function
-     is a recursive call to its parents, the user have to specify a
+     is a recursive call to its parents, the user has to specify a
      personally-made function to catch those calls.
 \var void (*starpu_sched_component::can_pull)(struct starpu_sched_component *component)
      This function allow a component to wake up a worker. It is
      currently called by component which implements a queue, to
      signify to its children that a task have been pushed in its local
-     queue, and is available to been popped by a worker, for example.
+     queue, and is available to be popped by a worker, for example.
      The basic implementation of this function is a recursive call to
      its children, until at least one worker have been woken up.
 

+ 1 - 1
doc/doxygen/chapters/api/mpi.doxy

@@ -503,7 +503,7 @@ with the argument \p rarg on the process root, the \p scallback
 function is called with the argument \p sarg on any other process.
 
 @name MPI Master Slave
-\anchor MPIMasterSlave
+\anchor MPIMasterSlaveSupport
 \ingroup API_MPI_Support
 
 \def STARPU_USE_MPI_MASTER_SLAVE

+ 4 - 3
doc/doxygen/chapters/api/scheduling_contexts.doxy

@@ -56,7 +56,8 @@ The arguments following the name of the scheduling context can be of
 the following types:
 <ul>
 <li> ::STARPU_SCHED_CTX_POLICY_NAME, followed by the name of a
-predefined scheduling policy
+predefined scheduling policy. Use an empty string to create the
+context with the default scheduling policy.
 </li>
 <li> ::STARPU_SCHED_CTX_POLICY_STRUCT, followed by a pointer to a
 custom scheduling policy (struct starpu_sched_policy *)
@@ -132,13 +133,13 @@ Create a context indicating an approximate interval of resources
 Execute the callback whenever the last task of the context finished executing, it is called with the parameters \p sched_ctx and any other parameter needed
 by the application (packed in \p args)
 
-\fn void starpu_sched_ctx_add_workers(int *workerids_ctx, int nworkers_ctx, unsigned sched_ctx_id)
+\fn void starpu_sched_ctx_add_workers(int *workerids_ctx, unsigned nworkers_ctx, unsigned sched_ctx_id)
 \ingroup API_Scheduling_Contexts
 Add dynamically the workers in \p workerids_ctx to the
 context \p sched_ctx_id. The last argument cannot be greater than
 \ref STARPU_NMAX_SCHED_CTXS.
 
-\fn void starpu_sched_ctx_remove_workers(int *workerids_ctx, int nworkers_ctx, unsigned sched_ctx_id)
+\fn void starpu_sched_ctx_remove_workers(int *workerids_ctx, unsigned nworkers_ctx, unsigned sched_ctx_id)
 \ingroup API_Scheduling_Contexts
 Remove the workers in \p workerids_ctx from the context
 \p sched_ctx_id. The last argument cannot be greater than

+ 23 - 6
doc/doxygen/chapters/api/scheduling_policy.doxy

@@ -60,12 +60,23 @@ For each task not going through the scheduler (because starpu_task::execute_on_a
 	of the scheduler coherent even when StarPU bypasses the
 	scheduling strategy.
 \var struct starpu_task *(*starpu_sched_policy::pop_task)(unsigned sched_ctx_id)
-        Get a task from the scheduler. The mutex associated to the
-	worker is already taken when this method is called. If this
-	method is defined as <c>NULL</c>, the worker will only execute tasks
-	from its local queue. In this case, the push_task method
-	should use the starpu_push_local_task method to assign tasks
-	to the different workers.
+        Get a task from the scheduler.
+        If this method returns NULL, the worker will start sleeping. If later on
+        some task are pushed for this worker, starpu_wake_worker() must be
+        called to wake the worker so it can call the pop_task() method again.
+	
+        The mutex associated to the worker is already taken when this method
+        is called. This method may release it (e.g. for scalability reasons
+        when doing work stealing), but it must acquire it again before taking
+        the decision whether to return a task or NULL, so the atomicity of
+        deciding to return NULL and making the worker actually sleep is
+        preserved. Otherwise in simgrid or blocking driver mode the worker might start
+        sleeping while a task has just been pushed for it.
+
+        If this method is defined as <c>NULL</c>, the worker will only execute
+        tasks from its local queue. In this case, the push_task method should
+        use the starpu_push_local_task method to assign tasks to the different
+        workers.
 \var struct starpu_task *(*starpu_sched_policy::pop_every_task)(unsigned sched_ctx_id)
         Remove all available tasks from the scheduler (tasks are
 	chained by the means of the field starpu_task::prev and
@@ -106,6 +117,12 @@ condition variable. For instance, in the case of a scheduling strategy
 with a single task queue, the same condition variable would be used to
 block and wake up all workers.
 
+\fn int starpu_wake_worker(int workerid)
+\ingroup API_Scheduling_Policy
+In simgrid or blocking driver mode, 
+this should be called by push functions to wake the potential workers that are
+supposed to pick up the tasks which just have been pushed, otherwise they may
+remain sleeping.
 
 \fn int starpu_sched_set_min_priority(int min_prio)
 \ingroup API_Scheduling_Policy

+ 7 - 7
doc/doxygen/chapters/api/task_lists.doxy

@@ -28,15 +28,15 @@ Push \p task at the front of \p list
 \ingroup API_Task_Lists
 Push \p task at the back of \p list
 
-\fn struct starpu_task *starpu_task_list_front(struct starpu_task_list *list)
+\fn struct starpu_task *starpu_task_list_front(const struct starpu_task_list *list)
 \ingroup API_Task_Lists
 Get the front of \p list (without removing it)
 
-\fn struct starpu_task *starpu_task_list_back(struct starpu_task_list *list)
+\fn struct starpu_task *starpu_task_list_back(const struct starpu_task_list *list)
 \ingroup API_Task_Lists
 Get the back of \p list (without removing it)
 
-\fn int starpu_task_list_empty(struct starpu_task_list *list)
+\fn int starpu_task_list_empty(const struct starpu_task_list *list)
 \ingroup API_Task_Lists
 Test if \p list is empty
 
@@ -52,19 +52,19 @@ Remove the element at the front of \p list
 \ingroup API_Task_Lists
 Remove the element at the back of \p list
 
-\fn struct starpu_task *starpu_task_list_begin(struct starpu_task_list *list)
+\fn struct starpu_task *starpu_task_list_begin(const struct starpu_task_list *list)
 \ingroup API_Task_Lists
 Get the first task of \p list.
 
-\fn struct starpu_task *starpu_task_list_end(struct starpu_task_list *list)
+\fn struct starpu_task *starpu_task_list_end(const struct starpu_task_list *list)
 \ingroup API_Task_Lists
 Get the end of \p list.
 
-\fn struct starpu_task *starpu_task_list_next(struct starpu_task *task)
+\fn struct starpu_task *starpu_task_list_next(const struct starpu_task *task)
 \ingroup API_Task_Lists
 Get the next task of \p list. This is not erase-safe.
 
-\fn int starpu_task_list_ismember(struct starpu_task_list *list, struct starpu_task *look)
+\fn int starpu_task_list_ismember(const struct starpu_task_list *list, const struct starpu_task *look)
 \ingroup API_Task_Lists
 Test whether the given task \p look is contained in the \p list.
 

+ 66 - 0
doc/doxygen/chapters/api/workers.doxy

@@ -254,4 +254,70 @@ on which device the memory needs to be allocated.
 \ingroup API_Workers_Properties
 Return worker \p type as a string.
 
+\fn int starpu_worker_sched_op_pending(void)
+\ingroup API_Workers_Properties
+Return \c !0 if current worker has a scheduling operation in progress,
+and \c 0 otherwise.
+
+\fn void starpu_worker_relax_on(void)
+\ingroup API_Workers_Properties
+Allow other threads and workers to temporarily observe the current
+worker state, even though it is performing a scheduling operation.
+Must be called by a worker before performing a potentially blocking
+call such as acquiring a mutex other than its own sched_mutex. This
+function increases \c state_relax_refcnt from the current worker. No
+more than <c>UINT_MAX-1</c> nested relax_on calls should performed on
+the same worker. This function is automatically called by \ref
+starpu_worker_lock to relax the caller worker state while attempting
+to lock the targer worker.
+
+\fn void starpu_worker_relax_off(void)
+\ingroup API_Workers_Properties
+Must be called after a potentially blocking call is complete, to
+restore the relax state in place before the corresponding relax_on.
+Decreases \c state_relax_refcnt. Calls to \ref starpu_worker_relax_on
+and \c starpu_worker_relax_off must be well parenthesized. This
+function is automatically called by \ref starpu_worker_unlock after the 
+target worker has been unlocked.
+
+\fn int starpu_worker_get_relax_state(void)
+\ingroup API_Workers_Properties
+Returns \c !0 if the current worker \c state_relax_refcnt!=0 and \c 0
+otherwise.
+
+\fn void starpu_worker_lock(int workerid)
+\ingroup API_Workers_Properties
+Acquire the sched mutex of \p workerid. If the caller is a worker,
+distinct from \p workerid, the caller worker automatically enter relax
+state while acquiring the target worker lock.
+
+\fn int starpu_worker_trylock(int workerid)
+\ingroup API_Workers_Properties
+Attempt to acquire the sched mutex of \p workerid. Returns \c 0 if
+successful, \c !0 if \p workerid sched mutex is held or the
+corresponding worker is not in relaxed stated.
+If the caller is a worker, distinct from \p workerid, the caller
+worker automatically enter relax state if successfully acquiring the target
+worker lock.
+
+\fn void starpu_worker_unlock(int workerid)
+\ingroup API_Workers_Properties
+Release the previously acquired sched mutex of \p workerid. Restore
+the relaxed state of the caller worker if needed.
+
+\fn void starpu_worker_lock_self(void)
+\ingroup API_Workers_Properties
+Acquire the current worker sched mutex.
+
+\fn void starpu_worker_unlock_self(void)
+\ingroup API_Workers_Properties
+Release the current worker sched mutex.
+
+\fn int starpu_wake_worker_relax(int workerid)
+\ingroup API_Workers_Properties
+Wake up \p workerid while temporarily entering the current worker relaxed state
+if needed during the waiting process. Returns 1 if \p workerid has been woken
+up or its state_keep_awake flag has been set to 1, and 0 otherwise (if \p
+workerid was not in the STATE_SLEEPING or in the STATE_SCHEDULING).
+
 */

+ 6 - 3
doc/doxygen/chapters/code/multiformat.c

@@ -30,7 +30,8 @@ starpu_data_handle_t handle;
  */
 #ifdef STARPU_USE_OPENCL
 void cpu_to_opencl_opencl_func(void *buffers[], void *args);
-struct starpu_codelet cpu_to_opencl_cl = {
+struct starpu_codelet cpu_to_opencl_cl =
+{
     .where = STARPU_OPENCL,
     .opencl_funcs = { cpu_to_opencl_opencl_func },
     .nbuffers = 1,
@@ -38,7 +39,8 @@ struct starpu_codelet cpu_to_opencl_cl = {
 };
 
 void opencl_to_cpu_func(void *buffers[], void *args);
-struct starpu_codelet opencl_to_cpu_cl = {
+struct starpu_codelet opencl_to_cpu_cl =
+{
     .where = STARPU_CPU,
     .cpu_funcs = { opencl_to_cpu_func },
     .cpu_funcs_name = { "opencl_to_cpu_func" },
@@ -47,7 +49,8 @@ struct starpu_codelet opencl_to_cpu_cl = {
 };
 #endif
 
-struct starpu_multiformat_data_interface_ops format_ops = {
+struct starpu_multiformat_data_interface_ops format_ops =
+{
 #ifdef STARPU_USE_OPENCL
     .opencl_elemsize = 2 * sizeof(float),
     .cpu_to_opencl_cl = &cpu_to_opencl_cl,

+ 2 - 1
doc/doxygen/chapters/code/vector_scal_c.c

@@ -32,7 +32,8 @@ extern void scal_sse_func(void *buffers[], void *_args);
 extern void scal_cuda_func(void *buffers[], void *_args);
 extern void scal_opencl_func(void *buffers[], void *_args);
 
-static struct starpu_codelet cl = {
+static struct starpu_codelet cl =
+{
     .where = STARPU_CPU | STARPU_CUDA | STARPU_OPENCL,
     /* CPU implementation of the codelet */
     .cpu_funcs = { scal_cpu_func, scal_sse_func },

+ 2 - 1
doc/doxygen/chapters/code/vector_scal_opencl_codelet.cl

@@ -18,7 +18,8 @@
 __kernel void vector_mult_opencl(int nx, __global float* val, float factor)
 {
         const int i = get_global_id(0);
-        if (i < nx) {
+        if (i < nx)
+	{
                 val[i] *= factor;
         }
 }

+ 13 - 8
examples/Makefile.am

@@ -77,11 +77,13 @@ EXTRA_DIST = 					\
 	scheduler/schedulers.sh				\
 	scheduler/schedulers_context.sh			\
 	fortran/Makefile				\
-	sched_ctx/axpy_partition_gpu.h				\
-	sched_ctx/axpy_partition_gpu.cu
+	sched_ctx/axpy_partition_gpu.h			\
+	sched_ctx/axpy_partition_gpu.cu			\
+	heat/heat.sh					\
+	lu/lu.sh
 
 
-CLEANFILES = *.gcno *.gcda *.linkinfo *.mod starpu_idle_microsec.log
+CLEANFILES = *.gcno *.gcda *.linkinfo *.mod starpu_idle_microsec.log *.mps *.dot *.pl
 
 if STARPU_USE_CUDA
 
@@ -300,6 +302,13 @@ STARPU_EXAMPLES +=				\
 	heat/heat				\
 	cg/cg					\
 	pipeline/pipeline
+
+if !STARPU_USE_MPI_MASTER_SLAVE
+TESTS += \
+	heat/heat.sh				\
+	lu/lu.sh
+
+endif
 endif
 endif
 
@@ -335,12 +344,8 @@ STARPU_EXAMPLES +=				\
 	sched_ctx/sched_ctx_without_sched_policy\
 	sched_ctx/nested_sched_ctxs		\
 	sched_ctx/sched_ctx_without_sched_policy_awake\
-	sched_ctx/parallel_tasks_reuse_handle
-
-if STARPU_LONG_CHECK
-STARPU_EXAMPLES +=				\
+	sched_ctx/parallel_tasks_reuse_handle	\
 	sched_ctx/parallel_code
-endif
 
 if STARPU_HAVE_HWLOC
 if STARPU_HWLOC_HAVE_TOPOLOGY_DUP

+ 2 - 2
examples/basic_examples/mult.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2010-2011, 2013, 2015  Université de Bordeaux
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011, 2012, 2013  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -136,7 +136,7 @@ static void init_problem_data(void)
 	C = (float *) malloc(xdim*ydim*sizeof(float));
 
 	/* fill the A and B matrices */
-	srand(2009);
+	starpu_srand48(2009);
 	for (j=0; j < ydim; j++)
 	{
 		for (i=0; i < zdim; i++)

+ 5 - 0
examples/cg/cg.c

@@ -389,6 +389,11 @@ static void parse_args(int argc, char **argv)
 	        if (strcmp(argv[i], "-maxiter") == 0)
 		{
 			i_max = atoi(argv[++i]);
+			if (i_max <= 0)
+			{
+				FPRINTF(stderr, "the number of iterations must be positive, not %d\n", i_max);
+				exit(EXIT_FAILURE);
+			}
 			continue;
 		}
 

+ 7 - 7
examples/cholesky/cholesky_models.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010-2011, 2015  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2017  CNRS
  * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -51,7 +51,7 @@ double cpu_chol_task_11_cost(struct starpu_task *task, struct starpu_perfmodel_a
 	double cost = (((double)(n)*n*n)/1000.0f*0.894/0.79176);
 
 #ifdef STARPU_MODEL_DEBUG
-	FPRINTF(stdout, "cpu_chol_task_11_cost n %d cost %e\n", n, cost);
+	FPRINTF(stdout, "cpu_chol_task_11_cost n %u cost %e\n", n, cost);
 #endif
 
 	return PERTURBATE(cost);
@@ -66,7 +66,7 @@ double cuda_chol_task_11_cost(struct starpu_task *task, struct starpu_perfmodel_
 	double cost = (((double)(n)*n*n)/50.0f/10.75/5.088633/0.9883);
 
 #ifdef STARPU_MODEL_DEBUG
-	FPRINTF(stdout, "cuda_chol_task_11_cost n %d cost %e\n", n, cost);
+	FPRINTF(stdout, "cuda_chol_task_11_cost n %u cost %e\n", n, cost);
 #endif
 
 	return PERTURBATE(cost);
@@ -81,7 +81,7 @@ double cpu_chol_task_21_cost(struct starpu_task *task, struct starpu_perfmodel_a
 	double cost = (((double)(n)*n*n)/7706.674/0.95/0.9965);
 
 #ifdef STARPU_MODEL_DEBUG
-	FPRINTF(stdout, "cpu_chol_task_21_cost n %d cost %e\n", n, cost);
+	FPRINTF(stdout, "cpu_chol_task_21_cost n %u cost %e\n", n, cost);
 #endif
 
 	return PERTURBATE(cost);
@@ -96,7 +96,7 @@ double cuda_chol_task_21_cost(struct starpu_task *task, struct starpu_perfmodel_
 	double cost = (((double)(n)*n*n)/50.0f/10.75/87.29520);
 
 #ifdef STARPU_MODEL_DEBUG
-	FPRINTF(stdout, "cuda_chol_task_21_cost n %d cost %e\n", n, cost);
+	FPRINTF(stdout, "cuda_chol_task_21_cost n %u cost %e\n", n, cost);
 #endif
 
 	return PERTURBATE(cost);
@@ -111,7 +111,7 @@ double cpu_chol_task_22_cost(struct starpu_task *task, struct starpu_perfmodel_a
 	double cost = (((double)(n)*n*n)/50.0f/10.75/8.0760);
 
 #ifdef STARPU_MODEL_DEBUG
-	FPRINTF(stdout, "cpu_chol_task_22_cost n %d cost %e\n", n, cost);
+	FPRINTF(stdout, "cpu_chol_task_22_cost n %u cost %e\n", n, cost);
 #endif
 
 	return PERTURBATE(cost);
@@ -126,7 +126,7 @@ double cuda_chol_task_22_cost(struct starpu_task *task, struct starpu_perfmodel_
 	double cost = (((double)(n)*n*n)/50.0f/10.75/76.30666);
 
 #ifdef STARPU_MODEL_DEBUG
-	FPRINTF(stdout, "cuda_chol_task_22_cost n %d cost %e\n", n, cost);
+	FPRINTF(stdout, "cuda_chol_task_22_cost n %u cost %e\n", n, cost);
 #endif
 
 	return PERTURBATE(cost);

+ 2 - 2
examples/cpp/add_vectors_cpp11.cpp

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009, 2010-2011, 2013-2015  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
  * Copyright (C) 2012 INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -32,7 +32,7 @@
 #if !defined(STARPU_HAVE_CXX11)
 int main(int argc, char **argv)
 {
-	return STARPU_TEST_SKIPPED;
+	return 77;
 }
 #else
 void cpu_kernel_add_vectors(void *buffers[], void *cl_arg)

+ 1 - 1
examples/filters/fblock_opencl_kernel.cl

@@ -19,7 +19,7 @@
 __kernel void fblock_opencl(__global int* block, unsigned offset, int nx, int ny, int nz, unsigned ldy, unsigned ldz, int factor)
 {
         int i, j, k;
-        block = (__global void *)block + offset;
+        block = (__global char *)block + offset;
         for(k=0; k<nz ; k++)
 	{
                 for(j=0; j<ny ; j++)

+ 5 - 0
examples/filters/fmultiple_manual.c

@@ -130,6 +130,11 @@ int main(int argc, char **argv)
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
+	/* force to execute task on the home_node, here it is STARPU_MAIN_RAM */
+	cl_switch.specific_nodes = 1;
+	for (i = 0; i < STARPU_NMAXBUFS; i++)
+		cl_switch.nodes[i] = STARPU_MAIN_RAM;
+
 	/* Declare the whole matrix to StarPU */
 	starpu_matrix_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)matrix, NX, NX, NY, sizeof(matrix[0][0]));
 

+ 105 - 79
examples/heat/dw_sparse_cg.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2011, 2015  Université de Bordeaux
+ * Copyright (C) 2009, 2010, 2011, 2015, 2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2016  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -25,11 +25,7 @@
 
 static struct starpu_task *create_task(starpu_tag_t id)
 {
-	struct starpu_codelet *cl = calloc(1,sizeof(struct starpu_codelet));
-
 	struct starpu_task *task = starpu_task_create();
-		task->cl = cl;
-		task->cl_arg = NULL;
 		task->use_tag = 1;
 		task->tag_id = id;
 
@@ -131,6 +127,30 @@ void init_problem(void)
  *	cg initialization phase
  */
 
+static struct starpu_codelet cl1 = {
+	.cpu_funcs = { cpu_codelet_func_1 },
+	.cpu_funcs_name = { "cpu_codelet_func_1" },
+	.nbuffers = 4,
+	.modes = { STARPU_R, STARPU_R, STARPU_W, STARPU_R },
+};
+
+static struct starpu_codelet cl2 = {
+	.cpu_funcs = { cpu_codelet_func_2 },
+	.cpu_funcs_name = { "cpu_codelet_func_2" },
+	.nbuffers = 2,
+	.modes = { STARPU_W, STARPU_R },
+};
+
+static struct starpu_codelet cl3 = {
+	.cpu_funcs = { cpu_codelet_func_3 },
+	.cpu_funcs_name = { "cpu_codelet_func_3" },
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = { cublas_codelet_func_3 },
+#endif
+	.nbuffers = 1,
+	.modes = { STARPU_R },
+};
+
 void init_cg(struct cg_problem *problem)
 {
 	int ret;
@@ -139,14 +159,7 @@ void init_cg(struct cg_problem *problem)
 
 	/* r = b  - A x */
 	struct starpu_task *task1 = create_task(1UL);
-	task1->cl->cpu_funcs[0] = cpu_codelet_func_1;
-	task1->cl->cpu_funcs_name[0] = "cpu_codelet_func_1";
-	task1->cl->nbuffers = 4;
-	task1->cl->modes[0] = STARPU_R;
-	task1->cl->modes[1] = STARPU_R;
-	task1->cl->modes[2] = STARPU_W;
-	task1->cl->modes[3] = STARPU_R;
-
+	task1->cl = &cl1;
 	task1->handles[0] = problem->ds_matrixA;
 	task1->handles[1] = problem->ds_vecx;
 	task1->handles[2] = problem->ds_vecr;
@@ -154,12 +167,7 @@ void init_cg(struct cg_problem *problem)
 
 	/* d = r */
 	struct starpu_task *task2 = create_task(2UL);
-	task2->cl->cpu_funcs[0] = cpu_codelet_func_2;
-	task2->cl->cpu_funcs_name[0] = "cpu_codelet_func_2";
-	task2->cl->nbuffers = 2;
-	task2->cl->modes[0] = STARPU_W;
-	task2->cl->modes[1] = STARPU_R;
-
+	task2->cl = &cl2;
 	task2->handles[0] = problem->ds_vecd;
 	task2->handles[1] = problem->ds_vecr;
 
@@ -167,15 +175,9 @@ void init_cg(struct cg_problem *problem)
 
 	/* delta_new = trans(r) r */
 	struct starpu_task *task3 = create_task(3UL);
-#ifdef STARPU_USE_CUDA
-	task3->cl->cuda_funcs[0] = cublas_codelet_func_3;
-#endif
-	task3->cl->cpu_funcs[0] = cpu_codelet_func_3;
-	task3->cl->cpu_funcs_name[0] = "cpu_codelet_func_3";
+	task3->cl = &cl3;
 	task3->cl_arg = problem;
 	task3->cl_arg_size = sizeof(*problem);
-	task3->cl->nbuffers = 1;
-	task3->cl->modes[0] = STARPU_R;
 	task3->handles[0] = problem->ds_vecr;
 
 	task3->callback_func = iteration_cg;
@@ -186,6 +188,11 @@ void init_cg(struct cg_problem *problem)
 
 	/* launch the computation now */
 	ret = starpu_task_submit(task1);
+	if (STARPU_UNLIKELY(ret == -ENODEV))
+	{
+		FPRINTF(stderr, "No worker may execute this task\n");
+		exit(0);
+	}
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	ret = starpu_task_submit(task2);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
@@ -198,6 +205,66 @@ void init_cg(struct cg_problem *problem)
  *		the codelet code launcher is its own callback !
  */
 
+static struct starpu_codelet cl4 = {
+	.cpu_funcs = { cpu_codelet_func_4 },
+	.cpu_funcs_name = { "cpu_codelet_func_4" },
+	.nbuffers = 3,
+	.modes = { STARPU_R, STARPU_R, STARPU_W },
+};
+
+static struct starpu_codelet cl5 = {
+	.cpu_funcs = { cpu_codelet_func_5 },
+	.cpu_funcs_name = { "cpu_codelet_func_5" },
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = { cublas_codelet_func_5 },
+#endif
+	.nbuffers = 2,
+	.modes = { STARPU_R, STARPU_R },
+};
+
+static struct starpu_codelet cl6 = {
+	.cpu_funcs = { cpu_codelet_func_6 },
+	.cpu_funcs_name = { "cpu_codelet_func_6" },
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = { cublas_codelet_func_6 },
+	.cuda_flags = { STARPU_CUDA_ASYNC },
+#endif
+	.nbuffers = 2,
+	.modes = { STARPU_RW, STARPU_R },
+};
+
+static struct starpu_codelet cl7 = {
+	.cpu_funcs = { cpu_codelet_func_7 },
+	.cpu_funcs_name = { "cpu_codelet_func_7" },
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = { cublas_codelet_func_7 },
+	.cuda_flags = { STARPU_CUDA_ASYNC },
+#endif
+	.nbuffers = 2,
+	.modes = { STARPU_RW, STARPU_R },
+};
+
+static struct starpu_codelet cl8 = {
+	.cpu_funcs = { cpu_codelet_func_8 },
+	.cpu_funcs_name = { "cpu_codelet_func_8" },
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = { cublas_codelet_func_8 },
+#endif
+	.nbuffers = 1,
+	.modes = { STARPU_R },
+};
+
+static struct starpu_codelet cl9 = {
+	.cpu_funcs = { cpu_codelet_func_9 },
+	.cpu_funcs_name = { "cpu_codelet_func_9" },
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = { cublas_codelet_func_9 },
+	.cuda_flags = { STARPU_CUDA_ASYNC },
+#endif
+	.nbuffers = 2,
+	.modes = { STARPU_RW, STARPU_R },
+};
+
 void launch_new_cg_iteration(struct cg_problem *problem)
 {
 	int ret;
@@ -208,30 +275,16 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 
 	/* q = A d */
 	struct starpu_task *task4 = create_task(maskiter | 4UL);
-	task4->cl->cpu_funcs[0] = cpu_codelet_func_4;
-	task4->cl->cpu_funcs_name[0] = "cpu_codelet_func_4";
-	task4->cl->nbuffers = 3;
-	task4->cl->modes[0] = STARPU_R;
-	task4->cl->modes[1] = STARPU_R;
-	task4->cl->modes[2] = STARPU_W;
-
+	task4->cl = &cl4;
 	task4->handles[0] = problem->ds_matrixA;
 	task4->handles[1] = problem->ds_vecd;
 	task4->handles[2] = problem->ds_vecq;
 
 	/* alpha = delta_new / ( trans(d) q )*/
 	struct starpu_task *task5 = create_task(maskiter | 5UL);
-#ifdef STARPU_USE_CUDA
-	task5->cl->cuda_funcs[0] = cublas_codelet_func_5;
-#endif
-	task5->cl->cpu_funcs[0] = cpu_codelet_func_5;
-	task5->cl->cpu_funcs_name[0] = "cpu_codelet_func_5";
+	task5->cl = &cl5;
 	task5->cl_arg = problem;
 	task5->cl_arg_size = sizeof(*problem);
-	task5->cl->nbuffers = 2;
-	task5->cl->modes[0] = STARPU_R;
-	task5->cl->modes[1] = STARPU_R;
-
 	task5->handles[0] = problem->ds_vecd;
 	task5->handles[1] = problem->ds_vecq;
 
@@ -239,18 +292,9 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 
 	/* x = x + alpha d */
 	struct starpu_task *task6 = create_task(maskiter | 6UL);
-#ifdef STARPU_USE_CUDA
-	task6->cl->cuda_funcs[0] = cublas_codelet_func_6;
-	task6->cl->cuda_flags[0] = STARPU_CUDA_ASYNC;
-#endif
-	task6->cl->cpu_funcs[0] = cpu_codelet_func_6;
-	task6->cl->cpu_funcs_name[0] = "cpu_codelet_func_6";
+	task6->cl = &cl6;
 	task6->cl_arg = problem;
 	task6->cl_arg_size = sizeof(*problem);
-	task6->cl->nbuffers = 2;
-	task6->cl->modes[0] = STARPU_RW;
-	task6->cl->modes[1] = STARPU_R;
-
 	task6->handles[0] = problem->ds_vecx;
 	task6->handles[1] = problem->ds_vecd;
 
@@ -258,18 +302,9 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 
 	/* r = r - alpha q */
 	struct starpu_task *task7 = create_task(maskiter | 7UL);
-#ifdef STARPU_USE_CUDA
-	task7->cl->cuda_funcs[0] = cublas_codelet_func_7;
-	task7->cl->cuda_flags[0] = STARPU_CUDA_ASYNC;
-#endif
-	task7->cl->cpu_funcs[0] = cpu_codelet_func_7;
-	task7->cl->cpu_funcs_name[0] = "cpu_codelet_func_7";
+	task7->cl = &cl7;
 	task7->cl_arg = problem;
 	task7->cl_arg_size = sizeof(*problem);
-	task7->cl->nbuffers = 2;
-	task7->cl->modes[0] = STARPU_RW;
-	task7->cl->modes[1] = STARPU_R;
-
 	task7->handles[0] = problem->ds_vecr;
 	task7->handles[1] = problem->ds_vecq;
 
@@ -277,33 +312,18 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 
 	/* update delta_* and compute beta */
 	struct starpu_task *task8 = create_task(maskiter | 8UL);
-#ifdef STARPU_USE_CUDA
-	task8->cl->cuda_funcs[0] = cublas_codelet_func_8;
-#endif
-	task8->cl->cpu_funcs[0] = cpu_codelet_func_8;
-	task8->cl->cpu_funcs_name[0] = "cpu_codelet_func_8";
+	task8->cl = &cl8;
 	task8->cl_arg = problem;
 	task8->cl_arg_size = sizeof(*problem);
-	task8->cl->nbuffers = 1;
-	task8->cl->modes[0] = STARPU_R;
 	task8->handles[0] = problem->ds_vecr;
 
 	starpu_tag_declare_deps((starpu_tag_t)(maskiter | 8UL), 1, (starpu_tag_t)(maskiter | 7UL));
 
 	/* d = r + beta d */
 	struct starpu_task *task9 = create_task(maskiter | 9UL);
-#ifdef STARPU_USE_CUDA
-	task9->cl->cuda_funcs[0] = cublas_codelet_func_9;
-	task9->cl->cuda_flags[0] = STARPU_CUDA_ASYNC;
-#endif
-	task9->cl->cpu_funcs[0] = cpu_codelet_func_9;
-	task9->cl->cpu_funcs_name[0] = "cpu_codelet_func_9";
+	task9->cl = &cl9;
 	task9->cl_arg = problem;
 	task9->cl_arg_size = sizeof(*problem);
-	task9->cl->nbuffers = 2;
-	task9->cl->modes[0] = STARPU_RW;
-	task9->cl->modes[1] = STARPU_R;
-
 	task9->handles[0] = problem->ds_vecd;
 	task9->handles[1] = problem->ds_vecr;
 
@@ -427,6 +447,10 @@ void conjugate_gradient(float *nzvalA, float *vecb, float *vecx, uint32_t nnz,
 	starpu_data_unregister(ds_vecr);
 	starpu_data_unregister(ds_vecd);
 	starpu_data_unregister(ds_vecq);
+
+	free(ptr_vecr);
+	free(ptr_vecd);
+	free(ptr_vecq);
 }
 
 
@@ -444,4 +468,6 @@ void do_conjugate_gradient(float *nzvalA, float *vecb, float *vecx, uint32_t nnz
 	starpu_cublas_init();
 
 	conjugate_gradient(nzvalA, vecb, vecx, nnz, nrow, colind, rowptr);
+
+	starpu_shutdown();
 }

+ 7 - 3
examples/heat/heat.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2012, 2015  Université de Bordeaux
+ * Copyright (C) 2009, 2010, 2012, 2015, 2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2016  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -121,9 +121,9 @@ static void parse_args(int argc, char **argv)
 			STARPU_ASSERT((nthick - 2)*(ntheta - 2) == size);
 		}
 
-		if (strcmp(argv[i], "-h") == 0)
+		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-help") == 0)
 		{
-			printf("usage : %s [-v1|-v2|-v3] [-pin] [-nthick number] [-ntheta number] [-shape [0|1|2]] [-cg] [-size number] [-no-prio]\n", argv[0]);
+			printf("usage : %s [-v1|-v2|-v3|-v4] [-pin] [-nthick number] [-ntheta number] [-shape [0|1|2]] [-cg] [-size number] [-no-prio]\n", argv[0]);
 		}
 	}
 }
@@ -751,6 +751,10 @@ int main(int argc, char **argv)
 			result[TRANSLATE(i)] = Bformer[TRANSLATE(i)];
 		}
 
+		free(nzval);
+		free(colind);
+		free(rowptr);
+		free(B);
 	}
 	else
 	{

+ 44 - 0
examples/heat/heat.sh

@@ -0,0 +1,44 @@
+#!/bin/bash
+#
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2017  Université de Bordeaux
+# Copyright (C) 2017  Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+# Test various LU options
+
+set -e
+
+PREFIX=$(dirname $0)
+
+$PREFIX/heat -shape 0
+$PREFIX/heat -shape 1
+# sometimes lead to pivot being 0
+#$PREFIX/heat -shape 2
+
+$PREFIX/heat -cg
+
+# TODO: FIXME
+
+# segfault
+#$PREFIX/heat -v1
+
+# (actually the default...)
+$PREFIX/heat -v2
+
+# hang
+#$PREFIX/heat -v3
+
+# hang
+#$PREFIX/heat -v4

+ 19 - 2
examples/interface/complex_codelet.h

@@ -22,6 +22,21 @@
 
 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
 
+/* Dumb performance model for simgrid */
+static double complex_cost_function(struct starpu_task *task, unsigned nimpl)
+{
+	(void) task;
+	(void) nimpl;
+	return 0.000001;
+}
+
+static struct starpu_perfmodel complex_model =
+{
+	.type = STARPU_COMMON,
+	.cost_function = complex_cost_function,
+	.symbol = "complex"
+};
+
 void compare_complex_codelet(void *descr[], void *_args)
 {
 	int nx1 = STARPU_COMPLEX_GET_NX(descr[0]);
@@ -57,7 +72,8 @@ struct starpu_codelet cl_compare =
 	/* .cpu_funcs_name = {"compare_complex_codelet"}, */
 	.nbuffers = 2,
 	.modes = {STARPU_R, STARPU_R},
-	.name = "cl_compare"
+	.name = "cl_compare",
+	.model = &complex_model
 };
 
 void display_complex_codelet(void *descr[], void *_args)
@@ -83,7 +99,8 @@ struct starpu_codelet cl_display =
 	.cpu_funcs_name = {"display_complex_codelet"},
 	.nbuffers = 1,
 	.modes = {STARPU_R},
-	.name = "cl_display"
+	.name = "cl_display",
+	.model = &complex_model
 };
 
 #endif /* __COMPLEX_CODELET_H */

+ 34 - 0
examples/lu/lu.sh

@@ -0,0 +1,34 @@
+#!/bin/bash
+#
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2017  Université de Bordeaux
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+# Test various LU options
+
+set -e
+
+PREFIX=$(dirname $0)
+
+$PREFIX/lu_implicit_example_float -size $((960 * 4)) -nblocks 4 -piv
+$PREFIX/lu_implicit_example_float -size $((960 * 4)) -nblocks 4 -no-stride
+$PREFIX/lu_implicit_example_float -size $((960 * 4)) -nblocks 4 -bound
+$PREFIX/lu_implicit_example_float -size $((960 * 2)) -nblocks 2 -bounddeps
+$PREFIX/lu_implicit_example_float -size $((960 * 2)) -nblocks 2 -bound -bounddeps -bounddepsprio
+
+$PREFIX/lu_example_float -size $((960 * 4)) -nblocks 4 -piv
+$PREFIX/lu_example_float -size $((960 * 4)) -nblocks 4 -no-stride
+$PREFIX/lu_example_float -size $((960 * 4)) -nblocks 4 -bound
+$PREFIX/lu_example_float -size $((960 * 2)) -nblocks 2 -bounddeps
+$PREFIX/lu_example_float -size $((960 * 2)) -nblocks 2 -bound -bounddeps -bounddepsprio

+ 3 - 1
examples/lu/lu_example.c

@@ -422,13 +422,15 @@ int main(int argc, char **argv)
 		if (pivot)
 		{
 			pivot_saved_matrix(ipiv);
-			free(ipiv);
 		}
 
 		check_result();
 	}
 #endif
 
+	if (pivot)
+		free(ipiv);
+
 	starpu_free_flags(A, (size_t)size*size*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
 
 	starpu_cublas_shutdown();

+ 4 - 0
examples/lu/xlu_implicit_pivot.c

@@ -232,6 +232,10 @@ starpu_data_handle_t get_block_with_striding(starpu_data_handle_t *dataAp,
 
 int STARPU_LU(lu_decomposition_pivot)(TYPE *matA, unsigned *ipiv, unsigned size, unsigned ld, unsigned nblocks)
 {
+	if (starpu_mic_worker_get_count() || starpu_scc_worker_get_count() || starpu_mpi_ms_worker_get_count())
+		/* These won't work with pivoting: we pass a pointer in cl_args */
+		return -ENODEV;
+
 	starpu_data_handle_t dataA;
 
 	/* monitor and partition the A matrix into blocks :

+ 5 - 0
examples/lu/xlu_pivot.c

@@ -399,6 +399,7 @@ int STARPU_LU(lu_decomposition_pivot)(TYPE *matA, unsigned *ipiv, unsigned size,
 
 	/* gather all the data */
 	starpu_data_unpartition(dataA, STARPU_MAIN_RAM);
+	starpu_data_unregister(dataA);
 	free(piv_description);
 
 	return ret;
@@ -413,6 +414,10 @@ starpu_data_handle_t get_block_with_no_striding(starpu_data_handle_t *dataAp, un
 
 int STARPU_LU(lu_decomposition_pivot_no_stride)(TYPE **matA, unsigned *ipiv, unsigned size, unsigned ld, unsigned nblocks)
 {
+	if (starpu_mic_worker_get_count() || starpu_scc_worker_get_count() || starpu_mpi_ms_worker_get_count())
+		/* These won't work with pivoting: we pass a pointer in cl_args */
+		return -ENODEV;
+
 	starpu_data_handle_t *dataAp = malloc(nblocks*nblocks*sizeof(starpu_data_handle_t));
 
 	/* monitor and partition the A matrix into blocks :

+ 3 - 0
examples/mandelbrot/mandelbrot.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2010, 2011, 2014-2015, 2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2016, 2017  CNRS
+ * Copyright (C) 2017  Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -497,7 +498,9 @@ int main(int argc, char **argv)
 	conf.ncuda = 0;
 
 	if (use_spmd_p)
+	{
 		conf.sched_policy_name = "peager";
+	}
 
 	ret = starpu_init(&conf);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");

+ 45 - 20
examples/mlr/mlr.c

@@ -50,7 +50,14 @@ static long sum;
 static void cl_params(struct starpu_task *task, double *parameters)
 {
 	int m, n, k;
-	starpu_codelet_unpack_args(task->cl_arg, &m, &n, &k);
+	int* vector_mn;
+
+	vector_mn = (int*)STARPU_VECTOR_GET_PTR(task->interfaces[0]);
+	m = vector_mn[0];
+	n = vector_mn[1];
+
+	starpu_codelet_unpack_args(task->cl_arg, &k);
+
 	parameters[0] = m;
 	parameters[1] = n;
 	parameters[2] = k;
@@ -61,10 +68,13 @@ void cpu_func(void *buffers[], void *cl_arg)
 {
 	long i;
 	int m,n,k;
-	starpu_codelet_unpack_args(cl_arg,
-			     	  &m,
-     			     	  &n,
-     			     	  &k);
+	int* vector_mn;
+
+	vector_mn = (int*)STARPU_VECTOR_GET_PTR(buffers[0]);
+	m = vector_mn[0];
+	n = vector_mn[1];
+
+	starpu_codelet_unpack_args(cl_arg, &k);
 
 	for(i=0; i < (long) (m*m*n); i++)
 		sum+=i;
@@ -123,7 +133,8 @@ static struct starpu_codelet cl_init =
 {
 	.cpu_funcs = { cpu_func },
 	.cpu_funcs_name = { "cpu_func" },
-	.nbuffers = 0,
+	.nbuffers = 1,
+	.modes = {STARPU_R},
 	.model = &cl_model_init,
 };
 
@@ -131,7 +142,8 @@ static struct starpu_codelet cl_final =
 {
 	.cpu_funcs = { cpu_func },
 	.cpu_funcs_name = { "cpu_func" },
-	.nbuffers = 0,
+	.nbuffers = 1,
+	.modes = {STARPU_R},
 	.model = &cl_model_final,
 };
 
@@ -147,29 +159,42 @@ int main(int argc, char **argv)
 
 	sum=0;
 	int m,n,k;
+	int* vector_mn = calloc( 2, sizeof(int) );
+	starpu_data_handle_t vector_mn_handle;
+
+	starpu_vector_data_register( &vector_mn_handle,
+				     STARPU_MAIN_RAM,
+				     (uintptr_t)vector_mn, 2,
+				     sizeof(int) );
 
-        /* Giving pseudo-random values to the M,N,K parameters and inserting tasks */
-	for(i=0; i < 42; i++)
+	/* Giving pseudo-random values to the M,N,K parameters and inserting tasks */
+	for ( i = 0; i < 42; i++)
 	{
 		m = (int) ((rand() % 10)+1);
 		n = (int) ((rand() % 10)+1);
 		k = (int) ((rand() % 10)+1);
 
-		for(j=0; j < 42; j++)
+		/* To illustrate the usage, M and N are stored in a data handle */
+		starpu_data_acquire(vector_mn_handle, STARPU_W);
+		vector_mn[0] = m;
+		vector_mn[1] = n;
+		starpu_data_release(vector_mn_handle);
+
+		for ( j = 0; j < 42; j++)
 		{
-			starpu_insert_task(&cl_init,
-				   STARPU_VALUE, &m, sizeof(int),
-				   STARPU_VALUE, &n, sizeof(int),
-				   STARPU_VALUE, &k, sizeof(int),
-				   0);
-			starpu_insert_task(&cl_final,
-				   STARPU_VALUE, &m, sizeof(int),
-				   STARPU_VALUE, &n, sizeof(int),
-				   STARPU_VALUE, &k, sizeof(int),
-				   0);
+			starpu_insert_task( &cl_init,
+					    STARPU_R, vector_mn_handle,
+					    STARPU_VALUE, &k, sizeof(int),
+					    0 );
+			starpu_insert_task( &cl_final,
+					    STARPU_R, vector_mn_handle,
+					    STARPU_VALUE, &k, sizeof(int),
+					    0 );
 		}
 	}
 
+	starpu_data_unregister(vector_mn_handle);
+	free(vector_mn);
 	starpu_shutdown();
 
 	return 0;

+ 8 - 2
examples/sched_ctx/dummy_sched_with_ctx.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2016  Université de Bordeaux
+ * Copyright (C) 2010-2017  Université de Bordeaux
  * Copyright (C) 2010-2013, 2016  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -118,8 +118,14 @@ static struct starpu_task *pop_task_dummy(unsigned sched_ctx_id)
 	 * the calling worker. So we just take the head of the list and give it
 	 * to the worker. */
 	struct dummy_sched_data *data = (struct dummy_sched_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
+#ifdef STARPU_NON_BLOCKING_DRIVERS
+	if (starpu_task_list_empty(&data->sched_list))
+		return NULL;
+#endif
 	STARPU_PTHREAD_MUTEX_LOCK(&data->policy_mutex);
-	struct starpu_task *task = starpu_task_list_pop_back(&data->sched_list);
+	struct starpu_task *task = NULL;
+	if (!starpu_task_list_empty(&data->sched_list))
+		task = starpu_task_list_pop_back(&data->sched_list);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex);
 	return task;
 }

+ 5 - 3
examples/sched_ctx/gpu_partition.c

@@ -105,7 +105,9 @@ int main(int argc, char **argv)
 	int ncuda = 0;
 	int gpu_devid = -1;
 
+#ifdef STARPU_DEVEL
 #warning temporary fix: skip test as cuda computation fails
+#endif
  	return 77;
 
 #ifndef STARPU_HAVE_SETENV
@@ -172,8 +174,8 @@ int main(int argc, char **argv)
 	int ncpus = starpu_cpu_worker_get_count();
 	int workers[ncpus+nstreams];
 	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, workers, ncpus);
-	
-	int sched_ctxs[nstreams];
+
+	unsigned sched_ctxs[nstreams];
 	int nsms[nstreams];
 	nsms[0] = 6;
 	nsms[1] = 7;
@@ -185,7 +187,7 @@ int main(int argc, char **argv)
 	}
 	unsigned sched_ctx1 = starpu_sched_ctx_create(workers, ncpus+nstreams, "ctx1", STARPU_SCHED_CTX_SUB_CTXS, sched_ctxs, nstreams, STARPU_SCHED_CTX_POLICY_NAME, "dmdas", 0);
 
-	FPRINTF(stderr, "parent ctx %d\n", sched_ctx1);
+	FPRINTF(stderr, "parent ctx %u\n", sched_ctx1);
 	starpu_sched_ctx_set_context(&sched_ctx1);
 
 #endif

+ 5 - 1
examples/sched_ctx/nested_sched_ctxs.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2010-2015  Université de Bordeaux
  * Copyright (C) 2010-2014, 2016, 2017  CNRS
+ * Copyright (C) 2017  Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -53,7 +54,10 @@ static void sched_ctx_func(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
 	unsigned sched_ctx = (uintptr_t)arg;
 	int t = parallel_code(sched_ctx);
 	if (sched_ctx > 0 && sched_ctx < 3)
-		tasks_executed[sched_ctx-1] += t;
+	{
+		STARPU_ATOMIC_ADD(&tasks_executed[sched_ctx-1], t);
+	}
+
 	//printf("w %d executed %d it \n", w, n);
 }
 

+ 8 - 6
examples/sched_ctx/parallel_code.c

@@ -16,6 +16,7 @@
  */
 
 #include <starpu.h>
+#ifdef STARPU_USE_CPU
 #include <omp.h>
 
 #ifdef STARPU_QUICK_CHECK
@@ -71,16 +72,10 @@ int main(int argc, char **argv)
 	int nprocs1;
 	int *procs1;
 
-#ifdef STARPU_USE_CPU
 	unsigned ncpus =  starpu_cpu_worker_get_count();
 	procs1 = (int*)malloc(ncpus*sizeof(int));
 	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, procs1, ncpus);
 	nprocs1 = ncpus;
-#else
-	nprocs1 = 1;
-	procs1 = (int*)malloc(nprocs1*sizeof(int));
-	procs1[0] = 0;
-#endif
 
 	unsigned sched_ctx1 = starpu_sched_ctx_create(procs1, nprocs1, "ctx1", STARPU_SCHED_CTX_POLICY_NAME, "dmda", 0);
 
@@ -100,3 +95,10 @@ int main(int argc, char **argv)
 
 	return 0;
 }
+#else /* STARPU_USE_CPU */
+int main(int argc, char **argv)
+{
+	/* starpu_sched_ctx_exec_parallel_code() requires a CPU worker has parallel region master */
+	return 77; /* STARPU_TEST_SKIPPED */
+}
+#endif /* STARPU_USE_CPU */

+ 15 - 11
examples/sched_ctx/parallel_tasks_reuse_handle.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2015 INRIA
+ * Copyright (C) 2015, 2017 INRIA
  * Copyright (C) 2015, 2016 CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -17,6 +17,7 @@
 
 #include <starpu.h>
 #include <omp.h>
+#include <pthread.h>
 
 #ifdef STARPU_QUICK_CHECK
 #define NTASKS 64
@@ -28,6 +29,8 @@
 #define LOOPS  10
 #endif
 
+#define N_NESTED_CTXS 2
+
 struct context
 {
 	int ncpus;
@@ -38,6 +41,7 @@ struct context
 /* Helper for the task that will initiate everything */
 void parallel_task_prologue_init_once_and_for_all(void * sched_ctx_)
 {
+	fprintf(stderr, "%p: %s -->\n", (void*)pthread_self(), __func__);
 	int sched_ctx = *(int *)sched_ctx_;
 	int *cpuids = NULL;
 	int ncpuids = 0;
@@ -50,6 +54,7 @@ void parallel_task_prologue_init_once_and_for_all(void * sched_ctx_)
 
 	omp_set_num_threads(ncpuids);
 	free(cpuids);
+	fprintf(stderr, "%p: %s <--\n", (void*)pthread_self(), __func__);
 	return;
 }
 
@@ -101,25 +106,24 @@ void parallel_task_init()
 						  0);
 
 	/* Initialize nested contexts */
-	/* WARNING : the number of contexts must be a divisor of the number of available cpus*/
-
-	contexts = malloc(sizeof(struct context)*2);
-	int cpus_per_context = main_context.ncpus/2;
+	contexts = malloc(sizeof(struct context)*N_NESTED_CTXS);
+	int cpus_per_context = main_context.ncpus/N_NESTED_CTXS;
 	int i;
-	for(i = 0; i < 2; i++)
+	for(i = 0; i < N_NESTED_CTXS; i++)
 	{
-		fprintf(stderr, "ncpus %d for context %d \n",cpus_per_context, i);
 		contexts[i].ncpus = cpus_per_context;
+		if (i == N_NESTED_CTXS-1)
+			contexts[i].ncpus += main_context.ncpus%N_NESTED_CTXS;
 		contexts[i].cpus = main_context.cpus+i*cpus_per_context;
 	}
 
-	for(i = 0; i < 2; i++)
+	for(i = 0; i < N_NESTED_CTXS; i++)
 		contexts[i].id = starpu_sched_ctx_create(contexts[i].cpus,
 							 contexts[i].ncpus,"nested_ctx",
 							 STARPU_SCHED_CTX_NESTED,main_context.id,
 							 0);
 
-	for (i = 0; i < 2; i++)
+	for (i = 0; i < N_NESTED_CTXS; i++)
 	{
 		parallel_task_init_one_context(&contexts[i].id);
 	}
@@ -131,7 +135,7 @@ void parallel_task_init()
 void parallel_task_deinit()
 {
 	int i;
-	for (i=0; i<2;i++)
+	for (i=0; i<N_NESTED_CTXS;i++)
 		starpu_sched_ctx_delete(contexts[i].id);
 	free(contexts);
 	free(main_context.cpus);
@@ -174,7 +178,7 @@ int main(int argc, char **argv)
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
-	if (starpu_cpu_worker_get_count() < 2)
+	if (starpu_cpu_worker_get_count() < N_NESTED_CTXS)
 	{
 		starpu_shutdown();
 		return 77;

+ 19 - 10
examples/sched_ctx/sched_ctx.c

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2010-2014  Université de Bordeaux
  * Copyright (C) 2010-2014, 2016  CNRS
+ * Copyright (C) 2017  Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -25,17 +26,25 @@
 #endif
 
 int tasks_executed = 0;
-starpu_pthread_mutex_t mut;
+int ctx1_tasks_executed = 0;
+int ctx2_tasks_executed = 0;
 
 static void sched_ctx_cpu_func(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg STARPU_ATTRIBUTE_UNUSED)
 {
-	STARPU_PTHREAD_MUTEX_LOCK(&mut);
-	tasks_executed++;
-	STARPU_PTHREAD_MUTEX_UNLOCK(&mut);
+	(void)STARPU_ATOMIC_ADD(&tasks_executed,1);
+	(void)STARPU_ATOMIC_ADD(&ctx1_tasks_executed,1);
 }
 
-static void sched_ctx_cuda_func(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg STARPU_ATTRIBUTE_UNUSED)
+static void sched_ctx2_cpu_func(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg STARPU_ATTRIBUTE_UNUSED)
 {
+	(void)STARPU_ATOMIC_ADD(&tasks_executed,1);
+	(void)STARPU_ATOMIC_ADD(&ctx2_tasks_executed,1);
+}
+
+static void sched_ctx2_cuda_func(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg STARPU_ATTRIBUTE_UNUSED)
+{
+	(void)STARPU_ATOMIC_ADD(&tasks_executed,1);
+	(void)STARPU_ATOMIC_ADD(&ctx2_tasks_executed,1);
 }
 
 static struct starpu_codelet sched_ctx_codelet1 =
@@ -48,8 +57,8 @@ static struct starpu_codelet sched_ctx_codelet1 =
 
 static struct starpu_codelet sched_ctx_codelet2 =
 {
-	.cpu_funcs = {sched_ctx_cpu_func},
-	.cuda_funcs = {sched_ctx_cuda_func},
+	.cpu_funcs = {sched_ctx2_cpu_func},
+	.cuda_funcs = {sched_ctx2_cuda_func},
 	.model = NULL,
 	.nbuffers = 0,
 	.name = "sched_ctx"
@@ -71,8 +80,6 @@ int main(int argc, char **argv)
 		return 77;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
-	STARPU_PTHREAD_MUTEX_INIT(&mut, NULL);
-
 #ifdef STARPU_USE_CPU
 	nprocs1 = starpu_cpu_worker_get_count();
 	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, procs1, nprocs1);
@@ -155,7 +162,9 @@ int main(int argc, char **argv)
 	starpu_sched_ctx_add_workers(procs1, nprocs1, sched_ctx2);
 	starpu_sched_ctx_delete(sched_ctx1);
 	starpu_sched_ctx_delete(sched_ctx2);
-	printf("tasks executed %d out of %d\n", tasks_executed, ntasks/2);
+	printf("tasks executed %d out of %d\n", tasks_executed, ntasks+1);
+	printf("tasks executed on ctx1: %d\n", ctx1_tasks_executed);
+	printf("tasks executed on ctx2: %d\n", ctx2_tasks_executed);
 
 enodev:
 	starpu_shutdown();

+ 2 - 2
examples/sched_ctx/two_cpu_contexts.c

@@ -72,8 +72,8 @@ int main(int argc, char **argv)
 		procs2[i] = procs[i+nprocs1];
 	}
 
-        /* create sched context 1 with default policy, by giving a NULL policy name */
-	unsigned sched_ctx1 = starpu_sched_ctx_create(procs1, nprocs1, "ctx1", STARPU_SCHED_CTX_POLICY_NAME, NULL, 0);
+        /* create sched context 1 with default policy, by giving a empty policy name */
+	unsigned sched_ctx1 = starpu_sched_ctx_create(procs1, nprocs1, "ctx1", STARPU_SCHED_CTX_POLICY_NAME, "", 0);
         /* create sched context 2 with a user selected policy name */
 	unsigned sched_ctx2 = starpu_sched_ctx_create(procs2, nprocs2, "ctx2", STARPU_SCHED_CTX_POLICY_NAME, "eager", 0);
 

+ 4 - 2
examples/scheduler/dummy_sched.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2016  Université de Bordeaux
+ * Copyright (C) 2010-2017  Université de Bordeaux
  * Copyright (C) 2010-2013, 2016  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -122,7 +122,9 @@ static struct starpu_task *pop_task_dummy(unsigned sched_ctx_id)
 		return NULL;
 #endif
 	STARPU_PTHREAD_MUTEX_LOCK(&data->policy_mutex);
-	struct starpu_task *task = starpu_task_list_pop_back(&data->sched_list);
+	struct starpu_task *task = NULL;
+	if (!starpu_task_list_empty(&data->sched_list))
+		task = starpu_task_list_pop_back(&data->sched_list);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex);
 	return task;
 }

+ 6 - 5
examples/spmv/spmv.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2009, 2010, 2011, 2013-2015  Université de Bordeaux
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
- * Copyright (C) 2010, 2011, 2012  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -64,15 +64,16 @@ static void csr_filter_func(void *father_interface, void *child_interface, struc
 
 	uint32_t first_index = id*chunk_size - firstentry;
 	uint32_t local_firstentry = rowptr[first_index];
-	
+
 	uint32_t child_nrow = STARPU_MIN(chunk_size, nrow - id*chunk_size);
-	uint32_t local_nnz = rowptr[first_index + child_nrow] - rowptr[first_index]; 
-	
+	uint32_t local_nnz = rowptr[first_index + child_nrow] - rowptr[first_index];
+
+	csr_child->id = csr_father->id;
 	csr_child->nnz = local_nnz;
 	csr_child->nrow = child_nrow;
 	csr_child->firstentry = local_firstentry;
 	csr_child->elemsize = elemsize;
-	
+
 	if (csr_father->nzval)
 	{
 		csr_child->rowptr = &csr_father->rowptr[first_index];

+ 2 - 3
examples/stencil/stencil-blocks.c

@@ -297,11 +297,10 @@ void allocate_memory_on_node(int rank)
 
 		int node = block->mpi_node;
 
-		unsigned size_bz = block_sizes_z[bz];
-
 		/* Main blocks */
 		if (node == rank)
 		{
+			unsigned size_bz = block_sizes_z[bz];
 			allocate_block_on_node(&block->layers_handle[0], bz, &block->layers[0],
 						(sizex + 2*K), (sizey + 2*K), (size_bz + 2*K));
 #ifndef STARPU_SIMGRID
@@ -389,8 +388,8 @@ void check(int rank)
 		/* Main blocks */
 		if (node == rank)
 		{
-			unsigned size_bz = block_sizes_z[bz];
 #ifdef LIFE
+			unsigned size_bz = block_sizes_z[bz];
 			unsigned x, y, z;
 			unsigned sum = 0;
 			for (x = 0; x < sizex; x++)

+ 12 - 27
include/fstarpu_mod.f90

@@ -1,6 +1,6 @@
 ! StarPU --- Runtime system for heterogeneous multicore architectures.
 !
-! Copyright (C) 2016  Inria
+! Copyright (C) 2016-2017  Inria
 !
 ! StarPU is free software; you can redistribute it and/or modify
 ! it under the terms of the GNU Lesser General Public License as published by
@@ -44,6 +44,7 @@ module fstarpu_mod
         type(c_ptr), bind(C) :: FSTARPU_EXECUTE_ON_DATA
         type(c_ptr), bind(C) :: FSTARPU_EXECUTE_ON_WORKER
         type(c_ptr), bind(C) :: FSTARPU_WORKER_ORDER
+        type(c_ptr), bind(C) :: FSTARPU_EXECUTE_WHERE
         type(c_ptr), bind(C) :: FSTARPU_HYPERVISOR_TAG
         type(c_ptr), bind(C) :: FSTARPU_POSSIBLY_PARALLEL
         type(c_ptr), bind(C) :: FSTARPU_FLOPS
@@ -82,6 +83,7 @@ module fstarpu_mod
         type(c_ptr), bind(C) :: FSTARPU_SCC
 
         type(c_ptr), bind(C) :: FSTARPU_CODELET_SIMGRID_EXECUTE
+        type(c_ptr), bind(C) :: FSTARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT
         type(c_ptr), bind(C) :: FSTARPU_CUDA_ASYNC
         type(c_ptr), bind(C) :: FSTARPU_OPENCL_ASYNC
 
@@ -1580,7 +1582,7 @@ module fstarpu_mod
                 end subroutine fstarpu_memchunk_tidy
 
                 ! == starpu_task_util.h ==
-                ! struct starpu_data_handle *fstarpu_data_handle_array_alloc(int nb);
+                ! starpu_data_handle_t *fstarpu_data_handle_array_alloc(int nb);
                 function fstarpu_data_handle_array_alloc (nb) bind(C)
                         use iso_c_binding, only: c_ptr, c_int
                         type(c_ptr) :: fstarpu_data_handle_array_alloc
@@ -1646,17 +1648,17 @@ module fstarpu_mod
 
                 subroutine fstarpu_task_insert(arglist) bind(C)
                         use iso_c_binding, only: c_ptr
-                        type(c_ptr), dimension(:), intent(in) :: arglist
+                        type(c_ptr), dimension(*), intent(in) :: arglist
                 end subroutine fstarpu_task_insert
                 subroutine fstarpu_insert_task(arglist) bind(C,name="fstarpu_task_insert")
                         use iso_c_binding, only: c_ptr
-                        type(c_ptr), dimension(:), intent(in) :: arglist
+                        type(c_ptr), dimension(*), intent(in) :: arglist
                 end subroutine fstarpu_insert_task
 
                 subroutine fstarpu_unpack_arg(cl_arg,bufferlist) bind(C)
                         use iso_c_binding, only: c_ptr
                         type(c_ptr), value, intent(in) :: cl_arg
-                        type(c_ptr), dimension(:), intent(in) :: bufferlist
+                        type(c_ptr), dimension(*), intent(in) :: bufferlist
                 end subroutine fstarpu_unpack_arg
 
                 ! == starpu_sched_ctx.h ==
@@ -1668,7 +1670,7 @@ module fstarpu_mod
                         integer(c_int), intent(in) :: workers_array(*)
                         integer(c_int), value, intent(in) :: nworkers
                         character(c_char), intent(in) :: ctx_name
-                        type(c_ptr), dimension(:), intent(in) :: arglist
+                        type(c_ptr), dimension(*), intent(in) :: arglist
                 end function fstarpu_sched_ctx_create
 
                 ! unsigned starpu_sched_ctx_create_inside_interval(const char *policy_name, const char *sched_ctx_name, int min_ncpus, int max_ncpus, int min_ngpus, int max_ngpus, unsigned allow_overlap);
@@ -2021,26 +2023,6 @@ module fstarpu_mod
                         integer(c_int), value, intent(in) :: sched_ctx_id
                 end subroutine fstarpu_sched_ctx_list_task_counters_reset_all
 
-                ! void starpu_sched_ctx_set_priority(int *workers, int nworkers, unsigned sched_ctx_id, unsigned priority);
-                subroutine fstarpu_sched_ctx_set_priority (workers, nworkers,  sched_ctx_id, priority) &
-                                bind(c,name="starpu_sched_ctx_set_priority")
-                        use iso_c_binding, only: c_int
-                        integer(c_int), intent(in) :: workers(*)
-                        integer(c_int), value, intent(in) :: nworkers
-                        integer(c_int), value, intent(in) :: sched_ctx_id
-                        integer(c_int), value, intent(in) :: priority
-                end subroutine fstarpu_sched_ctx_set_priority
-
-                ! void starpu_sched_ctx_set_priority_on_level(int* workers_to_add, unsigned nworkers_to_add, unsigned sched_ctx, unsigned priority);
-                subroutine fstarpu_sched_ctx_set_priority_on_level ( workers_to_add, nworkers_to_add, sched_ctx, priority) &
-                                bind(c,name="starpu_sched_ctx_set_priority_on_level")
-                        use iso_c_binding, only: c_int
-                        integer(c_int), intent(in) :: workers_to_add(*)
-                        integer(c_int), value, intent(in) :: nworkers_to_add
-                        integer(c_int), value, intent(in) :: sched_ctx
-                        integer(c_int), value, intent(in) :: priority
-                end subroutine fstarpu_sched_ctx_set_priority_on_level
-
                 ! unsigned starpu_sched_ctx_get_priority(int worker, unsigned sched_ctx_id);
                 function fstarpu_sched_ctx_get_priority (worker, sched_ctx_id) &
                                 bind(c,name="starpu_sched_ctx_get_priority")
@@ -2279,6 +2261,7 @@ module fstarpu_mod
                         FSTARPU_EXECUTE_ON_DATA = fstarpu_get_constant(C_CHAR_"FSTARPU_EXECUTE_ON_DATA"//C_NULL_CHAR)
                         FSTARPU_EXECUTE_ON_WORKER       = fstarpu_get_constant(C_CHAR_"FSTARPU_EXECUTE_ON_WORKER"//C_NULL_CHAR)
                         FSTARPU_WORKER_ORDER    = fstarpu_get_constant(C_CHAR_"FSTARPU_WORKER_ORDER"//C_NULL_CHAR)
+                        FSTARPU_EXECUTE_WHERE       = fstarpu_get_constant(C_CHAR_"FSTARPU_EXECUTE_WHERE"//C_NULL_CHAR)
                         FSTARPU_HYPERVISOR_TAG  = fstarpu_get_constant(C_CHAR_"FSTARPU_HYPERVISOR_TAG"//C_NULL_CHAR)
                         FSTARPU_POSSIBLY_PARALLEL       = fstarpu_get_constant(C_CHAR_"FSTARPU_POSSIBLY_PARALLEL"//C_NULL_CHAR)
                         FSTARPU_FLOPS   = fstarpu_get_constant(C_CHAR_"FSTARPU_FLOPS"//C_NULL_CHAR)
@@ -2331,7 +2314,9 @@ module fstarpu_mod
                             fstarpu_get_constant(C_CHAR_"FSTARPU_SCC"//C_NULL_CHAR)
 
                         FSTARPU_CODELET_SIMGRID_EXECUTE = &
-                            fstarpu_get_constant(C_CHAR_"FSTARPU_CODELET_SIMGRID_EXECUTE"//C_NULL_CHAR)
+                             fstarpu_get_constant(C_CHAR_"FSTARPU_CODELET_SIMGRID_EXECUTE"//C_NULL_CHAR)
+                        FSTARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT = &
+                             fstarpu_get_constant(C_CHAR_"FSTARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT"//C_NULL_CHAR)
                         FSTARPU_CUDA_ASYNC = &
                             fstarpu_get_constant(C_CHAR_"FSTARPU_CUDA_ASYNC"//C_NULL_CHAR)
                         FSTARPU_OPENCL_ASYNC = &

+ 9 - 1
include/starpu.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2014, 2016  Université de Bordeaux
+ * Copyright (C) 2009-2014, 2016-2017  Université de Bordeaux
  * Copyright (C) 2010-2015  CNRS
  * Copyright (C) 2014, 2016  INRIA
  *
@@ -30,6 +30,11 @@ typedef unsigned short uint16_t;
 typedef unsigned int uint32_t;
 typedef unsigned long long uint64_t;
 typedef UINT_PTR uintptr_t;
+typedef char int8_t;
+typedef short int16_t;
+typedef int int32_t;
+typedef long long int64_t;
+typedef INT_PTR intptr_t;
 #endif
 
 #include <starpu_config.h>
@@ -53,13 +58,16 @@ typedef UINT_PTR uintptr_t;
 #include <starpu_perfmodel.h>
 #include <starpu_worker.h>
 #include <starpu_task.h>
+#ifndef BUILDING_STARPU
 #include <starpu_task_list.h>
+#endif
 #include <starpu_task_util.h>
 #include <starpu_sched_ctx.h>
 #include <starpu_expert.h>
 #include <starpu_rand.h>
 #include <starpu_cuda.h>
 #include <starpu_cublas.h>
+#include <starpu_cusparse.h>
 #include <starpu_bound.h>
 #include <starpu_hash.h>
 #include <starpu_profiling.h>

+ 3 - 1
include/starpu_config.h.in

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009-2016  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016, 2017  CNRS
  * Copyright (C) 2014  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -150,5 +150,7 @@ typedef ssize_t starpu_ssize_t;
 #undef STARPU_HAVE_DARWIN
 
 #undef STARPU_HAVE_CXX11
+#undef STARPU_HAVE_STRERROR_R
+#undef STARPU_HAVE_STATEMENT_EXPRESSIONS
 
 #endif

+ 5 - 1
include/starpu_cublas_v2.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2012  Université de Bordeaux
+ * Copyright (C) 2010-2012, 2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -18,6 +18,8 @@
 #ifndef __STARPU_CUBLAS_V2_H__
 #define __STARPU_CUBLAS_V2_H__
 
+#if defined STARPU_USE_CUDA && !defined STARPU_DONT_INCLUDE_CUDA_HEADERS
+
 #include <cublas_v2.h>
 
 #ifdef __cplusplus
@@ -31,4 +33,6 @@ cublasHandle_t starpu_cublas_get_local_handle(void);
 }
 #endif
 
+#endif
+
 #endif /* __STARPU_CUBLAS_V2_H__ */

+ 41 - 0
include/starpu_cusparse.h

@@ -0,0 +1,41 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2010-2012, 2017  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013  CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __STARPU_CUSPARSE_H__
+#define __STARPU_CUSPARSE_H__
+
+#if defined STARPU_USE_CUDA && !defined STARPU_DONT_INCLUDE_CUDA_HEADERS
+#include <cusparse.h>
+#endif
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+void starpu_cusparse_init(void);
+void starpu_cusparse_shutdown(void);
+
+#if defined STARPU_USE_CUDA && !defined STARPU_DONT_INCLUDE_CUDA_HEADERS
+cusparseHandle_t starpu_cusparse_get_local_handle(void);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __STARPU_CUSPARSE_H__ */

+ 2 - 0
include/starpu_data.h

@@ -156,6 +156,8 @@ void starpu_memchunk_tidy(unsigned memory_node);
 void starpu_data_set_user_data(starpu_data_handle_t handle, void* user_data);
 void *starpu_data_get_user_data(starpu_data_handle_t handle);
 
+int starpu_data_cpy(starpu_data_handle_t dst_handle, starpu_data_handle_t src_handle, int asynchronous, void (*callback_func)(void*), void *callback_arg);
+
 #ifdef __cplusplus
 }
 #endif

+ 57 - 16
include/starpu_data_interfaces.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2016  Université de Bordeaux
- * Copyright (C) 2010-2014  CNRS
+ * Copyright (C) 2010-2014, 2017  CNRS
  * Copyright (C) 2011-2012, 2016  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -173,13 +173,24 @@ uint32_t starpu_matrix_get_local_ld(starpu_data_handle_t handle);
 uintptr_t starpu_matrix_get_local_ptr(starpu_data_handle_t handle);
 size_t starpu_matrix_get_elemsize(starpu_data_handle_t handle);
 
+#if defined(STARPU_HAVE_STATEMENT_EXPRESSIONS) && defined(STARPU_DEBUG)
+#define STARPU_MATRIX_CHECK(interface)          STARPU_ASSERT_MSG((((struct starpu_matrix_interface *)(interface))->id) == STARPU_MATRIX_INTERFACE_ID, "Error. The given data is not a matrix.")
+#define STARPU_MATRIX_GET_PTR(interface)	({ STARPU_MATRIX_CHECK(interface); (((struct starpu_matrix_interface *)(interface))->ptr) ; })
+#define STARPU_MATRIX_GET_DEV_HANDLE(interface)	({ STARPU_MATRIX_CHECK(interface); (((struct starpu_matrix_interface *)(interface))->dev_handle) ; })
+#define STARPU_MATRIX_GET_OFFSET(interface)	({ STARPU_MATRIX_CHECK(interface); (((struct starpu_matrix_interface *)(interface))->offset) ; })
+#define STARPU_MATRIX_GET_NX(interface)	        ({ STARPU_MATRIX_CHECK(interface); (((struct starpu_matrix_interface *)(interface))->nx) ; })
+#define STARPU_MATRIX_GET_NY(interface)	        ({ STARPU_MATRIX_CHECK(interface); (((struct starpu_matrix_interface *)(interface))->ny) ; })
+#define STARPU_MATRIX_GET_LD(interface)	        ({ STARPU_MATRIX_CHECK(interface); (((struct starpu_matrix_interface *)(interface))->ld) ; })
+#define STARPU_MATRIX_GET_ELEMSIZE(interface)	({ STARPU_MATRIX_CHECK(interface); (((struct starpu_matrix_interface *)(interface))->elemsize) ; })
+#else
 #define STARPU_MATRIX_GET_PTR(interface)	(((struct starpu_matrix_interface *)(interface))->ptr)
 #define STARPU_MATRIX_GET_DEV_HANDLE(interface)	(((struct starpu_matrix_interface *)(interface))->dev_handle)
 #define STARPU_MATRIX_GET_OFFSET(interface)	(((struct starpu_matrix_interface *)(interface))->offset)
-#define STARPU_MATRIX_GET_NX(interface)	(((struct starpu_matrix_interface *)(interface))->nx)
-#define STARPU_MATRIX_GET_NY(interface)	(((struct starpu_matrix_interface *)(interface))->ny)
-#define STARPU_MATRIX_GET_LD(interface)	(((struct starpu_matrix_interface *)(interface))->ld)
+#define STARPU_MATRIX_GET_NX(interface)	        (((struct starpu_matrix_interface *)(interface))->nx)
+#define STARPU_MATRIX_GET_NY(interface)	        (((struct starpu_matrix_interface *)(interface))->ny)
+#define STARPU_MATRIX_GET_LD(interface)	        (((struct starpu_matrix_interface *)(interface))->ld)
 #define STARPU_MATRIX_GET_ELEMSIZE(interface)	(((struct starpu_matrix_interface *)(interface))->elemsize)
+#endif
 
 extern struct starpu_data_interface_ops starpu_interface_coo_ops;
 
@@ -248,15 +259,28 @@ uint32_t starpu_block_get_local_ldz(starpu_data_handle_t handle);
 uintptr_t starpu_block_get_local_ptr(starpu_data_handle_t handle);
 size_t starpu_block_get_elemsize(starpu_data_handle_t handle);
 
-#define STARPU_BLOCK_GET_PTR(interface)	(((struct starpu_block_interface *)(interface))->ptr)
+#if defined(STARPU_HAVE_STATEMENT_EXPRESSIONS) && defined(STARPU_DEBUG)
+#define STARPU_BLOCK_CHECK(interface)           STARPU_ASSERT_MSG((((struct starpu_block_interface *)(interface))->id) == STARPU_BLOCK_INTERFACE_ID, "Error. The given data is not a block.")
+#define STARPU_BLOCK_GET_PTR(interface)	        ({ STARPU_BLOCK_CHECK(interface); (((struct starpu_block_interface *)(interface))->ptr) ; })
+#define STARPU_BLOCK_GET_DEV_HANDLE(interface)	({ STARPU_BLOCK_CHECK(interface); (((struct starpu_block_interface *)(interface))->dev_handle) ; })
+#define STARPU_BLOCK_GET_OFFSET(interface)	({ STARPU_BLOCK_CHECK(interface); (((struct starpu_block_interface *)(interface))->offset) ; })
+#define STARPU_BLOCK_GET_NX(interface)	        ({ STARPU_BLOCK_CHECK(interface); (((struct starpu_block_interface *)(interface))->nx) ; })
+#define STARPU_BLOCK_GET_NY(interface)	        ({ STARPU_BLOCK_CHECK(interface); (((struct starpu_block_interface *)(interface))->ny) ; })
+#define STARPU_BLOCK_GET_NZ(interface)	        ({ STARPU_BLOCK_CHECK(interface); (((struct starpu_block_interface *)(interface))->nz) ; })
+#define STARPU_BLOCK_GET_LDY(interface)	        ({ STARPU_BLOCK_CHECK(interface); (((struct starpu_block_interface *)(interface))->ldy) ; })
+#define STARPU_BLOCK_GET_LDZ(interface)	        ({ STARPU_BLOCK_CHECK(interface); (((struct starpu_block_interface *)(interface))->ldz) ; })
+#define STARPU_BLOCK_GET_ELEMSIZE(interface)	({ STARPU_BLOCK_CHECK(interface); (((struct starpu_block_interface *)(interface))->elemsize) ; })
+#else
+#define STARPU_BLOCK_GET_PTR(interface)	        (((struct starpu_block_interface *)(interface))->ptr)
 #define STARPU_BLOCK_GET_DEV_HANDLE(interface)	(((struct starpu_block_interface *)(interface))->dev_handle)
 #define STARPU_BLOCK_GET_OFFSET(interface)	(((struct starpu_block_interface *)(interface))->offset)
-#define STARPU_BLOCK_GET_NX(interface)	(((struct starpu_block_interface *)(interface))->nx)
-#define STARPU_BLOCK_GET_NY(interface)	(((struct starpu_block_interface *)(interface))->ny)
-#define STARPU_BLOCK_GET_NZ(interface)	(((struct starpu_block_interface *)(interface))->nz)
-#define STARPU_BLOCK_GET_LDY(interface)	(((struct starpu_block_interface *)(interface))->ldy)
-#define STARPU_BLOCK_GET_LDZ(interface)	(((struct starpu_block_interface *)(interface))->ldz)
+#define STARPU_BLOCK_GET_NX(interface)	        (((struct starpu_block_interface *)(interface))->nx)
+#define STARPU_BLOCK_GET_NY(interface)	        (((struct starpu_block_interface *)(interface))->ny)
+#define STARPU_BLOCK_GET_NZ(interface)	        (((struct starpu_block_interface *)(interface))->nz)
+#define STARPU_BLOCK_GET_LDY(interface)	        (((struct starpu_block_interface *)(interface))->ldy)
+#define STARPU_BLOCK_GET_LDZ(interface)	        (((struct starpu_block_interface *)(interface))->ldz)
 #define STARPU_BLOCK_GET_ELEMSIZE(interface)	(((struct starpu_block_interface *)(interface))->elemsize)
+#endif
 
 extern struct starpu_data_interface_ops starpu_interface_vector_ops;
 
@@ -279,12 +303,22 @@ uint32_t starpu_vector_get_nx(starpu_data_handle_t handle);
 size_t starpu_vector_get_elemsize(starpu_data_handle_t handle);
 uintptr_t starpu_vector_get_local_ptr(starpu_data_handle_t handle);
 
+#if defined(STARPU_HAVE_STATEMENT_EXPRESSIONS) && defined(STARPU_DEBUG)
+#define STARPU_VECTOR_CHECK(interface)          STARPU_ASSERT_MSG((((struct starpu_vector_interface *)(interface))->id) == STARPU_VECTOR_INTERFACE_ID, "Error. The given data is not a vector.")
+#define STARPU_VECTOR_GET_PTR(interface)	({ STARPU_VECTOR_CHECK(interface); (((struct starpu_vector_interface *)(interface))->ptr); })
+#define STARPU_VECTOR_GET_DEV_HANDLE(interface)	({ STARPU_VECTOR_CHECK(interface); (((struct starpu_vector_interface *)(interface))->dev_handle); })
+#define STARPU_VECTOR_GET_OFFSET(interface)	({ STARPU_VECTOR_CHECK(interface); (((struct starpu_vector_interface *)(interface))->offset); })
+#define STARPU_VECTOR_GET_NX(interface)	        ({ STARPU_VECTOR_CHECK(interface); (((struct starpu_vector_interface *)(interface))->nx); })
+#define STARPU_VECTOR_GET_ELEMSIZE(interface)	({ STARPU_VECTOR_CHECK(interface); (((struct starpu_vector_interface *)(interface))->elemsize); })
+#define STARPU_VECTOR_GET_SLICE_BASE(interface)	({ STARPU_VECTOR_CHECK(interface); (((struct starpu_vector_interface *)(interface))->slice_base); })
+#else
 #define STARPU_VECTOR_GET_PTR(interface)	(((struct starpu_vector_interface *)(interface))->ptr)
 #define STARPU_VECTOR_GET_DEV_HANDLE(interface)	(((struct starpu_vector_interface *)(interface))->dev_handle)
 #define STARPU_VECTOR_GET_OFFSET(interface)	(((struct starpu_vector_interface *)(interface))->offset)
-#define STARPU_VECTOR_GET_NX(interface)	(((struct starpu_vector_interface *)(interface))->nx)
+#define STARPU_VECTOR_GET_NX(interface)	        (((struct starpu_vector_interface *)(interface))->nx)
 #define STARPU_VECTOR_GET_ELEMSIZE(interface)	(((struct starpu_vector_interface *)(interface))->elemsize)
 #define STARPU_VECTOR_GET_SLICE_BASE(interface)	(((struct starpu_vector_interface *)(interface))->slice_base)
+#endif
 
 extern struct starpu_data_interface_ops starpu_interface_variable_ops;
 
@@ -303,11 +337,18 @@ void starpu_variable_ptr_register(starpu_data_handle_t handle, unsigned node, ui
 size_t starpu_variable_get_elemsize(starpu_data_handle_t handle);
 uintptr_t starpu_variable_get_local_ptr(starpu_data_handle_t handle);
 
-#define STARPU_VARIABLE_GET_PTR(interface)	(((struct starpu_variable_interface *)(interface))->ptr)
-#define STARPU_VARIABLE_GET_OFFSET(interface)	(((struct starpu_variable_interface *)(interface))->offset)
-#define STARPU_VARIABLE_GET_ELEMSIZE(interface)	(((struct starpu_variable_interface *)(interface))->elemsize)
-#define STARPU_VARIABLE_GET_DEV_HANDLE(interface) \
-	(((struct starpu_variable_interface *)(interface))->ptr)
+#if defined(STARPU_HAVE_STATEMENT_EXPRESSIONS) && defined(STARPU_DEBUG)
+#define STARPU_VARIABLE_CHECK(interface)          STARPU_ASSERT_MSG((((struct starpu_variable_interface *)(interface))->id) == STARPU_VARIABLE_INTERFACE_ID, "Error. The given data is not a variable.")
+#define STARPU_VARIABLE_GET_PTR(interface)	  ({ STARPU_VARIABLE_CHECK(interface); (((struct starpu_variable_interface *)(interface))->ptr) ; })
+#define STARPU_VARIABLE_GET_OFFSET(interface)	  ({ STARPU_VARIABLE_CHECK(interface); (((struct starpu_variable_interface *)(interface))->offset) ; })
+#define STARPU_VARIABLE_GET_ELEMSIZE(interface)	  ({ STARPU_VARIABLE_CHECK(interface); (((struct starpu_variable_interface *)(interface))->elemsize) ; })
+#define STARPU_VARIABLE_GET_DEV_HANDLE(interface) ({ STARPU_VARIABLE_CHECK(interface); (((struct starpu_variable_interface *)(interface))->ptr) ; })
+#else
+#define STARPU_VARIABLE_GET_PTR(interface)	  (((struct starpu_variable_interface *)(interface))->ptr)
+#define STARPU_VARIABLE_GET_OFFSET(interface)	  (((struct starpu_variable_interface *)(interface))->offset)
+#define STARPU_VARIABLE_GET_ELEMSIZE(interface)	  (((struct starpu_variable_interface *)(interface))->elemsize)
+#define STARPU_VARIABLE_GET_DEV_HANDLE(interface) (((struct starpu_variable_interface *)(interface))->ptr)
+#endif
 
 extern struct starpu_data_interface_ops starpu_interface_void_ops;
 

+ 2 - 0
include/starpu_disk.h

@@ -67,6 +67,8 @@ void *starpu_disk_open(unsigned node, void *pos, size_t size);
 
 int starpu_disk_register(struct starpu_disk_ops *func, void *parameter, starpu_ssize_t size);
 
+#define STARPU_DISK_SIZE_MIN (64*1024*1024)
+
 extern int starpu_disk_swap_node;
 
 #endif /* __STARPU_DISK_H__ */

+ 2 - 0
include/starpu_fxt.h

@@ -40,8 +40,10 @@ struct starpu_fxt_codelet_event
 struct starpu_fxt_options
 {
 	unsigned per_task_colour;
+	unsigned no_events;
 	unsigned no_counter;
 	unsigned no_bus;
+	unsigned no_flops;
 	unsigned ninputfiles;
 	unsigned no_smooth;
 	char *filenames[STARPU_FXT_MAX_FILES];

+ 2 - 1
include/starpu_perfmodel.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2014, 2016  Université de Bordeaux
+ * Copyright (C) 2010-2014, 2016-2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2016  Inria
@@ -25,6 +25,7 @@
 
 #include <starpu_util.h>
 #include <starpu_worker.h>
+#include <starpu_task.h>
 
 #ifdef __cplusplus
 extern "C"

+ 19 - 0
include/starpu_sched_component.h

@@ -2,6 +2,7 @@
  *
  * Copyright (C) 2013  Simon Archipoff
  * Copyright (C) 2014  CNRS
+ * Copyright (C) 2017  Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -92,6 +93,9 @@ int starpu_sched_tree_push_task(struct starpu_task *task);
 int starpu_sched_component_push_task(struct starpu_sched_component *from, struct starpu_sched_component *to, struct starpu_task *task);
 struct starpu_task *starpu_sched_tree_pop_task(unsigned sched_ctx);
 struct starpu_task *starpu_sched_component_pull_task(struct starpu_sched_component *from, struct starpu_sched_component *to);
+struct starpu_task* starpu_sched_component_pump_downstream(struct starpu_sched_component *component, int* success);
+void starpu_sched_component_send_can_push_to_parents(struct starpu_sched_component * component);
+
 void starpu_sched_tree_add_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers);
 void starpu_sched_tree_remove_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers);
 
@@ -107,6 +111,7 @@ void starpu_sched_component_prefetch_on_node(struct starpu_sched_component *comp
 void starpu_sched_component_connect(struct starpu_sched_component *parent, struct starpu_sched_component *child);
 
 struct starpu_sched_component *starpu_sched_component_worker_get(unsigned sched_ctx, int workerid);
+struct starpu_sched_component *starpu_sched_component_worker_new(unsigned sched_ctx, int workerid);
 int starpu_sched_component_worker_get_workerid(struct starpu_sched_component *worker_component);
 int starpu_sched_component_is_worker(struct starpu_sched_component *component);
 int starpu_sched_component_is_simple_worker(struct starpu_sched_component *component);
@@ -196,6 +201,20 @@ struct starpu_sched_component_specs
 struct starpu_sched_tree *starpu_sched_component_make_scheduler(unsigned sched_ctx_id, struct starpu_sched_component_specs s);
 #endif /* STARPU_HAVE_HWLOC */
 
+#define STARPU_COMPONENT_MUTEX_LOCK(m) \
+do \
+{ \
+	const int _relaxed_state = _starpu_worker_get_relax_state(); \
+	if (!_relaxed_state) \
+		_starpu_worker_relax_on(); \
+	STARPU_PTHREAD_MUTEX_LOCK((m)); \
+	if (!_relaxed_state) \
+		_starpu_worker_relax_off(); \
+} \
+while(0)
+
+#define STARPU_COMPONENT_MUTEX_UNLOCK(m) STARPU_PTHREAD_MUTEX_UNLOCK((m))
+
 #ifdef __cplusplus
 }
 #endif

+ 7 - 9
include/starpu_sched_ctx.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010 - 2012  INRIA
+ * Copyright (C) 2010 - 2012, 2017  INRIA
  * Copyright (C) 2016  Uppsala University
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -43,9 +43,9 @@ unsigned starpu_sched_ctx_create_inside_interval(const char *policy_name, const
 
 void starpu_sched_ctx_register_close_callback(unsigned sched_ctx_id, void (*close_callback)(unsigned sched_ctx_id, void* args), void *args);
 
-void starpu_sched_ctx_add_workers(int *workerids_ctx, int nworkers_ctx, unsigned sched_ctx_id);
+void starpu_sched_ctx_add_workers(int *workerids_ctx, unsigned nworkers_ctx, unsigned sched_ctx_id);
 
-void starpu_sched_ctx_remove_workers(int *workerids_ctx, int nworkers_ctx, unsigned sched_ctx_id);
+void starpu_sched_ctx_remove_workers(int *workerids_ctx, unsigned nworkers_ctx, unsigned sched_ctx_id);
 
 void starpu_sched_ctx_display_workers(unsigned sched_ctx_id, FILE *f);
 
@@ -133,16 +133,14 @@ void starpu_sched_ctx_list_task_counters_decrement(unsigned sched_ctx_id, int wo
 
 void starpu_sched_ctx_list_task_counters_reset(unsigned sched_ctx_id, int workerid);
 
-void starpu_sched_ctx_list_task_counters_increment_all(struct starpu_task *task, unsigned sched_ctx_id);
+void starpu_sched_ctx_list_task_counters_increment_all_ctx_locked(struct starpu_task *task, unsigned sched_ctx_id);
 
-void starpu_sched_ctx_list_task_counters_decrement_all(struct starpu_task *task, unsigned sched_ctx_id);
+void starpu_sched_ctx_list_task_counters_decrement_all_ctx_locked(struct starpu_task *task, unsigned sched_ctx_id);
 
 void starpu_sched_ctx_list_task_counters_reset_all(struct starpu_task *task, unsigned sched_ctx_id);
 
 void starpu_sched_ctx_set_priority(int *workers, int nworkers, unsigned sched_ctx_id, unsigned priority);
 
-void starpu_sched_ctx_set_priority_on_level(int* workers_to_add, unsigned nworkers_to_add, unsigned sched_ctx, unsigned priority);
-
 unsigned starpu_sched_ctx_get_priority(int worker, unsigned sched_ctx_id);
 
 void starpu_sched_ctx_get_available_cpuids(unsigned sched_ctx_id, int **cpuids, int *ncpuids);
@@ -160,9 +158,9 @@ unsigned starpu_sched_ctx_worker_is_master_for_child_ctx(int workerid, unsigned
 /* If not, returns STARPU_NMAX_SCHED_CTXS. */
 unsigned starpu_sched_ctx_master_get_context(int masterid);
 
-void starpu_sched_ctx_revert_task_counters(unsigned sched_ctx_id, double flops);
+void starpu_sched_ctx_revert_task_counters_ctx_locked(unsigned sched_ctx_id, double flops);
 
-void starpu_sched_ctx_move_task_to_ctx(struct starpu_task *task, unsigned sched_ctx, unsigned manage_mutex, unsigned with_repush);
+void starpu_sched_ctx_move_task_to_ctx_locked(struct starpu_task *task, unsigned sched_ctx, unsigned with_repush);
 
 int starpu_sched_ctx_get_worker_rank(unsigned sched_ctx_id);
 

+ 5 - 6
include/starpu_scheduler.h

@@ -1,8 +1,9 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2016  Université de Bordeaux
+ * Copyright (C) 2010-2017  Université de Bordeaux
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2016  Uppsala University
+ * Copyright (C) 2017  Inria
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -57,15 +58,13 @@ struct starpu_sched_policy
 struct starpu_sched_policy **starpu_sched_get_predefined_policies();
 
 void starpu_worker_get_sched_condition(int workerid, starpu_pthread_mutex_t **sched_mutex, starpu_pthread_cond_t **sched_cond);
+unsigned long starpu_task_get_job_id(struct starpu_task *task);
 
 /* This function must be called to wake up a worker that is sleeping on the cond. 
- * It returns 0 whenever the worker is not in a sleeping state */
-int starpu_wake_worker(int workerid);
-int starpu_wakeup_worker(int workerid, starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex);
+ * It returns 0 whenever the worker is not in a sleeping state or has the state_keep_awake flag on */
+int starpu_wake_worker_no_relax(int workerid);
 /* This is a version of starpu_wake_worker which assumes that the sched mutex is locked */
 int starpu_wake_worker_locked(int workerid);
-/* This is a version of starpu_wakeup_worker which assumes that the sched mutex is locked */
-int starpu_wakeup_worker_locked(int workerid, starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex);
 
 int starpu_worker_can_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl);
 int starpu_worker_can_execute_task_impl(unsigned workerid, struct starpu_task *task, unsigned *impl_mask);

+ 6 - 4
include/starpu_task.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2017  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2017  CNRS
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011, 2014, 2016  INRIA
  * Copyright (C) 2016  Uppsala University
@@ -46,6 +46,7 @@ extern "C"
 #define STARPU_MPI_MS	((1ULL)<<9)
 
 #define STARPU_CODELET_SIMGRID_EXECUTE	(1<<0)
+#define STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT	(1<<1)
 #define STARPU_CUDA_ASYNC	(1<<0)
 #define STARPU_OPENCL_ASYNC	(1<<0)
 
@@ -135,6 +136,7 @@ struct starpu_task
 	const char *name;
 
 	struct starpu_codelet *cl;
+	int32_t where;
 
 	int nbuffers;
 
@@ -180,13 +182,13 @@ struct starpu_task
 	unsigned destroy:1;
 	unsigned regenerate:1;
 
-	unsigned workerid;
-	unsigned workerorder;
-
 	unsigned scheduled:1;
 
 	unsigned int mf_skip:1;
 
+	unsigned workerid;
+	unsigned workerorder;
+
 	int priority;
 
 	enum starpu_task_status status;

+ 33 - 125
include/starpu_task_list.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2012, 2016  Université de Bordeaux
+ * Copyright (C) 2010-2012, 2016-2017  Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -25,148 +25,56 @@ extern "C"
 {
 #endif
 
+	/* NOTE: this needs to have at least the same size as lists in src/common/list.h */
+#ifdef BUILDING_STARPU
+#define STARPU_TASK_LIST_INLINE extern inline
+#else
 struct starpu_task_list
 {
 	struct starpu_task *head;
 	struct starpu_task *tail;
 };
+#define STARPU_TASK_LIST_INLINE extern
+#endif
 
-static STARPU_INLINE
-void starpu_task_list_init(struct starpu_task_list *list)
-{
-	list->head = NULL;
-	list->tail = NULL;
-}
-
-static STARPU_INLINE
-void starpu_task_list_push_front(struct starpu_task_list *list, struct starpu_task *task)
-{
-	if (list->tail == NULL)
-	{
-		list->tail = task;
-	}
-	else
-	{
-		list->head->prev = task;
-	}
-
-	task->prev = NULL;
-	task->next = list->head;
-	list->head = task;
-}
-
-static STARPU_INLINE
-void starpu_task_list_push_back(struct starpu_task_list *list, struct starpu_task *task)
-{
-	if (list->head == NULL)
-	{
-		list->head = task;
-	}
-	else
-	{
-		list->tail->next = task;
-	}
-
-	task->next = NULL;
-	task->prev = list->tail;
-	list->tail = task;
-}
-
-static STARPU_INLINE
-struct starpu_task *starpu_task_list_front(struct starpu_task_list *list)
-{
-	return list->head;
-}
-
-static STARPU_INLINE
-struct starpu_task *starpu_task_list_back(struct starpu_task_list *list)
-{
-	return list->tail;
-}
-
-static STARPU_INLINE
-int starpu_task_list_empty(struct starpu_task_list *list)
-{
-	return (list->head == NULL);
-}
+STARPU_TASK_LIST_INLINE
+void starpu_task_list_init(struct starpu_task_list *list);
 
-static STARPU_INLINE
-void starpu_task_list_erase(struct starpu_task_list *list, struct starpu_task *task)
-{
-	struct starpu_task *p = task->prev;
-
-	if (p)
-	{
-		p->next = task->next;
-	}
-	else
-	{
-		list->head = task->next;
-	}
-
-	if (task->next)
-	{
-		task->next->prev = p;
-	}
-	else
-	{
-		list->tail = p;
-	}
-
-	task->prev = NULL;
-	task->next = NULL;
-}
+STARPU_TASK_LIST_INLINE
+void starpu_task_list_push_front(struct starpu_task_list *list, struct starpu_task *task);
 
-static STARPU_INLINE
-struct starpu_task *starpu_task_list_pop_front(struct starpu_task_list *list)
-{
-	struct starpu_task *task = list->head;
+STARPU_TASK_LIST_INLINE
+void starpu_task_list_push_back(struct starpu_task_list *list, struct starpu_task *task);
 
-	if (task)
-		starpu_task_list_erase(list, task);
+STARPU_TASK_LIST_INLINE
+struct starpu_task *starpu_task_list_front(const struct starpu_task_list *list);
 
-	return task;
-}
+STARPU_TASK_LIST_INLINE
+struct starpu_task *starpu_task_list_back(const struct starpu_task_list *list);
 
-static STARPU_INLINE
-struct starpu_task *starpu_task_list_pop_back(struct starpu_task_list *list)
-{
-	struct starpu_task *task = list->tail;
+STARPU_TASK_LIST_INLINE
+int starpu_task_list_empty(const struct starpu_task_list *list);
 
-	if (task)
-		starpu_task_list_erase(list, task);
+STARPU_TASK_LIST_INLINE
+void starpu_task_list_erase(struct starpu_task_list *list, struct starpu_task *task);
 
-	return task;
-}
+STARPU_TASK_LIST_INLINE
+struct starpu_task *starpu_task_list_pop_front(struct starpu_task_list *list);
 
-static STARPU_INLINE
-struct starpu_task *starpu_task_list_begin(struct starpu_task_list *list)
-{
-	return list->head;
-}
+STARPU_TASK_LIST_INLINE
+struct starpu_task *starpu_task_list_pop_back(struct starpu_task_list *list);
 
-static STARPU_INLINE
-struct starpu_task *starpu_task_list_end(struct starpu_task_list *list STARPU_ATTRIBUTE_UNUSED)
-{
-	return NULL;
-}
+STARPU_TASK_LIST_INLINE
+struct starpu_task *starpu_task_list_begin(const struct starpu_task_list *list);
 
-static STARPU_INLINE
-struct starpu_task *starpu_task_list_next(struct starpu_task *task)
-{
-	return task->next;
-}
+STARPU_TASK_LIST_INLINE
+struct starpu_task *starpu_task_list_end(const struct starpu_task_list *list STARPU_ATTRIBUTE_UNUSED);
 
-static STARPU_INLINE
-int starpu_task_list_ismember(struct starpu_task_list *list, struct starpu_task *look)
-{
-	struct starpu_task *task;
+STARPU_TASK_LIST_INLINE
+struct starpu_task *starpu_task_list_next(const struct starpu_task *task);
 
-	for (task  = list->head; task != NULL; task  = task->next)
-		if (task == look)
-			return 1;
-	return 0;
-}
+STARPU_TASK_LIST_INLINE
+int starpu_task_list_ismember(const struct starpu_task_list *list, const struct starpu_task *look);
 
 #ifdef __cplusplus
 }

+ 10 - 9
include/starpu_task_util.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010-2015  Université de Bordeaux
- * Copyright (C) 2010-2014, 2016  CNRS
+ * Copyright (C) 2010-2014, 2016, 2017  CNRS
  * Copyright (C) 2014       INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -34,7 +34,7 @@ void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t
 
 /* NOTE: when adding a value here, please make sure to update both
  * src/util/starpu_task_insert_utils.c (in two places) and
- * mpi/src/starpu_mpi_task_insert.c */
+ * mpi/src/starpu_mpi_task_insert.c and mpi/src/starpu_mpi_task_insert_fortran.c */
 #define STARPU_MODE_SHIFT	17
 #define STARPU_VALUE		 (1<<STARPU_MODE_SHIFT)
 #define STARPU_CALLBACK		 (2<<STARPU_MODE_SHIFT)
@@ -54,13 +54,14 @@ void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t
 #define STARPU_PROLOGUE_CALLBACK_POP   (16<<STARPU_MODE_SHIFT)
 #define STARPU_PROLOGUE_CALLBACK_POP_ARG (17<<STARPU_MODE_SHIFT)
 #define STARPU_EXECUTE_ON_WORKER (18<<STARPU_MODE_SHIFT)
-#define STARPU_TAG_ONLY          (19<<STARPU_MODE_SHIFT)
-#define STARPU_POSSIBLY_PARALLEL    (20<<STARPU_MODE_SHIFT)
-#define STARPU_WORKER_ORDER      (21<<STARPU_MODE_SHIFT)
-#define STARPU_NODE_SELECTION_POLICY (22<<STARPU_MODE_SHIFT)
-#define STARPU_NAME		 (23<<STARPU_MODE_SHIFT)
-#define STARPU_CL_ARGS		(24<<STARPU_MODE_SHIFT)
-#define STARPU_SHIFTED_MODE_MAX (25<<STARPU_MODE_SHIFT)
+#define STARPU_EXECUTE_WHERE     (19<<STARPU_MODE_SHIFT)
+#define STARPU_TAG_ONLY          (20<<STARPU_MODE_SHIFT)
+#define STARPU_POSSIBLY_PARALLEL    (21<<STARPU_MODE_SHIFT)
+#define STARPU_WORKER_ORDER      (22<<STARPU_MODE_SHIFT)
+#define STARPU_NODE_SELECTION_POLICY (23<<STARPU_MODE_SHIFT)
+#define STARPU_NAME		 (24<<STARPU_MODE_SHIFT)
+#define STARPU_CL_ARGS		(25<<STARPU_MODE_SHIFT)
+#define STARPU_SHIFTED_MODE_MAX (26<<STARPU_MODE_SHIFT)
 
 struct starpu_task *starpu_task_build(struct starpu_codelet *cl, ...);
 int starpu_task_insert(struct starpu_codelet *cl, ...);

+ 1 - 0
include/starpu_thread_util.h

@@ -19,6 +19,7 @@
 #define __STARPU_THREAD_UTIL_H__
 
 #include <starpu_util.h>
+#include <starpu_thread.h>
 #include <errno.h>
 
 #if !(defined(_MSC_VER) && !defined(BUILDING_STARPU))

+ 15 - 10
include/starpu_util.h

@@ -20,6 +20,7 @@
 
 #include <stdio.h>
 #include <stdlib.h>
+#include <stdint.h>
 #include <string.h>
 #include <assert.h>
 
@@ -29,6 +30,10 @@
 #include <execinfo.h>
 #endif
 
+#ifdef STARPU_SIMGRID_MC
+#include <simgrid/modelchecker.h>
+#endif
+
 #ifdef __cplusplus
 extern "C"
 {
@@ -111,17 +116,23 @@ extern "C"
 #  define STARPU_DUMP_BACKTRACE() do { } while (0)
 #endif
 
+#ifdef STARPU_SIMGRID_MC
+#define STARPU_SIMGRID_ASSERT(x) MC_assert(!!(x))
+#else
+#define STARPU_SIMGRID_ASSERT(x)
+#endif
+
 #ifdef STARPU_NO_ASSERT
 #define STARPU_ASSERT(x)		do { if (0) { (void) (x); } } while(0)
 #define STARPU_ASSERT_ACCESSIBLE(x)	do { if (0) { (void) (x); } } while(0)
 #define STARPU_ASSERT_MSG(x, msg, ...)	do { if (0) { (void) (x); (void) msg; } } while(0)
 #else
 #  if defined(__CUDACC__) || defined(STARPU_HAVE_WINDOWS)
-#    define STARPU_ASSERT(x)		do { if (STARPU_UNLIKELY(!(x))) { STARPU_DUMP_BACKTRACE(); *(int*)NULL = 0; } } while(0)
-#    define STARPU_ASSERT_MSG(x, msg, ...)	do { if (STARPU_UNLIKELY(!(x))) { STARPU_DUMP_BACKTRACE(); fprintf(stderr, "\n[starpu][%s][assert failure] " msg "\n\n", __starpu_func__, ## __VA_ARGS__); *(int*)NULL = 0; }} while(0)
+#    define STARPU_ASSERT(x)		do { if (STARPU_UNLIKELY(!(x))) { STARPU_DUMP_BACKTRACE(); STARPU_SIMGRID_ASSERT(x); *(int*)NULL = 0; } } while(0)
+#    define STARPU_ASSERT_MSG(x, msg, ...)	do { if (STARPU_UNLIKELY(!(x))) { STARPU_DUMP_BACKTRACE(); fprintf(stderr, "\n[starpu][%s][assert failure] " msg "\n\n", __starpu_func__, ## __VA_ARGS__); STARPU_SIMGRID_ASSERT(x); *(int*)NULL = 0; }} while(0)
 #  else
-#    define STARPU_ASSERT(x)		do { if (STARPU_UNLIKELY(!(x))) { STARPU_DUMP_BACKTRACE(); assert(x); } } while (0)
-#    define STARPU_ASSERT_MSG(x, msg, ...)	do { if (STARPU_UNLIKELY(!(x))) { STARPU_DUMP_BACKTRACE(); fprintf(stderr, "\n[starpu][%s][assert failure] " msg "\n\n", __starpu_func__, ## __VA_ARGS__); assert(x); } } while(0)
+#    define STARPU_ASSERT(x)		do { if (STARPU_UNLIKELY(!(x))) { STARPU_DUMP_BACKTRACE(); STARPU_SIMGRID_ASSERT(x); assert(x); } } while (0)
+#    define STARPU_ASSERT_MSG(x, msg, ...)	do { if (STARPU_UNLIKELY(!(x))) { STARPU_DUMP_BACKTRACE(); fprintf(stderr, "\n[starpu][%s][assert failure] " msg "\n\n", __starpu_func__, ## __VA_ARGS__); STARPU_SIMGRID_ASSERT(x); assert(x); } } while(0)
 
 #  endif
 #  define STARPU_ASSERT_ACCESSIBLE(ptr)	do { \
@@ -321,10 +332,6 @@ STARPU_ATOMIC_SOMETHINGL(or, old | value)
 }
 #endif
 
-/* Include this only here so that <starpu_data_interfaces.h> can use the
- * macros above.  */
-#include <starpu_task.h>
-
 #ifdef __cplusplus
 extern "C"
 {
@@ -406,8 +413,6 @@ void starpu_execute_on_each_worker_ex(void (*func)(void *), void *arg, uint32_t
 
 void starpu_execute_on_specific_workers(void (*func)(void*), void *arg, unsigned num_workers, unsigned *workers, const char *name);
 
-int starpu_data_cpy(starpu_data_handle_t dst_handle, starpu_data_handle_t src_handle, int asynchronous, void (*callback_func)(void*), void *callback_arg);
-
 double starpu_timing_now(void);
 
 #ifdef _WIN32

+ 24 - 2
include/starpu_worker.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2013, 2016  Université de Bordeaux
+ * Copyright (C) 2009-2013, 2016-2017  Université de Bordeaux
  * Copyright (C) 2010-2014, 2017  CNRS
  * Copyright (C) 2016, 2017  INRIA
  * Copyright (C) 2016  Uppsala University
@@ -23,6 +23,7 @@
 #include <stdlib.h>
 #include <starpu_config.h>
 #include <starpu_thread.h>
+#include <starpu_task.h>
 
 #ifdef __cplusplus
 extern "C"
@@ -127,7 +128,7 @@ struct starpu_tree* starpu_workers_get_tree(void);
 
 unsigned starpu_worker_get_sched_ctx_list(int worker, unsigned **sched_ctx);
 
-unsigned starpu_worker_is_blocked(int workerid);
+unsigned starpu_worker_is_blocked_in_parallel(int workerid);
 
 unsigned starpu_worker_is_slave_somewhere(int workerid);
 
@@ -140,6 +141,27 @@ int starpu_worker_get_devids(enum starpu_worker_archtype type, int *devids, int
 int starpu_worker_get_stream_workerids(unsigned devid, int *workerids, enum starpu_worker_archtype type);
 
 unsigned starpu_worker_get_sched_ctx_id_stream(unsigned stream_workerid);
+
+int starpu_worker_sched_op_pending(void);
+
+void starpu_worker_relax_on(void);
+
+void starpu_worker_relax_off(void);
+
+int starpu_worker_get_relax_state(void);
+
+void starpu_worker_lock(int workerid);
+
+int starpu_worker_trylock(int workerid);
+
+void starpu_worker_unlock(int workerid);
+
+void starpu_worker_lock_self(void);
+
+void starpu_worker_unlock_self(void);
+
+int starpu_wake_worker_relax(int workerid);
+
 #ifdef __cplusplus
 }
 #endif

+ 109 - 0
mpi/dev/starpu_mpi_comm_check.sh

@@ -0,0 +1,109 @@
+#!/bin/bash
+
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2017 CNRS
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+# Script to check MPI communications are done properly
+# The application should be launched with STARPU_MPI_COMM=1
+# e.g
+#    $ export STARPU_MPI_COMM=1
+#    $ mpirun --output-filename starpu_mpi.log appli parameters
+# and then the script can be launched with the output files
+#    $ starpu_mpi_comm_check.sh starpu_mpi.log.*
+
+if test -z "$1"
+then
+    echo Syntax error: parameter missing
+    exit 1
+fi
+
+# Get the nodes identifiers
+nodes=$(for f in $*
+	do
+	    grep starpu_mpi $f | grep '\[' | awk '{print $1}'| sed 's/\[\(.*\)\]\[starpu_mpi\]/\1/' | grep "^[[:digit:]]*$"
+	done |sort|uniq
+     )
+echo nodes $nodes
+
+DIR=/tmp
+
+# for each node, extract send and receive communications
+for node in $nodes
+do
+    for f in $*
+    do
+	grep starpu_mpi $f |grep "\[$node"
+    done > $DIR/starpu_mpi_node$node.log
+    grep -- "-->" $DIR/starpu_mpi_node$node.log > $DIR/starpu_mpi_node${node}_send.log
+    grep -- "<--" $DIR/starpu_mpi_node$node.log > $DIR/starpu_mpi_node${node}_recv.log
+done
+
+# count the number of traced lines
+#for node in $nodes
+#do
+#    wc -l $DIR/starpu_mpi_node${node}_recv.log
+#    lines=$(grep :42:42 $DIR/starpu_mpi_node${node}_recv.log | wc -l)
+#    lines2=$(( lines + lines ))
+#    echo $lines2
+#    lines3=$(( lines2 + lines ))
+#    echo $lines3
+#done
+
+# for each pair of nodes, check tags are sent and received in the same order
+for src in $nodes
+do
+    for dst in $nodes
+    do
+	if test $src != $dst
+	then
+	    grep ":$dst:42:" $DIR/starpu_mpi_node${src}_send.log| awk -F':' '{print $6}' > $DIR/node${src}_send_to_${dst}.log
+	    grep ":$src:42:" $DIR/starpu_mpi_node${dst}_recv.log|awk -F ':' '{print $6}'> $DIR/node${dst}_recv_from_${src}.log
+ 	    diff --side-by-side  --suppress-common-lines $DIR/node${src}_send_to_${dst}.log $DIR/node${dst}_recv_from_${src}.log  > $DIR/check_$$
+	    if test -s $DIR/check_$$
+	    then
+		echo $src $dst
+		less $DIR/check_$$
+	    fi
+	fi
+    done
+done
+
+# check each envelope reception is followed by the appropriate data reception
+# first line: MPI_Recv of the envelope
+# second line: display envelope information
+# third line: MPI_Recv of the data
+for node in $nodes
+do
+    echo processing $DIR/starpu_mpi_node${node}_recv.log
+    (
+	while read line
+	do
+	    read line2
+	    read line3
+	    #echo processing
+	    tag2=$(echo $line2 | awk -F ':' '{print $6}')
+	    tag3=$(echo $line3 | awk -F ':' '{print $6}')
+	    if test "$tag2" != "$tag3"
+	    then
+		echo erreur
+		echo $tag2 $tag3
+		echo $line
+		echo $line2
+		echo $line3
+	    fi
+	done
+    ) < $DIR/starpu_mpi_node${node}_recv.log
+done
+

+ 26 - 3
mpi/examples/Makefile.am

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2009-2013, 2015-2016  Université de Bordeaux
+# Copyright (C) 2009-2013, 2015-2017  Université de Bordeaux
 # Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
 # Copyright (C) 2016  Inria
 #
@@ -17,6 +17,15 @@
 
 include $(top_srcdir)/starpu.mk
 
+if STARPU_SIMGRID
+STARPU_PERF_MODEL_DIR=$(abs_top_srcdir)/tools/perfmodels/sampling
+STARPU_HOSTNAME=mirage
+MALLOC_PERTURB_=0
+export STARPU_PERF_MODEL_DIR
+export STARPU_HOSTNAME
+export MALLOC_PERTURB_
+endif
+
 CC=$(MPICC)
 CCLD=$(MPICC)
 FC=$(MPIFORT)
@@ -26,17 +35,23 @@ if STARPU_HAVE_WINDOWS
 LOADER_BIN		=
 else
 loader_CPPFLAGS 	= 	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
+if !STARPU_SIMGRID
 LOADER			=	loader
 LOADER_BIN		=	$(abs_top_builddir)/mpi/examples/$(LOADER)
+endif
 loader_SOURCES		=	../../tests/loader.c
 endif
 
+if STARPU_SIMGRID
+MPI			=	$(abs_top_builddir)/tools/starpu_smpirun -np 4 -platform $(abs_top_srcdir)/tools/perfmodels/cluster.xml -hostfile $(abs_top_srcdir)/tools/perfmodels/hostfile
+else
 # we always test on 4 processes, the execution time is not that bigger
 if STARPU_QUICK_CHECK
 MPI			=	$(MPIEXEC) $(MPIEXEC_ARGS) -np 4
 else
 MPI			=	$(MPIEXEC) $(MPIEXEC_ARGS) -np 4
 endif
+endif
 
 if STARPU_HAVE_AM111
 TESTS_ENVIRONMENT	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=4 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
@@ -45,11 +60,9 @@ else
 TESTS_ENVIRONMENT 	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=4 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPI) $(LOADER_BIN)
 endif
 
-if !STARPU_SIMGRID
 if STARPU_MPI_CHECK
 TESTS			=	$(starpu_mpi_EXAMPLES)
 endif
-endif
 
 check_PROGRAMS = $(LOADER) $(starpu_mpi_EXAMPLES)
 starpu_mpi_EXAMPLES =
@@ -248,11 +261,13 @@ matrix_decomposition_mpi_cholesky_distributed_LDADD =	\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
 	$(STARPU_BLAS_LDFLAGS) -lm
 
+if !STARPU_SIMGRID
 starpu_mpi_EXAMPLES +=				\
 	matrix_decomposition/mpi_cholesky			\
 	matrix_decomposition/mpi_cholesky_distributed
 endif
 endif
+endif
 
 ########################
 # MPI Matrix mult example #
@@ -269,9 +284,11 @@ matrix_mult_mm_LDADD =			\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
 	-lm
 
+if !STARPU_SIMGRID
 starpu_mpi_EXAMPLES +=				\
 	matrix_mult/mm
 endif
+endif
 
 ##########################################
 # Native Fortran MPI Matrix mult example #
@@ -303,12 +320,14 @@ native_fortran_nf_basic_ring_LDADD =					\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
 	-lm
 
+if !STARPU_SIMGRID
 starpu_mpi_EXAMPLES +=				\
 	native_fortran/nf_mm			\
 	native_fortran/nf_basic_ring
 endif
 endif
 endif
+endif
 
 ###################
 # complex example #
@@ -344,9 +363,11 @@ user_datatype_user_datatype_SOURCES =		\
 user_datatype_user_datatype_LDADD =		\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 
+if !STARPU_SIMGRID
 starpu_mpi_EXAMPLES	+=			\
 	user_datatype/user_datatype
 endif
+endif
 
 ###################
 # comm example #
@@ -362,10 +383,12 @@ comm_comm_LDADD =		\
 comm_mix_comm_LDADD =		\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 
+if !STARPU_SIMGRID
 starpu_mpi_EXAMPLES	+=			\
 	comm/comm				\
 	comm/mix_comm
 endif
+endif
 
 if STARPU_HAVE_MPIFORT
 if BUILD_EXAMPLES

+ 17 - 1
mpi/examples/complex/mpi_complex.c

@@ -26,11 +26,27 @@ void display_foo_codelet(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
 	FPRINTF(stderr, "foo = %d\n", *foo);
 }
 
+/* Dumb performance model for simgrid */
+static double display_cost_function(struct starpu_task *task, unsigned nimpl)
+{
+	(void) task;
+	(void) nimpl;
+	return 0.000001;
+}
+
+static struct starpu_perfmodel display_model =
+{
+	.type = STARPU_COMMON,
+	.cost_function = display_cost_function,
+	.symbol = "display"
+};
+
 struct starpu_codelet foo_display =
 {
 	.cpu_funcs = {display_foo_codelet},
 	.nbuffers = 1,
-	.modes = {STARPU_R}
+	.modes = {STARPU_R},
+	.model = &display_model
 };
 
 int main(int argc, char **argv)

+ 2 - 2
mpi/examples/matrix_decomposition/mpi_decomposition_params.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2015-2016  Université de Bordeaux
+ * Copyright (C) 2009, 2010, 2015-2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -27,7 +27,7 @@ unsigned nblocks = 2;
 unsigned nbigblocks = 2;
 #elif !defined(STARPU_LONG_CHECK)
 unsigned size = 4*320;
-unsigned nblocks = 16;
+unsigned nblocks = 4;
 unsigned nbigblocks = 2;
 #else
 unsigned size = 16*320;

+ 17 - 1
mpi/examples/stencil/stencil5.c

@@ -37,11 +37,27 @@ void stencil5_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
 //	fprintf(stdout, "VALUES: %2.2f %2.2f %2.2f %2.2f %2.2f\n", *xy, *xm1y, *xp1y, *xym1, *xyp1);
 }
 
+/* Dumb performance model for simgrid */
+static double stencil5_cost_function(struct starpu_task *task, unsigned nimpl)
+{
+	(void) task;
+	(void) nimpl;
+	return 0.000001;
+}
+
+static struct starpu_perfmodel stencil5_model =
+{
+	.type = STARPU_COMMON,
+	.cost_function = stencil5_cost_function,
+	.symbol = "stencil5"
+};
+
 struct starpu_codelet stencil5_cl =
 {
 	.cpu_funcs = {stencil5_cpu},
 	.nbuffers = 5,
-	.modes = {STARPU_RW, STARPU_R, STARPU_R, STARPU_R, STARPU_R}
+	.modes = {STARPU_RW, STARPU_R, STARPU_R, STARPU_R, STARPU_R},
+	.model = &stencil5_model
 };
 
 #ifdef STARPU_QUICK_CHECK

+ 27 - 17
mpi/src/starpu_mpi.c

@@ -94,11 +94,10 @@ static int posted_requests = 0, newer_requests, barrier_running = 0;
 #pragma weak smpi_simulated_main_
 extern int smpi_simulated_main_(int argc, char *argv[]);
 
-#ifdef HAVE_SMPI_PROCESS_SET_USER_DATA
+#pragma weak smpi_process_set_user_data
 #if !HAVE_DECL_SMPI_PROCESS_SET_USER_DATA
 extern void smpi_process_set_user_data(void *);
 #endif
-#endif
 
 static void _starpu_mpi_request_init(struct _starpu_mpi_req **req)
 {
@@ -437,13 +436,13 @@ static void _starpu_mpi_isend_data_func(struct _starpu_mpi_req *req)
 
 	if (req->sync == 0)
 	{
-		_STARPU_MPI_COMM_TO_DEBUG(req->count, req->datatype, req->node_tag.rank, _STARPU_MPI_TAG_DATA, req->node_tag.data_tag, req->node_tag.comm);
+		_STARPU_MPI_COMM_TO_DEBUG(req, req->count, req->datatype, req->node_tag.rank, _STARPU_MPI_TAG_DATA, req->node_tag.data_tag, req->node_tag.comm);
 		req->ret = MPI_Isend(req->ptr, req->count, req->datatype, req->node_tag.rank, _STARPU_MPI_TAG_DATA, req->node_tag.comm, &req->data_request);
 		STARPU_MPI_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Isend returning %s", _starpu_mpi_get_mpi_error_code(req->ret));
 	}
 	else
 	{
-		_STARPU_MPI_COMM_TO_DEBUG(req->count, req->datatype, req->node_tag.rank, _STARPU_MPI_TAG_SYNC_DATA, req->node_tag.data_tag, req->node_tag.comm);
+		_STARPU_MPI_COMM_TO_DEBUG(req, req->count, req->datatype, req->node_tag.rank, _STARPU_MPI_TAG_SYNC_DATA, req->node_tag.data_tag, req->node_tag.comm);
 		req->ret = MPI_Issend(req->ptr, req->count, req->datatype, req->node_tag.rank, _STARPU_MPI_TAG_SYNC_DATA, req->node_tag.comm, &req->data_request);
 		STARPU_MPI_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Issend returning %s", _starpu_mpi_get_mpi_error_code(req->ret));
 	}
@@ -483,7 +482,7 @@ static void _starpu_mpi_isend_size_func(struct _starpu_mpi_req *req)
 		MPI_Type_size(req->datatype, &size);
 		req->envelope->size = (starpu_ssize_t)req->count * size;
 		_STARPU_MPI_DEBUG(20, "Post MPI isend count (%ld) datatype_size %ld request to %d\n",req->count,starpu_data_get_size(req->data_handle), req->node_tag.rank);
-		_STARPU_MPI_COMM_TO_DEBUG(sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.rank, _STARPU_MPI_TAG_ENVELOPE, _STARPU_MPI_TAG_ENVELOPE, req->node_tag.comm);
+		_STARPU_MPI_COMM_TO_DEBUG(req->envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.rank, _STARPU_MPI_TAG_ENVELOPE, req->envelope->data_tag, req->node_tag.comm);
 		MPI_Isend(req->envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.rank, _STARPU_MPI_TAG_ENVELOPE, req->node_tag.comm, &req->size_req);
 	}
 	else
@@ -498,7 +497,7 @@ static void _starpu_mpi_isend_size_func(struct _starpu_mpi_req *req)
  			// We already know the size of the data, let's send it to overlap with the packing of the data
 			_STARPU_MPI_DEBUG(20, "Sending size %ld (%ld %s) to node %d (first call to pack)\n", req->envelope->size, sizeof(req->count), "MPI_BYTE", req->node_tag.rank);
 			req->count = req->envelope->size;
-			_STARPU_MPI_COMM_TO_DEBUG(sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.rank, _STARPU_MPI_TAG_ENVELOPE, _STARPU_MPI_TAG_ENVELOPE, req->node_tag.comm);
+			_STARPU_MPI_COMM_TO_DEBUG(req->envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.rank, _STARPU_MPI_TAG_ENVELOPE, req->envelope->data_tag, req->node_tag.comm);
 			ret = MPI_Isend(req->envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.rank, _STARPU_MPI_TAG_ENVELOPE, req->node_tag.comm, &req->size_req);
 			STARPU_MPI_ASSERT_MSG(ret == MPI_SUCCESS, "when sending size, MPI_Isend returning %s", _starpu_mpi_get_mpi_error_code(ret));
  		}
@@ -509,7 +508,7 @@ static void _starpu_mpi_isend_size_func(struct _starpu_mpi_req *req)
  		{
  			// We know the size now, let's send it
 			_STARPU_MPI_DEBUG(20, "Sending size %ld (%ld %s) to node %d (second call to pack)\n", req->envelope->size, sizeof(req->count), "MPI_BYTE", req->node_tag.rank);
-			_STARPU_MPI_COMM_TO_DEBUG(sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.rank, _STARPU_MPI_TAG_ENVELOPE, _STARPU_MPI_TAG_ENVELOPE, req->node_tag.comm);
+			_STARPU_MPI_COMM_TO_DEBUG(req->envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.rank, _STARPU_MPI_TAG_ENVELOPE, req->envelope->data_tag, req->node_tag.comm);
 			ret = MPI_Isend(req->envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.rank, _STARPU_MPI_TAG_ENVELOPE, req->node_tag.comm, &req->size_req);
 			STARPU_MPI_ASSERT_MSG(ret == MPI_SUCCESS, "when sending size, MPI_Isend returning %s", _starpu_mpi_get_mpi_error_code(ret));
  		}
@@ -634,7 +633,7 @@ static void _starpu_mpi_irecv_data_func(struct _starpu_mpi_req *req)
 		_envelope->mode = _STARPU_MPI_ENVELOPE_SYNC_READY;
 		_envelope->data_tag = req->node_tag.data_tag;
 		_STARPU_MPI_DEBUG(20, "Telling node %d it can send the data and waiting for the data back ...\n", req->node_tag.rank);
-		_STARPU_MPI_COMM_TO_DEBUG(sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.rank, _STARPU_MPI_TAG_ENVELOPE, _STARPU_MPI_TAG_ENVELOPE, req->node_tag.comm);
+		_STARPU_MPI_COMM_TO_DEBUG(_envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.rank, _STARPU_MPI_TAG_ENVELOPE, _envelope->data_tag, req->node_tag.comm);
 		req->ret = MPI_Send(_envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.rank, _STARPU_MPI_TAG_ENVELOPE, req->node_tag.comm);
 		STARPU_MPI_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Send returning %s", _starpu_mpi_get_mpi_error_code(req->ret));
 		free(_envelope);
@@ -643,12 +642,12 @@ static void _starpu_mpi_irecv_data_func(struct _starpu_mpi_req *req)
 
 	if (req->sync)
 	{
-		_STARPU_MPI_COMM_FROM_DEBUG(req->count, req->datatype, req->node_tag.rank, _STARPU_MPI_TAG_SYNC_DATA, req->node_tag.data_tag, req->node_tag.comm);
+		_STARPU_MPI_COMM_FROM_DEBUG(req, req->count, req->datatype, req->node_tag.rank, _STARPU_MPI_TAG_SYNC_DATA, req->node_tag.data_tag, req->node_tag.comm);
 		req->ret = MPI_Irecv(req->ptr, req->count, req->datatype, req->node_tag.rank, _STARPU_MPI_TAG_SYNC_DATA, req->node_tag.comm, &req->data_request);
 	}
 	else
 	{
-		_STARPU_MPI_COMM_FROM_DEBUG(req->count, req->datatype, req->node_tag.rank, _STARPU_MPI_TAG_DATA, req->node_tag.data_tag, req->node_tag.comm);
+		_STARPU_MPI_COMM_FROM_DEBUG(req, req->count, req->datatype, req->node_tag.rank, _STARPU_MPI_TAG_DATA, req->node_tag.data_tag, req->node_tag.comm);
 		req->ret = MPI_Irecv(req->ptr, req->count, req->datatype, req->node_tag.rank, _STARPU_MPI_TAG_DATA, req->node_tag.comm, &req->data_request);
 #ifdef STARPU_SIMGRID
 		_starpu_mpi_simgrid_wait_req(&req->data_request, &req->status_store, &req->queue, &req->done);
@@ -1176,6 +1175,7 @@ static void _starpu_mpi_test_detached_requests(void)
 #ifdef STARPU_SIMGRID
 		req->ret = _starpu_mpi_simgrid_mpi_test(&req->done, &flag);
 #else
+		STARPU_MPI_ASSERT_MSG(req->data_request != MPI_REQUEST_NULL, "Cannot test completion of the request MPI_REQUEST_NULL");
 		req->ret = MPI_Test(&req->data_request, &flag, MPI_STATUS_IGNORE);
 #endif
 
@@ -1287,21 +1287,28 @@ static void _starpu_mpi_receive_early_data(struct _starpu_mpi_envelope *envelope
 	STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 
 	// We wait until the request is pushed in the
-	// ready_request list, that ensures that the next loop
-	// will call _starpu_mpi_handle_ready_request
-	// on the request and post the corresponding mpi_irecv,
-	// otherwise, it may lead to read data as envelop
+	// ready_request list
 	STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
 	STARPU_PTHREAD_MUTEX_LOCK(&(early_data_handle->req->posted_mutex));
 	while (!(early_data_handle->req->posted))
 		STARPU_PTHREAD_COND_WAIT(&(early_data_handle->req->posted_cond), &(early_data_handle->req->posted_mutex));
 	STARPU_PTHREAD_MUTEX_UNLOCK(&(early_data_handle->req->posted_mutex));
 
+#ifdef STARPU_DEVEL
+#warning check if req_ready is still necessary
+#endif
 	STARPU_PTHREAD_MUTEX_LOCK(&early_data_handle->req_mutex);
 	early_data_handle->req_ready = 1;
 	STARPU_PTHREAD_COND_BROADCAST(&early_data_handle->req_cond);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&early_data_handle->req_mutex);
 	STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
+
+	// Handle the request immediatly to make sure the mpi_irecv is
+	// posted before receiving an other envelope
+	_starpu_mpi_req_list_erase(ready_requests, early_data_handle->req);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
+	_starpu_mpi_handle_ready_request(early_data_handle->req);
+	STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
 }
 
 static void *_starpu_mpi_progress_thread_func(void *arg)
@@ -1326,12 +1333,14 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 		argv_cpy[i] = strdup((*(argc_argv->argv))[i]);
 	MSG_process_create_with_arguments("main", smpi_simulated_main_, NULL, _starpu_simgrid_get_host_by_name("MAIN"), *(argc_argv->argc), argv_cpy);
 	/* And set TSD for us */
-#ifdef HAVE_SMPI_PROCESS_SET_USER_DATA
 	void **tsd;
 	_STARPU_CALLOC(tsd, MAX_TSD + 1, sizeof(void*));
+	if (!smpi_process_set_user_data)
+	{
+		_STARPU_ERROR("Your version of simgrid does not provide smpi_process_set_user_data, we can not continue without it\n");
+	}
 	smpi_process_set_user_data(tsd);
 #endif
-#endif
 
 #ifdef STARPU_USE_FXT
 	_starpu_fxt_wait_initialisation();
@@ -1440,6 +1449,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
 			if (flag)
 			{
+				_STARPU_MPI_COMM_FROM_DEBUG(envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, envelope_status.MPI_SOURCE, _STARPU_MPI_TAG_ENVELOPE, envelope->data_tag, envelope_comm);
 				_STARPU_MPI_DEBUG(4, "Envelope received with mode %d\n", envelope->mode);
 				if (envelope->mode == _STARPU_MPI_ENVELOPE_SYNC_READY)
 				{
@@ -1632,7 +1642,7 @@ int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv)
         detached_requests = _starpu_mpi_req_list_new();
 
         STARPU_PTHREAD_MUTEX_INIT(&mutex_posted_requests, NULL);
-        _starpu_mpi_comm = starpu_getenv("STARPU_MPI_COMM") != NULL;
+        _starpu_mpi_comm_debug = starpu_getenv("STARPU_MPI_COMM") != NULL;
 
 #ifdef STARPU_SIMGRID
 	STARPU_PTHREAD_MUTEX_INIT(&wait_counter_mutex, NULL);

+ 37 - 56
mpi/src/starpu_mpi_collective.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011, 2012, 2013, 2014, 2015, 2016  CNRS
+ * Copyright (C) 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -39,25 +39,23 @@ void _callback_collective(void *arg)
 	}
 }
 
-int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg)
+static
+int _callback_set(int rank, starpu_data_handle_t *data_handles, int count, int root, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg, void (**callback_func)(void *), struct _callback_arg **callback_arg)
 {
-	int rank;
-	int x;
-	struct _callback_arg *callback_arg = NULL;
-	void (*callback_func)(void *) = NULL;
 	void (*callback)(void *);
 
-	starpu_mpi_comm_rank(comm, &rank);
-
 	callback = (rank == root) ? scallback : rcallback;
-	if (callback)
+	if (*callback)
 	{
-		callback_func = _callback_collective;
-		_STARPU_MPI_MALLOC(callback_arg, sizeof(struct _callback_arg));
-		callback_arg->count = 0;
-		callback_arg->nb = 0;
-		callback_arg->callback = (rank == root) ? scallback : rcallback;
-		callback_arg->arg = (rank == root) ? sarg : rarg;
+		int x;
+
+		*callback_func = _callback_collective;
+
+		_STARPU_MPI_MALLOC(*callback_arg, sizeof(struct _callback_arg));
+		(*callback_arg)->count = 0;
+		(*callback_arg)->nb = 0;
+		(*callback_arg)->callback = (rank == root) ? scallback : rcallback;
+		(*callback_arg)->arg = (rank == root) ? sarg : rarg;
 
 		for(x = 0; x < count ; x++)
 		{
@@ -68,22 +66,38 @@ int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, i
 				STARPU_ASSERT_MSG(data_tag >= 0, "Invalid tag for data handle");
 				if ((rank == root) && (owner != root))
 				{
-					callback_arg->count ++;
+					(*callback_arg)->count ++;
 				}
 				if ((rank != root) && (owner == rank))
 				{
-					callback_arg->count ++;
+					(*callback_arg)->count ++;
 				}
 			}
 		}
 
-		if (!callback_arg->count)
+		if (!(*callback_arg)->count)
 		{
-			free(callback_arg);
-			return 0;
+			free(*callback_arg);
+			return 1;
 		}
 	}
 
+	return 0;
+}
+
+int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg)
+{
+	int rank;
+	int x;
+	struct _callback_arg *callback_arg = NULL;
+	void (*callback_func)(void *) = NULL;
+
+	starpu_mpi_comm_rank(comm, &rank);
+
+	x = _callback_set(rank, data_handles, count, root, scallback, sarg, rcallback, rarg, &callback_func, &callback_arg);
+	if (x == 1)
+		return 0;
+
 	for(x = 0; x < count ; x++)
 	{
 		if (data_handles[x])
@@ -112,45 +126,12 @@ int starpu_mpi_gather_detached(starpu_data_handle_t *data_handles, int count, in
 	int x;
 	struct _callback_arg *callback_arg = NULL;
 	void (*callback_func)(void *) = NULL;
-	void (*callback)(void *);
 
 	starpu_mpi_comm_rank(comm, &rank);
 
-	callback = (rank == root) ? scallback : rcallback;
-	if (callback)
-	{
-		callback_func = _callback_collective;
-
-		_STARPU_MPI_MALLOC(callback_arg, sizeof(struct _callback_arg));
-		callback_arg->count = 0;
-		callback_arg->nb = 0;
-		callback_arg->callback = callback;
-		callback_arg->arg = (rank == root) ? sarg : rarg;
-
-		for(x = 0; x < count ; x++)
-		{
-			if (data_handles[x])
-			{
-				int owner = starpu_mpi_data_get_rank(data_handles[x]);
-				int data_tag = starpu_mpi_data_get_tag(data_handles[x]);
-				STARPU_ASSERT_MSG(data_tag >= 0, "Invalid tag for data handle");
-				if ((rank == root) && (owner != root))
-				{
-					callback_arg->count ++;
-				}
-				if ((rank != root) && (owner == rank))
-				{
-					callback_arg->count ++;
-				}
-			}
-		}
-
-		if (!callback_arg->count)
-		{
-			free(callback_arg);
-			return 0;
-		}
-	}
+	x = _callback_set(rank, data_handles, count, root, scallback, sarg, rcallback, rarg, &callback_func, &callback_arg);
+	if (x == 1)
+		return 0;
 
 	for(x = 0; x < count ; x++)
 	{

+ 1 - 1
mpi/src/starpu_mpi_comm.c

@@ -137,7 +137,7 @@ void _starpu_mpi_comm_post_recv()
 		if (_comm->posted == 0)
 		{
 			_STARPU_MPI_DEBUG(3, "Posting a receive to get a data envelop on comm %d %ld\n", i, (long int)_comm->comm);
-			_STARPU_MPI_COMM_FROM_DEBUG(sizeof(struct _starpu_mpi_envelope), MPI_BYTE, MPI_ANY_SOURCE, _STARPU_MPI_TAG_ENVELOPE, _STARPU_MPI_TAG_ENVELOPE, _comm->comm);
+			_STARPU_MPI_COMM_FROM_DEBUG(_comm->envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, MPI_ANY_SOURCE, _STARPU_MPI_TAG_ENVELOPE, _STARPU_MPI_TAG_ENVELOPE, _comm->comm);
 			MPI_Irecv(_comm->envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, MPI_ANY_SOURCE, _STARPU_MPI_TAG_ENVELOPE, _comm->comm, &_comm->request);
 #ifdef STARPU_SIMGRID
 			_starpu_mpi_simgrid_wait_req(&_comm->request, &_comm->status, &_comm->queue, &_comm->done);

+ 23 - 3
mpi/src/starpu_mpi_datatype.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009-2011, 2015  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -41,6 +41,26 @@ void _starpu_mpi_datatype_shutdown(void)
 }
 
 /*
+ * 	Bcsr
+ */
+
+static void handle_to_datatype_bcsr(starpu_data_handle_t data_handle, MPI_Datatype *datatype)
+{
+	int ret;
+
+	uint32_t r = starpu_bcsr_get_r(data_handle);
+	uint32_t c = starpu_bcsr_get_c(data_handle);
+	uint32_t nnz = starpu_bcsr_get_nnz(data_handle);
+	size_t elemsize = starpu_bcsr_get_elemsize(data_handle);
+
+	ret = MPI_Type_contiguous(r*c*nnz*elemsize, MPI_BYTE, datatype);
+	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_contiguous failed");
+
+	ret = MPI_Type_commit(datatype);
+	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Type_commit failed");
+}
+
+/*
  * 	Matrix
  */
 
@@ -149,7 +169,7 @@ static starpu_mpi_datatype_allocate_func_t handle_to_datatype_funcs[STARPU_MAX_I
 	[STARPU_BLOCK_INTERFACE_ID]	= handle_to_datatype_block,
 	[STARPU_VECTOR_INTERFACE_ID]	= handle_to_datatype_vector,
 	[STARPU_CSR_INTERFACE_ID]	= NULL,
-	[STARPU_BCSR_INTERFACE_ID]	= NULL,
+	[STARPU_BCSR_INTERFACE_ID]	= handle_to_datatype_bcsr,
 	[STARPU_VARIABLE_INTERFACE_ID]	= handle_to_datatype_variable,
 	[STARPU_VOID_INTERFACE_ID]	= handle_to_datatype_void,
 	[STARPU_MULTIFORMAT_INTERFACE_ID] = NULL,
@@ -237,7 +257,7 @@ static starpu_mpi_datatype_free_func_t handle_free_datatype_funcs[STARPU_MAX_INT
 	[STARPU_BLOCK_INTERFACE_ID]	= _starpu_mpi_handle_free_complex_datatype,
 	[STARPU_VECTOR_INTERFACE_ID]	= _starpu_mpi_handle_free_simple_datatype,
 	[STARPU_CSR_INTERFACE_ID]	= NULL,
-	[STARPU_BCSR_INTERFACE_ID]	= NULL,
+	[STARPU_BCSR_INTERFACE_ID]	= _starpu_mpi_handle_free_simple_datatype,
 	[STARPU_VARIABLE_INTERFACE_ID]	= _starpu_mpi_handle_free_simple_datatype,
 	[STARPU_VOID_INTERFACE_ID]      = _starpu_mpi_handle_free_simple_datatype,
 	[STARPU_MULTIFORMAT_INTERFACE_ID] = NULL,

+ 1 - 1
mpi/src/starpu_mpi_init.c

@@ -48,7 +48,7 @@ static void _starpu_mpi_print_thread_level_support(int thread_level, char *msg)
 	{
 		case MPI_THREAD_SERIALIZED:
 		{
-			_STARPU_DISP("MPI%s MPI_THREAD_SERIALIZED; Multiple threads may make MPI calls, but only one at a time.\n", msg);
+			_STARPU_DEBUG("MPI%s MPI_THREAD_SERIALIZED; Multiple threads may make MPI calls, but only one at a time.\n", msg);
 			break;
 		}
 		case MPI_THREAD_FUNNELED:

+ 2 - 2
mpi/src/starpu_mpi_private.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2012, 2014-2016  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -21,7 +21,7 @@ int _starpu_debug_rank=-1;
 int _starpu_debug_level_min=0;
 int _starpu_debug_level_max=0;
 int _starpu_mpi_tag = 42;
-int _starpu_mpi_comm;
+int _starpu_mpi_comm_debug;
 
 void _starpu_mpi_set_debug_level_min(int level)
 {

+ 19 - 19
mpi/src/starpu_mpi_private.h

@@ -49,7 +49,7 @@ void _starpu_mpi_simgrid_wait_req(MPI_Request *request, 	MPI_Status *status, sta
 
 extern int _starpu_debug_rank;
 char *_starpu_mpi_get_mpi_error_code(int code);
-extern int _starpu_mpi_comm;
+extern int _starpu_mpi_comm_debug;
 
 #ifdef STARPU_VERBOSE
 extern int _starpu_debug_level_min;
@@ -94,24 +94,24 @@ int _starpu_debug_rank;
 #define _STARPU_MPI_REALLOC(ptr, size) do { ptr = realloc(ptr, size); STARPU_MPI_ASSERT_MSG(ptr != NULL, "Cannot reallocate %ld bytes\n", (long) size); } while (0)
 
 #ifdef STARPU_VERBOSE
-#  define _STARPU_MPI_COMM_DEBUG(count, datatype, node, tag, utag, comm, way) \
-	do \
-	{ \
-	     	if (_starpu_mpi_comm)	\
-	     	{ \
-     			int __size; \
-			char _comm_name[128]; \
-			int _comm_name_len; \
-			int _rank; \
+#  define _STARPU_MPI_COMM_DEBUG(ptr, count, datatype, node, tag, utag, comm, way) \
+	do								\
+	{							\
+	     	if (_starpu_mpi_comm_debug)			\
+		{					\
+     			int __size;			\
+			char _comm_name[128];		\
+			int _comm_name_len;		\
+			int _rank;			    \
 			starpu_mpi_comm_rank(comm, &_rank); \
-			MPI_Type_size(datatype, &__size); \
+			MPI_Type_size(datatype, &__size);		\
 			MPI_Comm_get_name(comm, _comm_name, &_comm_name_len); \
-			fprintf(stderr, "[%d][starpu_mpi] %s %d:%d(%d):%s %12s %ld     [%s:%d]\n", _rank, way, node, tag, utag, _comm_name, " ", count*__size, __starpu_func__ , __LINE__); \
-			fflush(stderr); \
-		} \
+			fprintf(stderr, "[%d][starpu_mpi] :%d:%s:%d:%d:%d:%s:%p:%ld:%d:%s:%d\n", _rank, _rank, way, node, tag, utag, _comm_name, ptr, count, __size, __starpu_func__ , __LINE__); \
+			fflush(stderr);					\
+		}							\
 	} while(0);
-#  define _STARPU_MPI_COMM_TO_DEBUG(count, datatype, dest, tag, utag, comm) 		_STARPU_MPI_COMM_DEBUG(count, datatype, dest, tag, utag, comm, "-->")
-#  define _STARPU_MPI_COMM_FROM_DEBUG(count, datatype, source, tag, utag, comm) 	_STARPU_MPI_COMM_DEBUG(count, datatype, source, tag, utag, comm, "<--")
+#  define _STARPU_MPI_COMM_TO_DEBUG(ptr, count, datatype, dest, tag, utag, comm) 	    _STARPU_MPI_COMM_DEBUG(ptr, count, datatype, dest, tag, utag, comm, "-->")
+#  define _STARPU_MPI_COMM_FROM_DEBUG(ptr, count, datatype, source, tag, utag, comm)  _STARPU_MPI_COMM_DEBUG(ptr, count, datatype, source, tag, utag, comm, "<--")
 #  define _STARPU_MPI_DEBUG(level, fmt, ...) \
 	do \
 	{								\
@@ -123,9 +123,9 @@ int _starpu_debug_rank;
 		}			\
 	} while(0);
 #else
-#  define _STARPU_MPI_COMM_DEBUG(count, datatype, node, tag, utag, comm, way)		do { } while(0)
-#  define _STARPU_MPI_COMM_TO_DEBUG(count, datatype, dest, tag, comm, utag)		do { } while(0)
-#  define _STARPU_MPI_COMM_FROM_DEBUG(count, datatype, source, tag, comm, utag)	do { } while(0)
+#  define _STARPU_MPI_COMM_DEBUG(ptr, count, datatype, node, tag, utag, comm, way)  do { } while(0)
+#  define _STARPU_MPI_COMM_TO_DEBUG(ptr, count, datatype, dest, tag, utag, comm)     do { } while(0)
+#  define _STARPU_MPI_COMM_FROM_DEBUG(ptr, count, datatype, source, tag, utag, comm) do { } while(0)
 #  define _STARPU_MPI_DEBUG(level, fmt, ...)		do { } while(0)
 #endif
 

+ 7 - 1
mpi/src/starpu_mpi_task_insert.c

@@ -54,7 +54,7 @@ int starpu_mpi_pre_submit_hook_unregister()
 
 int _starpu_mpi_find_executee_node(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int *do_execute, int *inconsistent_execute, int *xrank)
 {
-	if (mode & STARPU_W)
+	if (mode & STARPU_W || mode & STARPU_REDUX)
 	{
 		if (!data)
 		{
@@ -386,6 +386,12 @@ int _starpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nod
                 {
                         (void)va_arg(varg_list_copy, void *);
 		}
+		else if (arg_type==STARPU_EXECUTE_WHERE)
+		{
+			// the flag is decoded and set later when
+			// calling function _starpu_task_insert_create()
+			(void)va_arg(varg_list_copy, unsigned long long);
+		}
 		else if (arg_type==STARPU_EXECUTE_ON_WORKER)
 		{
 			// the flag is decoded and set later when

+ 5 - 0
mpi/src/starpu_mpi_task_insert_fortran.c

@@ -241,6 +241,11 @@ int _fstarpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_no
 			arg_i++;
 			/* void* */
 		}
+		else if (arg_type==STARPU_EXECUTE_WHERE)
+		{
+			arg_i++;
+			/* int* */
+		}
 		else if (arg_type==STARPU_EXECUTE_ON_WORKER)
 		{
 			arg_i++;

+ 52 - 29
mpi/tests/Makefile.am

@@ -1,6 +1,6 @@
 # StarPU --- Runtime system for heterogeneous multicore architectures.
 #
-# Copyright (C) 2009-2012, 2015-2016  Université de Bordeaux
+# Copyright (C) 2009-2012, 2015-2017  Université de Bordeaux
 # Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -16,6 +16,15 @@
 
 include $(top_srcdir)/starpu.mk
 
+if STARPU_SIMGRID
+STARPU_PERF_MODEL_DIR=$(abs_top_srcdir)/tools/perfmodels/sampling
+STARPU_HOSTNAME=mirage
+MALLOC_PERTURB_=0
+export STARPU_PERF_MODEL_DIR
+export STARPU_HOSTNAME
+export MALLOC_PERTURB_
+endif
+
 CC=$(MPICC)
 CCLD=$(MPICC)
 
@@ -23,17 +32,23 @@ if STARPU_HAVE_WINDOWS
 LOADER_BIN		=
 else
 loader_CPPFLAGS 	= 	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
+if !STARPU_SIMGRID
 LOADER			=	loader
 LOADER_BIN		=	$(abs_top_builddir)/mpi/tests/$(LOADER)
+endif
 loader_SOURCES		=	../../tests/loader.c
 endif
 
+if STARPU_SIMGRID
+MPI			=	$(abs_top_builddir)/tools/starpu_smpirun -np 4 -platform $(abs_top_srcdir)/tools/perfmodels/cluster.xml -hostfile $(abs_top_srcdir)/tools/perfmodels/hostfile
+else
 # we always test on 4 processes, the execution time is not that bigger
 if STARPU_QUICK_CHECK
 MPI			=	$(MPIEXEC) $(MPIEXEC_ARGS) -np 4
 else
 MPI			=	$(MPIEXEC) $(MPIEXEC_ARGS) -np 4
 endif
+endif
 
 if STARPU_HAVE_AM111
 TESTS_ENVIRONMENT	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=4 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
@@ -42,11 +57,9 @@ else
 TESTS_ENVIRONMENT 	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=4 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPI) $(LOADER_BIN)
 endif
 
-if !STARPU_SIMGRID
 if STARPU_MPI_CHECK
 TESTS			=	$(starpu_mpi_TESTS)
 endif
-endif
 
 check_PROGRAMS = $(LOADER) $(starpu_mpi_TESTS)
 
@@ -87,8 +100,36 @@ AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LDFLAGS) $(ST
 ########################
 
 if BUILD_TESTS
-starpu_mpi_TESTS =				\
-	datatypes				\
+
+starpu_mpi_TESTS =				
+
+starpu_mpi_TESTS +=				\
+	cache					\
+	cache_disable				\
+	callback				\
+	early_request				\
+	insert_task				\
+	insert_task_block			\
+	insert_task_dyn_handles			\
+	insert_task_node_choice			\
+	insert_task_owner			\
+	insert_task_owner2			\
+	insert_task_owner_data			\
+	matrix					\
+	matrix2					\
+	mpi_detached_tag			\
+	mpi_irecv_detached			\
+	mpi_isend_detached			\
+	mpi_reduction				\
+	mpi_scatter_gather			\
+	policy_register				\
+	policy_register_many			\
+	policy_selection			\
+	policy_selection2			\
+	ring_async_implicit
+
+if !STARPU_SIMGRID
+starpu_mpi_TESTS +=				\
 	pingpong				\
 	mpi_test				\
 	mpi_isend				\
@@ -96,50 +137,32 @@ starpu_mpi_TESTS =				\
 	mpi_earlyrecv2				\
 	mpi_earlyrecv2_sync			\
 	mpi_irecv				\
-	mpi_isend_detached			\
-	mpi_irecv_detached			\
-	mpi_detached_tag			\
 	mpi_redux				\
 	ring					\
 	ring_sync				\
 	ring_sync_detached			\
 	ring_async				\
-	ring_async_implicit			\
 	block_interface				\
 	block_interface_pinned			\
-	cache					\
-	cache_disable				\
-	callback				\
-	matrix					\
 	matrix2					\
-	insert_task				\
 	insert_task_compute			\
 	insert_task_sent_cache			\
 	insert_task_recv_cache			\
-	insert_task_block			\
-	insert_task_owner			\
-	insert_task_owner2			\
-	insert_task_owner_data			\
-	insert_task_node_choice			\
 	insert_task_count			\
-	insert_task_dyn_handles			\
 	multiple_send				\
-	mpi_scatter_gather			\
-	mpi_reduction				\
 	user_defined_datatype			\
 	tags_checking				\
 	sync					\
 	gather					\
 	gather2					\
-	policy_register				\
-	policy_register_many			\
+	load_balancer
+
+# Expected to fail
+starpu_mpi_TESTS +=				\
 	policy_register_toomany			\
 	policy_unregister			\
-	policy_selection			\
-	policy_selection2			\
-	early_request				\
-	starpu_redefine				\
-	load_balancer
+	starpu_redefine
+endif
 
 noinst_PROGRAMS =				\
 	datatypes				\

+ 9 - 6
mpi/tests/block_interface.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2014-2015  Université de Bordeaux
+ * Copyright (C) 2009, 2010, 2014-2015, 2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2014, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -27,16 +27,18 @@
 int main(int argc, char **argv)
 {
 	int ret, rank, size;
+	int mpi_init;
 
-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
-	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
-	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
 
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_init(NULL, NULL, 0);
+	ret = starpu_mpi_init(NULL, NULL, mpi_init);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
+	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
+
 	if (size < 2)
 	{
 		if (rank == 0)
@@ -142,7 +144,8 @@ int main(int argc, char **argv)
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 
-	MPI_Finalize();
+	if (!mpi_init)
+		MPI_Finalize();
 
 	return 0;
 }

+ 11 - 7
mpi/tests/block_interface_pinned.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2014-2015  Université de Bordeaux
+ * Copyright (C) 2009, 2010, 2014-2015, 2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -27,16 +27,18 @@
 int main(int argc, char **argv)
 {
 	int ret, rank, size;
+	int mpi_init;
 
-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
-	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
-	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
 
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_init(NULL, NULL, 0);
+	ret = starpu_mpi_init(NULL, NULL, mpi_init);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
+	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
+
 	if (size < 2)
 	{
 		if (rank == 0)
@@ -44,7 +46,8 @@ int main(int argc, char **argv)
 
 		starpu_mpi_shutdown();
 		starpu_shutdown();
-		MPI_Finalize();
+		if (!mpi_init)
+			MPI_Finalize();
 		return STARPU_TEST_SKIPPED;
 	}
 
@@ -146,7 +149,8 @@ int main(int argc, char **argv)
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 
-	MPI_Finalize();
+	if (!mpi_init)
+		MPI_Finalize();
 
 	return 0;
 }

+ 107 - 6
mpi/tests/datatypes.c

@@ -152,6 +152,46 @@ void check_block(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r, i
 	starpu_data_release(handle_r);
 }
 
+void check_bcsr(starpu_data_handle_t handle_s, starpu_data_handle_t handle_r, int *error)
+{
+	STARPU_ASSERT(starpu_bcsr_get_elemsize(handle_s) == starpu_bcsr_get_elemsize(handle_r));
+	STARPU_ASSERT(starpu_bcsr_get_nnz(handle_s) == starpu_bcsr_get_nnz(handle_r));
+	STARPU_ASSERT(starpu_bcsr_get_nrow(handle_s) == starpu_bcsr_get_nrow(handle_r));
+	STARPU_ASSERT(starpu_bcsr_get_firstentry(handle_s) == starpu_bcsr_get_firstentry(handle_r));
+	STARPU_ASSERT(starpu_bcsr_get_r(handle_s) == starpu_bcsr_get_r(handle_r));
+	STARPU_ASSERT(starpu_bcsr_get_c(handle_s) == starpu_bcsr_get_c(handle_r));
+	//	STARPU_ASSERT(starpu_bcsr_get_local_colind(handle_s) == starpu_bcsr_get_local_colind(handle_r));
+	//	STARPU_ASSERT(starpu_bcsr_get_local_rowptr(handle_s) == starpu_bcsr_get_local_rowptr(handle_r));
+
+	starpu_data_acquire(handle_s, STARPU_R);
+	starpu_data_acquire(handle_r, STARPU_R);
+
+	int *bcsr_s = (int *)starpu_bcsr_get_local_nzval(handle_s);
+	int *bcsr_r = (int *)starpu_bcsr_get_local_nzval(handle_r);
+
+	int r = starpu_bcsr_get_r(handle_s);
+	int c = starpu_bcsr_get_c(handle_s);
+	int nnz = starpu_bcsr_get_nnz(handle_s);
+
+	int x;
+
+	for(x=0 ; x<r*c*nnz ; x++)
+	{
+		if (bcsr_s[x] == bcsr_r[x])
+		{
+			FPRINTF_MPI(stderr, "Success with bcsr[%d] value: %d == %d\n", x, bcsr_s[x], bcsr_r[x]);
+		}
+		else
+		{
+			*error = 1;
+			FPRINTF_MPI(stderr, "Error with bcsr[%d] value: %d != %d\n", x, bcsr_s[x], bcsr_r[x]);
+		}
+	}
+
+	starpu_data_release(handle_s);
+	starpu_data_release(handle_r);
+}
+
 void send_recv_and_check(int rank, int node, starpu_data_handle_t handle_s, int tag_s, starpu_data_handle_t handle_r, int tag_r, int *error, check_func func)
 {
 	int ret;
@@ -164,6 +204,7 @@ void send_recv_and_check(int rank, int node, starpu_data_handle_t handle_s, int
 		ret = starpu_mpi_recv(handle_r, node, tag_r, MPI_COMM_WORLD, &status);
 		STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_recv");
 
+		assert(func);
 		func(handle_s, handle_r, error);
 	}
 	else if (rank == 1)
@@ -328,20 +369,77 @@ void exchange_block(int rank, int *error)
 	}
 }
 
+void exchange_bcsr(int rank, int *error)
+{
+	/*
+	 * We use the following matrix:
+	 *
+	 *   +----------------+
+	 *   |  0   1   0   0 |
+	 *   |  2   3   0   0 |
+	 *   |  4   5   8   9 |
+	 *   |  6   7  10  11 |
+	 *   +----------------+
+	 *
+	 * nzval  = [0, 1, 2, 3] ++ [4, 5, 6, 7] ++ [8, 9, 10, 11]
+	 * colind = [0, 0, 1]
+	 * rowptr = [0, 1 ]
+	 * r = c = 2
+	 */
+
+	/* Size of the blocks */
+#define BCSR_R 2
+#define BCSR_C 2
+#define BCSR_NROW 2
+#define BCSR_NNZ_BLOCKS 3     /* out of 4 */
+#define BCSR_NZVAL_SIZE (BCSR_R*BCSR_C*BCSR_NNZ_BLOCKS)
+
+	uint32_t colind[BCSR_NNZ_BLOCKS] = {0, 0, 1};
+	uint32_t rowptr[BCSR_NROW] = {0, 1};
+
+	if (rank == 0)
+	{
+		starpu_data_handle_t bcsr_handle[2];
+		int nzval[BCSR_NZVAL_SIZE]  =
+		{
+			0, 1, 2, 3,    /* First block  */
+			4, 5, 6, 7,    /* Second block */
+			8, 9, 10, 11   /* Third block  */
+		};
+
+		starpu_bcsr_data_register(&bcsr_handle[0], STARPU_MAIN_RAM, BCSR_NNZ_BLOCKS, BCSR_NROW, (uintptr_t) nzval, colind, rowptr, 0, BCSR_R, BCSR_C, sizeof(nzval[0]));
+		starpu_bcsr_data_register(&bcsr_handle[1], -1, BCSR_NNZ_BLOCKS, BCSR_NROW, (uintptr_t) NULL, colind, rowptr, 0, BCSR_R, BCSR_C, sizeof(nzval[0]));
+
+		send_recv_and_check(rank, 1, bcsr_handle[0], 0x73, bcsr_handle[1], 0x8337, error, check_bcsr);
+
+		starpu_data_unregister(bcsr_handle[0]);
+		starpu_data_unregister(bcsr_handle[1]);
+	}
+	else if (rank == 1)
+	{
+		starpu_data_handle_t bcsr_handle;
+		starpu_bcsr_data_register(&bcsr_handle, -1, BCSR_NNZ_BLOCKS, BCSR_NROW, (uintptr_t) NULL, colind, rowptr, 0, BCSR_R, BCSR_C, sizeof(int));
+		send_recv_and_check(rank, 0, bcsr_handle, 0x73, NULL, 0x8337, NULL, NULL);
+		starpu_data_unregister(bcsr_handle);
+	}
+}
+
 int main(int argc, char **argv)
 {
 	int ret, rank, size;
 	int error=0;
+	int mpi_init;
 
-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
-	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
-	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
 
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_init(NULL, NULL, 0);
+	ret = starpu_mpi_init(NULL, NULL, mpi_init);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
+	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
+
 	if (size < 2)
 	{
 		if (rank == 0)
@@ -349,7 +447,8 @@ int main(int argc, char **argv)
 
 		starpu_mpi_shutdown();
 		starpu_shutdown();
-		MPI_Finalize();
+		if (!mpi_init)
+			MPI_Finalize();
 		return STARPU_TEST_SKIPPED;
 	}
 
@@ -358,11 +457,13 @@ int main(int argc, char **argv)
 	exchange_vector(rank, &error);
 	exchange_matrix(rank, &error);
 	exchange_block(rank, &error);
+	exchange_bcsr(rank, &error);
 
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 
-	MPI_Finalize();
+	if (!mpi_init)
+		MPI_Finalize();
 
 	return rank == 0 ? error : 0;
 }

+ 10 - 6
mpi/tests/early_request.c

@@ -191,23 +191,26 @@ int main(int argc, char * argv[])
 	/* Init */
 	int ret;
 	int mpi_rank, mpi_size;
+	int mpi_init;
 
-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
-	starpu_mpi_comm_rank(MPI_COMM_WORLD, &mpi_rank);
-	starpu_mpi_comm_size(MPI_COMM_WORLD, &mpi_size);
+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
 
 	ret = starpu_init(NULL);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_init(NULL, NULL, 0);
+	ret = starpu_mpi_init(NULL, NULL, mpi_init);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
 
+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &mpi_rank);
+	starpu_mpi_comm_size(MPI_COMM_WORLD, &mpi_size);
+
 	if (starpu_cpu_worker_get_count() == 0)
 	{
 		if (mpi_rank == 0)
 			FPRINTF(stderr, "We need at least 1 CPU worker.\n");
 		starpu_mpi_shutdown();
 		starpu_shutdown();
-		MPI_Finalize();
+		if (!mpi_init)
+			MPI_Finalize();
 		return STARPU_TEST_SKIPPED;
 	}
 
@@ -246,7 +249,8 @@ int main(int argc, char * argv[])
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 
-	MPI_Finalize();
+	if (!mpi_init)
+		MPI_Finalize();
 	FPRINTF(stderr, "No assert until end\n");
 	return 0;
 }

+ 0 - 0
mpi/tests/gather.c


Some files were not shown because too many files changed in this diff