8 年之前 · 0f9830e940
--- a/AUTHORS
+++ b/AUTHORS
@@ -6,10 +6,12 @@ Berenger Bramas <berenger.bramas@inria.fr>
 
				 Alfredo Buttari <alfredo.buttari@enseeiht.fr>
			
 
				 Adrien Cassagne <adrien.cassagne@inria.fr>
			
 
				 Jérôme Clet-Ortega <jerome.clet-ortega@labri.fr>
			
 
				+Terry Cojean <terry.cojean@inria.fr>
			
 
				 Nicolas Collin <nicolas.collin@inria.fr>
			
 
				 Ludovic Courtès <ludovic.courtes@inria.fr>
			
 
				 Yann Courtois <yann.courtois33@gmail.com>
			
 
				 Jean-Marie Couteyen <jm.couteyen@gmail.com>
			
 
				+Lionel Eyraud-Dubois <lionel.eyraud-dubois@inria.fr>
			
 
				 Nathalie Furmento <nathalie.furmento@labri.fr>
			
 
				 David Gómez <david_gomez1380@yahoo.com.mx>
			
 
				 Sylvain Henry <sylvain.henry@inria.fr>
			
@@ -22,9 +24,11 @@ Brice Mortier <brice.mortier@etu.u-bordeaux1.fr>
 
				 Stojce Nakov <stojce.nakov@inria.fr>
			
 
				 Joris Pablo <joris.pablo@orange.fr>
			
 
				 Damien Pasqualinotto <dam.pasqualinotto@wanadoo.fr>
			
 
				+Samuel Pitoiset <samuel.pitoiset@inria.fr>
			
 
				 Nguyen Quôc-Dinh <nguyen.quocdinh@gmail.com>
			
 
				 Cyril Roelandt <cyril.roelandt@inria.fr>
			
 
				 Anthony Roy <theanthony33@gmail.com>
			
 
				+Chiheb Sakka <chiheb.sakka@inria.fr>
			
 
				 Corentin Salingue <corentin.salingue@gmail.com>
			
 
				 Marc Sergent <marc.sergent@inria.fr>
			
 
				 Anthony Simonet <anthony.simonet@etu.u-bordeaux.fr>
			
@@ -33,4 +37,5 @@ Ludovic Stordeur <ludovic.stordeur@inria.fr>
 
				 Guillaume Sylvand <guillaume.sylvand@airbus.com>
			
 
				 François Tessier <francois.tessier@inria.fr>
			
 
				 Samuel Thibault <samuel.thibault@labri.fr>
			
 
				+Leo Villeveygoux <leo.villeveygoux@inria.fr>
			
 
				 Pierre-André Wacrenier <wacrenier@labri.fr>
			
--- a/ChangeLog
+++ b/ChangeLog
@@ -43,12 +43,17 @@ Small features:
 
				   * New function starpu_worker_display_names to display the names of
			
 
				     all the workers of a specified type.
			
 
				   * Arbiters now support concurrent read access.
			
 
				+  * Add a field starpu_task::where similar to starpu_codelet::where
			
 
				+    which allows to restrict where to execute a task. Also add
			
 
				+    STARPU_TASK_WHERE to be used when calling starpu_task_insert().
			
 
				 
			
 
				 Changes:
			
 
				   * Vastly improve simgrid simulation time.
			
 
				 
			
 
				 Small changes:
			
 
				   * Use asynchronous transfers for task data fetches with were not prefetched.
			
 
				+  * Allow to call starpu_sched_ctx_set_policy_data on the main
			
 
				+    scheduler context
			
 
				 
			
 
				 StarPU 1.2.2 (svn revision xxx)
			
 
				 ==============================================
			
@@ -57,6 +62,9 @@ New features:
 
				   * Add starpu_data_acquire_try and starpu_data_acquire_on_node_try.
			
 
				   * Add NVCC_CC environment variable.
			
 
				   * Add -no-foo options to starpu_fxt_tool to make traces lighter
			
 
				+  * Add starpu_cusparse_init/shutdown/get_local_handle for proper CUDA
			
 
				+    overlapping with cusparse.
			
 
				+
			
 
				 
			
 
				 Small changes:
			
 
				   * Output generated through STARPU_MPI_COMM has been modified to
			
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,6 +1,6 @@
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 #
			
 
				-# Copyright (C) 2009-2016  Université de Bordeaux
			
 
				+# Copyright (C) 2009-2017  Université de Bordeaux
			
 
				 # Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016  CNRS
			
 
				 # Copyright (C) 2014  INRIA
			
 
				 # Copyright (C) 2016  Inria
			
@@ -99,6 +99,7 @@ versinclude_HEADERS = 				\
 
				 	include/starpu_disk.h			\
			
 
				 	include/starpu_cublas.h			\
			
 
				 	include/starpu_cublas_v2.h		\
			
 
				+	include/starpu_cusparse.h		\
			
 
				 	include/starpu_driver.h			\
			
 
				 	include/starpu_stdlib.h			\
			
 
				 	include/starpu_thread.h			\
			
--- a/configure.ac
+++ b/configure.ac
@@ -3,7 +3,7 @@
 
				 # Copyright (C) 2009-2017  Université de Bordeaux
			
 
				 # Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
			
 
				 # Copyright (C) 2011  Télécom-SudParis
			
 
				-# Copyright (C) 2011, 2012, 2014-2016  INRIA
			
 
				+# Copyright (C) 2011, 2012, 2014-2017  INRIA
			
 
				 #
			
 
				 # StarPU is free software; you can redistribute it and/or modify
			
 
				 # it under the terms of the GNU Lesser General Public License as published by
			
@@ -475,30 +475,33 @@ if test x$enable_mpi_check = xno ; then
 
				 fi
			
 
				 
			
 
				 
			
 
				-# Check if mpiexec is available
			
 
				-AC_ARG_WITH(mpiexec, [AS_HELP_STRING([--with-mpiexec[=<path to mpiexec>]],
			
 
				-            [Path of mpiexec])],
			
 
				-    [
			
 
				-        if test x$withval = xyes; then
			
 
				-            AC_MSG_ERROR(--with-mpiexec must be given a pathname)
			
 
				-        else
			
 
				-            mpiexec_path=$withval
			
 
				-        fi
			
 
				-    ],
			
 
				-    [
			
 
				-        # nothing was specified: look in the path
			
 
				-        AC_PATH_PROG(mpiexec_path, mpiexec, [no], [$(dirname $mpicc_path):$PATH])
			
 
				-    ])
			
 
				-
			
 
				-AC_MSG_CHECKING(whether mpiexec is available)
			
 
				-AC_MSG_RESULT($mpiexec_path)
			
 
				-
			
 
				-# We test if MPIEXEC exists
			
 
				-if test ! -x $mpiexec_path; then
			
 
				-    #MPIEXEC does not exists or is not executable
			
 
				-    AC_MSG_RESULT(The mpiexec script is not valid)
			
 
				-        running_mpi_check=no
			
 
				-        mpiexec_path=""
			
 
				+if test x$enable_simgrid = xno ; then
			
 
				+    # Check if mpiexec is available
			
 
				+    AC_ARG_WITH(mpiexec, [AS_HELP_STRING([--with-mpiexec[=<path to mpiexec>]],
			
 
				+                [Path of mpiexec])],
			
 
				+        [
			
 
				+            if test x$withval = xyes; then
			
 
				+                AC_MSG_ERROR(--with-mpiexec must be given a pathname)
			
 
				+            else
			
 
				+                mpiexec_path=$withval
			
 
				+            fi
			
 
				+        ],
			
 
				+        [
			
 
				+            # nothing was specified: look in the path
			
 
				+            AC_PATH_PROG(mpiexec_path, mpiexec, [no], [$(dirname $mpicc_path):$PATH])
			
 
				+        ])
			
 
				+    
			
 
				+    AC_MSG_CHECKING(whether mpiexec is available)
			
 
				+    AC_MSG_RESULT($mpiexec_path)
			
 
				+    
			
 
				+    # We test if MPIEXEC exists
			
 
				+    if test ! -x $mpiexec_path; then
			
 
				+        #MPIEXEC does not exists or is not executable
			
 
				+        AC_MSG_RESULT(The mpiexec script is not valid)
			
 
				+            running_mpi_check=no
			
 
				+            mpiexec_path=""
			
 
				+    fi
			
 
				+    AC_SUBST(MPIEXEC,$mpiexec_path)
			
 
				 fi
			
 
				 
			
 
				 AM_CONDITIONAL(STARPU_MPI_CHECK, test x$running_mpi_check = xyes)
			
@@ -508,7 +511,6 @@ fi
 
				 if test x$use_mpi = xyes ; then
			
 
				     AC_MSG_CHECKING(whether MPI tests should be run)
			
 
				     AC_MSG_RESULT($running_mpi_check)
			
 
				-    AC_SUBST(MPIEXEC,$mpiexec_path)
			
 
				 fi
			
 
				 
			
 
				 #We can only build StarPU MPI Library if User wants it and MPI is available
			
@@ -1144,6 +1146,9 @@ if test x$enable_cuda = xyes; then
 
				 	fi
			
 
				 
			
 
				 	AC_CHECK_HEADERS([cuda_gl_interop.h])
			
 
				+
			
 
				+	AC_CHECK_LIB([cusparse], [cusparseCreate])
			
 
				+	AC_CHECK_DECLS([cusparseSetStream], [], [], [[#include <cusparse.h>]])
			
 
				 fi
			
 
				 
			
 
				 dnl Hey dude, are you around?
			
@@ -2616,6 +2621,16 @@ if test "x$enable_socl" = "xyes" -a "$have_valid_opencl" = "no" ; then
 
				     AC_MSG_ERROR([SOCL cannot be enabled without OpenCL])
			
 
				 fi
			
 
				 
			
 
				+# MPI Master Slave and SOCL are not compatible
			
 
				+if test "x$use_mpi_master_slave" = "xyes" ; then
			
 
				+   if test "x$enable_socl" = "xyes" ; then
			
 
				+      AC_MSG_ERROR([MPI Master-Slave and SOCL can not be used at the same time !])
			
 
				+   fi
			
 
				+   if test "x$enable_socl" = "xmaybe" ; then
			
 
				+     enable_socl=no 
			
 
				+   fi
			
 
				+fi
			
 
				+
			
 
				 # now we enable SOCL if and only if a proper setup is available
			
 
				 if test "x$enable_socl" = "xyes" -o "x$enable_socl" = "xmaybe" ; then
			
 
				    build_socl=$have_valid_opencl
			
--- a/doc/doxygen/chapters/210_check_list_performance.doxy
+++ b/doc/doxygen/chapters/210_check_list_performance.doxy
@@ -65,9 +65,13 @@ Calling starpu_cublas_init() makes StarPU already do appropriate calls for the
 
				 CUBLAS library. Some libraries like Magma may however change the current stream of CUBLAS v1,
			
 
				 one then has to call <c>cublasSetKernelStream(starpu_cuda_get_local_stream())</c> at
			
 
				 the beginning of the codelet to make sure that CUBLAS is really using the proper
			
 
				-stream. When using CUBLAS v2, starpu_cublas_local_handle() can be called to queue CUBLAS
			
 
				+stream. When using CUBLAS v2, starpu_cublas_get_local_handle() can be called to queue CUBLAS
			
 
				 kernels with the proper configuration.
			
 
				 
			
 
				+Similarly, calling starpu_cusparse_init() makes StarPU create CUSPARSE handles
			
 
				+on each CUDA device, starpu_cusparse_get_local_handle() can then be used to
			
 
				+queue CUSPARSE kernels with the proper configuration.
			
 
				+
			
 
				 If the kernel can be made to only use this local stream or other self-allocated
			
 
				 streams, i.e. the whole kernel submission can be made asynchronous, then
			
 
				 one should enable asynchronous execution of the kernel.  That means setting
			
@@ -78,6 +82,21 @@ able to submit and complete data transfers while kernels are executing, instead
 
				 kernel submission. The kernel just has to make sure that StarPU can use the
			
 
				 local stream to synchronize with the kernel startup and completion.
			
 
				 
			
 
				+If the kernel uses its own non-default stream, one can synchronize that stream
			
 
				+with the StarPU-provided stream this way:
			
 
				+
			
 
				+\code{.c}
			
 
				+cudaEvent_t event;
			
 
				+call_kernel_with_its_own_stream()
			
 
				+cudaEventCreateWithFlags(&event, cudaEventDisableTiming);
			
 
				+cudaEventRecord(event, get_kernel_stream());
			
 
				+cudaStreamWaitEvent(starpu_cuda_get_local_stream(), event, 0);
			
 
				+cudaEventDestroy(event);
			
 
				+\endcode
			
 
				+
			
 
				+That code makes the StarPU-provided stream wait for a new event, which will be
			
 
				+triggered by the completion of the kernel.
			
 
				+
			
 
				 Using the flag ::STARPU_CUDA_ASYNC also permits to enable concurrent kernel
			
 
				 execution, on cards which support it (Kepler and later, notably). This is
			
 
				 enabled by setting the environment variable \ref STARPU_NWORKER_PER_CUDA to the
			
--- a/doc/doxygen/chapters/350_modularized_scheduler.doxy
+++ b/doc/doxygen/chapters/350_modularized_scheduler.doxy
@@ -229,7 +229,7 @@ static void initialize_eager_prefetching_center_policy(unsigned sched_ctx_id)
 
				     /* Each Worker Component has a Flow-control Fifo Component as
			
 
				      * father */
			
 
				     struct starpu_sched_component * worker_component =
			
 
				-	  starpu_sched_component_worker_get(i);
			
 
				+	  starpu_sched_component_worker_new(i);
			
 
				     struct starpu_sched_component * fifo_component =
			
 
				 	  starpu_sched_component_fifo_create(&fifo_data);
			
 
				     fifo_component->add_child
			
--- a/doc/doxygen/chapters/501_environment_variables.doxy
+++ b/doc/doxygen/chapters/501_environment_variables.doxy
@@ -831,7 +831,7 @@ that have a limited amount of memory.
 
				 \anchor STARPU_LIMIT_CPU_MEM
			
 
				 \addindex __env__STARPU_LIMIT_CPU_MEM
			
 
				 This variable specifies the maximum number of megabytes that should be
			
 
				-available to the application on each CPU device. Setting it enables allocation
			
 
				+available to the application in the main CPU memory. Setting it enables allocation
			
 
				 cache in main memory
			
 
				 </dd>
			
 
				 
			
--- a/doc/doxygen/chapters/api/codelet_and_tasks.doxy
+++ b/doc/doxygen/chapters/api/codelet_and_tasks.doxy
@@ -56,33 +56,33 @@ essentially used for synchronization tasks.
 
				 
			
 
				 \def STARPU_CPU
			
 
				 \ingroup API_Codelet_And_Tasks
			
 
				-This macro is used when setting the field starpu_codelet::where
			
 
				-to specify the codelet may be executed on a CPU processing unit.
			
 
				+This macro is used when setting the field starpu_codelet::where (or starpu_task::where)
			
 
				+to specify the codelet (or the task) may be executed on a CPU processing unit.
			
 
				 
			
 
				 \def STARPU_CUDA
			
 
				 \ingroup API_Codelet_And_Tasks
			
 
				-This macro is used when setting the field starpu_codelet::where
			
 
				-to specify the codelet may be executed on a CUDA processing unit.
			
 
				+This macro is used when setting the field starpu_codelet::where (or starpu_task::where)
			
 
				+to specify the codelet (or the task) may be executed on a CUDA processing unit.
			
 
				 
			
 
				 \def STARPU_OPENCL
			
 
				 \ingroup API_Codelet_And_Tasks
			
 
				-This macro is used when setting the field starpu_codelet::where to
			
 
				-specify the codelet may be executed on a OpenCL processing unit.
			
 
				+This macro is used when setting the field starpu_codelet::where (or starpu_task::where) to
			
 
				+specify the codelet (or the task) may be executed on a OpenCL processing unit.
			
 
				 
			
 
				 \def STARPU_MIC
			
 
				 \ingroup API_Codelet_And_Tasks
			
 
				-This macro is used when setting the field starpu_codelet::where to
			
 
				-specify the codelet may be executed on a MIC processing unit.
			
 
				+This macro is used when setting the field starpu_codelet::where (or starpu_task::where) to
			
 
				+specify the codelet (or the task) may be executed on a MIC processing unit.
			
 
				 
			
 
				 \def STARPU_MPI_MS
			
 
				 \ingroup API_Codelet_And_Tasks
			
 
				-This macro is used when setting the field starpu_codelet::where to
			
 
				-specify the codelet may be executed on a MPI Slave processing unit.
			
 
				+This macro is used when setting the field starpu_codelet::where (or starpu_task::where) to
			
 
				+specify the codelet (or the task) may be executed on a MPI Slave processing unit.
			
 
				 
			
 
				 \def STARPU_SCC
			
 
				 \ingroup API_Codelet_And_Tasks
			
 
				-This macro is used when setting the field starpu_codelet::where to
			
 
				-specify the codelet may be executed on an SCC processing unit.
			
 
				+This macro is used when setting the field starpu_codelet::where (or starpu_task::where) to
			
 
				+specify the codelet (or the task) may be executed on an SCC processing unit.
			
 
				 
			
 
				 \def STARPU_MAIN_RAM
			
 
				 \ingroup API_Codelet_And_Tasks
			
@@ -443,6 +443,10 @@ the configuration of a task allocated with starpu_task_create().
 
				     the task. The access modes are now defined in the field
			
 
				     starpu_codelet::modes.
			
 
				 
			
 
				+\var uint32_t starpu_task::where
			
 
				+    When set, specifies where the task is allowed to be executed.
			
 
				+    When unset, it takes the value of starpu_codelet::where.
			
 
				+
			
 
				 \var int starpu_task::nbuffers
			
 
				     Specifies the number of buffers. This is only used when
			
 
				     starpu_codelet::nbuffers is \ref STARPU_VARIABLE_NBUFFERS.
			
--- a/doc/doxygen/chapters/api/cuda_extensions.doxy
+++ b/doc/doxygen/chapters/api/cuda_extensions.doxy
@@ -95,4 +95,21 @@ Report a cublas error.
 
				 Calls starpu_cublas_report_error(), passing the current
			
 
				 function, file and line position.
			
 
				 
			
 
				+\fn void starpu_cusparse_init(void)
			
 
				+\ingroup API_CUDA_Extensions
			
 
				+Calling starpu_cusparse_init() will initialize CUSPARSE on every CUDA device
			
 
				+controlled by StarPU. This call blocks until CUSPARSE has been properly
			
 
				+initialized on every device.
			
 
				+
			
 
				+\fn cusparseHandle_t starpu_cusparse_get_local_handle(void)
			
 
				+\ingroup API_CUDA_Extensions
			
 
				+This function returns the CUSPARSE handle to be used to queue CUSPARSE
			
 
				+kernels. It is properly initialized and configured for multistream by
			
 
				+starpu_cusparse_init().
			
 
				+
			
 
				+\fn void starpu_cusparse_shutdown(void)
			
 
				+\ingroup API_CUDA_Extensions
			
 
				+This function synchronously deinitializes the CUSPARSE library on
			
 
				+every CUDA device.
			
 
				+
			
 
				 */
			
--- a/doc/doxygen/chapters/api/scheduling_contexts.doxy
+++ b/doc/doxygen/chapters/api/scheduling_contexts.doxy
@@ -132,13 +132,13 @@ Create a context indicating an approximate interval of resources
 
				 Execute the callback whenever the last task of the context finished executing, it is called with the parameters \p sched_ctx and any other parameter needed
			
 
				 by the application (packed in \p args)
			
 
				 
			
 
				-\fn void starpu_sched_ctx_add_workers(int *workerids_ctx, int nworkers_ctx, unsigned sched_ctx_id)
			
 
				+\fn void starpu_sched_ctx_add_workers(int *workerids_ctx, unsigned nworkers_ctx, unsigned sched_ctx_id)
			
 
				 \ingroup API_Scheduling_Contexts
			
 
				 Add dynamically the workers in \p workerids_ctx to the
			
 
				 context \p sched_ctx_id. The last argument cannot be greater than
			
 
				 \ref STARPU_NMAX_SCHED_CTXS.
			
 
				 
			
 
				-\fn void starpu_sched_ctx_remove_workers(int *workerids_ctx, int nworkers_ctx, unsigned sched_ctx_id)
			
 
				+\fn void starpu_sched_ctx_remove_workers(int *workerids_ctx, unsigned nworkers_ctx, unsigned sched_ctx_id)
			
 
				 \ingroup API_Scheduling_Contexts
			
 
				 Remove the workers in \p workerids_ctx from the context
			
 
				 \p sched_ctx_id. The last argument cannot be greater than
			
--- a/doc/doxygen/chapters/api/scheduling_policy.doxy
+++ b/doc/doxygen/chapters/api/scheduling_policy.doxy
@@ -124,12 +124,6 @@ this should be called by push functions to wake the potential workers that are
 
				 supposed to pick up the tasks which just have been pushed, otherwise they may
			
 
				 remain sleeping.
			
 
				 
			
 
				-\fn void starpu_worker_get_job_id(struct starpu_task *task)
			
 
				-\ingroup API_Scheduling_Policy
			
 
				-Return the job id of the given task, i.e. a number that uniquely identifies this
			
 
				-task for the local MPI node, and can be found in the various offline execution
			
 
				-traces reports.
			
 
				-
			
 
				 \fn int starpu_sched_set_min_priority(int min_prio)
			
 
				 \ingroup API_Scheduling_Policy
			
 
				 TODO: check if this is correct
			
--- a/doc/doxygen/chapters/api/task_lists.doxy
+++ b/doc/doxygen/chapters/api/task_lists.doxy
@@ -28,15 +28,15 @@ Push \p task at the front of \p list
 
				 \ingroup API_Task_Lists
			
 
				 Push \p task at the back of \p list
			
 
				 
			
 
				-\fn struct starpu_task *starpu_task_list_front(struct starpu_task_list *list)
			
 
				+\fn struct starpu_task *starpu_task_list_front(const struct starpu_task_list *list)
			
 
				 \ingroup API_Task_Lists
			
 
				 Get the front of \p list (without removing it)
			
 
				 
			
 
				-\fn struct starpu_task *starpu_task_list_back(struct starpu_task_list *list)
			
 
				+\fn struct starpu_task *starpu_task_list_back(const struct starpu_task_list *list)
			
 
				 \ingroup API_Task_Lists
			
 
				 Get the back of \p list (without removing it)
			
 
				 
			
 
				-\fn int starpu_task_list_empty(struct starpu_task_list *list)
			
 
				+\fn int starpu_task_list_empty(const struct starpu_task_list *list)
			
 
				 \ingroup API_Task_Lists
			
 
				 Test if \p list is empty
			
 
				 
			
@@ -52,19 +52,19 @@ Remove the element at the front of \p list
 
				 \ingroup API_Task_Lists
			
 
				 Remove the element at the back of \p list
			
 
				 
			
 
				-\fn struct starpu_task *starpu_task_list_begin(struct starpu_task_list *list)
			
 
				+\fn struct starpu_task *starpu_task_list_begin(const struct starpu_task_list *list)
			
 
				 \ingroup API_Task_Lists
			
 
				 Get the first task of \p list.
			
 
				 
			
 
				-\fn struct starpu_task *starpu_task_list_end(struct starpu_task_list *list)
			
 
				+\fn struct starpu_task *starpu_task_list_end(const struct starpu_task_list *list)
			
 
				 \ingroup API_Task_Lists
			
 
				 Get the end of \p list.
			
 
				 
			
 
				-\fn struct starpu_task *starpu_task_list_next(struct starpu_task *task)
			
 
				+\fn struct starpu_task *starpu_task_list_next(const struct starpu_task *task)
			
 
				 \ingroup API_Task_Lists
			
 
				 Get the next task of \p list. This is not erase-safe.
			
 
				 
			
 
				-\fn int starpu_task_list_ismember(struct starpu_task_list *list, struct starpu_task *look)
			
 
				+\fn int starpu_task_list_ismember(const struct starpu_task_list *list, const struct starpu_task *look)
			
 
				 \ingroup API_Task_Lists
			
 
				 Test whether the given task \p look is contained in the \p list.
			
 
				 
			
--- a/doc/doxygen/chapters/api/workers.doxy
+++ b/doc/doxygen/chapters/api/workers.doxy
@@ -264,4 +264,70 @@ whose StarPU identifier is \p node.
 
				 \ingroup API_Workers_Properties
			
 
				 Return worker \p type as a string.
			
 
				 
			
 
				+\fn int starpu_worker_sched_op_pending(void)
			
 
				+\ingroup API_Workers_Properties
			
 
				+Return \c !0 if current worker has a scheduling operation in progress,
			
 
				+and \c 0 otherwise.
			
 
				+
			
 
				+\fn void starpu_worker_relax_on(void)
			
 
				+\ingroup API_Workers_Properties
			
 
				+Allow other threads and workers to temporarily observe the current
			
 
				+worker state, even though it is performing a scheduling operation.
			
 
				+Must be called by a worker before performing a potentially blocking
			
 
				+call such as acquiring a mutex other than its own sched_mutex. This
			
 
				+function increases \c state_relax_refcnt from the current worker. No
			
 
				+more than <c>UINT_MAX-1</c> nested relax_on calls should performed on
			
 
				+the same worker. This function is automatically called by \ref
			
 
				+starpu_worker_lock to relax the caller worker state while attempting
			
 
				+to lock the targer worker.
			
 
				+
			
 
				+\fn void starpu_worker_relax_off(void)
			
 
				+\ingroup API_Workers_Properties
			
 
				+Must be called after a potentially blocking call is complete, to
			
 
				+restore the relax state in place before the corresponding relax_on.
			
 
				+Decreases \c state_relax_refcnt. Calls to \ref starpu_worker_relax_on
			
 
				+and \c starpu_worker_relax_off must be well parenthesized. This
			
 
				+function is automatically called by \ref starpu_worker_unlock after the 
			
 
				+target worker has been unlocked.
			
 
				+
			
 
				+\fn int starpu_worker_get_relax_state(void)
			
 
				+\ingroup API_Workers_Properties
			
 
				+Returns \c !0 if the current worker \c state_relax_refcnt!=0 and \c 0
			
 
				+otherwise.
			
 
				+
			
 
				+\fn void starpu_worker_lock(int workerid)
			
 
				+\ingroup API_Workers_Properties
			
 
				+Acquire the sched mutex of \p workerid. If the caller is a worker,
			
 
				+distinct from \p workerid, the caller worker automatically enter relax
			
 
				+state while acquiring the target worker lock.
			
 
				+
			
 
				+\fn int starpu_worker_trylock(int workerid)
			
 
				+\ingroup API_Workers_Properties
			
 
				+Attempt to acquire the sched mutex of \p workerid. Returns \c 0 if
			
 
				+successful, \c !0 if \p workerid sched mutex is held or the
			
 
				+corresponding worker is not in relaxed stated.
			
 
				+If the caller is a worker, distinct from \p workerid, the caller
			
 
				+worker automatically enter relax state if successfully acquiring the target
			
 
				+worker lock.
			
 
				+
			
 
				+\fn void starpu_worker_unlock(int workerid)
			
 
				+\ingroup API_Workers_Properties
			
 
				+Release the previously acquired sched mutex of \p workerid. Restore
			
 
				+the relaxed state of the caller worker if needed.
			
 
				+
			
 
				+\fn void starpu_worker_lock_self(void)
			
 
				+\ingroup API_Workers_Properties
			
 
				+Acquire the current worker sched mutex.
			
 
				+
			
 
				+\fn void starpu_worker_unlock_self(void)
			
 
				+\ingroup API_Workers_Properties
			
 
				+Release the current worker sched mutex.
			
 
				+
			
 
				+\fn int starpu_wake_worker_relax(int workerid)
			
 
				+\ingroup API_Workers_Properties
			
 
				+Wake up \p workerid while temporarily entering the current worker relaxed state
			
 
				+if needed during the waiting process. Returns 1 if \p workerid has been woken
			
 
				+up or its state_keep_awake flag has been set to 1, and 0 otherwise (if \p
			
 
				+workerid was not in the STATE_SLEEPING or in the STATE_SCHEDULING).
			
 
				+
			
 
				 */
			
--- a/examples/heat/heat.sh
+++ b/examples/heat/heat.sh
@@ -3,6 +3,7 @@
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 #
			
 
				 # Copyright (C) 2017  Université de Bordeaux
			
 
				+# Copyright (C) 2017  Inria
			
 
				 #
			
 
				 # StarPU is free software; you can redistribute it and/or modify
			
 
				 # it under the terms of the GNU Lesser General Public License as published by
			
--- a/examples/interface/complex_codelet.h
+++ b/examples/interface/complex_codelet.h
@@ -22,6 +22,21 @@
 
				 
			
 
				 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
			
 
				 
			
 
				+/* Dumb performance model for simgrid */
			
 
				+static double complex_cost_function(struct starpu_task *task, unsigned nimpl)
			
 
				+{
			
 
				+	(void) task;
			
 
				+	(void) nimpl;
			
 
				+	return 0.000001;
			
 
				+}
			
 
				+
			
 
				+static struct starpu_perfmodel complex_model =
			
 
				+{
			
 
				+	.type = STARPU_COMMON,
			
 
				+	.cost_function = complex_cost_function,
			
 
				+	.symbol = "complex"
			
 
				+};
			
 
				+
			
 
				 void compare_complex_codelet(void *descr[], void *_args)
			
 
				 {
			
 
				 	int nx1 = STARPU_COMPLEX_GET_NX(descr[0]);
			
@@ -57,7 +72,8 @@ struct starpu_codelet cl_compare =
 
				 	/* .cpu_funcs_name = {"compare_complex_codelet"}, */
			
 
				 	.nbuffers = 2,
			
 
				 	.modes = {STARPU_R, STARPU_R},
			
 
				-	.name = "cl_compare"
			
 
				+	.name = "cl_compare",
			
 
				+	.model = &complex_model
			
 
				 };
			
 
				 
			
 
				 void display_complex_codelet(void *descr[], void *_args)
			
@@ -83,7 +99,8 @@ struct starpu_codelet cl_display =
 
				 	.cpu_funcs_name = {"display_complex_codelet"},
			
 
				 	.nbuffers = 1,
			
 
				 	.modes = {STARPU_R},
			
 
				-	.name = "cl_display"
			
 
				+	.name = "cl_display",
			
 
				+	.model = &complex_model
			
 
				 };
			
 
				 
			
 
				 #endif /* __COMPLEX_CODELET_H */
			
--- a/examples/mandelbrot/mandelbrot.c
+++ b/examples/mandelbrot/mandelbrot.c
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2010, 2011, 2014-2015, 2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2016, 2017  CNRS
			
 
				+ * Copyright (C) 2017  Inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -497,7 +498,9 @@ int main(int argc, char **argv)
 
				 	conf.ncuda = 0;
			
 
				 
			
 
				 	if (use_spmd_p)
			
 
				+	{
			
 
				 		conf.sched_policy_name = "peager";
			
 
				+	}
			
 
				 
			
 
				 	ret = starpu_init(&conf);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
--- a/examples/sched_ctx/dummy_sched_with_ctx.c
+++ b/examples/sched_ctx/dummy_sched_with_ctx.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2010-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010-2013, 2016  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -118,8 +118,14 @@ static struct starpu_task *pop_task_dummy(unsigned sched_ctx_id)
 
				 	 * the calling worker. So we just take the head of the list and give it
			
 
				 	 * to the worker. */
			
 
				 	struct dummy_sched_data *data = (struct dummy_sched_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
			
 
				+#ifdef STARPU_NON_BLOCKING_DRIVERS
			
 
				+	if (starpu_task_list_empty(&data->sched_list))
			
 
				+		return NULL;
			
 
				+#endif
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&data->policy_mutex);
			
 
				-	struct starpu_task *task = starpu_task_list_pop_back(&data->sched_list);
			
 
				+	struct starpu_task *task = NULL;
			
 
				+	if (!starpu_task_list_empty(&data->sched_list))
			
 
				+		task = starpu_task_list_pop_back(&data->sched_list);
			
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex);
			
 
				 	return task;
			
 
				 }
			
--- a/examples/sched_ctx/nested_sched_ctxs.c
+++ b/examples/sched_ctx/nested_sched_ctxs.c
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2010-2015  Université de Bordeaux
			
 
				  * Copyright (C) 2010-2014, 2016, 2017  CNRS
			
 
				+ * Copyright (C) 2017  Inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -53,7 +54,10 @@ static void sched_ctx_func(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
 
				 	unsigned sched_ctx = (uintptr_t)arg;
			
 
				 	int t = parallel_code(sched_ctx);
			
 
				 	if (sched_ctx > 0 && sched_ctx < 3)
			
 
				-		tasks_executed[sched_ctx-1] += t;
			
 
				+	{
			
 
				+		STARPU_ATOMIC_ADD(&tasks_executed[sched_ctx-1], t);
			
 
				+	}
			
 
				+
			
 
				 	//printf("w %d executed %d it \n", w, n);
			
 
				 }
			
 
				 
			
--- a/examples/sched_ctx/parallel_code.c
+++ b/examples/sched_ctx/parallel_code.c
@@ -16,6 +16,7 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#ifdef STARPU_USE_CPU
			
 
				 #include <omp.h>
			
 
				 
			
 
				 #ifdef STARPU_QUICK_CHECK
			
@@ -71,16 +72,10 @@ int main(int argc, char **argv)
 
				 	int nprocs1;
			
 
				 	int *procs1;
			
 
				 
			
 
				-#ifdef STARPU_USE_CPU
			
 
				 	unsigned ncpus =  starpu_cpu_worker_get_count();
			
 
				 	procs1 = (int*)malloc(ncpus*sizeof(int));
			
 
				 	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, procs1, ncpus);
			
 
				 	nprocs1 = ncpus;
			
 
				-#else
			
 
				-	nprocs1 = 1;
			
 
				-	procs1 = (int*)malloc(nprocs1*sizeof(int));
			
 
				-	procs1[0] = 0;
			
 
				-#endif
			
 
				 
			
 
				 	unsigned sched_ctx1 = starpu_sched_ctx_create(procs1, nprocs1, "ctx1", STARPU_SCHED_CTX_POLICY_NAME, "dmda", 0);
			
 
				 
			
@@ -100,3 +95,10 @@ int main(int argc, char **argv)
 
				 
			
 
				 	return 0;
			
 
				 }
			
 
				+#else /* STARPU_USE_CPU */
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	/* starpu_sched_ctx_exec_parallel_code() requires a CPU worker has parallel region master */
			
 
				+	return 77; /* STARPU_TEST_SKIPPED */
			
 
				+}
			
 
				+#endif /* STARPU_USE_CPU */
			
--- a/examples/sched_ctx/parallel_tasks_reuse_handle.c
+++ b/examples/sched_ctx/parallel_tasks_reuse_handle.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.

			
 
				  *

			
 
				- * Copyright (C) 2015 INRIA

			
 
				+ * Copyright (C) 2015, 2017 INRIA

			
 
				  * Copyright (C) 2015, 2016 CNRS

			
 
				  *

			
 
				  * StarPU is free software; you can redistribute it and/or modify

			
@@ -17,6 +17,7 @@
 
				 

			
 
				 #include <starpu.h>

			
 
				 #include <omp.h>

			
 
				+#include <pthread.h>

			
 
				 

			
 
				 #ifdef STARPU_QUICK_CHECK

			
 
				 #define NTASKS 64

			
@@ -28,6 +29,8 @@
 
				 #define LOOPS  10

			
 
				 #endif

			
 
				 

			
 
				+#define N_NESTED_CTXS 2

			
 
				+

			
 
				 struct context

			
 
				 {

			
 
				 	int ncpus;

			
@@ -38,6 +41,7 @@ struct context
 
				 /* Helper for the task that will initiate everything */

			
 
				 void parallel_task_prologue_init_once_and_for_all(void * sched_ctx_)

			
 
				 {

			
 
				+	fprintf(stderr, "%p: %s -->\n", (void*)pthread_self(), __func__);

			
 
				 	int sched_ctx = *(int *)sched_ctx_;

			
 
				 	int *cpuids = NULL;

			
 
				 	int ncpuids = 0;

			
@@ -50,6 +54,7 @@ void parallel_task_prologue_init_once_and_for_all(void * sched_ctx_)
 
				 

			
 
				 	omp_set_num_threads(ncpuids);

			
 
				 	free(cpuids);

			
 
				+	fprintf(stderr, "%p: %s <--\n", (void*)pthread_self(), __func__);

			
 
				 	return;

			
 
				 }

			
 
				 

			
@@ -101,25 +106,24 @@ void parallel_task_init()
 
				 						  0);

			
 
				 

			
 
				 	/* Initialize nested contexts */

			
 
				-	/* WARNING : the number of contexts must be a divisor of the number of available cpus*/

			
 
				-

			
 
				-	contexts = malloc(sizeof(struct context)*2);

			
 
				-	int cpus_per_context = main_context.ncpus/2;

			
 
				+	contexts = malloc(sizeof(struct context)*N_NESTED_CTXS);

			
 
				+	int cpus_per_context = main_context.ncpus/N_NESTED_CTXS;

			
 
				 	int i;

			
 
				-	for(i = 0; i < 2; i++)

			
 
				+	for(i = 0; i < N_NESTED_CTXS; i++)

			
 
				 	{

			
 
				-		fprintf(stderr, "ncpus %d for context %d \n",cpus_per_context, i);

			
 
				 		contexts[i].ncpus = cpus_per_context;

			
 
				+		if (i == N_NESTED_CTXS-1)

			
 
				+			contexts[i].ncpus += main_context.ncpus%N_NESTED_CTXS;

			
 
				 		contexts[i].cpus = main_context.cpus+i*cpus_per_context;

			
 
				 	}

			
 
				 

			
 
				-	for(i = 0; i < 2; i++)

			
 
				+	for(i = 0; i < N_NESTED_CTXS; i++)

			
 
				 		contexts[i].id = starpu_sched_ctx_create(contexts[i].cpus,

			
 
				 							 contexts[i].ncpus,"nested_ctx",

			
 
				 							 STARPU_SCHED_CTX_NESTED,main_context.id,

			
 
				 							 0);

			
 
				 

			
 
				-	for (i = 0; i < 2; i++)

			
 
				+	for (i = 0; i < N_NESTED_CTXS; i++)

			
 
				 	{

			
 
				 		parallel_task_init_one_context(&contexts[i].id);

			
 
				 	}

			
@@ -131,7 +135,7 @@ void parallel_task_init()
 
				 void parallel_task_deinit()

			
 
				 {

			
 
				 	int i;

			
 
				-	for (i=0; i<2;i++)

			
 
				+	for (i=0; i<N_NESTED_CTXS;i++)

			
 
				 		starpu_sched_ctx_delete(contexts[i].id);

			
 
				 	free(contexts);

			
 
				 	free(main_context.cpus);

			
@@ -174,7 +178,7 @@ int main(int argc, char **argv)
 
				 		return 77;

			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");

			
 
				 

			
 
				-	if (starpu_cpu_worker_get_count() < 2)

			
 
				+	if (starpu_cpu_worker_get_count() < N_NESTED_CTXS)

			
 
				 	{

			
 
				 		starpu_shutdown();

			
 
				 		return 77;

			
--- a/examples/sched_ctx/sched_ctx.c
+++ b/examples/sched_ctx/sched_ctx.c
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2010-2014  Université de Bordeaux
			
 
				  * Copyright (C) 2010-2014, 2016  CNRS
			
 
				+ * Copyright (C) 2017  Inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -25,17 +26,25 @@
 
				 #endif
			
 
				 
			
 
				 int tasks_executed = 0;
			
 
				-starpu_pthread_mutex_t mut;
			
 
				+int ctx1_tasks_executed = 0;
			
 
				+int ctx2_tasks_executed = 0;
			
 
				 
			
 
				 static void sched_ctx_cpu_func(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				-	STARPU_PTHREAD_MUTEX_LOCK(&mut);
			
 
				-	tasks_executed++;
			
 
				-	STARPU_PTHREAD_MUTEX_UNLOCK(&mut);
			
 
				+	(void)STARPU_ATOMIC_ADD(&tasks_executed,1);
			
 
				+	(void)STARPU_ATOMIC_ADD(&ctx1_tasks_executed,1);
			
 
				 }
			
 
				 
			
 
				-static void sched_ctx_cuda_func(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg STARPU_ATTRIBUTE_UNUSED)
			
 
				+static void sched_ctx2_cpu_func(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				+	(void)STARPU_ATOMIC_ADD(&tasks_executed,1);
			
 
				+	(void)STARPU_ATOMIC_ADD(&ctx2_tasks_executed,1);
			
 
				+}
			
 
				+
			
 
				+static void sched_ctx2_cuda_func(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg STARPU_ATTRIBUTE_UNUSED)
			
 
				+{
			
 
				+	(void)STARPU_ATOMIC_ADD(&tasks_executed,1);
			
 
				+	(void)STARPU_ATOMIC_ADD(&ctx2_tasks_executed,1);
			
 
				 }
			
 
				 
			
 
				 static struct starpu_codelet sched_ctx_codelet1 =
			
@@ -48,8 +57,8 @@ static struct starpu_codelet sched_ctx_codelet1 =
 
				 
			
 
				 static struct starpu_codelet sched_ctx_codelet2 =
			
 
				 {
			
 
				-	.cpu_funcs = {sched_ctx_cpu_func},
			
 
				-	.cuda_funcs = {sched_ctx_cuda_func},
			
 
				+	.cpu_funcs = {sched_ctx2_cpu_func},
			
 
				+	.cuda_funcs = {sched_ctx2_cuda_func},
			
 
				 	.model = NULL,
			
 
				 	.nbuffers = 0,
			
 
				 	.name = "sched_ctx"
			
@@ -71,8 +80,6 @@ int main(int argc, char **argv)
 
				 		return 77;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				-	STARPU_PTHREAD_MUTEX_INIT(&mut, NULL);
			
 
				-
			
 
				 #ifdef STARPU_USE_CPU
			
 
				 	nprocs1 = starpu_cpu_worker_get_count();
			
 
				 	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, procs1, nprocs1);
			
@@ -155,7 +162,9 @@ int main(int argc, char **argv)
 
				 	starpu_sched_ctx_add_workers(procs1, nprocs1, sched_ctx2);
			
 
				 	starpu_sched_ctx_delete(sched_ctx1);
			
 
				 	starpu_sched_ctx_delete(sched_ctx2);
			
 
				-	printf("tasks executed %d out of %d\n", tasks_executed, ntasks/2);
			
 
				+	printf("tasks executed %d out of %d\n", tasks_executed, ntasks+1);
			
 
				+	printf("tasks executed on ctx1: %d\n", ctx1_tasks_executed);
			
 
				+	printf("tasks executed on ctx2: %d\n", ctx2_tasks_executed);
			
 
				 
			
 
				 enodev:
			
 
				 	starpu_shutdown();
			
--- a/examples/scheduler/dummy_sched.c
+++ b/examples/scheduler/dummy_sched.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2010-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010-2013, 2016  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -122,7 +122,9 @@ static struct starpu_task *pop_task_dummy(unsigned sched_ctx_id)
 
				 		return NULL;
			
 
				 #endif
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&data->policy_mutex);
			
 
				-	struct starpu_task *task = starpu_task_list_pop_back(&data->sched_list);
			
 
				+	struct starpu_task *task = NULL;
			
 
				+	if (!starpu_task_list_empty(&data->sched_list))
			
 
				+		task = starpu_task_list_pop_back(&data->sched_list);
			
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&data->policy_mutex);
			
 
				 	return task;
			
 
				 }
			
--- a/include/fstarpu_mod.f90
+++ b/include/fstarpu_mod.f90
@@ -1,6 +1,6 @@
 
				 ! StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 !
			
 
				-! Copyright (C) 2016  Inria
			
 
				+! Copyright (C) 2016-2017  Inria
			
 
				 !
			
 
				 ! StarPU is free software; you can redistribute it and/or modify
			
 
				 ! it under the terms of the GNU Lesser General Public License as published by
			
@@ -44,6 +44,7 @@ module fstarpu_mod
 
				         type(c_ptr), bind(C) :: FSTARPU_EXECUTE_ON_DATA
			
 
				         type(c_ptr), bind(C) :: FSTARPU_EXECUTE_ON_WORKER
			
 
				         type(c_ptr), bind(C) :: FSTARPU_WORKER_ORDER
			
 
				+        type(c_ptr), bind(C) :: FSTARPU_EXECUTE_WHERE
			
 
				         type(c_ptr), bind(C) :: FSTARPU_HYPERVISOR_TAG
			
 
				         type(c_ptr), bind(C) :: FSTARPU_POSSIBLY_PARALLEL
			
 
				         type(c_ptr), bind(C) :: FSTARPU_FLOPS
			
@@ -2022,26 +2023,6 @@ module fstarpu_mod
 
				                         integer(c_int), value, intent(in) :: sched_ctx_id
			
 
				                 end subroutine fstarpu_sched_ctx_list_task_counters_reset_all
			
 
				 
			
 
				-                ! void starpu_sched_ctx_set_priority(int *workers, int nworkers, unsigned sched_ctx_id, unsigned priority);
			
 
				-                subroutine fstarpu_sched_ctx_set_priority (workers, nworkers,  sched_ctx_id, priority) &
			
 
				-                                bind(c,name="starpu_sched_ctx_set_priority")
			
 
				-                        use iso_c_binding, only: c_int
			
 
				-                        integer(c_int), intent(in) :: workers(*)
			
 
				-                        integer(c_int), value, intent(in) :: nworkers
			
 
				-                        integer(c_int), value, intent(in) :: sched_ctx_id
			
 
				-                        integer(c_int), value, intent(in) :: priority
			
 
				-                end subroutine fstarpu_sched_ctx_set_priority
			
 
				-
			
 
				-                ! void starpu_sched_ctx_set_priority_on_level(int* workers_to_add, unsigned nworkers_to_add, unsigned sched_ctx, unsigned priority);
			
 
				-                subroutine fstarpu_sched_ctx_set_priority_on_level ( workers_to_add, nworkers_to_add, sched_ctx, priority) &
			
 
				-                                bind(c,name="starpu_sched_ctx_set_priority_on_level")
			
 
				-                        use iso_c_binding, only: c_int
			
 
				-                        integer(c_int), intent(in) :: workers_to_add(*)
			
 
				-                        integer(c_int), value, intent(in) :: nworkers_to_add
			
 
				-                        integer(c_int), value, intent(in) :: sched_ctx
			
 
				-                        integer(c_int), value, intent(in) :: priority
			
 
				-                end subroutine fstarpu_sched_ctx_set_priority_on_level
			
 
				-
			
 
				                 ! unsigned starpu_sched_ctx_get_priority(int worker, unsigned sched_ctx_id);
			
 
				                 function fstarpu_sched_ctx_get_priority (worker, sched_ctx_id) &
			
 
				                                 bind(c,name="starpu_sched_ctx_get_priority")
			
@@ -2280,6 +2261,7 @@ module fstarpu_mod
 
				                         FSTARPU_EXECUTE_ON_DATA = fstarpu_get_constant(C_CHAR_"FSTARPU_EXECUTE_ON_DATA"//C_NULL_CHAR)
			
 
				                         FSTARPU_EXECUTE_ON_WORKER       = fstarpu_get_constant(C_CHAR_"FSTARPU_EXECUTE_ON_WORKER"//C_NULL_CHAR)
			
 
				                         FSTARPU_WORKER_ORDER    = fstarpu_get_constant(C_CHAR_"FSTARPU_WORKER_ORDER"//C_NULL_CHAR)
			
 
				+                        FSTARPU_EXECUTE_WHERE       = fstarpu_get_constant(C_CHAR_"FSTARPU_EXECUTE_WHERE"//C_NULL_CHAR)
			
 
				                         FSTARPU_HYPERVISOR_TAG  = fstarpu_get_constant(C_CHAR_"FSTARPU_HYPERVISOR_TAG"//C_NULL_CHAR)
			
 
				                         FSTARPU_POSSIBLY_PARALLEL       = fstarpu_get_constant(C_CHAR_"FSTARPU_POSSIBLY_PARALLEL"//C_NULL_CHAR)
			
 
				                         FSTARPU_FLOPS   = fstarpu_get_constant(C_CHAR_"FSTARPU_FLOPS"//C_NULL_CHAR)
			
--- a/include/starpu.h
+++ b/include/starpu.h
@@ -62,6 +62,7 @@ typedef UINT_PTR uintptr_t;
 
				 #include <starpu_rand.h>
			
 
				 #include <starpu_cuda.h>
			
 
				 #include <starpu_cublas.h>
			
 
				+#include <starpu_cusparse.h>
			
 
				 #include <starpu_bound.h>
			
 
				 #include <starpu_hash.h>
			
 
				 #include <starpu_profiling.h>
			
--- a/include/starpu_cublas_v2.h
+++ b/include/starpu_cublas_v2.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2012  Université de Bordeaux
			
 
				+ * Copyright (C) 2010-2012, 2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -18,6 +18,8 @@
 
				 #ifndef __STARPU_CUBLAS_V2_H__
			
 
				 #define __STARPU_CUBLAS_V2_H__
			
 
				 
			
 
				+#if defined STARPU_USE_CUDA && !defined STARPU_DONT_INCLUDE_CUDA_HEADERS
			
 
				+
			
 
				 #include <cublas_v2.h>
			
 
				 
			
 
				 #ifdef __cplusplus
			
@@ -31,4 +33,6 @@ cublasHandle_t starpu_cublas_get_local_handle(void);
 
				 }
			
 
				 #endif
			
 
				 
			
 
				+#endif
			
 
				+
			
 
				 #endif /* __STARPU_CUBLAS_V2_H__ */
			
--- a/include/starpu_cusparse.h
+++ b/include/starpu_cusparse.h
@@ -0,0 +1,38 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2010-2012, 2017  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013  CNRS
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __STARPU_CUSPARSE_H__
			
 
				+#define __STARPU_CUSPARSE_H__
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C"
			
 
				+{
			
 
				+#endif
			
 
				+
			
 
				+void starpu_cusparse_init(void);
			
 
				+void starpu_cusparse_shutdown(void);
			
 
				+
			
 
				+#if defined STARPU_USE_CUDA && !defined STARPU_DONT_INCLUDE_CUDA_HEADERS
			
 
				+#include <cusparse.h>
			
 
				+cusparseHandle_t starpu_cusparse_get_local_handle(void);
			
 
				+#endif
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#endif /* __STARPU_CUSPARSE_H__ */
			
--- a/include/starpu_sched_component.h
+++ b/include/starpu_sched_component.h
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2013  Simon Archipoff
			
 
				  * Copyright (C) 2014  CNRS
			
 
				+ * Copyright (C) 2017  Inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -92,6 +93,9 @@ int starpu_sched_tree_push_task(struct starpu_task *task);
 
				 int starpu_sched_component_push_task(struct starpu_sched_component *from, struct starpu_sched_component *to, struct starpu_task *task);
			
 
				 struct starpu_task *starpu_sched_tree_pop_task(unsigned sched_ctx);
			
 
				 struct starpu_task *starpu_sched_component_pull_task(struct starpu_sched_component *from, struct starpu_sched_component *to);
			
 
				+struct starpu_task* starpu_sched_component_pump_downstream(struct starpu_sched_component *component, int* success);
			
 
				+void starpu_sched_component_send_can_push_to_parents(struct starpu_sched_component * component);
			
 
				+
			
 
				 void starpu_sched_tree_add_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers);
			
 
				 void starpu_sched_tree_remove_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers);
			
 
				 
			
@@ -107,6 +111,7 @@ void starpu_sched_component_prefetch_on_node(struct starpu_sched_component *comp
 
				 void starpu_sched_component_connect(struct starpu_sched_component *parent, struct starpu_sched_component *child);
			
 
				 
			
 
				 struct starpu_sched_component *starpu_sched_component_worker_get(unsigned sched_ctx, int workerid);
			
 
				+struct starpu_sched_component *starpu_sched_component_worker_new(unsigned sched_ctx, int workerid);
			
 
				 int starpu_sched_component_worker_get_workerid(struct starpu_sched_component *worker_component);
			
 
				 int starpu_sched_component_is_worker(struct starpu_sched_component *component);
			
 
				 int starpu_sched_component_is_simple_worker(struct starpu_sched_component *component);
			
@@ -196,6 +201,20 @@ struct starpu_sched_component_specs
 
				 struct starpu_sched_tree *starpu_sched_component_make_scheduler(unsigned sched_ctx_id, struct starpu_sched_component_specs s);
			
 
				 #endif /* STARPU_HAVE_HWLOC */
			
 
				 
			
 
				+#define STARPU_COMPONENT_MUTEX_LOCK(m) \
			
 
				+do \
			
 
				+{ \
			
 
				+	const int _relaxed_state = _starpu_worker_get_relax_state(); \
			
 
				+	if (!_relaxed_state) \
			
 
				+		_starpu_worker_relax_on(); \
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK((m)); \
			
 
				+	if (!_relaxed_state) \
			
 
				+		_starpu_worker_relax_off(); \
			
 
				+} \
			
 
				+while(0)
			
 
				+
			
 
				+#define STARPU_COMPONENT_MUTEX_UNLOCK(m) STARPU_PTHREAD_MUTEX_UNLOCK((m))
			
 
				+
			
 
				 #ifdef __cplusplus
			
 
				 }
			
 
				 #endif
			
--- a/include/starpu_sched_ctx.h
+++ b/include/starpu_sched_ctx.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010 - 2012  INRIA
			
 
				+ * Copyright (C) 2010 - 2012, 2017  INRIA
			
 
				  * Copyright (C) 2016  Uppsala University
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -43,9 +43,9 @@ unsigned starpu_sched_ctx_create_inside_interval(const char *policy_name, const
 
				 
			
 
				 void starpu_sched_ctx_register_close_callback(unsigned sched_ctx_id, void (*close_callback)(unsigned sched_ctx_id, void* args), void *args);
			
 
				 
			
 
				-void starpu_sched_ctx_add_workers(int *workerids_ctx, int nworkers_ctx, unsigned sched_ctx_id);
			
 
				+void starpu_sched_ctx_add_workers(int *workerids_ctx, unsigned nworkers_ctx, unsigned sched_ctx_id);
			
 
				 
			
 
				-void starpu_sched_ctx_remove_workers(int *workerids_ctx, int nworkers_ctx, unsigned sched_ctx_id);
			
 
				+void starpu_sched_ctx_remove_workers(int *workerids_ctx, unsigned nworkers_ctx, unsigned sched_ctx_id);
			
 
				 
			
 
				 void starpu_sched_ctx_display_workers(unsigned sched_ctx_id, FILE *f);
			
 
				 
			
@@ -133,16 +133,14 @@ void starpu_sched_ctx_list_task_counters_decrement(unsigned sched_ctx_id, int wo
 
				 
			
 
				 void starpu_sched_ctx_list_task_counters_reset(unsigned sched_ctx_id, int workerid);
			
 
				 
			
 
				-void starpu_sched_ctx_list_task_counters_increment_all(struct starpu_task *task, unsigned sched_ctx_id);
			
 
				+void starpu_sched_ctx_list_task_counters_increment_all_ctx_locked(struct starpu_task *task, unsigned sched_ctx_id);
			
 
				 
			
 
				-void starpu_sched_ctx_list_task_counters_decrement_all(struct starpu_task *task, unsigned sched_ctx_id);
			
 
				+void starpu_sched_ctx_list_task_counters_decrement_all_ctx_locked(struct starpu_task *task, unsigned sched_ctx_id);
			
 
				 
			
 
				 void starpu_sched_ctx_list_task_counters_reset_all(struct starpu_task *task, unsigned sched_ctx_id);
			
 
				 
			
 
				 void starpu_sched_ctx_set_priority(int *workers, int nworkers, unsigned sched_ctx_id, unsigned priority);
			
 
				 
			
 
				-void starpu_sched_ctx_set_priority_on_level(int* workers_to_add, unsigned nworkers_to_add, unsigned sched_ctx, unsigned priority);
			
 
				-
			
 
				 unsigned starpu_sched_ctx_get_priority(int worker, unsigned sched_ctx_id);
			
 
				 
			
 
				 void starpu_sched_ctx_get_available_cpuids(unsigned sched_ctx_id, int **cpuids, int *ncpuids);
			
@@ -160,9 +158,9 @@ unsigned starpu_sched_ctx_worker_is_master_for_child_ctx(int workerid, unsigned
 
				 /* If not, returns STARPU_NMAX_SCHED_CTXS. */
			
 
				 unsigned starpu_sched_ctx_master_get_context(int masterid);
			
 
				 
			
 
				-void starpu_sched_ctx_revert_task_counters(unsigned sched_ctx_id, double flops);
			
 
				+void starpu_sched_ctx_revert_task_counters_ctx_locked(unsigned sched_ctx_id, double flops);
			
 
				 
			
 
				-void starpu_sched_ctx_move_task_to_ctx(struct starpu_task *task, unsigned sched_ctx, unsigned manage_mutex, unsigned with_repush);
			
 
				+void starpu_sched_ctx_move_task_to_ctx_locked(struct starpu_task *task, unsigned sched_ctx, unsigned with_repush);
			
 
				 
			
 
				 int starpu_sched_ctx_get_worker_rank(unsigned sched_ctx_id);
			
 
				 
			
--- a/include/starpu_scheduler.h
+++ b/include/starpu_scheduler.h
@@ -3,6 +3,7 @@
 
				  * Copyright (C) 2010-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2011  Télécom-SudParis
			
 
				  * Copyright (C) 2016  Uppsala University
			
 
				+ * Copyright (C) 2017  Inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -60,13 +61,10 @@ void starpu_worker_get_sched_condition(int workerid, starpu_pthread_mutex_t **sc
 
				 unsigned long starpu_task_get_job_id(struct starpu_task *task);
			
 
				 
			
 
				 /* This function must be called to wake up a worker that is sleeping on the cond. 
			
 
				- * It returns 0 whenever the worker is not in a sleeping state */
			
 
				-int starpu_wake_worker(int workerid);
			
 
				-int starpu_wakeup_worker(int workerid, starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex);
			
 
				+ * It returns 0 whenever the worker is not in a sleeping state or has the state_keep_awake flag on */
			
 
				+int starpu_wake_worker_no_relax(int workerid);
			
 
				 /* This is a version of starpu_wake_worker which assumes that the sched mutex is locked */
			
 
				 int starpu_wake_worker_locked(int workerid);
			
 
				-/* This is a version of starpu_wakeup_worker which assumes that the sched mutex is locked */
			
 
				-int starpu_wakeup_worker_locked(int workerid, starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex);
			
 
				 
			
 
				 int starpu_worker_can_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl);
			
 
				 int starpu_worker_can_execute_task_impl(unsigned workerid, struct starpu_task *task, unsigned *impl_mask);
			
--- a/include/starpu_task.h
+++ b/include/starpu_task.h
@@ -136,6 +136,7 @@ struct starpu_task
 
				 	const char *name;
			
 
				 
			
 
				 	struct starpu_codelet *cl;
			
 
				+	int32_t where;
			
 
				 
			
 
				 	int nbuffers;
			
 
				 
			
--- a/include/starpu_task_list.h
+++ b/include/starpu_task_list.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2012, 2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2010-2012, 2016-2017  Université de Bordeaux
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -25,148 +25,56 @@ extern "C"
 
				 {
			
 
				 #endif
			
 
				 
			
 
				+	/* NOTE: this needs to have at least the same size as lists in src/common/list.h */
			
 
				+#ifdef BUILDING_STARPU
			
 
				+#define STARPU_TASK_LIST_INLINE extern inline
			
 
				+#else
			
 
				 struct starpu_task_list
			
 
				 {
			
 
				 	struct starpu_task *head;
			
 
				 	struct starpu_task *tail;
			
 
				 };
			
 
				+#define STARPU_TASK_LIST_INLINE extern
			
 
				+#endif
			
 
				 
			
 
				-static STARPU_INLINE
			
 
				-void starpu_task_list_init(struct starpu_task_list *list)
			
 
				-{
			
 
				-	list->head = NULL;
			
 
				-	list->tail = NULL;
			
 
				-}
			
 
				-
			
 
				-static STARPU_INLINE
			
 
				-void starpu_task_list_push_front(struct starpu_task_list *list, struct starpu_task *task)
			
 
				-{
			
 
				-	if (list->tail == NULL)
			
 
				-	{
			
 
				-		list->tail = task;
			
 
				-	}
			
 
				-	else
			
 
				-	{
			
 
				-		list->head->prev = task;
			
 
				-	}
			
 
				-
			
 
				-	task->prev = NULL;
			
 
				-	task->next = list->head;
			
 
				-	list->head = task;
			
 
				-}
			
 
				-
			
 
				-static STARPU_INLINE
			
 
				-void starpu_task_list_push_back(struct starpu_task_list *list, struct starpu_task *task)
			
 
				-{
			
 
				-	if (list->head == NULL)
			
 
				-	{
			
 
				-		list->head = task;
			
 
				-	}
			
 
				-	else
			
 
				-	{
			
 
				-		list->tail->next = task;
			
 
				-	}
			
 
				-
			
 
				-	task->next = NULL;
			
 
				-	task->prev = list->tail;
			
 
				-	list->tail = task;
			
 
				-}
			
 
				-
			
 
				-static STARPU_INLINE
			
 
				-struct starpu_task *starpu_task_list_front(struct starpu_task_list *list)
			
 
				-{
			
 
				-	return list->head;
			
 
				-}
			
 
				-
			
 
				-static STARPU_INLINE
			
 
				-struct starpu_task *starpu_task_list_back(struct starpu_task_list *list)
			
 
				-{
			
 
				-	return list->tail;
			
 
				-}
			
 
				-
			
 
				-static STARPU_INLINE
			
 
				-int starpu_task_list_empty(struct starpu_task_list *list)
			
 
				-{
			
 
				-	return (list->head == NULL);
			
 
				-}
			
 
				+STARPU_TASK_LIST_INLINE
			
 
				+void starpu_task_list_init(struct starpu_task_list *list);
			
 
				 
			
 
				-static STARPU_INLINE
			
 
				-void starpu_task_list_erase(struct starpu_task_list *list, struct starpu_task *task)
			
 
				-{
			
 
				-	struct starpu_task *p = task->prev;
			
 
				-
			
 
				-	if (p)
			
 
				-	{
			
 
				-		p->next = task->next;
			
 
				-	}
			
 
				-	else
			
 
				-	{
			
 
				-		list->head = task->next;
			
 
				-	}
			
 
				-
			
 
				-	if (task->next)
			
 
				-	{
			
 
				-		task->next->prev = p;
			
 
				-	}
			
 
				-	else
			
 
				-	{
			
 
				-		list->tail = p;
			
 
				-	}
			
 
				-
			
 
				-	task->prev = NULL;
			
 
				-	task->next = NULL;
			
 
				-}
			
 
				+STARPU_TASK_LIST_INLINE
			
 
				+void starpu_task_list_push_front(struct starpu_task_list *list, struct starpu_task *task);
			
 
				 
			
 
				-static STARPU_INLINE
			
 
				-struct starpu_task *starpu_task_list_pop_front(struct starpu_task_list *list)
			
 
				-{
			
 
				-	struct starpu_task *task = list->head;
			
 
				+STARPU_TASK_LIST_INLINE
			
 
				+void starpu_task_list_push_back(struct starpu_task_list *list, struct starpu_task *task);
			
 
				 
			
 
				-	if (task)
			
 
				-		starpu_task_list_erase(list, task);
			
 
				+STARPU_TASK_LIST_INLINE
			
 
				+struct starpu_task *starpu_task_list_front(const struct starpu_task_list *list);
			
 
				 
			
 
				-	return task;
			
 
				-}
			
 
				+STARPU_TASK_LIST_INLINE
			
 
				+struct starpu_task *starpu_task_list_back(const struct starpu_task_list *list);
			
 
				 
			
 
				-static STARPU_INLINE
			
 
				-struct starpu_task *starpu_task_list_pop_back(struct starpu_task_list *list)
			
 
				-{
			
 
				-	struct starpu_task *task = list->tail;
			
 
				+STARPU_TASK_LIST_INLINE
			
 
				+int starpu_task_list_empty(const struct starpu_task_list *list);
			
 
				 
			
 
				-	if (task)
			
 
				-		starpu_task_list_erase(list, task);
			
 
				+STARPU_TASK_LIST_INLINE
			
 
				+void starpu_task_list_erase(struct starpu_task_list *list, struct starpu_task *task);
			
 
				 
			
 
				-	return task;
			
 
				-}
			
 
				+STARPU_TASK_LIST_INLINE
			
 
				+struct starpu_task *starpu_task_list_pop_front(struct starpu_task_list *list);
			
 
				 
			
 
				-static STARPU_INLINE
			
 
				-struct starpu_task *starpu_task_list_begin(struct starpu_task_list *list)
			
 
				-{
			
 
				-	return list->head;
			
 
				-}
			
 
				+STARPU_TASK_LIST_INLINE
			
 
				+struct starpu_task *starpu_task_list_pop_back(struct starpu_task_list *list);
			
 
				 
			
 
				-static STARPU_INLINE
			
 
				-struct starpu_task *starpu_task_list_end(struct starpu_task_list *list STARPU_ATTRIBUTE_UNUSED)
			
 
				-{
			
 
				-	return NULL;
			
 
				-}
			
 
				+STARPU_TASK_LIST_INLINE
			
 
				+struct starpu_task *starpu_task_list_begin(const struct starpu_task_list *list);
			
 
				 
			
 
				-static STARPU_INLINE
			
 
				-struct starpu_task *starpu_task_list_next(struct starpu_task *task)
			
 
				-{
			
 
				-	return task->next;
			
 
				-}
			
 
				+STARPU_TASK_LIST_INLINE
			
 
				+struct starpu_task *starpu_task_list_end(const struct starpu_task_list *list STARPU_ATTRIBUTE_UNUSED);
			
 
				 
			
 
				-static STARPU_INLINE
			
 
				-int starpu_task_list_ismember(struct starpu_task_list *list, struct starpu_task *look)
			
 
				-{
			
 
				-	struct starpu_task *task;
			
 
				+STARPU_TASK_LIST_INLINE
			
 
				+struct starpu_task *starpu_task_list_next(const struct starpu_task *task);
			
 
				 
			
 
				-	for (task  = list->head; task != NULL; task  = task->next)
			
 
				-		if (task == look)
			
 
				-			return 1;
			
 
				-	return 0;
			
 
				-}
			
 
				+STARPU_TASK_LIST_INLINE
			
 
				+int starpu_task_list_ismember(const struct starpu_task_list *list, const struct starpu_task *look);
			
 
				 
			
 
				 #ifdef __cplusplus
			
 
				 }
			
--- a/include/starpu_task_util.h
+++ b/include/starpu_task_util.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010-2015  Université de Bordeaux
			
 
				- * Copyright (C) 2010-2014, 2016  CNRS
			
 
				+ * Copyright (C) 2010-2014, 2016, 2017  CNRS
			
 
				  * Copyright (C) 2014       INRIA
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -34,7 +34,7 @@ void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t
 
				 
			
 
				 /* NOTE: when adding a value here, please make sure to update both
			
 
				  * src/util/starpu_task_insert_utils.c (in two places) and
			
 
				- * mpi/src/starpu_mpi_task_insert.c */
			
 
				+ * mpi/src/starpu_mpi_task_insert.c and mpi/src/starpu_mpi_task_insert_fortran.c */
			
 
				 #define STARPU_MODE_SHIFT	17
			
 
				 #define STARPU_VALUE		 (1<<STARPU_MODE_SHIFT)
			
 
				 #define STARPU_CALLBACK		 (2<<STARPU_MODE_SHIFT)
			
@@ -54,13 +54,14 @@ void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t
 
				 #define STARPU_PROLOGUE_CALLBACK_POP   (16<<STARPU_MODE_SHIFT)
			
 
				 #define STARPU_PROLOGUE_CALLBACK_POP_ARG (17<<STARPU_MODE_SHIFT)
			
 
				 #define STARPU_EXECUTE_ON_WORKER (18<<STARPU_MODE_SHIFT)
			
 
				-#define STARPU_TAG_ONLY          (19<<STARPU_MODE_SHIFT)
			
 
				-#define STARPU_POSSIBLY_PARALLEL    (20<<STARPU_MODE_SHIFT)
			
 
				-#define STARPU_WORKER_ORDER      (21<<STARPU_MODE_SHIFT)
			
 
				-#define STARPU_NODE_SELECTION_POLICY (22<<STARPU_MODE_SHIFT)
			
 
				-#define STARPU_NAME		 (23<<STARPU_MODE_SHIFT)
			
 
				-#define STARPU_CL_ARGS		(24<<STARPU_MODE_SHIFT)
			
 
				-#define STARPU_SHIFTED_MODE_MAX (25<<STARPU_MODE_SHIFT)
			
 
				+#define STARPU_EXECUTE_WHERE     (19<<STARPU_MODE_SHIFT)
			
 
				+#define STARPU_TAG_ONLY          (20<<STARPU_MODE_SHIFT)
			
 
				+#define STARPU_POSSIBLY_PARALLEL    (21<<STARPU_MODE_SHIFT)
			
 
				+#define STARPU_WORKER_ORDER      (22<<STARPU_MODE_SHIFT)
			
 
				+#define STARPU_NODE_SELECTION_POLICY (23<<STARPU_MODE_SHIFT)
			
 
				+#define STARPU_NAME		 (24<<STARPU_MODE_SHIFT)
			
 
				+#define STARPU_CL_ARGS		(25<<STARPU_MODE_SHIFT)
			
 
				+#define STARPU_SHIFTED_MODE_MAX (26<<STARPU_MODE_SHIFT)
			
 
				 
			
 
				 struct starpu_task *starpu_task_build(struct starpu_codelet *cl, ...);
			
 
				 int starpu_task_insert(struct starpu_codelet *cl, ...);
			
--- a/include/starpu_worker.h
+++ b/include/starpu_worker.h
@@ -127,7 +127,7 @@ struct starpu_tree* starpu_workers_get_tree(void);
 
				 
			
 
				 unsigned starpu_worker_get_sched_ctx_list(int worker, unsigned **sched_ctx);
			
 
				 
			
 
				-unsigned starpu_worker_is_blocked(int workerid);
			
 
				+unsigned starpu_worker_is_blocked_in_parallel(int workerid);
			
 
				 
			
 
				 unsigned starpu_worker_is_slave_somewhere(int workerid);
			
 
				 
			
@@ -140,6 +140,27 @@ int starpu_worker_get_devids(enum starpu_worker_archtype type, int *devids, int
 
				 int starpu_worker_get_stream_workerids(unsigned devid, int *workerids, enum starpu_worker_archtype type);
			
 
				 
			
 
				 unsigned starpu_worker_get_sched_ctx_id_stream(unsigned stream_workerid);
			
 
				+
			
 
				+int starpu_worker_sched_op_pending(void);
			
 
				+
			
 
				+void starpu_worker_relax_on(void);
			
 
				+
			
 
				+void starpu_worker_relax_off(void);
			
 
				+
			
 
				+int starpu_worker_get_relax_state(void);
			
 
				+
			
 
				+void starpu_worker_lock(int workerid);
			
 
				+
			
 
				+int starpu_worker_trylock(int workerid);
			
 
				+
			
 
				+void starpu_worker_unlock(int workerid);
			
 
				+
			
 
				+void starpu_worker_lock_self(void);
			
 
				+
			
 
				+void starpu_worker_unlock_self(void);
			
 
				+
			
 
				+int starpu_wake_worker_relax(int workerid);
			
 
				+
			
 
				 #ifdef __cplusplus
			
 
				 }
			
 
				 #endif
			
--- a/mpi/examples/Makefile.am
+++ b/mpi/examples/Makefile.am
@@ -1,6 +1,6 @@
 
				 # StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				 #
			
 
				-# Copyright (C) 2009-2013, 2015-2016  Université de Bordeaux
			
 
				+# Copyright (C) 2009-2013, 2015-2017  Université de Bordeaux
			
 
				 # Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
			
 
				 # Copyright (C) 2016  Inria
			
 
				 #
			
@@ -17,6 +17,15 @@
 
				 
			
 
				 include $(top_srcdir)/starpu.mk
			
 
				 
			
 
				+if STARPU_SIMGRID
			
 
				+STARPU_PERF_MODEL_DIR=$(abs_top_srcdir)/tools/perfmodels/sampling
			
 
				+STARPU_HOSTNAME=mirage
			
 
				+MALLOC_PERTURB_=0
			
 
				+export STARPU_PERF_MODEL_DIR
			
 
				+export STARPU_HOSTNAME
			
 
				+export MALLOC_PERTURB_
			
 
				+endif
			
 
				+
			
 
				 CC=$(MPICC)
			
 
				 CCLD=$(MPICC)
			
 
				 FC=$(MPIFORT)
			
@@ -26,17 +35,23 @@ if STARPU_HAVE_WINDOWS
 
				 LOADER_BIN		=
			
 
				 else
			
 
				 loader_CPPFLAGS 	= 	$(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
			
 
				+if !STARPU_SIMGRID
			
 
				 LOADER			=	loader
			
 
				 LOADER_BIN		=	$(abs_top_builddir)/mpi/examples/$(LOADER)
			
 
				+endif
			
 
				 loader_SOURCES		=	../../tests/loader.c
			
 
				 endif
			
 
				 
			
 
				+if STARPU_SIMGRID
			
 
				+MPI			=	$(abs_top_builddir)/tools/starpu_smpirun -np 4 -platform $(abs_top_srcdir)/tools/perfmodels/cluster.xml -hostfile $(abs_top_srcdir)/tools/perfmodels/hostfile
			
 
				+else
			
 
				 # we always test on 4 processes, the execution time is not that bigger
			
 
				 if STARPU_QUICK_CHECK
			
 
				 MPI			=	$(MPIEXEC) $(MPIEXEC_ARGS) -np 4
			
 
				 else
			
 
				 MPI			=	$(MPIEXEC) $(MPIEXEC_ARGS) -np 4
			
 
				 endif
			
 
				+endif
			
 
				 
			
 
				 if STARPU_HAVE_AM111
			
 
				 TESTS_ENVIRONMENT	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=4 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
			
@@ -45,11 +60,9 @@ else
 
				 TESTS_ENVIRONMENT 	=	STARPU_WORKERS_NOBIND=1 STARPU_NCPU=4 top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)" $(MPI) $(LOADER_BIN)
			
 
				 endif
			
 
				 
			
 
				-if !STARPU_SIMGRID
			
 
				 if STARPU_MPI_CHECK
			
 
				 TESTS			=	$(starpu_mpi_EXAMPLES)
			
 
				 endif
			
 
				-endif
			
 
				 
			
 
				 check_PROGRAMS = $(LOADER) $(starpu_mpi_EXAMPLES)
			
 
				 starpu_mpi_EXAMPLES =
			
@@ -248,11 +261,13 @@ matrix_decomposition_mpi_cholesky_distributed_LDADD =	\
 
				 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
			
 
				 	$(STARPU_BLAS_LDFLAGS) -lm
			
 
				 
			
 
				+if !STARPU_SIMGRID
			
 
				 starpu_mpi_EXAMPLES +=				\
			
 
				 	matrix_decomposition/mpi_cholesky			\
			
 
				 	matrix_decomposition/mpi_cholesky_distributed
			
 
				 endif
			
 
				 endif
			
 
				+endif
			
 
				 
			
 
				 ########################
			
 
				 # MPI Matrix mult example #
			
@@ -269,9 +284,11 @@ matrix_mult_mm_LDADD =			\
 
				 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
			
 
				 	-lm
			
 
				 
			
 
				+if !STARPU_SIMGRID
			
 
				 starpu_mpi_EXAMPLES +=				\
			
 
				 	matrix_mult/mm
			
 
				 endif
			
 
				+endif
			
 
				 
			
 
				 ##########################################
			
 
				 # Native Fortran MPI Matrix mult example #
			
@@ -303,12 +320,14 @@ native_fortran_nf_basic_ring_LDADD =					\
 
				 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
			
 
				 	-lm
			
 
				 
			
 
				+if !STARPU_SIMGRID
			
 
				 starpu_mpi_EXAMPLES +=				\
			
 
				 	native_fortran/nf_mm			\
			
 
				 	native_fortran/nf_basic_ring
			
 
				 endif
			
 
				 endif
			
 
				 endif
			
 
				+endif
			
 
				 
			
 
				 ###################
			
 
				 # complex example #
			
@@ -344,9 +363,11 @@ user_datatype_user_datatype_SOURCES =		\
 
				 user_datatype_user_datatype_LDADD =		\
			
 
				 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				 
			
 
				+if !STARPU_SIMGRID
			
 
				 starpu_mpi_EXAMPLES	+=			\
			
 
				 	user_datatype/user_datatype
			
 
				 endif
			
 
				+endif
			
 
				 
			
 
				 ###################
			
 
				 # comm example #
			
@@ -362,10 +383,12 @@ comm_comm_LDADD =		\
 
				 comm_mix_comm_LDADD =		\
			
 
				 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
			
 
				 
			
 
				+if !STARPU_SIMGRID
			
 
				 starpu_mpi_EXAMPLES	+=			\
			
 
				 	comm/comm				\
			
 
				 	comm/mix_comm
			
 
				 endif
			
 
				+endif
			
 
				 
			
 
				 if STARPU_HAVE_MPIFORT
			
 
				 if BUILD_EXAMPLES
			
--- a/mpi/examples/complex/mpi_complex.c
+++ b/mpi/examples/complex/mpi_complex.c
@@ -26,11 +26,27 @@ void display_foo_codelet(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
 
				 	FPRINTF(stderr, "foo = %d\n", *foo);
			
 
				 }
			
 
				 
			
 
				+/* Dumb performance model for simgrid */
			
 
				+static double display_cost_function(struct starpu_task *task, unsigned nimpl)
			
 
				+{
			
 
				+	(void) task;
			
 
				+	(void) nimpl;
			
 
				+	return 0.000001;
			
 
				+}
			
 
				+
			
 
				+static struct starpu_perfmodel display_model =
			
 
				+{
			
 
				+	.type = STARPU_COMMON,
			
 
				+	.cost_function = display_cost_function,
			
 
				+	.symbol = "display"
			
 
				+};
			
 
				+
			
 
				 struct starpu_codelet foo_display =
			
 
				 {
			
 
				 	.cpu_funcs = {display_foo_codelet},
			
 
				 	.nbuffers = 1,
			
 
				-	.modes = {STARPU_R}
			
 
				+	.modes = {STARPU_R},
			
 
				+	.model = &display_model
			
 
				 };
			
 
				 
			
 
				 int main(int argc, char **argv)
			
--- a/mpi/examples/stencil/stencil5.c
+++ b/mpi/examples/stencil/stencil5.c
@@ -37,11 +37,27 @@ void stencil5_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
 
				 //	fprintf(stdout, "VALUES: %2.2f %2.2f %2.2f %2.2f %2.2f\n", *xy, *xm1y, *xp1y, *xym1, *xyp1);
			
 
				 }
			
 
				 
			
 
				+/* Dumb performance model for simgrid */
			
 
				+static double stencil5_cost_function(struct starpu_task *task, unsigned nimpl)
			
 
				+{
			
 
				+	(void) task;
			
 
				+	(void) nimpl;
			
 
				+	return 0.000001;
			
 
				+}
			
 
				+
			
 
				+static struct starpu_perfmodel stencil5_model =
			
 
				+{
			
 
				+	.type = STARPU_COMMON,
			
 
				+	.cost_function = stencil5_cost_function,
			
 
				+	.symbol = "stencil5"
			
 
				+};
			
 
				+
			
 
				 struct starpu_codelet stencil5_cl =
			
 
				 {
			
 
				 	.cpu_funcs = {stencil5_cpu},
			
 
				 	.nbuffers = 5,
			
 
				-	.modes = {STARPU_RW, STARPU_R, STARPU_R, STARPU_R, STARPU_R}
			
 
				+	.modes = {STARPU_RW, STARPU_R, STARPU_R, STARPU_R, STARPU_R},
			
 
				+	.model = &stencil5_model
			
 
				 };
			
 
				 
			
 
				 #ifdef STARPU_QUICK_CHECK
			
--- a/mpi/src/starpu_mpi.c
+++ b/mpi/src/starpu_mpi.c
@@ -94,11 +94,10 @@ static int posted_requests = 0, newer_requests, barrier_running = 0;
 
				 #pragma weak smpi_simulated_main_
			
 
				 extern int smpi_simulated_main_(int argc, char *argv[]);
			
 
				 
			
 
				-#ifdef HAVE_SMPI_PROCESS_SET_USER_DATA
			
 
				+#pragma weak smpi_process_set_user_data
			
 
				 #if !HAVE_DECL_SMPI_PROCESS_SET_USER_DATA
			
 
				 extern void smpi_process_set_user_data(void *);
			
 
				 #endif
			
 
				-#endif
			
 
				 
			
 
				 static void _starpu_mpi_request_init(struct _starpu_mpi_req **req)
			
 
				 {
			
@@ -1334,12 +1333,15 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 		argv_cpy[i] = strdup((*(argc_argv->argv))[i]);
			
 
				 	MSG_process_create_with_arguments("main", smpi_simulated_main_, NULL, _starpu_simgrid_get_host_by_name("MAIN"), *(argc_argv->argc), argv_cpy);
			
 
				 	/* And set TSD for us */
			
 
				-#ifdef HAVE_SMPI_PROCESS_SET_USER_DATA
			
 
				 	void **tsd;
			
 
				 	_STARPU_CALLOC(tsd, MAX_TSD + 1, sizeof(void*));
			
 
				+	if (!smpi_process_set_user_data)
			
 
				+	{
			
 
				+		fprintf(stderr,"Your version of simgrid does not provide smpi_process_set_user_data, we can not continue without it\n");
			
 
				+		exit(1);
			
 
				+	}
			
 
				 	smpi_process_set_user_data(tsd);
			
 
				 #endif
			
 
				-#endif
			
 
				 
			
 
				 #ifdef STARPU_USE_FXT
			
 
				 	_starpu_fxt_wait_initialisation();
			
--- a/mpi/src/starpu_mpi_task_insert.c
+++ b/mpi/src/starpu_mpi_task_insert.c
@@ -386,6 +386,12 @@ int _starpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nod
 
				                 {
			
 
				                         (void)va_arg(varg_list_copy, void *);
			
 
				 		}
			
 
				+		else if (arg_type==STARPU_EXECUTE_WHERE)
			
 
				+		{
			
 
				+			// the flag is decoded and set later when
			
 
				+			// calling function _starpu_task_insert_create()
			
 
				+			(void)va_arg(varg_list_copy, uint32_t);
			
 
				+		}
			
 
				 		else if (arg_type==STARPU_EXECUTE_ON_WORKER)
			
 
				 		{
			
 
				 			// the flag is decoded and set later when
			
--- a/mpi/src/starpu_mpi_task_insert_fortran.c
+++ b/mpi/src/starpu_mpi_task_insert_fortran.c
@@ -241,6 +241,11 @@ int _fstarpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_no
 
				 			arg_i++;
			
 
				 			/* void* */
			
 
				 		}
			
 
				+		else if (arg_type==STARPU_EXECUTE_WHERE)
			
 
				+		{
			
 
				+			arg_i++;
			
 
				+			/* int* */
			
 
				+		}
			
 
				 		else if (arg_type==STARPU_EXECUTE_ON_WORKER)
			
 
				 		{
			
 
				 			arg_i++;
			
--- a/mpi/tests/Makefile.am
+++ b/mpi/tests/Makefile.am
@@ -107,17 +107,29 @@ starpu_mpi_TESTS +=				\
 
				 	cache					\
			
 
				 	cache_disable				\
			
 
				 	callback				\
			
 
				+	early_request				\
			
 
				 	insert_task				\
			
 
				 	insert_task_block			\
			
 
				+	insert_task_dyn_handles			\
			
 
				+	insert_task_node_choice			\
			
 
				 	insert_task_owner			\
			
 
				 	insert_task_owner2			\
			
 
				 	insert_task_owner_data			\
			
 
				-	insert_task_node_choice			\
			
 
				-	matrix
			
 
				+	matrix					\
			
 
				+	matrix2					\
			
 
				+	mpi_detached_tag			\
			
 
				+	mpi_irecv_detached			\
			
 
				+	mpi_isend_detached			\
			
 
				+	mpi_reduction				\
			
 
				+	mpi_scatter_gather			\
			
 
				+	policy_register				\
			
 
				+	policy_register_many			\
			
 
				+	policy_selection			\
			
 
				+	policy_selection2			\
			
 
				+	ring_async_implicit
			
 
				 
			
 
				 if !STARPU_SIMGRID
			
 
				 starpu_mpi_TESTS +=				\
			
 
				-	datatypes				\
			
 
				 	pingpong				\
			
 
				 	mpi_test				\
			
 
				 	mpi_isend				\
			
@@ -125,15 +137,11 @@ starpu_mpi_TESTS +=				\
 
				 	mpi_earlyrecv2				\
			
 
				 	mpi_earlyrecv2_sync			\
			
 
				 	mpi_irecv				\
			
 
				-	mpi_isend_detached			\
			
 
				-	mpi_irecv_detached			\
			
 
				-	mpi_detached_tag			\
			
 
				 	mpi_redux				\
			
 
				 	ring					\
			
 
				 	ring_sync				\
			
 
				 	ring_sync_detached			\
			
 
				 	ring_async				\
			
 
				-	ring_async_implicit			\
			
 
				 	block_interface				\
			
 
				 	block_interface_pinned			\
			
 
				 	matrix2					\
			
@@ -141,24 +149,19 @@ starpu_mpi_TESTS +=				\
 
				 	insert_task_sent_cache			\
			
 
				 	insert_task_recv_cache			\
			
 
				 	insert_task_count			\
			
 
				-	insert_task_dyn_handles			\
			
 
				 	multiple_send				\
			
 
				-	mpi_scatter_gather			\
			
 
				-	mpi_reduction				\
			
 
				 	user_defined_datatype			\
			
 
				 	tags_checking				\
			
 
				 	sync					\
			
 
				 	gather					\
			
 
				 	gather2					\
			
 
				-	policy_register				\
			
 
				-	policy_register_many			\
			
 
				+	load_balancer
			
 
				+
			
 
				+# Expected to fail
			
 
				+starpu_mpi_TESTS +=				\
			
 
				 	policy_register_toomany			\
			
 
				 	policy_unregister			\
			
 
				-	policy_selection			\
			
 
				-	policy_selection2			\
			
 
				-	early_request				\
			
 
				-	starpu_redefine				\
			
 
				-	load_balancer
			
 
				+	starpu_redefine
			
 
				 endif
			
 
				 
			
 
				 noinst_PROGRAMS =				\
			
--- a/mpi/tests/block_interface.c
+++ b/mpi/tests/block_interface.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010, 2014-2015  Université de Bordeaux
			
 
				+ * Copyright (C) 2009, 2010, 2014-2015, 2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2014, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -27,16 +27,18 @@
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				 	int ret, rank, size;
			
 
				+	int mpi_init;
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				-	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				-	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				 
			
 
				 	ret = starpu_init(NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				-	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	ret = starpu_mpi_init(NULL, NULL, mpi_init);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				 
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				 	if (size < 2)
			
 
				 	{
			
 
				 		if (rank == 0)
			
@@ -142,7 +144,8 @@ int main(int argc, char **argv)
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				-	MPI_Finalize();
			
 
				+	if (!mpi_init)
			
 
				+		MPI_Finalize();
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/mpi/tests/block_interface_pinned.c
+++ b/mpi/tests/block_interface_pinned.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010, 2014-2015  Université de Bordeaux
			
 
				+ * Copyright (C) 2009, 2010, 2014-2015, 2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -27,16 +27,18 @@
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				 	int ret, rank, size;
			
 
				+	int mpi_init;
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				-	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				-	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				 
			
 
				 	ret = starpu_init(NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				-	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	ret = starpu_mpi_init(NULL, NULL, mpi_init);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				 
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				 	if (size < 2)
			
 
				 	{
			
 
				 		if (rank == 0)
			
@@ -44,7 +46,8 @@ int main(int argc, char **argv)
 
				 
			
 
				 		starpu_mpi_shutdown();
			
 
				 		starpu_shutdown();
			
 
				-		MPI_Finalize();
			
 
				+		if (!mpi_init)
			
 
				+			MPI_Finalize();
			
 
				 		return STARPU_TEST_SKIPPED;
			
 
				 	}
			
 
				 
			
@@ -146,7 +149,8 @@ int main(int argc, char **argv)
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				-	MPI_Finalize();
			
 
				+	if (!mpi_init)
			
 
				+		MPI_Finalize();
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/mpi/tests/datatypes.c
+++ b/mpi/tests/datatypes.c
@@ -332,16 +332,18 @@ int main(int argc, char **argv)
 
				 {
			
 
				 	int ret, rank, size;
			
 
				 	int error=0;
			
 
				+	int mpi_init;
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				-	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				-	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				 
			
 
				 	ret = starpu_init(NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				-	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	ret = starpu_mpi_init(NULL, NULL, mpi_init);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				 
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				 	if (size < 2)
			
 
				 	{
			
 
				 		if (rank == 0)
			
@@ -349,7 +351,8 @@ int main(int argc, char **argv)
 
				 
			
 
				 		starpu_mpi_shutdown();
			
 
				 		starpu_shutdown();
			
 
				-		MPI_Finalize();
			
 
				+		if (!mpi_init)
			
 
				+			MPI_Finalize();
			
 
				 		return STARPU_TEST_SKIPPED;
			
 
				 	}
			
 
				 
			
@@ -362,7 +365,8 @@ int main(int argc, char **argv)
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				-	MPI_Finalize();
			
 
				+	if (!mpi_init)
			
 
				+		MPI_Finalize();
			
 
				 
			
 
				 	return rank == 0 ? error : 0;
			
 
				 }
			
--- a/mpi/tests/early_request.c
+++ b/mpi/tests/early_request.c
@@ -191,23 +191,26 @@ int main(int argc, char * argv[])
 
				 	/* Init */
			
 
				 	int ret;
			
 
				 	int mpi_rank, mpi_size;
			
 
				+	int mpi_init;
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				-	starpu_mpi_comm_rank(MPI_COMM_WORLD, &mpi_rank);
			
 
				-	starpu_mpi_comm_size(MPI_COMM_WORLD, &mpi_size);
			
 
				+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				 
			
 
				 	ret = starpu_init(NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				-	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	ret = starpu_mpi_init(NULL, NULL, mpi_init);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				 
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &mpi_rank);
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &mpi_size);
			
 
				+
			
 
				 	if (starpu_cpu_worker_get_count() == 0)
			
 
				 	{
			
 
				 		if (mpi_rank == 0)
			
 
				 			FPRINTF(stderr, "We need at least 1 CPU worker.\n");
			
 
				 		starpu_mpi_shutdown();
			
 
				 		starpu_shutdown();
			
 
				-		MPI_Finalize();
			
 
				+		if (!mpi_init)
			
 
				+			MPI_Finalize();
			
 
				 		return STARPU_TEST_SKIPPED;
			
 
				 	}
			
 
				 
			
@@ -246,7 +249,8 @@ int main(int argc, char * argv[])
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				-	MPI_Finalize();
			
 
				+	if (!mpi_init)
			
 
				+		MPI_Finalize();
			
 
				 	FPRINTF(stderr, "No assert until end\n");
			
 
				 	return 0;
			
 
				 }
			
--- a/mpi/tests/gather.c
+++ b/mpi/tests/gather.c
@@ -22,22 +22,25 @@ int main(int argc, char **argv)
 
				 	int ret, rank, size;
			
 
				 	starpu_data_handle_t handle;
			
 
				 	int var;
			
 
				+	int mpi_init;
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				-	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				-	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				 
			
 
				 	ret = starpu_init(NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				-	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	ret = starpu_mpi_init(NULL, NULL, mpi_init);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				 
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				 	if (size<3)
			
 
				 	{
			
 
				 		FPRINTF(stderr, "We need more than 2 processes.\n");
			
 
				 		starpu_mpi_shutdown();
			
 
				 		starpu_shutdown();
			
 
				-		MPI_Finalize();
			
 
				+		if (!mpi_init)
			
 
				+			MPI_Finalize();
			
 
				 		return STARPU_TEST_SKIPPED;
			
 
				 	}
			
 
				 
			
@@ -69,7 +72,8 @@ int main(int argc, char **argv)
 
				 
			
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				-	MPI_Finalize();
			
 
				+	if (!mpi_init)
			
 
				+		MPI_Finalize();
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/mpi/tests/gather2.c
+++ b/mpi/tests/gather2.c
@@ -20,22 +20,25 @@
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				 	int ret, rank, size;
			
 
				+	int mpi_init;
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				-	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				-	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				 
			
 
				 	ret = starpu_init(NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				-	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	ret = starpu_mpi_init(NULL, NULL, mpi_init);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				 
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				 	if (size<3)
			
 
				 	{
			
 
				 		FPRINTF(stderr, "We need more than 2 processes.\n");
			
 
				 		starpu_mpi_shutdown();
			
 
				 		starpu_shutdown();
			
 
				-		MPI_Finalize();
			
 
				+		if (!mpi_init)
			
 
				+			MPI_Finalize();
			
 
				 		return STARPU_TEST_SKIPPED;
			
 
				 	}
			
 
				 
			
@@ -91,7 +94,8 @@ int main(int argc, char **argv)
 
				 
			
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				-	MPI_Finalize();
			
 
				+	if (!mpi_init)
			
 
				+		MPI_Finalize();
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/mpi/tests/helper.h
+++ b/mpi/tests/helper.h
@@ -16,16 +16,17 @@
 
				 
			
 
				 #include <errno.h>
			
 
				 #include <starpu_mpi.h>
			
 
				+#include <starpu_config.h>
			
 
				 #include "../../tests/helper.h"
			
 
				 
			
 
				 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
			
 
				 #define FPRINTF_MPI(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) { \
			
 
				 			int _disp_rank; starpu_mpi_comm_rank(MPI_COMM_WORLD, &_disp_rank); \
			
 
				 			fprintf(ofile, "[%d][starpu_mpi][%s] " fmt , _disp_rank, __starpu_func__ ,## __VA_ARGS__); \
			
 
				-			fflush(ofile); }} while(0);
			
 
				+			fflush(ofile); }} while(0)
			
 
				 
			
 
				-#define MPI_INIT_THREAD(argc, argv, required) do {	    \
			
 
				-		int thread_support;					\
			
 
				+#define MPI_INIT_THREAD_real(argc, argv, required) do {	\
			
 
				+		int thread_support;				\
			
 
				 		if (MPI_Init_thread(argc, argv, required, &thread_support) != MPI_SUCCESS) \
			
 
				 		{						\
			
 
				 			fprintf(stderr,"MPI_Init_thread failed\n");	\
			
@@ -34,5 +35,13 @@
 
				 		if (thread_support == MPI_THREAD_FUNNELED)		\
			
 
				 			fprintf(stderr,"Warning: MPI only has funneled thread support, not serialized, hoping this will work\n"); \
			
 
				 		if (thread_support < MPI_THREAD_FUNNELED)		\
			
 
				-			fprintf(stderr,"Warning: MPI does not have thread support!\n"); } while(0);
			
 
				+			fprintf(stderr,"Warning: MPI does not have thread support!\n"); } while(0)
			
 
				+
			
 
				+#ifdef STARPU_SIMGRID
			
 
				+#define MPI_INIT_THREAD(argc, argv, required, init) do { *(init) = 1 ; } while(0)
			
 
				+#else
			
 
				+#define MPI_INIT_THREAD(argc, argv, required, init) do {	\
			
 
				+		*(init) = 0;                                    \
			
 
				+		MPI_INIT_THREAD_real(argc, argv, required); } while(0)
			
 
				+#endif
			
 
				 
			
--- a/mpi/tests/insert_task_compute.c
+++ b/mpi/tests/insert_task_compute.c
@@ -226,7 +226,7 @@ int main(int argc, char **argv)
 
				 	int after_node[2][4] = {{220, 20, 11, 22}, {220, 20, 11, 22}};
			
 
				 	int node, insert_task, data_array;
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				+	MPI_INIT_THREAD_real(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				 
			
 
				 	global_ret = 0;
			
--- a/mpi/tests/insert_task_count.c
+++ b/mpi/tests/insert_task_count.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010, 2014-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2009, 2010, 2014-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -63,16 +63,18 @@ int main(int argc, char **argv)
 
				 	int ret, rank, size;
			
 
				 	int token = 0;
			
 
				 	starpu_data_handle_t token_handle;
			
 
				+	int mpi_init;
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				-	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				-	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				 
			
 
				 	ret = starpu_init(NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				-	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	ret = starpu_mpi_init(NULL, NULL, mpi_init);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				 
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				 	if (size < 2 || (starpu_cpu_worker_get_count() + starpu_cuda_worker_get_count() == 0))
			
 
				 	{
			
 
				 		if (rank == 0)
			
@@ -84,7 +86,8 @@ int main(int argc, char **argv)
 
				 		}
			
 
				 		starpu_mpi_shutdown();
			
 
				 		starpu_shutdown();
			
 
				-		MPI_Finalize();
			
 
				+		if (!mpi_init)
			
 
				+			MPI_Finalize();
			
 
				 		return STARPU_TEST_SKIPPED;
			
 
				 	}
			
 
				 
			
@@ -120,7 +123,8 @@ int main(int argc, char **argv)
 
				 
			
 
				 	FPRINTF_MPI(stderr, "Final value for token %d\n", token);
			
 
				 
			
 
				-	MPI_Finalize();
			
 
				+	if (!mpi_init)
			
 
				+		MPI_Finalize();
			
 
				 
			
 
				 #ifndef STARPU_SIMGRID
			
 
				 	if (rank == 1)
			
--- a/mpi/tests/insert_task_dyn_handles.c
+++ b/mpi/tests/insert_task_dyn_handles.c
@@ -73,15 +73,17 @@ int main(int argc, char **argv)
 
				         starpu_data_handle_t *data_handles;
			
 
				         starpu_data_handle_t factor_handle;
			
 
				 	struct starpu_data_descr *descrs;
			
 
				+	int mpi_init;
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				-	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				 
			
 
				 	ret = starpu_init(NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				-	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	ret = starpu_mpi_init(NULL, NULL, mpi_init);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				 
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+
			
 
				 	if (starpu_cpu_worker_get_count() == 0)
			
 
				 	{
			
 
				 		if (rank == 0)
			
@@ -142,6 +144,7 @@ enodev:
 
				 	}
			
 
				 	else if (rank == 0)
			
 
				 	{
			
 
				+#ifndef STARPU_SIMGRID
			
 
				 		for(i=0 ; i<STARPU_NMAXBUFS-1 ; i++)
			
 
				 		{
			
 
				 			if (x[i] != nloops * FFACTOR * 2)
			
@@ -162,6 +165,7 @@ enodev:
 
				 		{
			
 
				 			FPRINTF_MPI(stderr, "[end of loop] all values are correct\n");
			
 
				 		}
			
 
				+#endif
			
 
				 		free(x);
			
 
				 	}
			
 
				 	else
			
@@ -173,6 +177,7 @@ enodev:
 
				 
			
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				-	MPI_Finalize();
			
 
				+	if (!mpi_init)
			
 
				+		MPI_Finalize();
			
 
				 	return ret;
			
 
				 }
			
--- a/mpi/tests/insert_task_recv_cache.c
+++ b/mpi/tests/insert_task_recv_cache.c
@@ -137,7 +137,7 @@ int main(int argc, char **argv)
 
				 	size_t *comm_amount_with_cache;
			
 
				 	size_t *comm_amount_without_cache;
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				+	MPI_INIT_THREAD_real(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				 	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				 
			
--- a/mpi/tests/insert_task_sent_cache.c
+++ b/mpi/tests/insert_task_sent_cache.c
@@ -143,7 +143,7 @@ int main(int argc, char **argv)
 
				 	size_t *comm_amount_with_cache;
			
 
				 	size_t *comm_amount_without_cache;
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				+	MPI_INIT_THREAD_real(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				 	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				 
			
--- a/mpi/tests/load_balancer.c
+++ b/mpi/tests/load_balancer.c
@@ -46,14 +46,15 @@ int main(int argc, char **argv)
 
				 {
			
 
				 	int ret;
			
 
				 	struct starpu_mpi_lb_conf itf;
			
 
				+	int mpi_init;
			
 
				 
			
 
				 	itf.get_neighbors = get_neighbors;
			
 
				 	itf.get_data_unit_to_migrate = get_data_unit_to_migrate;
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				 	ret = starpu_init(NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				-	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	ret = starpu_mpi_init(NULL, NULL, mpi_init);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				 
			
 
				 	unsetenv("STARPU_MPI_LB");
			
@@ -65,7 +66,8 @@ int main(int argc, char **argv)
 
				 
			
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				-	MPI_Finalize();
			
 
				+	if (!mpi_init)
			
 
				+		MPI_Finalize();
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/mpi/tests/matrix2.c
+++ b/mpi/tests/matrix2.c
@@ -58,16 +58,18 @@ int main(int argc, char **argv)
 
				 	unsigned X[N];
			
 
				 	starpu_data_handle_t data_A[N];
			
 
				 	starpu_data_handle_t data_X[N];
			
 
				+	int mpi_init;
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				-	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				-	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				 
			
 
				 	ret = starpu_init(NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				-	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	ret = starpu_mpi_init(NULL, NULL, mpi_init);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				 
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				 	if ((size < 3) || (starpu_cpu_worker_get_count() == 0))
			
 
				 	{
			
 
				 		if (rank == 0)
			
@@ -79,7 +81,8 @@ int main(int argc, char **argv)
 
				 		}
			
 
				 		starpu_mpi_shutdown();
			
 
				 		starpu_shutdown();
			
 
				-		MPI_Finalize();
			
 
				+		if (!mpi_init)
			
 
				+			MPI_Finalize();
			
 
				 		return STARPU_TEST_SKIPPED;
			
 
				 	}
			
 
				 
			
@@ -154,6 +157,7 @@ int main(int argc, char **argv)
 
				 	}
			
 
				 #endif
			
 
				 
			
 
				-	MPI_Finalize();
			
 
				+	if (!mpi_init)
			
 
				+		MPI_Finalize();
			
 
				 	return 0;
			
 
				 }
			
--- a/mpi/tests/mpi_detached_tag.c
+++ b/mpi/tests/mpi_detached_tag.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2014-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2014-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -33,16 +33,18 @@ starpu_data_handle_t tab_handle;
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				 	int ret, rank, size;
			
 
				+	int mpi_init;
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				-	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				-	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				 
			
 
				 	ret = starpu_init(NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				-	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	ret = starpu_mpi_init(NULL, NULL, mpi_init);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				 
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				 	if (size%2 != 0)
			
 
				 	{
			
 
				 		if (rank == 0)
			
@@ -50,7 +52,8 @@ int main(int argc, char **argv)
 
				 
			
 
				 		starpu_mpi_shutdown();
			
 
				 		starpu_shutdown();
			
 
				-		MPI_Finalize();
			
 
				+		if (!mpi_init)
			
 
				+			MPI_Finalize();
			
 
				 		return STARPU_TEST_SKIPPED;
			
 
				 	}
			
 
				 
			
@@ -84,7 +87,8 @@ int main(int argc, char **argv)
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				-	MPI_Finalize();
			
 
				+	if (!mpi_init)
			
 
				+		MPI_Finalize();
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/mpi/tests/mpi_earlyrecv.c
+++ b/mpi/tests/mpi_earlyrecv.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010, 2014-2015  Université de Bordeaux
			
 
				+ * Copyright (C) 2009, 2010, 2014-2015, 2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -25,22 +25,25 @@ int main(int argc, char **argv)
 
				 	starpu_data_handle_t tab_handle[4];
			
 
				 	int values[4];
			
 
				 	starpu_mpi_req request[2] = {NULL, NULL};
			
 
				+	int mpi_init;
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				-	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				-	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				 
			
 
				 	ret = starpu_init(NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				-	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	ret = starpu_mpi_init(NULL, NULL, mpi_init);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				 
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				 	if (size%2 != 0)
			
 
				 	{
			
 
				 		FPRINTF_MPI(stderr, "We need a even number of processes.\n");
			
 
				 		starpu_mpi_shutdown();
			
 
				 		starpu_shutdown();
			
 
				-		MPI_Finalize();
			
 
				+		if (!mpi_init)
			
 
				+			MPI_Finalize();
			
 
				 		return STARPU_TEST_SKIPPED;
			
 
				 	}
			
 
				 
			
@@ -123,7 +126,8 @@ int main(int argc, char **argv)
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				-	MPI_Finalize();
			
 
				+	if (!mpi_init)
			
 
				+		MPI_Finalize();
			
 
				 
			
 
				 	return ret;
			
 
				 }
			
--- a/mpi/tests/mpi_earlyrecv2.c
+++ b/mpi/tests/mpi_earlyrecv2.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010, 2014-2015  Université de Bordeaux
			
 
				+ * Copyright (C) 2009, 2010, 2014-2015, 2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -207,22 +207,25 @@ int main(int argc, char **argv)
 
				 {
			
 
				 	int ret=0, global_ret=0;
			
 
				 	int rank, size;
			
 
				+	int mpi_init;
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				-	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				-	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				 
			
 
				 	ret = starpu_init(NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				-	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	ret = starpu_mpi_init(NULL, NULL, mpi_init);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				 
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				 	if (size%2 != 0)
			
 
				 	{
			
 
				 		FPRINTF(stderr, "We need a even number of processes.\n");
			
 
				 		starpu_mpi_shutdown();
			
 
				 		starpu_shutdown();
			
 
				-		MPI_Finalize();
			
 
				+		if (!mpi_init)
			
 
				+			MPI_Finalize();
			
 
				 		return STARPU_TEST_SKIPPED;
			
 
				 	}
			
 
				 
			
@@ -247,7 +250,8 @@ int main(int argc, char **argv)
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				-	MPI_Finalize();
			
 
				+	if (!mpi_init)
			
 
				+		MPI_Finalize();
			
 
				 
			
 
				 	return global_ret;
			
 
				 }
			
--- a/mpi/tests/mpi_earlyrecv2_sync.c
+++ b/mpi/tests/mpi_earlyrecv2_sync.c
@@ -211,7 +211,7 @@ int main(int argc, char **argv)
 
				 	int ret=0, global_ret=0;
			
 
				 	int rank, size;
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				+	MPI_INIT_THREAD_real(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				 	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				 
			
--- a/mpi/tests/mpi_irecv.c
+++ b/mpi/tests/mpi_irecv.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010, 2014-2015  Université de Bordeaux
			
 
				+ * Copyright (C) 2009, 2010, 2014-2015, 2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -31,16 +31,18 @@ starpu_data_handle_t tab_handle;
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				 	int ret, rank, size;
			
 
				+	int mpi_init;
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				-	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				-	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				 
			
 
				 	ret = starpu_init(NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				-	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	ret = starpu_mpi_init(NULL, NULL, mpi_init);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				 
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				 	if (size%2 != 0)
			
 
				 	{
			
 
				 		if (rank == 0)
			
@@ -48,7 +50,8 @@ int main(int argc, char **argv)
 
				 
			
 
				 		starpu_mpi_shutdown();
			
 
				 		starpu_shutdown();
			
 
				-		MPI_Finalize();
			
 
				+		if (!mpi_init)
			
 
				+			MPI_Finalize();
			
 
				 		return STARPU_TEST_SKIPPED;
			
 
				 	}
			
 
				 
			
@@ -81,7 +84,8 @@ int main(int argc, char **argv)
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				-	MPI_Finalize();
			
 
				+	if (!mpi_init)
			
 
				+		MPI_Finalize();
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/mpi/tests/mpi_irecv_detached.c
+++ b/mpi/tests/mpi_irecv_detached.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2012, 2014-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2012, 2014-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -48,16 +48,18 @@ void callback(void *arg STARPU_ATTRIBUTE_UNUSED)
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				 	int ret, rank, size;
			
 
				+	int mpi_init;
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				-	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				-	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				 
			
 
				 	ret = starpu_init(NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				-	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	ret = starpu_mpi_init(NULL, NULL, mpi_init);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				 
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				 	if (size%2 != 0)
			
 
				 	{
			
 
				 		if (rank == 0)
			
@@ -65,7 +67,8 @@ int main(int argc, char **argv)
 
				 
			
 
				 		starpu_mpi_shutdown();
			
 
				 		starpu_shutdown();
			
 
				-		MPI_Finalize();
			
 
				+		if (!mpi_init)
			
 
				+			MPI_Finalize();
			
 
				 		return STARPU_TEST_SKIPPED;
			
 
				 	}
			
 
				 
			
@@ -101,7 +104,8 @@ int main(int argc, char **argv)
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				-	MPI_Finalize();
			
 
				+	if (!mpi_init)
			
 
				+		MPI_Finalize();
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/mpi/tests/mpi_isend.c
+++ b/mpi/tests/mpi_isend.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010, 2014-2015  Université de Bordeaux
			
 
				+ * Copyright (C) 2009, 2010, 2014-2015, 2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -31,16 +31,18 @@ starpu_data_handle_t tab_handle;
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				 	int ret, rank, size;
			
 
				+	int mpi_init;
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				-	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				-	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				 
			
 
				 	ret = starpu_init(NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				-	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	ret = starpu_mpi_init(NULL, NULL, mpi_init);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				 
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				 	if (size%2 != 0)
			
 
				 	{
			
 
				 		if (rank == 0)
			
@@ -48,7 +50,8 @@ int main(int argc, char **argv)
 
				 
			
 
				 		starpu_mpi_shutdown();
			
 
				 		starpu_shutdown();
			
 
				-		MPI_Finalize();
			
 
				+		if (!mpi_init)
			
 
				+			MPI_Finalize();
			
 
				 		return STARPU_TEST_SKIPPED;
			
 
				 	}
			
 
				 
			
@@ -82,7 +85,8 @@ int main(int argc, char **argv)
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				-	MPI_Finalize();
			
 
				+	if (!mpi_init)
			
 
				+		MPI_Finalize();
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/mpi/tests/mpi_isend_detached.c
+++ b/mpi/tests/mpi_isend_detached.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2012, 2014-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2012, 2014-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -47,16 +47,18 @@ int main(int argc, char **argv)
 
				 	int ret, rank, size;
			
 
				 	float *tab;
			
 
				 	starpu_data_handle_t tab_handle;
			
 
				+	int mpi_init;
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				-	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				-	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				 
			
 
				 	ret = starpu_init(NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				-	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	ret = starpu_mpi_init(NULL, NULL, mpi_init);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				 
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				 	if (size%2 != 0)
			
 
				 	{
			
 
				 		if (rank == 0)
			
@@ -64,7 +66,8 @@ int main(int argc, char **argv)
 
				 
			
 
				 		starpu_mpi_shutdown();
			
 
				 		starpu_shutdown();
			
 
				-		MPI_Finalize();
			
 
				+		if (!mpi_init)
			
 
				+			MPI_Finalize();
			
 
				 		return STARPU_TEST_SKIPPED;
			
 
				 	}
			
 
				 
			
@@ -106,7 +109,8 @@ int main(int argc, char **argv)
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				-	MPI_Finalize();
			
 
				+	if (!mpi_init)
			
 
				+		MPI_Finalize();
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/mpi/tests/mpi_redux.c
+++ b/mpi/tests/mpi_redux.c
@@ -36,18 +36,20 @@ int main(int argc, char **argv)
 
				 	int ret, rank, size, sum;
			
 
				 	int value=0;
			
 
				 	starpu_data_handle_t *handles;
			
 
				+	int mpi_init;
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				-	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				-	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				-
			
 
				-	sum = ((size-1) * (size) / 2);
			
 
				+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				 
			
 
				 	ret = starpu_init(NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				-	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	ret = starpu_mpi_init(NULL, NULL, mpi_init);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				 
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				+	sum = ((size-1) * (size) / 2);
			
 
				+
			
 
				 	if (rank == 0)
			
 
				 	{
			
 
				 		int src;
			
@@ -99,7 +101,8 @@ int main(int argc, char **argv)
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				-	MPI_Finalize();
			
 
				+	if (!mpi_init)
			
 
				+		MPI_Finalize();
			
 
				 
			
 
				 	STARPU_ASSERT_MSG(sum == value, "Sum of first %d integers is %d, not %d\n", size-1, sum, value);
			
 
				 
			
--- a/mpi/tests/mpi_test.c
+++ b/mpi/tests/mpi_test.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2014-2015  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2014-2015, 2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -31,16 +31,18 @@ int main(int argc, char **argv)
 
				 	int ret, rank, size;
			
 
				 	float *tab;
			
 
				 	starpu_data_handle_t tab_handle;
			
 
				+	int mpi_init;
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				-	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				-	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				 
			
 
				 	ret = starpu_init(NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				-	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	ret = starpu_mpi_init(NULL, NULL, mpi_init);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				 
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				 	if (size%2 != 0)
			
 
				 	{
			
 
				 		if (rank == 0)
			
@@ -48,7 +50,8 @@ int main(int argc, char **argv)
 
				 
			
 
				 		starpu_mpi_shutdown();
			
 
				 		starpu_shutdown();
			
 
				-		MPI_Finalize();
			
 
				+		if (!mpi_init)
			
 
				+			MPI_Finalize();
			
 
				 		return STARPU_TEST_SKIPPED;
			
 
				 	}
			
 
				 
			
@@ -88,7 +91,8 @@ int main(int argc, char **argv)
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				-	MPI_Finalize();
			
 
				+	if (!mpi_init)
			
 
				+		MPI_Finalize();
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/mpi/tests/pingpong.c
+++ b/mpi/tests/pingpong.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010, 2014-2015  Université de Bordeaux
			
 
				+ * Copyright (C) 2009, 2010, 2014-2015, 2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -32,16 +32,18 @@ starpu_data_handle_t tab_handle;
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				 	int ret, rank, size;
			
 
				+	int mpi_init;
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				-	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				-	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				 
			
 
				 	ret = starpu_init(NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				-	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	ret = starpu_mpi_init(NULL, NULL, mpi_init);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				 
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				 	if (size%2 != 0)
			
 
				 	{
			
 
				 		if (rank == 0)
			
@@ -49,7 +51,8 @@ int main(int argc, char **argv)
 
				 
			
 
				 		starpu_mpi_shutdown();
			
 
				 		starpu_shutdown();
			
 
				-		MPI_Finalize();
			
 
				+		if (!mpi_init)
			
 
				+			MPI_Finalize();
			
 
				 		return STARPU_TEST_SKIPPED;
			
 
				 	}
			
 
				 
			
@@ -81,7 +84,8 @@ int main(int argc, char **argv)
 
				 
			
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				-	MPI_Finalize();
			
 
				+	if (!mpi_init)
			
 
				+		MPI_Finalize();
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/mpi/tests/policy_register.c
+++ b/mpi/tests/policy_register.c
@@ -69,16 +69,18 @@ int main(int argc, char **argv)
 
				 	int policy;
			
 
				 	struct starpu_task *task;
			
 
				 	starpu_data_handle_t handles[2];
			
 
				+	int mpi_init;
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				-	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				-	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				 
			
 
				 	ret = starpu_init(NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				-	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	ret = starpu_mpi_init(NULL, NULL, mpi_init);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				 
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				 	if (size < 2)
			
 
				 	{
			
 
				 		if (rank == 0)
			
@@ -86,7 +88,8 @@ int main(int argc, char **argv)
 
				 
			
 
				 		starpu_mpi_shutdown();
			
 
				 		starpu_shutdown();
			
 
				-		MPI_Finalize();
			
 
				+		if (!mpi_init)
			
 
				+			MPI_Finalize();
			
 
				 		return STARPU_TEST_SKIPPED;
			
 
				 	}
			
 
				 
			
@@ -140,7 +143,8 @@ int main(int argc, char **argv)
 
				 	starpu_data_unregister(handles[1]);
			
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				-	MPI_Finalize();
			
 
				+	if (!mpi_init)
			
 
				+		MPI_Finalize();
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/mpi/tests/policy_selection.c
+++ b/mpi/tests/policy_selection.c
@@ -56,16 +56,19 @@ int main(int argc, char **argv)
 
				 	int policy = 12;
			
 
				 	struct starpu_task *task;
			
 
				 	starpu_data_handle_t handles[3];
			
 
				+	int mpi_init;
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				-	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				-	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				+	(void)mpi_init;
			
 
				 
			
 
				 	ret = starpu_init(NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				 
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				 	if (size < 3)
			
 
				 	{
			
 
				 		if (rank == 0)
			
@@ -73,7 +76,8 @@ int main(int argc, char **argv)
 
				 
			
 
				 		starpu_mpi_shutdown();
			
 
				 		starpu_shutdown();
			
 
				-		MPI_Finalize();
			
 
				+		if (!mpi_init)
			
 
				+			MPI_Finalize();
			
 
				 		return STARPU_TEST_SKIPPED;
			
 
				 	}
			
 
				 
			
@@ -181,7 +185,8 @@ int main(int argc, char **argv)
 
				 
			
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				-	MPI_Finalize();
			
 
				+	if (!mpi_init)
			
 
				+		MPI_Finalize();
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/mpi/tests/policy_selection2.c
+++ b/mpi/tests/policy_selection2.c
@@ -54,16 +54,19 @@ int main(int argc, char **argv)
 
				 	int rank, size;
			
 
				 	int data[3];
			
 
				 	starpu_data_handle_t handles[3];
			
 
				+	int mpi_init;
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				-	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				-	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				+	(void)mpi_init;
			
 
				 
			
 
				 	ret = starpu_init(NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				 
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				 	if ((size < 3) || (starpu_cpu_worker_get_count() == 0))
			
 
				 	{
			
 
				 		if (rank == 0)
			
@@ -75,7 +78,8 @@ int main(int argc, char **argv)
 
				 		}
			
 
				 		starpu_mpi_shutdown();
			
 
				 		starpu_shutdown();
			
 
				-		MPI_Finalize();
			
 
				+		if (!mpi_init)
			
 
				+			MPI_Finalize();
			
 
				 		return STARPU_TEST_SKIPPED;
			
 
				 	}
			
 
				 
			
@@ -97,11 +101,13 @@ int main(int argc, char **argv)
 
				 	for(i=0 ; i<2 ; i++) starpu_data_acquire(handles[i], STARPU_R);
			
 
				 	FPRINTF_MPI(stderr, "data[%d,%d,%d] = %d,%d,%d\n", 0, 1, 2, data[0], data[1], data[2]);
			
 
				 	for(i=0 ; i<2 ; i++) starpu_data_release(handles[i]);
			
 
				+#ifndef STARPU_SIMGRID
			
 
				 	if (rank == 2)
			
 
				 	{
			
 
				 		STARPU_ASSERT_MSG(data[0] == 2*data[2] && data[1] == 2*data[2], "Computation incorrect. data[%d] (%d) != 2*data[%d] (%d) && data[%d] (%d) != 2*data[%d] (%d)\n",
			
 
				 				  0, data[0], 2, data[2], 1, data[1], 2, data[2]);
			
 
				 	}
			
 
				+#endif
			
 
				 
			
 
				 	for(i=0 ; i<2 ; i++) starpu_data_acquire(handles[i], STARPU_W);
			
 
				 	for(i=0 ; i<2 ; i++) data[i] = 12;
			
@@ -115,17 +121,20 @@ int main(int argc, char **argv)
 
				 	for(i=0 ; i<2 ; i++) starpu_data_acquire(handles[i], STARPU_R);
			
 
				 	FPRINTF_MPI(stderr, "data[%d,%d,%d] = %d,%d,%d\n", 0, 1, 2, data[0], data[1], data[2]);
			
 
				 	for(i=0 ; i<2 ; i++) starpu_data_release(handles[i]);
			
 
				+#ifndef STARPU_SIMGRID
			
 
				 	if (rank == 1)
			
 
				 	{
			
 
				 		STARPU_ASSERT_MSG(data[0] == 2*data[2] && data[1] == 2*data[2], "Computation incorrect. data[%d] (%d) != 2*data[%d] (%d) && data[%d] (%d) != 2*data[%d] (%d)\n",
			
 
				 				  0, data[0], 2, data[2], 1, data[1], 2, data[2]);
			
 
				 	}
			
 
				+#endif
			
 
				 
			
 
				 	for(i=0 ; i<3 ; i++) starpu_data_unregister(handles[i]);
			
 
				 
			
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				-	MPI_Finalize();
			
 
				+	if (!mpi_init)
			
 
				+		MPI_Finalize();
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/mpi/tests/ring.c
+++ b/mpi/tests/ring.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010, 2014-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2009, 2010, 2014-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -76,16 +76,18 @@ void increment_token(void)
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				 	int ret, rank, size;
			
 
				+	int mpi_init;
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				-	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				-	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				 
			
 
				 	ret = starpu_init(NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				-	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	ret = starpu_mpi_init(NULL, NULL, mpi_init);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				 
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				 	if (size < 2 || (starpu_cpu_worker_get_count() + starpu_cuda_worker_get_count() == 0))
			
 
				 	{
			
 
				 		if (rank == 0)
			
@@ -97,7 +99,8 @@ int main(int argc, char **argv)
 
				 		}
			
 
				 		starpu_mpi_shutdown();
			
 
				 		starpu_shutdown();
			
 
				-		MPI_Finalize();
			
 
				+		if (!mpi_init)
			
 
				+			MPI_Finalize();
			
 
				 		return STARPU_TEST_SKIPPED;
			
 
				 	}
			
 
				 
			
@@ -142,7 +145,8 @@ int main(int argc, char **argv)
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				-	MPI_Finalize();
			
 
				+	if (!mpi_init)
			
 
				+		MPI_Finalize();
			
 
				 
			
 
				 #ifndef STARPU_SIMGRID
			
 
				 	if (rank == last_rank)
			
--- a/mpi/tests/ring_async.c
+++ b/mpi/tests/ring_async.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010, 2014-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2009, 2010, 2014-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -76,16 +76,18 @@ void increment_token(void)
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				 	int ret, rank, size;
			
 
				+	int mpi_init;
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				-	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				-	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				 
			
 
				 	ret = starpu_init(NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				-	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	ret = starpu_mpi_init(NULL, NULL, mpi_init);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				 
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				 	if (size < 2 || (starpu_cpu_worker_get_count() + starpu_cuda_worker_get_count() == 0))
			
 
				 	{
			
 
				 		if (rank == 0)
			
@@ -97,7 +99,8 @@ int main(int argc, char **argv)
 
				 		}
			
 
				 		starpu_mpi_shutdown();
			
 
				 		starpu_shutdown();
			
 
				-		MPI_Finalize();
			
 
				+		if (!mpi_init)
			
 
				+			MPI_Finalize();
			
 
				 		return STARPU_TEST_SKIPPED;
			
 
				 	}
			
 
				 
			
@@ -147,7 +150,8 @@ int main(int argc, char **argv)
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				-	MPI_Finalize();
			
 
				+	if (!mpi_init)
			
 
				+		MPI_Finalize();
			
 
				 
			
 
				 #ifndef STARPU_SIMGRID
			
 
				 	if (rank == last_rank)
			
--- a/mpi/tests/ring_sync.c
+++ b/mpi/tests/ring_sync.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010, 2014-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2009, 2010, 2014-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -76,16 +76,18 @@ void increment_token(void)
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				 	int ret, rank, size;
			
 
				+	int mpi_init;
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				-	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				-	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				 
			
 
				 	ret = starpu_init(NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				-	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	ret = starpu_mpi_init(NULL, NULL, mpi_init);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				 
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				 	if (size < 2 || (starpu_cpu_worker_get_count() + starpu_cuda_worker_get_count() == 0))
			
 
				 	{
			
 
				 		if (rank == 0)
			
@@ -97,7 +99,8 @@ int main(int argc, char **argv)
 
				 		}
			
 
				 		starpu_mpi_shutdown();
			
 
				 		starpu_shutdown();
			
 
				-		MPI_Finalize();
			
 
				+		if (!mpi_init)
			
 
				+			MPI_Finalize();
			
 
				 		return STARPU_TEST_SKIPPED;
			
 
				 	}
			
 
				 
			
@@ -145,7 +148,8 @@ int main(int argc, char **argv)
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				-	MPI_Finalize();
			
 
				+	if (!mpi_init)
			
 
				+		MPI_Finalize();
			
 
				 
			
 
				 #ifndef STARPU_SIMGRID
			
 
				 	if (rank == last_rank)
			
--- a/mpi/tests/ring_sync_detached.c
+++ b/mpi/tests/ring_sync_detached.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010, 2014-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2009, 2010, 2014-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -88,16 +88,18 @@ int main(int argc, char **argv)
 
				 	int ret, rank, size;
			
 
				 	int token = 42;
			
 
				 	starpu_data_handle_t token_handle;
			
 
				+	int mpi_init;
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				-	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				-	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				 
			
 
				 	ret = starpu_init(NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				-	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	ret = starpu_mpi_init(NULL, NULL, mpi_init);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				 
			
 
				+	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+
			
 
				 	if (size < 2 || (starpu_cpu_worker_get_count() + starpu_cuda_worker_get_count() == 0))
			
 
				 	{
			
 
				 		if (rank == 0)
			
@@ -109,7 +111,8 @@ int main(int argc, char **argv)
 
				 		}
			
 
				 		starpu_mpi_shutdown();
			
 
				 		starpu_shutdown();
			
 
				-		MPI_Finalize();
			
 
				+		if (!mpi_init)
			
 
				+			MPI_Finalize();
			
 
				 		return STARPU_TEST_SKIPPED;
			
 
				 	}
			
 
				 
			
@@ -161,7 +164,8 @@ int main(int argc, char **argv)
 
				 	starpu_shutdown();
			
 
				 
			
 
				 	FPRINTF_MPI(stderr, "Final value for token %d\n", token);
			
 
				-	MPI_Finalize();
			
 
				+	if (!mpi_init)
			
 
				+		MPI_Finalize();
			
 
				 
			
 
				 #ifndef STARPU_SIMGRID
			
 
				 	if (rank == last_rank)
			
--- a/mpi/tests/starpu_redefine.c
+++ b/mpi/tests/starpu_redefine.c
@@ -21,14 +21,15 @@ int main(int argc, char **argv)
 
				 {
			
 
				 	int ret;
			
 
				 	starpu_data_handle_t handle;
			
 
				+	int mpi_init;
			
 
				 
			
 
				 	disable_coredump();
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				 
			
 
				 	ret = starpu_init(NULL);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				-	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	ret = starpu_mpi_init(NULL, NULL, mpi_init);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				 
			
 
				 	starpu_vector_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)&ret, 1, sizeof(int));
			
@@ -37,7 +38,8 @@ int main(int argc, char **argv)
 
				 
			
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				-	MPI_Finalize();
			
 
				+	if (!mpi_init)
			
 
				+		MPI_Finalize();
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/mpi/tests/sync.c
+++ b/mpi/tests/sync.c
@@ -23,15 +23,17 @@ int main(int argc, char **argv)
 
				 	int rank, other_rank;
			
 
				 	int ret;
			
 
				 	starpu_data_handle_t data[2];
			
 
				+	int mpi_init;
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				-        starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				-        starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				+	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+        MPI_Comm_size(MPI_COMM_WORLD, &size);
			
 
				 
			
 
				         if (size % 2)
			
 
				         {
			
 
				 		FPRINTF(stderr, "We need a even number of processes.\n");
			
 
				-                MPI_Finalize();
			
 
				+		if (!mpi_init)
			
 
				+			MPI_Finalize();
			
 
				                 return STARPU_TEST_SKIPPED;
			
 
				         }
			
 
				 
			
@@ -92,6 +94,7 @@ int main(int argc, char **argv)
 
				 
			
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				-        MPI_Finalize();
			
 
				+	if (!mpi_init)
			
 
				+		MPI_Finalize();
			
 
				 	return 0;
			
 
				 }
			
--- a/mpi/tests/tags_checking.c
+++ b/mpi/tests/tags_checking.c
@@ -124,7 +124,7 @@ int main(int argc, char **argv)
 
				 	int ret=0;
			
 
				 	int sdetached, rdetached;
			
 
				 
			
 
				-	MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				+	MPI_INIT_THREAD_real(&argc, &argv, MPI_THREAD_SERIALIZED);
			
 
				         starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				         starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
			
 
				 
			
--- a/sc_hypervisor/src/sc_hypervisor.c
+++ b/sc_hypervisor/src/sc_hypervisor.c
@@ -403,7 +403,6 @@ void sc_hypervisor_unregister_ctx(unsigned sched_ctx)
 
				 	if(npus)
			
 
				 	{
			
 
				 		starpu_sched_ctx_set_priority(pus, npus, father, 1);
			
 
				-		starpu_sched_ctx_set_priority_on_level(pus, npus, father, 1);
			
 
				 		free(pus);
			
 
				 	}
			
 
				 
			
--- a/socl/examples/matmul/matmul.c
+++ b/socl/examples/matmul/matmul.c
@@ -1,6 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.

			
 
				  *

			
 
				  * Copyright (C) 2010,2011, 2015, 2017 University of Bordeaux

			
 
				+ * Copyright (C) 2017 Inria

			
 
				  *

			
 
				  * StarPU is free software; you can redistribute it and/or modify

			
 
				  * it under the terms of the GNU Lesser General Public License as published by

			
@@ -13,6 +14,14 @@
 
				  *

			
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.

			
 
				  */

			
 
				+#ifndef STARPU_NON_BLOCKING_DRIVERS

			
 
				+int main(int argc, const char** argv) {

			
 
				+	(void) argv;

			
 
				+	(void) argv;

			
 
				+	/* testcase does not seem to support blocking drivers */

			
 
				+	return 77;

			
 
				+}

			
 
				+#else

			
 
				 

			
 
				 #ifdef __APPLE_CC__

			
 
				 #include <OpenCL/opencl.h>

			
@@ -512,3 +521,4 @@ void computeReference(TYPE* C, const TYPE* A, const TYPE* B, unsigned int hA, un
 
				 			C[i * wB + j] = (TYPE)sum;

			
 
				 		}

			
 
				 }

			
 
				+#endif /* STARPU_NON_BLOCKING_DRIVERS */

			
--- a/socl/src/cl_createkernel.c
+++ b/socl/src/cl_createkernel.c
@@ -29,7 +29,7 @@ static void soclCreateKernel_task(void *data) {
 
				       return;
			
 
				    }
			
 
				 
			
 
				-   DEBUG_MSG("[Device %d] Creating kernel...\n", starpu_worker_get_id_check());
			
 
				+   DEBUG_MSG("[Device %u] Creating kernel...\n", starpu_worker_get_id_check());
			
 
				    k->cl_kernels[range] = clCreateKernel(k->program->cl_programs[range], k->kernel_name, &err);
			
 
				    if (err != CL_SUCCESS) {
			
 
				       k->errcodes[range] = err;
			
--- a/socl/src/gc.c
+++ b/socl/src/gc.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010-2012 University of Bordeaux
			
 
				- * Copyright (C) 2012, 2014 CNRS
			
 
				+ * Copyright (C) 2012, 2014, 2017 CNRS
			
 
				  * Copyright (C) 2012 Vincent Danjean <Vincent.Danjean@ens-lyon.org>
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -108,7 +108,7 @@ void gc_stop(void) {
 
				 
			
 
				 int gc_entity_release_ex(entity e, const char * DEBUG_PARAM(caller)) {
			
 
				 
			
 
				-  DEBUG_MSG("[%s] Decrementing refcount of %s %p to ", caller, e->name, e);
			
 
				+  DEBUG_MSG("[%s] Decrementing refcount of %s %p to ", caller, e->name, (void *)e);
			
 
				 
			
 
				   /* Decrement reference count */
			
 
				   int refs = __sync_sub_and_fetch(&e->refs, 1);
			
@@ -120,7 +120,7 @@ int gc_entity_release_ex(entity e, const char * DEBUG_PARAM(caller)) {
 
				   if (refs != 0)
			
 
				     return 0;
			
 
				 
			
 
				-  DEBUG_MSG("[%s] Releasing %s %p\n", caller, e->name, e);
			
 
				+  DEBUG_MSG("[%s] Releasing %s %p\n", caller, e->name, (void *)e);
			
 
				 
			
 
				   GC_LOCK;
			
 
				 
			
@@ -209,7 +209,7 @@ void gc_print_remaining_entities(void) {
 
				 
			
 
				    entity e = entities;
			
 
				    while (e != NULL) {
			
 
				-      DEBUG_MSG("  - %s %p\n", e->name, e);
			
 
				+      DEBUG_MSG("  - %s %p\n", e->name, (void *)e);
			
 
				       e = e->next;
			
 
				    }
			
 
				 
			
--- a/socl/src/task.c
+++ b/socl/src/task.c
@@ -105,7 +105,7 @@ cl_int task_submit_ex(starpu_task task, cl_command cmd) {
 
				   gc_entity_release(ev);
			
 
				 
			
 
				   /* Submit task */
			
 
				-  int ret = (task->cl != NULL && task->cl->where == STARPU_OPENCL ?
			
 
				+  int ret = (task->cl != NULL && task->where == STARPU_OPENCL ?
			
 
				         starpu_task_submit_to_ctx(task, cmd->event->cq->context->sched_ctx) :
			
 
				         starpu_task_submit(task));
			
 
				 
			
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -168,6 +168,7 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 		\
 
				 	common/thread.c						\
			
 
				 	common/rbtree.c						\
			
 
				 	common/graph.c						\
			
 
				+	common/inlines.c					\
			
 
				 	core/jobs.c						\
			
 
				 	core/task.c						\
			
 
				 	core/task_bundle.c					\
			
@@ -317,6 +318,7 @@ endif
 
				 endif
			
 
				 
			
 
				 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/cuda/starpu_cublas.c
			
 
				+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/cuda/starpu_cusparse.c
			
 
				 
			
 
				 if STARPU_USE_OPENCL
			
 
				 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/opencl/driver_opencl.c
			
--- a/src/common/graph.c
+++ b/src/common/graph.c
@@ -1,6 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2016-2017  Université de Bordeaux
			
 
				+ * Copyright (C) 2017  Inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -28,6 +29,7 @@
 
				 #include <starpu.h>
			
 
				 #include <core/jobs.h>
			
 
				 #include <common/graph.h>
			
 
				+#include <core/workers.h>
			
 
				 
			
 
				 /* Protects the whole task graph except the dropped list */
			
 
				 static starpu_pthread_rwlock_t graph_lock;
			
@@ -60,7 +62,9 @@ void _starpu_graph_init(void)
 
				 /* LockWR the graph lock */
			
 
				 void _starpu_graph_wrlock(void)
			
 
				 {
			
 
				+	_starpu_worker_relax_on();
			
 
				 	STARPU_PTHREAD_RWLOCK_WRLOCK(&graph_lock);
			
 
				+	_starpu_worker_relax_off();
			
 
				 }
			
 
				 
			
 
				 void _starpu_graph_drop_node(struct _starpu_graph_node *node);
			
@@ -94,14 +98,18 @@ void _starpu_graph_drop_dropped_nodes(void)
 
				 /* UnlockWR the graph lock */
			
 
				 void _starpu_graph_wrunlock(void)
			
 
				 {
			
 
				+	_starpu_worker_relax_on();
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&dropped_lock);
			
 
				+	_starpu_worker_relax_off();
			
 
				 	_starpu_graph_drop_dropped_nodes();
			
 
				 }
			
 
				 
			
 
				 /* LockRD the graph lock */
			
 
				 void _starpu_graph_rdlock(void)
			
 
				 {
			
 
				+	_starpu_worker_relax_on();
			
 
				 	STARPU_PTHREAD_RWLOCK_RDLOCK(&graph_lock);
			
 
				+	_starpu_worker_relax_off();
			
 
				 }
			
 
				 
			
 
				 /* UnlockRD the graph lock */
			
@@ -247,12 +255,16 @@ void _starpu_graph_drop_job(struct _starpu_job *job)
 
				 	if (!node)
			
 
				 		return;
			
 
				 
			
 
				+	_starpu_worker_relax_on();
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&node->mutex);
			
 
				+	_starpu_worker_relax_off();
			
 
				 	/* Will not be able to use the job any more */
			
 
				 	node->job = NULL;
			
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&node->mutex);
			
 
				 
			
 
				+	_starpu_worker_relax_on();
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&dropped_lock);
			
 
				+	_starpu_worker_relax_off();
			
 
				 	/* Queue for removal when lock becomes available */
			
 
				 	_starpu_graph_node_multilist_push_back_dropped(&dropped, node);
			
 
				 	if (STARPU_PTHREAD_RWLOCK_TRYWRLOCK(&graph_lock) == 0)
			
--- a/src/common/inlines.c
+++ b/src/common/inlines.c
@@ -0,0 +1,22 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2017  Université de Bordeaux
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+/* This includes the inline definitions in a .c file so that they can also be
			
 
				+ * referenced from outside */
			
 
				+
			
 
				+#define LIST_INLINE
			
 
				+#define PRIO_LIST_INLINE
			
 
				+#include <core/task.h>
			
--- a/src/common/list.h
+++ b/src/common/list.h
@@ -126,6 +126,9 @@
 
				  */
			
 
				 
			
 
				 
			
 
				+#ifndef LIST_INLINE
			
 
				+#define LIST_INLINE static inline
			
 
				+#endif
			
 
				 
			
 
				 /**@hideinitializer
			
 
				  * Generates a new type for list of elements */
			
@@ -146,75 +149,76 @@
 
				  * The effective type declaration for lists */
			
 
				 #define LIST_CREATE_TYPE_NOSTRUCT(ENAME, _prev, _next) \
			
 
				   /** @internal */ \
			
 
				+ /* NOTE: this must not be greater than the struct defined in include/starpu_task_list.h */ \
			
 
				   struct ENAME##_list \
			
 
				   { \
			
 
				     struct ENAME *_head; /**< @internal head of the list */ \
			
 
				     struct ENAME *_tail; /**< @internal tail of the list */ \
			
 
				   }; \
			
 
				-  /** @internal */static inline struct ENAME *ENAME##_new(void) \
			
 
				+  /** @internal */LIST_INLINE struct ENAME *ENAME##_new(void) \
			
 
				     { struct ENAME *e; _STARPU_MALLOC(e, sizeof(struct ENAME)); \
			
 
				       e->_next = NULL; e->_prev = NULL; return e; } \
			
 
				-  /** @internal */static inline void ENAME##_delete(struct ENAME *e) \
			
 
				+  /** @internal */LIST_INLINE void ENAME##_delete(struct ENAME *e) \
			
 
				     { free(e); } \
			
 
				-  /** @internal */static inline void ENAME##_list_push_front(struct ENAME##_list *l, struct ENAME *e) \
			
 
				+  /** @internal */LIST_INLINE void ENAME##_list_push_front(struct ENAME##_list *l, struct ENAME *e) \
			
 
				     { if(l->_tail == NULL) l->_tail = e; else l->_head->_prev = e; \
			
 
				       e->_prev = NULL; e->_next = l->_head; l->_head = e; } \
			
 
				-  /** @internal */static inline void ENAME##_list_push_back(struct ENAME##_list *l, struct ENAME *e) \
			
 
				+  /** @internal */LIST_INLINE void ENAME##_list_push_back(struct ENAME##_list *l, struct ENAME *e) \
			
 
				     { if(l->_head == NULL) l->_head = e; else l->_tail->_next = e; \
			
 
				       e->_next = NULL; e->_prev = l->_tail; l->_tail = e; } \
			
 
				-  /** @internal */static inline void ENAME##_list_insert_before(struct ENAME##_list *l, struct ENAME *e, struct ENAME *o) \
			
 
				+  /** @internal */LIST_INLINE void ENAME##_list_insert_before(struct ENAME##_list *l, struct ENAME *e, struct ENAME *o) \
			
 
				     { struct ENAME *p = o->_prev; if (p) { p->_next = e; e->_prev = p; } else { l->_head = e; e->_prev = NULL; } \
			
 
				       e->_next = o; o->_prev = e; } \
			
 
				-  /** @internal */static inline void ENAME##_list_insert_after(struct ENAME##_list *l, struct ENAME *e, struct ENAME *o) \
			
 
				+  /** @internal */LIST_INLINE void ENAME##_list_insert_after(struct ENAME##_list *l, struct ENAME *e, struct ENAME *o) \
			
 
				     { struct ENAME *n = o->_next; if (n) { n->_prev = e; e->_next = n; } else { l->_tail = e; e->_next = NULL; } \
			
 
				       e->_prev = o; o->_next = e; } \
			
 
				-  /** @internal */static inline void ENAME##_list_push_list_front(struct ENAME##_list *l1, struct ENAME##_list *l2) \
			
 
				+  /** @internal */LIST_INLINE void ENAME##_list_push_list_front(struct ENAME##_list *l1, struct ENAME##_list *l2) \
			
 
				     { if (l2->_head == NULL) { l2->_head = l1->_head; l2->_tail = l1->_tail; } \
			
 
				       else if (l1->_head != NULL) { l1->_tail->_next = l2->_head; l2->_head->_prev = l1->_tail; l2->_head = l1->_head; } } \
			
 
				-  /** @internal */static inline void ENAME##_list_push_list_back(struct ENAME##_list *l1, struct ENAME##_list *l2) \
			
 
				+  /** @internal */LIST_INLINE void ENAME##_list_push_list_back(struct ENAME##_list *l1, struct ENAME##_list *l2) \
			
 
				     { if(l1->_head == NULL) { l1->_head = l2->_head; l1->_tail = l2->_tail; } \
			
 
				       else if (l2->_head != NULL) { l1->_tail->_next = l2->_head; l2->_head->_prev = l1->_tail; l1->_tail = l2->_tail; } } \
			
 
				-  /** @internal */static inline struct ENAME *ENAME##_list_front(const struct ENAME##_list *l) \
			
 
				+  /** @internal */LIST_INLINE struct ENAME *ENAME##_list_front(const struct ENAME##_list *l) \
			
 
				     { return l->_head; } \
			
 
				-  /** @internal */static inline struct ENAME *ENAME##_list_back(const struct ENAME##_list *l) \
			
 
				+  /** @internal */LIST_INLINE struct ENAME *ENAME##_list_back(const struct ENAME##_list *l) \
			
 
				     { return l->_tail; } \
			
 
				-  /** @internal */static inline void ENAME##_list_init(struct ENAME##_list *l) \
			
 
				+  /** @internal */LIST_INLINE void ENAME##_list_init(struct ENAME##_list *l) \
			
 
				     { l->_head=NULL; l->_tail=l->_head; } \
			
 
				-  /** @internal */static inline struct ENAME##_list *ENAME##_list_new(void) \
			
 
				+  /** @internal */LIST_INLINE struct ENAME##_list *ENAME##_list_new(void) \
			
 
				     { struct ENAME##_list *l; _STARPU_MALLOC(l, sizeof(struct ENAME##_list)); \
			
 
				       ENAME##_list_init(l); return l; } \
			
 
				-  /** @internal */static inline int ENAME##_list_empty(const struct ENAME##_list *l) \
			
 
				+  /** @internal */LIST_INLINE int ENAME##_list_empty(const struct ENAME##_list *l) \
			
 
				     { return (l->_head == NULL); } \
			
 
				-  /** @internal */static inline void ENAME##_list_delete(struct ENAME##_list *l) \
			
 
				+  /** @internal */LIST_INLINE void ENAME##_list_delete(struct ENAME##_list *l) \
			
 
				     { free(l); } \
			
 
				-  /** @internal */static inline void ENAME##_list_erase(struct ENAME##_list *l, struct ENAME *c) \
			
 
				+  /** @internal */LIST_INLINE void ENAME##_list_erase(struct ENAME##_list *l, struct ENAME *c) \
			
 
				     { struct ENAME *p = c->_prev; if(p) p->_next = c->_next; else l->_head = c->_next; \
			
 
				       if(c->_next) c->_next->_prev = p; else l->_tail = p; } \
			
 
				-  /** @internal */static inline struct ENAME *ENAME##_list_pop_front(struct ENAME##_list *l) \
			
 
				+  /** @internal */LIST_INLINE struct ENAME *ENAME##_list_pop_front(struct ENAME##_list *l) \
			
 
				     { struct ENAME *e = ENAME##_list_front(l); \
			
 
				       ENAME##_list_erase(l, e); return e; } \
			
 
				-  /** @internal */static inline struct ENAME *ENAME##_list_pop_back(struct ENAME##_list *l) \
			
 
				+  /** @internal */LIST_INLINE struct ENAME *ENAME##_list_pop_back(struct ENAME##_list *l) \
			
 
				     { struct ENAME *e = ENAME##_list_back(l); \
			
 
				       ENAME##_list_erase(l, e); return e; } \
			
 
				-  /** @internal */static inline struct ENAME *ENAME##_list_begin(const struct ENAME##_list *l) \
			
 
				+  /** @internal */LIST_INLINE struct ENAME *ENAME##_list_begin(const struct ENAME##_list *l) \
			
 
				     { return l->_head; } \
			
 
				-  /** @internal */static inline struct ENAME *ENAME##_list_end(const struct ENAME##_list *l STARPU_ATTRIBUTE_UNUSED) \
			
 
				+  /** @internal */LIST_INLINE struct ENAME *ENAME##_list_end(const struct ENAME##_list *l STARPU_ATTRIBUTE_UNUSED) \
			
 
				     { return NULL; } \
			
 
				-  /** @internal */static inline struct ENAME *ENAME##_list_next(const struct ENAME *i) \
			
 
				+  /** @internal */LIST_INLINE struct ENAME *ENAME##_list_next(const struct ENAME *i) \
			
 
				     { return i->_next; } \
			
 
				-  /** @internal */static inline struct ENAME *ENAME##_list_last(const struct ENAME##_list *l) \
			
 
				+  /** @internal */LIST_INLINE struct ENAME *ENAME##_list_last(const struct ENAME##_list *l) \
			
 
				     { return l->_tail; } \
			
 
				-  /** @internal */static inline struct ENAME *ENAME##_list_alpha(const struct ENAME##_list *l STARPU_ATTRIBUTE_UNUSED) \
			
 
				+  /** @internal */LIST_INLINE struct ENAME *ENAME##_list_alpha(const struct ENAME##_list *l STARPU_ATTRIBUTE_UNUSED) \
			
 
				     { return NULL; } \
			
 
				-  /** @internal */static inline struct ENAME *ENAME##_list_prev(const struct ENAME *i) \
			
 
				+  /** @internal */LIST_INLINE struct ENAME *ENAME##_list_prev(const struct ENAME *i) \
			
 
				     { return i->_prev; } \
			
 
				-  /** @internal */static inline int ENAME##_list_ismember(const struct ENAME##_list *l, const struct ENAME *e) \
			
 
				+  /** @internal */LIST_INLINE int ENAME##_list_ismember(const struct ENAME##_list *l, const struct ENAME *e) \
			
 
				     { struct ENAME *i=l->_head; while(i!=NULL){ if (i == e) return 1; i=i->_next; } return 0; } \
			
 
				-  /** @internal */static inline int ENAME##_list_member(const struct ENAME##_list *l, const struct ENAME *e) \
			
 
				+  /** @internal */LIST_INLINE int ENAME##_list_member(const struct ENAME##_list *l, const struct ENAME *e) \
			
 
				     { struct ENAME *i=l->_head; int k=0; while(i!=NULL){if (i == e) return k; k++; i=i->_next; } return -1; } \
			
 
				-  /** @internal */static inline int ENAME##_list_size(const struct ENAME##_list *l) \
			
 
				+  /** @internal */LIST_INLINE int ENAME##_list_size(const struct ENAME##_list *l) \
			
 
				     { struct ENAME *i=l->_head; int k=0; while(i!=NULL){k++;i=i->_next;} return k; } \
			
 
				-  /** @internal */static inline int ENAME##_list_check(const struct ENAME##_list *l) \
			
 
				+  /** @internal */LIST_INLINE int ENAME##_list_check(const struct ENAME##_list *l) \
			
 
				     { struct ENAME *i=l->_head; while(i) \
			
 
				     { if ((i->_next == NULL) && i != l->_tail) return 0; \
			
 
				       if (i->_next == i) return 0; \
			
@@ -247,18 +251,18 @@ struct ENAME##_multilist_##MEMBER { \
 
				 /* Create the inlines */
			
 
				 #define MULTILIST_CREATE_INLINES(TYPE, ENAME, MEMBER) \
			
 
				 /* Cast from list element to real type.  */ \
			
 
				-static inline TYPE *ENAME##_of_multilist_##MEMBER(struct ENAME##_multilist_##MEMBER *elt) { \
			
 
				+LIST_INLINE TYPE *ENAME##_of_multilist_##MEMBER(struct ENAME##_multilist_##MEMBER *elt) { \
			
 
				 	return ((TYPE *) ((uintptr_t) (elt) - ((uintptr_t) (&((TYPE *) 0)->MEMBER)))); \
			
 
				 } \
			
 
				 \
			
 
				 /* Initialize a list head.  */ \
			
 
				-static inline void ENAME##_multilist_init_##MEMBER(struct ENAME##_multilist_##MEMBER *head) { \
			
 
				+LIST_INLINE void ENAME##_multilist_init_##MEMBER(struct ENAME##_multilist_##MEMBER *head) { \
			
 
				 	head->next = head; \
			
 
				 	head->prev = head; \
			
 
				 } \
			
 
				 \
			
 
				 /* Push element to head of a list.  */ \
			
 
				-static inline void ENAME##_multilist_push_front_##MEMBER(struct ENAME##_multilist_##MEMBER *head, TYPE *e) { \
			
 
				+LIST_INLINE void ENAME##_multilist_push_front_##MEMBER(struct ENAME##_multilist_##MEMBER *head, TYPE *e) { \
			
 
				 	STARPU_ASSERT_MULTILIST(e->MEMBER.prev == NULL); \
			
 
				 	STARPU_ASSERT_MULTILIST(e->MEMBER.next == NULL); \
			
 
				 	e->MEMBER.next = head->next; \
			
@@ -268,7 +272,7 @@ static inline void ENAME##_multilist_push_front_##MEMBER(struct ENAME##_multilis
 
				 } \
			
 
				 \
			
 
				 /* Push element to tail of a list.  */ \
			
 
				-static inline void ENAME##_multilist_push_back_##MEMBER(struct ENAME##_multilist_##MEMBER *head, TYPE *e) { \
			
 
				+LIST_INLINE void ENAME##_multilist_push_back_##MEMBER(struct ENAME##_multilist_##MEMBER *head, TYPE *e) { \
			
 
				 	STARPU_ASSERT_MULTILIST(e->MEMBER.prev == NULL); \
			
 
				 	STARPU_ASSERT_MULTILIST(e->MEMBER.next == NULL); \
			
 
				 	e->MEMBER.prev = head->prev; \
			
@@ -278,7 +282,7 @@ static inline void ENAME##_multilist_push_back_##MEMBER(struct ENAME##_multilist
 
				 } \
			
 
				 \
			
 
				 /* Erase element from a list.  */ \
			
 
				-static inline void ENAME##_multilist_erase_##MEMBER(struct ENAME##_multilist_##MEMBER *head STARPU_ATTRIBUTE_UNUSED, TYPE *e) { \
			
 
				+LIST_INLINE void ENAME##_multilist_erase_##MEMBER(struct ENAME##_multilist_##MEMBER *head STARPU_ATTRIBUTE_UNUSED, TYPE *e) { \
			
 
				 	STARPU_ASSERT_MULTILIST(e->MEMBER.next->prev == &e->MEMBER); \
			
 
				 	e->MEMBER.next->prev = e->MEMBER.prev; \
			
 
				 	STARPU_ASSERT_MULTILIST(e->MEMBER.prev->next == &e->MEMBER); \
			
@@ -288,30 +292,30 @@ static inline void ENAME##_multilist_erase_##MEMBER(struct ENAME##_multilist_##M
 
				 } \
			
 
				 \
			
 
				 /* Test whether the element was queued on the list.  */ \
			
 
				-static inline int ENAME##_multilist_queued_##MEMBER(TYPE *e) { \
			
 
				+LIST_INLINE int ENAME##_multilist_queued_##MEMBER(TYPE *e) { \
			
 
				 	return ((e)->MEMBER.next != NULL); \
			
 
				 } \
			
 
				 \
			
 
				 /* Test whether the list is empty.  */ \
			
 
				-static inline int ENAME##_multilist_empty_##MEMBER(struct ENAME##_multilist_##MEMBER *head) { \
			
 
				+LIST_INLINE int ENAME##_multilist_empty_##MEMBER(struct ENAME##_multilist_##MEMBER *head) { \
			
 
				 	return head->next == head; \
			
 
				 } \
			
 
				 \
			
 
				 /* Return the first element of the list.  */ \
			
 
				-static inline TYPE *ENAME##_multilist_begin_##MEMBER(struct ENAME##_multilist_##MEMBER *head) { \
			
 
				+LIST_INLINE TYPE *ENAME##_multilist_begin_##MEMBER(struct ENAME##_multilist_##MEMBER *head) { \
			
 
				 	return ENAME##_of_multilist_##MEMBER(head->next); \
			
 
				 } \
			
 
				 /* Return the value to be tested at the end of the list.  */ \
			
 
				-static inline TYPE *ENAME##_multilist_end_##MEMBER(struct ENAME##_multilist_##MEMBER *head) { \
			
 
				+LIST_INLINE TYPE *ENAME##_multilist_end_##MEMBER(struct ENAME##_multilist_##MEMBER *head) { \
			
 
				 	return ENAME##_of_multilist_##MEMBER(head); \
			
 
				 } \
			
 
				 /* Return the next element of the list.  */ \
			
 
				-static inline TYPE *ENAME##_multilist_next_##MEMBER(TYPE *e) { \
			
 
				+LIST_INLINE TYPE *ENAME##_multilist_next_##MEMBER(TYPE *e) { \
			
 
				 	return ENAME##_of_multilist_##MEMBER(e->MEMBER.next); \
			
 
				 } \
			
 
				 \
			
 
				  /* Move a list from its head to another head.  */ \
			
 
				-static inline void ENAME##_multilist_move_##MEMBER(struct ENAME##_multilist_##MEMBER *head, struct ENAME##_multilist_##MEMBER *newhead) { \
			
 
				+LIST_INLINE void ENAME##_multilist_move_##MEMBER(struct ENAME##_multilist_##MEMBER *head, struct ENAME##_multilist_##MEMBER *newhead) { \
			
 
				 	if (ENAME##_multilist_empty_##MEMBER(head)) \
			
 
				 		ENAME##_multilist_init_##MEMBER(newhead); \
			
 
				 	else { \
			
--- a/src/common/prio_list.h
+++ b/src/common/prio_list.h
@@ -37,7 +37,7 @@
 
				  *   * Test that the priority list is empty
			
 
				  * void FOO_prio_list_empty(struct FOO_prio_list*)
			
 
				  *   * Erase element from the priority list
			
 
				- * void FOO_prio_list_empty(struct FOO_prio_list*, struct FOO*)
			
 
				+ * void FOO_prio_list_erase(struct FOO_prio_list*, struct FOO*)
			
 
				  *   * Return and erase the first element of the priority list
			
 
				  * void FOO_prio_list_pop_front(struct FOO_prio_list*)
			
 
				  *   * Catenate second priority list at ends of the first priority list
			
@@ -66,6 +66,10 @@
 
				 
			
 
				 #include <common/rbtree.h>
			
 
				 
			
 
				+#ifndef PRIO_LIST_INLINE
			
 
				+#define PRIO_LIST_INLINE static inline
			
 
				+#endif
			
 
				+
			
 
				 #define PRIO_LIST_TYPE(ENAME, PRIOFIELD) \
			
 
				 	PRIO_LIST_CREATE_TYPE(ENAME, PRIOFIELD)
			
 
				 
			
@@ -83,22 +87,22 @@
 
				 		int prio; \
			
 
				 		struct ENAME##_list list; \
			
 
				 	}; \
			
 
				-	static inline struct ENAME##_prio_list_stage *ENAME##_node_to_list_stage(struct starpu_rbtree_node *node) \
			
 
				+	PRIO_LIST_INLINE struct ENAME##_prio_list_stage *ENAME##_node_to_list_stage(struct starpu_rbtree_node *node) \
			
 
				 	{ \
			
 
				 		/* This assumes node is first member of stage */ \
			
 
				 		return (struct ENAME##_prio_list_stage *) node; \
			
 
				 	} \
			
 
				-	static inline const struct ENAME##_prio_list_stage *ENAME##_node_to_list_stage_const(const struct starpu_rbtree_node *node) \
			
 
				+	PRIO_LIST_INLINE const struct ENAME##_prio_list_stage *ENAME##_node_to_list_stage_const(const struct starpu_rbtree_node *node) \
			
 
				 	{ \
			
 
				 		/* This assumes node is first member of stage */ \
			
 
				 		return (struct ENAME##_prio_list_stage *) node; \
			
 
				 	} \
			
 
				-	static inline void ENAME##_prio_list_init(struct ENAME##_prio_list *priolist) \
			
 
				+	PRIO_LIST_INLINE void ENAME##_prio_list_init(struct ENAME##_prio_list *priolist) \
			
 
				 	{ \
			
 
				 		starpu_rbtree_init(&priolist->tree); \
			
 
				 		priolist->empty = 1; \
			
 
				 	} \
			
 
				-	static inline void ENAME##_prio_list_deinit(struct ENAME##_prio_list *priolist) \
			
 
				+	PRIO_LIST_INLINE void ENAME##_prio_list_deinit(struct ENAME##_prio_list *priolist) \
			
 
				 	{ \
			
 
				 		if (starpu_rbtree_empty(&priolist->tree)) \
			
 
				 			return; \
			
@@ -109,13 +113,13 @@
 
				 		starpu_rbtree_remove(&priolist->tree, root); \
			
 
				 		free(stage); \
			
 
				 	} \
			
 
				-	static inline int ENAME##_prio_list_cmp_fn(int prio, const struct starpu_rbtree_node *node) \
			
 
				+	PRIO_LIST_INLINE int ENAME##_prio_list_cmp_fn(int prio, const struct starpu_rbtree_node *node) \
			
 
				 	{ \
			
 
				 		/* Sort by decreasing order */ \
			
 
				 		const struct ENAME##_prio_list_stage *e2 = ENAME##_node_to_list_stage_const(node); \
			
 
				 		return (e2->prio - prio); \
			
 
				 	} \
			
 
				-	static inline struct ENAME##_prio_list_stage *ENAME##_prio_list_add(struct ENAME##_prio_list *priolist, int prio) \
			
 
				+	PRIO_LIST_INLINE struct ENAME##_prio_list_stage *ENAME##_prio_list_add(struct ENAME##_prio_list *priolist, int prio) \
			
 
				 	{ \
			
 
				 		unsigned long slot; \
			
 
				 		struct starpu_rbtree_node *node; \
			
@@ -132,25 +136,25 @@
 
				 		} \
			
 
				 		return stage; \
			
 
				 	} \
			
 
				-	static inline void ENAME##_prio_list_push_back(struct ENAME##_prio_list *priolist, struct ENAME *e) \
			
 
				+	PRIO_LIST_INLINE void ENAME##_prio_list_push_back(struct ENAME##_prio_list *priolist, struct ENAME *e) \
			
 
				 	{ \
			
 
				 		struct ENAME##_prio_list_stage *stage = ENAME##_prio_list_add(priolist, e->PRIOFIELD); \
			
 
				 		ENAME##_list_push_back(&stage->list, e); \
			
 
				 		priolist->empty = 0; \
			
 
				 	} \
			
 
				-	static inline void ENAME##_prio_list_push_front(struct ENAME##_prio_list *priolist, struct ENAME *e) \
			
 
				+	PRIO_LIST_INLINE void ENAME##_prio_list_push_front(struct ENAME##_prio_list *priolist, struct ENAME *e) \
			
 
				 	{ \
			
 
				 		struct ENAME##_prio_list_stage *stage = ENAME##_prio_list_add(priolist, e->PRIOFIELD); \
			
 
				 		ENAME##_list_push_front(&stage->list, e); \
			
 
				 		priolist->empty = 0; \
			
 
				 	} \
			
 
				-	static inline int ENAME##_prio_list_empty(const struct ENAME##_prio_list *priolist) \
			
 
				+	PRIO_LIST_INLINE int ENAME##_prio_list_empty(const struct ENAME##_prio_list *priolist) \
			
 
				 	{ \
			
 
				 		return priolist->empty; \
			
 
				 	} \
			
 
				 	/* Version of list_empty which does not use the cached empty flag,
			
 
				 	 * typically used to compute the value of the flag */ \
			
 
				-	static inline int ENAME##_prio_list_empty_slow(const struct ENAME##_prio_list *priolist) \
			
 
				+	PRIO_LIST_INLINE int ENAME##_prio_list_empty_slow(const struct ENAME##_prio_list *priolist) \
			
 
				 	{ \
			
 
				 		if (starpu_rbtree_empty(&priolist->tree)) \
			
 
				 			return 1; \
			
@@ -161,7 +165,7 @@
 
				 			return 1; \
			
 
				 		return 0; \
			
 
				 	} \
			
 
				-	static inline void ENAME##_prio_list_erase(struct ENAME##_prio_list *priolist, struct ENAME *e) \
			
 
				+	PRIO_LIST_INLINE void ENAME##_prio_list_erase(struct ENAME##_prio_list *priolist, struct ENAME *e) \
			
 
				 	{ \
			
 
				 		struct starpu_rbtree_node *node = starpu_rbtree_lookup(&priolist->tree, e->PRIOFIELD, ENAME##_prio_list_cmp_fn); \
			
 
				 		struct ENAME##_prio_list_stage *stage = ENAME##_node_to_list_stage(node); \
			
@@ -176,7 +180,7 @@
 
				 			priolist->empty = ENAME##_prio_list_empty_slow(priolist); \
			
 
				 		} \
			
 
				 	} \
			
 
				-	static inline int ENAME##_prio_list_get_next_nonempty_stage(struct ENAME##_prio_list *priolist, struct starpu_rbtree_node *node, struct starpu_rbtree_node **pnode, struct ENAME##_prio_list_stage **pstage) \
			
 
				+	PRIO_LIST_INLINE int ENAME##_prio_list_get_next_nonempty_stage(struct ENAME##_prio_list *priolist, struct starpu_rbtree_node *node, struct starpu_rbtree_node **pnode, struct ENAME##_prio_list_stage **pstage) \
			
 
				 	{ \
			
 
				 		struct ENAME##_prio_list_stage *stage; \
			
 
				 		while(1) { \
			
@@ -201,12 +205,12 @@
 
				 		*pstage = stage; \
			
 
				 		return 1; \
			
 
				 	} \
			
 
				-	static inline int ENAME##_prio_list_get_first_nonempty_stage(struct ENAME##_prio_list *priolist, struct starpu_rbtree_node **pnode, struct ENAME##_prio_list_stage **pstage) \
			
 
				+	PRIO_LIST_INLINE int ENAME##_prio_list_get_first_nonempty_stage(struct ENAME##_prio_list *priolist, struct starpu_rbtree_node **pnode, struct ENAME##_prio_list_stage **pstage) \
			
 
				 	{ \
			
 
				 		struct starpu_rbtree_node *node = starpu_rbtree_first(&priolist->tree); \
			
 
				 		return ENAME##_prio_list_get_next_nonempty_stage(priolist, node, pnode, pstage); \
			
 
				 	} \
			
 
				-	static inline struct ENAME *ENAME##_prio_list_pop_front(struct ENAME##_prio_list *priolist) \
			
 
				+	PRIO_LIST_INLINE struct ENAME *ENAME##_prio_list_pop_front(struct ENAME##_prio_list *priolist) \
			
 
				 	{ \
			
 
				 		struct starpu_rbtree_node *node; \
			
 
				 		struct ENAME##_prio_list_stage *stage; \
			
@@ -225,7 +229,7 @@
 
				 		} \
			
 
				 		return ret; \
			
 
				 	} \
			
 
				-	static inline int ENAME##_prio_list_get_prev_nonempty_stage(struct ENAME##_prio_list *priolist, struct starpu_rbtree_node *node, struct starpu_rbtree_node **pnode, struct ENAME##_prio_list_stage **pstage) \
			
 
				+	PRIO_LIST_INLINE int ENAME##_prio_list_get_prev_nonempty_stage(struct ENAME##_prio_list *priolist, struct starpu_rbtree_node *node, struct starpu_rbtree_node **pnode, struct ENAME##_prio_list_stage **pstage) \
			
 
				 	{ \
			
 
				 		struct ENAME##_prio_list_stage *stage; \
			
 
				 		while(1) { \
			
@@ -250,12 +254,12 @@
 
				 		*pstage = stage; \
			
 
				 		return 1; \
			
 
				 	} \
			
 
				-	static inline int ENAME##_prio_list_get_last_nonempty_stage(struct ENAME##_prio_list *priolist, struct starpu_rbtree_node **pnode, struct ENAME##_prio_list_stage **pstage) \
			
 
				+	PRIO_LIST_INLINE int ENAME##_prio_list_get_last_nonempty_stage(struct ENAME##_prio_list *priolist, struct starpu_rbtree_node **pnode, struct ENAME##_prio_list_stage **pstage) \
			
 
				 	{ \
			
 
				 		struct starpu_rbtree_node *node = starpu_rbtree_last(&priolist->tree); \
			
 
				 		return ENAME##_prio_list_get_prev_nonempty_stage(priolist, node, pnode, pstage); \
			
 
				 	} \
			
 
				-	static inline struct ENAME *ENAME##_prio_list_pop_back(struct ENAME##_prio_list *priolist) \
			
 
				+	PRIO_LIST_INLINE struct ENAME *ENAME##_prio_list_pop_back(struct ENAME##_prio_list *priolist) \
			
 
				 	{ \
			
 
				 		struct starpu_rbtree_node *node; \
			
 
				 		struct ENAME##_prio_list_stage *stage; \
			
@@ -274,7 +278,7 @@
 
				 		} \
			
 
				 		return ret; \
			
 
				 	} \
			
 
				-	static inline void ENAME##_prio_list_push_prio_list_back(struct ENAME##_prio_list *priolist, struct ENAME##_prio_list *priolist_toadd) \
			
 
				+	PRIO_LIST_INLINE void ENAME##_prio_list_push_prio_list_back(struct ENAME##_prio_list *priolist, struct ENAME##_prio_list *priolist_toadd) \
			
 
				 	{ \
			
 
				 		struct starpu_rbtree_node *node_toadd, *tmp; \
			
 
				 		starpu_rbtree_for_each_remove(&priolist_toadd->tree, node_toadd, tmp) { \
			
@@ -306,7 +310,7 @@
 
				 			} \
			
 
				 		} \
			
 
				 	} \
			
 
				-	static inline int ENAME##_prio_list_ismember(const struct ENAME##_prio_list *priolist, const struct ENAME *e) \
			
 
				+	PRIO_LIST_INLINE int ENAME##_prio_list_ismember(const struct ENAME##_prio_list *priolist, const struct ENAME *e) \
			
 
				 	{ \
			
 
				 		struct starpu_rbtree_node *node = starpu_rbtree_lookup(&priolist->tree, e->PRIOFIELD, ENAME##_prio_list_cmp_fn); \
			
 
				 		if (node) { \
			
@@ -315,7 +319,7 @@
 
				 		} \
			
 
				 		return 0; \
			
 
				 	} \
			
 
				-	static inline struct ENAME *ENAME##_prio_list_begin(struct ENAME##_prio_list *priolist) \
			
 
				+	PRIO_LIST_INLINE struct ENAME *ENAME##_prio_list_begin(struct ENAME##_prio_list *priolist) \
			
 
				 	{ \
			
 
				 		struct starpu_rbtree_node *node; \
			
 
				 		struct ENAME##_prio_list_stage *stage; \
			
@@ -323,9 +327,9 @@
 
				 			return NULL; \
			
 
				 		return ENAME##_list_begin(&stage->list); \
			
 
				 	} \
			
 
				-	static inline struct ENAME *ENAME##_prio_list_end(struct ENAME##_prio_list *priolist STARPU_ATTRIBUTE_UNUSED) \
			
 
				+	PRIO_LIST_INLINE struct ENAME *ENAME##_prio_list_end(struct ENAME##_prio_list *priolist STARPU_ATTRIBUTE_UNUSED) \
			
 
				 	{ return NULL; } \
			
 
				-	static inline struct ENAME *ENAME##_prio_list_next(struct ENAME##_prio_list *priolist, const struct ENAME *i) \
			
 
				+	PRIO_LIST_INLINE struct ENAME *ENAME##_prio_list_next(struct ENAME##_prio_list *priolist, const struct ENAME *i) \
			
 
				 	{ \
			
 
				 		struct ENAME *next = ENAME##_list_next(i); \
			
 
				 		if (next != ENAME##_list_end(NULL)) \
			
@@ -337,7 +341,7 @@
 
				 			return NULL; \
			
 
				 		return ENAME##_list_begin(&stage->list); \
			
 
				 	} \
			
 
				-	static inline struct ENAME *ENAME##_prio_list_last(struct ENAME##_prio_list *priolist) \
			
 
				+	PRIO_LIST_INLINE struct ENAME *ENAME##_prio_list_last(struct ENAME##_prio_list *priolist) \
			
 
				 	{ \
			
 
				 		struct starpu_rbtree_node *node; \
			
 
				 		struct ENAME##_prio_list_stage *stage; \
			
@@ -345,9 +349,9 @@
 
				 			return NULL; \
			
 
				 		return ENAME##_list_last(&stage->list); \
			
 
				 	} \
			
 
				-	static inline struct ENAME *ENAME##_prio_list_alpha(struct ENAME##_prio_list *priolist STARPU_ATTRIBUTE_UNUSED) \
			
 
				+	PRIO_LIST_INLINE struct ENAME *ENAME##_prio_list_alpha(struct ENAME##_prio_list *priolist STARPU_ATTRIBUTE_UNUSED) \
			
 
				 	{ return NULL; } \
			
 
				-	static inline struct ENAME *ENAME##_prio_list_prev(struct ENAME##_prio_list *priolist, const struct ENAME *i) \
			
 
				+	PRIO_LIST_INLINE struct ENAME *ENAME##_prio_list_prev(struct ENAME##_prio_list *priolist, const struct ENAME *i) \
			
 
				 	{ \
			
 
				 		struct ENAME *next = ENAME##_list_prev(i); \
			
 
				 		if (next != ENAME##_list_alpha(NULL)) \
			
@@ -365,11 +369,11 @@
 
				 /* gdbinit can't recurse in a tree. Use a mere list in debugging mode.  */
			
 
				 #define PRIO_LIST_CREATE_TYPE(ENAME, PRIOFIELD) \
			
 
				 	struct ENAME##_prio_list { struct ENAME##_list list; }; \
			
 
				-	static inline void ENAME##_prio_list_init(struct ENAME##_prio_list *priolist) \
			
 
				+	PRIO_LIST_INLINE void ENAME##_prio_list_init(struct ENAME##_prio_list *priolist) \
			
 
				 	{ ENAME##_list_init(&(priolist)->list); } \
			
 
				-	static inline void ENAME##_prio_list_deinit(struct ENAME##_prio_list *priolist) \
			
 
				+	PRIO_LIST_INLINE void ENAME##_prio_list_deinit(struct ENAME##_prio_list *priolist) \
			
 
				 	{ (void) (priolist); /* ENAME##_list_deinit(&(priolist)->list); */ } \
			
 
				-	static inline void ENAME##_prio_list_push_back(struct ENAME##_prio_list *priolist, struct ENAME *e) \
			
 
				+	PRIO_LIST_INLINE void ENAME##_prio_list_push_back(struct ENAME##_prio_list *priolist, struct ENAME *e) \
			
 
				 	{ \
			
 
				 		struct ENAME *cur; \
			
 
				 		for (cur  = ENAME##_list_begin(&(priolist)->list); \
			
@@ -382,7 +386,7 @@
 
				 		else \
			
 
				 			ENAME##_list_insert_before(&(priolist)->list, (e), cur); \
			
 
				 	} \
			
 
				-	static inline void ENAME##_prio_list_push_front(struct ENAME##_prio_list *priolist, struct ENAME *e) \
			
 
				+	PRIO_LIST_INLINE void ENAME##_prio_list_push_front(struct ENAME##_prio_list *priolist, struct ENAME *e) \
			
 
				 	{ \
			
 
				 		struct ENAME *cur; \
			
 
				 		for (cur  = ENAME##_list_begin(&(priolist)->list); \
			
@@ -395,29 +399,29 @@
 
				 		else \
			
 
				 			ENAME##_list_insert_before(&(priolist)->list, (e), cur); \
			
 
				 	} \
			
 
				-	static inline int ENAME##_prio_list_empty(const struct ENAME##_prio_list *priolist) \
			
 
				+	PRIO_LIST_INLINE int ENAME##_prio_list_empty(const struct ENAME##_prio_list *priolist) \
			
 
				 	{ return ENAME##_list_empty(&(priolist)->list); } \
			
 
				-	static inline void ENAME##_prio_list_erase(struct ENAME##_prio_list *priolist, struct ENAME *e) \
			
 
				+	PRIO_LIST_INLINE void ENAME##_prio_list_erase(struct ENAME##_prio_list *priolist, struct ENAME *e) \
			
 
				 	{ ENAME##_list_erase(&(priolist)->list, (e)); } \
			
 
				-	static inline struct ENAME *ENAME##_prio_list_pop_front(struct ENAME##_prio_list *priolist) \
			
 
				+	PRIO_LIST_INLINE struct ENAME *ENAME##_prio_list_pop_front(struct ENAME##_prio_list *priolist) \
			
 
				 	{ return ENAME##_list_pop_front(&(priolist)->list); } \
			
 
				-	static inline struct ENAME *ENAME##_prio_list_pop_back(struct ENAME##_prio_list *priolist) \
			
 
				+	PRIO_LIST_INLINE struct ENAME *ENAME##_prio_list_pop_back(struct ENAME##_prio_list *priolist) \
			
 
				 	{ return ENAME##_list_pop_back(&(priolist)->list); } \
			
 
				-	static inline void ENAME##_prio_list_push_prio_list_back(struct ENAME##_prio_list *priolist, struct ENAME##_prio_list *priolist_toadd) \
			
 
				+	PRIO_LIST_INLINE void ENAME##_prio_list_push_prio_list_back(struct ENAME##_prio_list *priolist, struct ENAME##_prio_list *priolist_toadd) \
			
 
				 	{ ENAME##_list_push_list_back(&(priolist)->list, &(priolist_toadd)->list); } \
			
 
				-	static inline int ENAME##_prio_list_ismember(const struct ENAME##_prio_list *priolist, const struct ENAME *e) \
			
 
				+	PRIO_LIST_INLINE int ENAME##_prio_list_ismember(const struct ENAME##_prio_list *priolist, const struct ENAME *e) \
			
 
				 	{ return ENAME##_list_ismember(&(priolist)->list, (e)); } \
			
 
				-	static inline struct ENAME *ENAME##_prio_list_begin(struct ENAME##_prio_list *priolist) \
			
 
				+	PRIO_LIST_INLINE struct ENAME *ENAME##_prio_list_begin(struct ENAME##_prio_list *priolist) \
			
 
				 	{ return ENAME##_list_begin(&(priolist)->list); } \
			
 
				-	static inline struct ENAME *ENAME##_prio_list_end(struct ENAME##_prio_list *priolist) \
			
 
				+	PRIO_LIST_INLINE struct ENAME *ENAME##_prio_list_end(struct ENAME##_prio_list *priolist) \
			
 
				 	{ return ENAME##_list_end(&(priolist)->list); } \
			
 
				-	static inline struct ENAME *ENAME##_prio_list_next(struct ENAME##_prio_list *priolist STARPU_ATTRIBUTE_UNUSED, const struct ENAME *i) \
			
 
				+	PRIO_LIST_INLINE struct ENAME *ENAME##_prio_list_next(struct ENAME##_prio_list *priolist STARPU_ATTRIBUTE_UNUSED, const struct ENAME *i) \
			
 
				 	{ return ENAME##_list_next(i); } \
			
 
				-	static inline struct ENAME *ENAME##_prio_list_last(struct ENAME##_prio_list *priolist) \
			
 
				+	PRIO_LIST_INLINE struct ENAME *ENAME##_prio_list_last(struct ENAME##_prio_list *priolist) \
			
 
				 	{ return ENAME##_list_last(&(priolist)->list); } \
			
 
				-	static inline struct ENAME *ENAME##_prio_list_alpha(struct ENAME##_prio_list *priolist) \
			
 
				+	PRIO_LIST_INLINE struct ENAME *ENAME##_prio_list_alpha(struct ENAME##_prio_list *priolist) \
			
 
				 	{ return ENAME##_list_alpha(&(priolist)->list); } \
			
 
				-	static inline struct ENAME *ENAME##_prio_list_prev(struct ENAME##_prio_list *priolist STARPU_ATTRIBUTE_UNUSED, const struct ENAME *i) \
			
 
				+	PRIO_LIST_INLINE struct ENAME *ENAME##_prio_list_prev(struct ENAME##_prio_list *priolist STARPU_ATTRIBUTE_UNUSED, const struct ENAME *i) \
			
 
				 	{ return ENAME##_list_prev(i); } \
			
 
				 
			
 
				 #endif
			
--- a/src/common/thread.c
+++ b/src/common/thread.c
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2010, 2012-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
			
 
				+ * Copyright (C) 2017  Inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -234,11 +235,21 @@ int starpu_pthread_key_delete(starpu_pthread_key_t key)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+/* We need it only when using smpi */
			
 
				+#pragma weak smpi_process_get_user_data
			
 
				+#if !HAVE_DECL_SMPI_PROCESS_SET_USER_DATA
			
 
				+extern void *smpi_process_get_user_data();
			
 
				+#endif
			
 
				+
			
 
				 int starpu_pthread_setspecific(starpu_pthread_key_t key, const void *pointer)
			
 
				 {
			
 
				 	void **array;
			
 
				-#ifdef STARPU_SIMGRID_HAVE_SIMIX_PROCESS_GET_CODE
			
 
				-	if ((SIMIX_process_get_code() == _starpu_mpi_simgrid_init) || (!strcmp(SIMIX_process_self_get_name(),"wait for mpi transfer")))
			
 
				+#ifdef HAVE_SMPI_PROCESS_SET_USER_DATA
			
 
				+	const char *process_name = SIMIX_process_self_get_name();
			
 
				+	char *end;
			
 
				+	/* Test whether it is an MPI rank */
			
 
				+	strtol(process_name, &end, 10);
			
 
				+	if (!*end || !strcmp(process_name, "wait for mpi transfer"))
			
 
				 		/* Special-case the SMPI process */
			
 
				 		array = smpi_process_get_user_data();
			
 
				 	else
			
@@ -251,9 +262,13 @@ int starpu_pthread_setspecific(starpu_pthread_key_t key, const void *pointer)
 
				 void* starpu_pthread_getspecific(starpu_pthread_key_t key)
			
 
				 {
			
 
				 	void **array;
			
 
				-#ifdef STARPU_SIMGRID_HAVE_SIMIX_PROCESS_GET_CODE
			
 
				-	if ((SIMIX_process_get_code() == _starpu_mpi_simgrid_init) || (!strcmp(SIMIX_process_self_get_name(),"wait for mpi transfer")))
			
 
				-		/* Special-case the SMPI process */
			
 
				+#ifdef HAVE_SMPI_PROCESS_SET_USER_DATA
			
 
				+	const char *process_name = SIMIX_process_self_get_name();
			
 
				+	char *end;
			
 
				+	/* Test whether it is an MPI rank */
			
 
				+	strtol(process_name, &end, 10);
			
 
				+	if (!*end || !strcmp(process_name, "wait for mpi transfer"))
			
 
				+		/* Special-case the SMPI processes */
			
 
				 		array = smpi_process_get_user_data();
			
 
				 	else
			
 
				 #endif
			
@@ -720,34 +735,17 @@ int starpu_pthread_barrier_wait(starpu_pthread_barrier_t *barrier)
 
				  * macros of course) which record when the mutex is held or not */
			
 
				 int starpu_pthread_mutex_lock_sched(starpu_pthread_mutex_t *mutex)
			
 
				 {
			
 
				-	int p_ret = starpu_pthread_mutex_lock(mutex);
			
 
				-	int workerid = starpu_worker_get_id();
			
 
				-	if(workerid != -1 && _starpu_worker_mutex_is_sched_mutex(workerid, mutex))
			
 
				-		_starpu_worker_set_flag_sched_mutex_locked(workerid, 1);
			
 
				-	return p_ret;
			
 
				+	return starpu_pthread_mutex_lock(mutex);
			
 
				 }
			
 
				 
			
 
				 int starpu_pthread_mutex_unlock_sched(starpu_pthread_mutex_t *mutex)
			
 
				 {
			
 
				-	int workerid = starpu_worker_get_id();
			
 
				-	if(workerid != -1 && _starpu_worker_mutex_is_sched_mutex(workerid, mutex))
			
 
				-		_starpu_worker_set_flag_sched_mutex_locked(workerid, 0);
			
 
				-
			
 
				 	return starpu_pthread_mutex_unlock(mutex);
			
 
				 }
			
 
				 
			
 
				 int starpu_pthread_mutex_trylock_sched(starpu_pthread_mutex_t *mutex)
			
 
				 {
			
 
				-	int ret = starpu_pthread_mutex_trylock(mutex);
			
 
				-
			
 
				-	if (!ret)
			
 
				-	{
			
 
				-		int workerid = starpu_worker_get_id();
			
 
				-		if(workerid != -1 && _starpu_worker_mutex_is_sched_mutex(workerid, mutex))
			
 
				-			_starpu_worker_set_flag_sched_mutex_locked(workerid, 1);
			
 
				-	}
			
 
				-
			
 
				-	return ret;
			
 
				+	return starpu_pthread_mutex_trylock(mutex);
			
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_DEBUG
			
--- a/src/common/utils.c
+++ b/src/common/utils.c
@@ -464,6 +464,10 @@ char *_starpu_get_home_path(void)
 
				 		static int warn;
			
 
				 		path = starpu_getenv("TMPDIR");
			
 
				 		if (!path)
			
 
				+			path = starpu_getenv("TEMP");
			
 
				+		if (!path)
			
 
				+			path = starpu_getenv("TMP");
			
 
				+		if (!path)
			
 
				 			path = "/tmp";
			
 
				 		if (!warn)
			
 
				 		{
			
--- a/src/core/combined_workers.c
+++ b/src/core/combined_workers.c
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2010-2015  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2014, 2016  CNRS
			
 
				+ * Copyright (C) 2017  Inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -160,7 +161,7 @@ int starpu_combined_worker_assign_workerid(int nworkers, int workerid_array[])
 
				 #endif
			
 
				 	}
			
 
				 
			
 
				-	starpu_sched_ctx_add_workers(&combined_worker_id, 1, STARPU_GLOBAL_SCHED_CTX);
			
 
				+	starpu_sched_ctx_add_combined_workers(&combined_worker_id, 1, STARPU_GLOBAL_SCHED_CTX);
			
 
				 
			
 
				 	return new_workerid;
			
 
				 }
			
--- a/src/core/dependencies/cg.c
+++ b/src/core/dependencies/cg.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010-2012, 2014-2017  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016  CNRS
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016, 2017  CNRS
			
 
				  * Copyright (C) 2012 INRIA
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -126,8 +126,7 @@ int _starpu_list_task_scheduled_successors_in_cg_list(struct _starpu_cg_list *su
 
				 		if (n < ndeps)
			
 
				 		{
			
 
				 			struct starpu_task *task = cg->succ.job->task;
			
 
				-			if (task->cl == NULL || task->cl->where == STARPU_NOWHERE
			
 
				-					|| task->execute_on_a_specific_worker)
			
 
				+			if (task->cl == NULL || task->where == STARPU_NOWHERE || task->execute_on_a_specific_worker)
			
 
				 				/* will not be scheduled */
			
 
				 				continue;
			
 
				 			task_array[n] = task;
			
--- a/src/core/errorcheck.h
+++ b/src/core/errorcheck.h
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2009, 2010, 2014  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011  CNRS
			
 
				+ * Copyright (C) 2017  Inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -37,9 +38,7 @@ enum _starpu_worker_status
 
				 	/* while executing the scheduler code */
			
 
				 	STATUS_SCHEDULING,
			
 
				 	/* while sleeping because there is nothing to do */
			
 
				-	STATUS_SLEEPING,
			
 
				-	/* while a sleeping worker is about to wake up (to avoid waking twice for the same worker) */
			
 
				-	STATUS_WAKING_UP
			
 
				+	STATUS_SLEEPING
			
 
				 };
			
 
				 
			
 
				 struct _starpu_worker;
			
--- a/src/core/jobs.c
+++ b/src/core/jobs.c
@@ -1,9 +1,9 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009-2017  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
			
 
				  * Copyright (C) 2011  Télécom-SudParis
			
 
				- * Copyright (C) 2011, 2014, 2016  INRIA
			
 
				+ * Copyright (C) 2011, 2014, 2016-2017  INRIA
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -272,12 +272,6 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 
				 		0
			
 
				 #endif
			
 
				 		;
			
 
				-	/* Read cl fields before releasing dependencies, for the case of a
			
 
				-	 * switch_cl which is freed by data_unregister happening as soon as
			
 
				-	 * the dependencies are released.
			
 
				-	 */
			
 
				-	unsigned nowhere = !task->cl || task->cl->where == STARPU_NOWHERE;
			
 
				-
			
 
				 #ifdef STARPU_DEBUG
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&all_jobs_list_mutex);
			
 
				 	_starpu_job_multilist_erase_all_submitted(&all_jobs_list, j);
			
@@ -345,6 +339,7 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 
				 	 * scheduler to process it : the task structure doesn't contain any valuable
			
 
				 	 * data as it's not linked to an actual worker */
			
 
				 	/* control task should not execute post_exec_hook */
			
 
				+	unsigned nowhere = !task->cl || task->cl->where == STARPU_NOWHERE || task->where == STARPU_NOWHERE;
			
 
				 	if(j->task_size == 1 && !nowhere && !j->internal
			
 
				 #ifdef STARPU_OPENMP
			
 
				 	/* If this is a continuation, we do not execute the post_exec_hook. The
			
@@ -711,10 +706,10 @@ int _starpu_push_local_task(struct _starpu_worker *worker, struct starpu_task *t
 
				 {
			
 
				 	/* Check that the worker is able to execute the task ! */
			
 
				 	STARPU_ASSERT(task && task->cl);
			
 
				-	if (STARPU_UNLIKELY(!(worker->worker_mask & task->cl->where)))
			
 
				+	if (STARPU_UNLIKELY(!(worker->worker_mask & task->where)))
			
 
				 		return -ENODEV;
			
 
				 
			
 
				-	STARPU_PTHREAD_MUTEX_LOCK_SCHED(&worker->sched_mutex);
			
 
				+	_starpu_worker_lock(worker->workerid);
			
 
				 
			
 
				 	if (task->execute_on_a_specific_worker && task->workerorder)
			
 
				 	{
			
@@ -758,7 +753,7 @@ int _starpu_push_local_task(struct _starpu_worker *worker, struct starpu_task *t
 
				 
			
 
				 	starpu_wake_worker_locked(worker->workerid);
			
 
				 	starpu_push_task_end(task);
			
 
				-	STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&worker->sched_mutex);
			
 
				+	_starpu_worker_unlock(worker->workerid);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/src/core/jobs.h
+++ b/src/core/jobs.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009-2016  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2013, 2014, 2015  CNRS
			
 
				+ * Copyright (C) 2010, 2011, 2013, 2014, 2015, 2017  CNRS
			
 
				  * Copyright (C) 2011  Télécom-SudParis
			
 
				  * Copyright (C) 2014  INRIA
			
 
				  *
			
@@ -51,11 +51,11 @@ struct _starpu_worker;
 
				 /* codelet function */
			
 
				 typedef void (*_starpu_cl_func_t)(void **, void *);
			
 
				 
			
 
				-#define _STARPU_CPU_MAY_PERFORM(j)	((j)->task->cl->where & STARPU_CPU)
			
 
				-#define _STARPU_CUDA_MAY_PERFORM(j)      ((j)->task->cl->where & STARPU_CUDA)
			
 
				-#define _STARPU_OPENCL_MAY_PERFORM(j)	((j)->task->cl->where & STARPU_OPENCL)
			
 
				-#define _STARPU_MIC_MAY_PERFORM(j)	((j)->task->cl->where & STARPU_MIC)
			
 
				-#define _STARPU_SCC_MAY_PERFORM(j)	((j)->task->cl->where & STARPU_SCC)
			
 
				+#define _STARPU_CPU_MAY_PERFORM(j)	((j)->task->where & STARPU_CPU)
			
 
				+#define _STARPU_CUDA_MAY_PERFORM(j)      ((j)->task->where & STARPU_CUDA)
			
 
				+#define _STARPU_OPENCL_MAY_PERFORM(j)	((j)->task->where & STARPU_OPENCL)
			
 
				+#define _STARPU_MIC_MAY_PERFORM(j)	((j)->task->where & STARPU_MIC)
			
 
				+#define _STARPU_SCC_MAY_PERFORM(j)	((j)->task->where & STARPU_SCC)
			
 
				 
			
 
				 struct _starpu_data_descr
			
 
				 {
			
--- a/src/core/perfmodel/multiple_regression.c
+++ b/src/core/perfmodel/multiple_regression.c
@@ -2,7 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2009, 2010, 2011, 2015-2016  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2016  CNRS
			
 
				- * Copyright (C) 2016  Inria
			
 
				+ * Copyright (C) 2016-2017  Inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -265,7 +265,6 @@ int _starpu_multiple_regression(struct starpu_perfmodel_history_list *ptr, doubl
 
				 
			
 
				 	/* Computing number of rows */
			
 
				 	long n=find_long_list_size(ptr);
			
 
				-	STARPU_ASSERT(n);
			
 
				 
			
 
				         /* Reading old calibrations if necessary */
			
 
				 	FILE *f=NULL;
			
--- a/src/core/perfmodel/perfmodel.c
+++ b/src/core/perfmodel/perfmodel.c
@@ -181,6 +181,7 @@ void _starpu_init_and_load_perfmodel(struct starpu_perfmodel *model)
 
				 
			
 
				 static double starpu_model_expected_perf(struct starpu_task *task, struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch,  unsigned nimpl)
			
 
				 {
			
 
				+	double exp_perf = 0.0;
			
 
				 	if (model)
			
 
				 	{
			
 
				 		_starpu_init_and_load_perfmodel(model);
			
@@ -190,24 +191,36 @@ static double starpu_model_expected_perf(struct starpu_task *task, struct starpu
 
				 		switch (model->type)
			
 
				 		{
			
 
				 			case STARPU_PER_ARCH:
			
 
				-				return per_arch_task_expected_perf(model, arch, task, nimpl);
			
 
				+				exp_perf = per_arch_task_expected_perf(model, arch, task, nimpl);
			
 
				+				STARPU_ASSERT_MSG(isnan(exp_perf)||exp_perf>=0,"exp_perf=%lf\n",exp_perf);
			
 
				+				break;
			
 
				 			case STARPU_COMMON:
			
 
				-				return common_task_expected_perf(model, arch, task, nimpl);
			
 
				+				exp_perf = common_task_expected_perf(model, arch, task, nimpl);
			
 
				+				STARPU_ASSERT_MSG(isnan(exp_perf)||exp_perf>=0,"exp_perf=%lf\n",exp_perf);
			
 
				+				break;
			
 
				 			case STARPU_HISTORY_BASED:
			
 
				-				return _starpu_history_based_job_expected_perf(model, arch, j, nimpl);
			
 
				+				exp_perf = _starpu_history_based_job_expected_perf(model, arch, j, nimpl);
			
 
				+				STARPU_ASSERT_MSG(isnan(exp_perf)||exp_perf>=0,"exp_perf=%lf\n",exp_perf);
			
 
				+				break;
			
 
				 			case STARPU_REGRESSION_BASED:
			
 
				-				return _starpu_regression_based_job_expected_perf(model, arch, j, nimpl);
			
 
				+				exp_perf = _starpu_regression_based_job_expected_perf(model, arch, j, nimpl);
			
 
				+				STARPU_ASSERT_MSG(isnan(exp_perf)||exp_perf>=0,"exp_perf=%lf\n",exp_perf);
			
 
				+				break;
			
 
				 			case STARPU_NL_REGRESSION_BASED:
			
 
				-				return _starpu_non_linear_regression_based_job_expected_perf(model, arch, j,nimpl);
			
 
				+				exp_perf = _starpu_non_linear_regression_based_job_expected_perf(model, arch, j,nimpl);
			
 
				+				STARPU_ASSERT_MSG(isnan(exp_perf)||exp_perf>=0,"exp_perf=%lf\n",exp_perf);
			
 
				+				break;
			
 
				 			case STARPU_MULTIPLE_REGRESSION_BASED:
			
 
				-				return _starpu_multiple_regression_based_job_expected_perf(model, arch, j, nimpl);
			
 
				+				exp_perf = _starpu_multiple_regression_based_job_expected_perf(model, arch, j, nimpl);
			
 
				+				STARPU_ASSERT_MSG(isnan(exp_perf)||exp_perf>=0,"exp_perf=%lf\n",exp_perf);
			
 
				+				break;
			
 
				 			default:
			
 
				 				STARPU_ABORT();
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				 	/* no model was found */
			
 
				-	return 0.0;
			
 
				+	return exp_perf;
			
 
				 }
			
 
				 
			
 
				 double starpu_task_expected_length(struct starpu_task *task, struct starpu_perfmodel_arch* arch, unsigned nimpl)
			
--- a/src/core/perfmodel/perfmodel_history.c
+++ b/src/core/perfmodel/perfmodel_history.c
@@ -242,6 +242,64 @@ static void insert_history_entry(struct starpu_perfmodel_history_entry *entry, s
 
				 }
			
 
				 
			
 
				 #ifndef STARPU_SIMGRID
			
 
				+static void check_reg_model(struct starpu_perfmodel *model, int comb, int impl)
			
 
				+{
			
 
				+	struct starpu_perfmodel_per_arch *per_arch_model = &model->state->per_arch[comb][impl];
			
 
				+	struct starpu_perfmodel_regression_model *reg_model = &per_arch_model->regression;
			
 
				+
			
 
				+	/*
			
 
				+	 * Linear Regression model
			
 
				+	 */
			
 
				+
			
 
				+	/* Unless we have enough measurements, we put NaN in the file to indicate the model is invalid */
			
 
				+	double alpha = nan(""), beta = nan("");
			
 
				+	if (model->type == STARPU_REGRESSION_BASED || model->type == STARPU_NL_REGRESSION_BASED)
			
 
				+	{
			
 
				+		if (reg_model->nsample > 1)
			
 
				+		{
			
 
				+			alpha = reg_model->alpha;
			
 
				+			beta = reg_model->beta;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* TODO: check:
			
 
				+	 * reg_model->sumlnx
			
 
				+	 * reg_model->sumlnx2
			
 
				+	 * reg_model->sumlny
			
 
				+	 * reg_model->sumlnxlny
			
 
				+	 * alpha
			
 
				+	 * beta
			
 
				+	 * reg_model->minx
			
 
				+	 * reg_model->maxx
			
 
				+	 */
			
 
				+	(void)alpha;
			
 
				+	(void)beta;
			
 
				+
			
 
				+	/*
			
 
				+	 * Non-Linear Regression model
			
 
				+	 */
			
 
				+
			
 
				+	double a = nan(""), b = nan(""), c = nan("");
			
 
				+
			
 
				+	if (model->type == STARPU_NL_REGRESSION_BASED)
			
 
				+		_starpu_regression_non_linear_power(per_arch_model->list, &a, &b, &c);
			
 
				+
			
 
				+	/* TODO: check:
			
 
				+	 * a
			
 
				+	 * b
			
 
				+	 * c
			
 
				+	 */
			
 
				+
			
 
				+	/*
			
 
				+	 * Multiple Regression Model
			
 
				+	 */
			
 
				+
			
 
				+	if (model->type == STARPU_MULTIPLE_REGRESSION_BASED)
			
 
				+	{
			
 
				+		/* TODO: check: */
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 static void dump_reg_model(FILE *f, struct starpu_perfmodel *model, int comb, int impl)
			
 
				 {
			
 
				 	struct starpu_perfmodel_per_arch *per_arch_model;
			
@@ -416,6 +474,15 @@ static void scan_reg_model(FILE *f, const char *path, struct starpu_perfmodel_re
 
				 
			
 
				 
			
 
				 #ifndef STARPU_SIMGRID
			
 
				+static void check_history_entry(struct starpu_perfmodel_history_entry *entry)
			
 
				+{
			
 
				+	STARPU_ASSERT_MSG(entry->deviation >= 0, "entry=%p, entry->deviation=%lf\n", entry, entry->deviation);
			
 
				+	STARPU_ASSERT_MSG(entry->sum >= 0, "entry=%p, entry->sum=%lf\n", entry, entry->sum);
			
 
				+	STARPU_ASSERT_MSG(entry->sum2 >= 0, "entry=%p, entry->sum2=%lf\n", entry, entry->sum2);
			
 
				+	STARPU_ASSERT_MSG(entry->mean >= 0, "entry=%p, entry->mean=%lf\n", entry, entry->mean);
			
 
				+	STARPU_ASSERT_MSG(isnan(entry->flops)||entry->flops >= 0, "entry=%p, entry->flops=%lf\n", entry, entry->flops);
			
 
				+	STARPU_ASSERT_MSG(entry->duration >= 0, "entry=%p, entry->duration=%lf\n", entry, entry->duration);
			
 
				+}
			
 
				 static void dump_history_entry(FILE *f, struct starpu_perfmodel_history_entry *entry)
			
 
				 {
			
 
				 	fprintf(f, "%08x\t%-15lu\t%-15e\t%-15e\t%-15e\t%-15e\t%-15e\t%u\n", entry->footprint, (unsigned long) entry->size, entry->flops, entry->mean, entry->deviation, entry->sum, entry->sum2, entry->nsample);
			
@@ -458,6 +525,11 @@ static void scan_history_entry(FILE *f, const char *path, struct starpu_perfmode
 
				 
			
 
				 	if (entry)
			
 
				 	{
			
 
				+		STARPU_ASSERT_MSG(flops >=0, "Negative flops %lf in performance model file %s", flops, path);
			
 
				+		STARPU_ASSERT_MSG(mean >=0, "Negative mean %lf in performance model file %s", mean, path);
			
 
				+		STARPU_ASSERT_MSG(deviation >=0, "Negative deviation %lf in performance model file %s", deviation, path);
			
 
				+		STARPU_ASSERT_MSG(sum >=0, "Negative sum %lf in performance model file %s", sum, path);
			
 
				+		STARPU_ASSERT_MSG(sum2 >=0, "Negative sum2 %lf in performance model file %s", sum2, path);
			
 
				 		entry->footprint = footprint;
			
 
				 		entry->size = size;
			
 
				 		entry->flops = flops;
			
@@ -487,7 +559,7 @@ static void parse_per_arch_model_file(FILE *f, const char *path, struct starpu_p
 
				 		struct starpu_perfmodel_history_entry *entry = NULL;
			
 
				 		if (scan_history)
			
 
				 		{
			
 
				-			_STARPU_MALLOC(entry, sizeof(struct starpu_perfmodel_history_entry));
			
 
				+			_STARPU_CALLOC(entry, 1, sizeof(struct starpu_perfmodel_history_entry));
			
 
				 
			
 
				 			/* Tell  helgrind that we do not care about
			
 
				 			 * racing access to the sampling, we only want a
			
@@ -660,6 +732,43 @@ static int parse_model_file(FILE *f, const char *path, struct starpu_perfmodel *
 
				 }
			
 
				 
			
 
				 #ifndef STARPU_SIMGRID
			
 
				+static void check_per_arch_model(struct starpu_perfmodel *model, int comb, unsigned impl)
			
 
				+{
			
 
				+	struct starpu_perfmodel_per_arch *per_arch_model;
			
 
				+
			
 
				+	per_arch_model = &model->state->per_arch[comb][impl];
			
 
				+	/* count the number of elements in the lists */
			
 
				+	struct starpu_perfmodel_history_list *ptr = NULL;
			
 
				+	unsigned nentries = 0;
			
 
				+
			
 
				+	if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
			
 
				+	{
			
 
				+		/* Dump the list of all entries in the history */
			
 
				+		ptr = per_arch_model->list;
			
 
				+		while(ptr)
			
 
				+		{
			
 
				+			nentries++;
			
 
				+			ptr = ptr->next;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* header */
			
 
				+	char archname[32];
			
 
				+	starpu_perfmodel_get_arch_name(arch_combs[comb], archname,  32, impl);
			
 
				+	STARPU_ASSERT(strlen(archname)>0);
			
 
				+	check_reg_model(model, comb, impl);
			
 
				+
			
 
				+	/* Dump the history into the model file in case it is necessary */
			
 
				+	if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
			
 
				+	{
			
 
				+		ptr = per_arch_model->list;
			
 
				+		while (ptr)
			
 
				+		{
			
 
				+			check_history_entry(ptr->entry);
			
 
				+			ptr = ptr->next;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				 static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel *model, int comb, unsigned impl)
			
 
				 {
			
 
				 	struct starpu_perfmodel_per_arch *per_arch_model;
			
@@ -704,6 +813,39 @@ static void dump_per_arch_model_file(FILE *f, struct starpu_perfmodel *model, in
 
				 	fprintf(f, "\n");
			
 
				 }
			
 
				 
			
 
				+static void check_model(struct starpu_perfmodel *model)
			
 
				+{
			
 
				+	int ncombs = model->state->ncombs;
			
 
				+	STARPU_ASSERT(ncombs >= 0);
			
 
				+
			
 
				+	int i, impl, dev;
			
 
				+	for(i = 0; i < ncombs; i++)
			
 
				+	{
			
 
				+		int comb = model->state->combs[i];
			
 
				+		STARPU_ASSERT(comb >= 0);
			
 
				+
			
 
				+		int ndevices = arch_combs[comb]->ndevices;
			
 
				+		STARPU_ASSERT(ndevices >= 1);
			
 
				+
			
 
				+		for(dev = 0; dev < ndevices; dev++)
			
 
				+		{
			
 
				+			STARPU_ASSERT(arch_combs[comb]->devices[dev].type >= 0);
			
 
				+			STARPU_ASSERT(arch_combs[comb]->devices[dev].type <= 5);
			
 
				+
			
 
				+			STARPU_ASSERT(arch_combs[comb]->devices[dev].devid >= 0);
			
 
				+
			
 
				+			STARPU_ASSERT(arch_combs[comb]->devices[dev].ncores >= 0);
			
 
				+		}
			
 
				+
			
 
				+		int nimpls = model->state->nimpls[comb];
			
 
				+		STARPU_ASSERT(nimpls >= 1);
			
 
				+		for (impl = 0; impl < nimpls; impl++)
			
 
				+		{
			
 
				+			check_per_arch_model(model, comb, impl);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 static void dump_model_file(FILE *f, struct starpu_perfmodel *model)
			
 
				 {
			
 
				 	fprintf(f, "##################\n");
			
@@ -873,6 +1015,7 @@ static void save_history_based_model(struct starpu_perfmodel *model)
 
				 	STARPU_ASSERT_MSG(f, "Could not save performance model %s\n", path);
			
 
				 
			
 
				 	locked = _starpu_fwrlock(f) == 0;
			
 
				+	check_model(model);
			
 
				 	_starpu_fftruncate(f, 0);
			
 
				 	dump_model_file(f, model);
			
 
				 	if (locked)
			
@@ -1423,6 +1566,7 @@ double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, s
 
				 	history = per_arch_model->history;
			
 
				 	HASH_FIND_UINT32_T(history, &key, elt);
			
 
				 	entry = (elt == NULL) ? NULL : elt->history_entry;
			
 
				+	STARPU_ASSERT_MSG(!entry || entry->mean >= 0, "entry=%p, entry->mean=%lf\n", entry, entry?entry->mean:NAN);
			
 
				 	STARPU_PTHREAD_RWLOCK_UNLOCK(&model->state->model_rwlock);
			
 
				 
			
 
				 	/* Here helgrind would shout that this is unprotected access.
			
@@ -1430,10 +1574,13 @@ double _starpu_history_based_job_expected_perf(struct starpu_perfmodel *model, s
 
				 	 * a good-enough estimation */
			
 
				 
			
 
				 	if (entry && entry->nsample >= _starpu_calibration_minimum)
			
 
				+	{
			
 
				+		STARPU_ASSERT_MSG(entry->mean >= 0, "entry->mean=%lf\n", entry->mean);
			
 
				 		/* TODO: report differently if we've scheduled really enough
			
 
				 		 * of that task and the scheduler should perhaps put it aside */
			
 
				 		/* Calibrated enough */
			
 
				 		exp = entry->mean;
			
 
				+	}
			
 
				 
			
 
				 docal:
			
 
				 	STARPU_HG_DISABLE_CHECKING(model->benchmarking);
			
@@ -1447,6 +1594,7 @@ docal:
 
				 		model->benchmarking = 1;
			
 
				 	}
			
 
				 
			
 
				+	STARPU_ASSERT_MSG(isnan(exp)||exp >= 0, "exp=%lf\n", exp);
			
 
				 	return exp;
			
 
				 }
			
 
				 
			
@@ -1470,6 +1618,7 @@ int _starpu_perfmodel_create_comb_if_needed(struct starpu_perfmodel_arch* arch)
 
				 
			
 
				 void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfmodel *model, struct starpu_perfmodel_arch* arch, unsigned cpuid STARPU_ATTRIBUTE_UNUSED, double measured, unsigned impl)
			
 
				 {
			
 
				+	STARPU_ASSERT_MSG(measured >= 0, "measured=%lf\n", measured);
			
 
				 	if (model)
			
 
				 	{
			
 
				 		int c;
			
@@ -1526,7 +1675,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 
				 			if (!entry)
			
 
				 			{
			
 
				 				/* this is the first entry with such a footprint */
			
 
				-				_STARPU_MALLOC(entry, sizeof(struct starpu_perfmodel_history_entry));
			
 
				+				_STARPU_CALLOC(entry, 1, sizeof(struct starpu_perfmodel_history_entry));
			
 
				 
			
 
				 				/* Tell  helgrind that we do not care about
			
 
				 				 * racing access to the sampling, we only want a
			
@@ -1585,7 +1734,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 
				 
			
 
				 					unsigned n = entry->nsample;
			
 
				 					entry->mean = entry->sum / n;
			
 
				-					entry->deviation = sqrt((entry->sum2 - (entry->sum*entry->sum)/n)/n);
			
 
				+					entry->deviation = sqrt((fabs(entry->sum2 - (entry->sum*entry->sum))/n)/n);
			
 
				 				}
			
 
				 
			
 
				 				if (j->task->flops != 0.)
			
@@ -1645,6 +1794,7 @@ void _starpu_update_perfmodel_history(struct _starpu_job *j, struct starpu_perfm
 
				 			_STARPU_MALLOC(entry->parameters, model->nparameters*sizeof(double));
			
 
				 			model->parameters(j->task, entry->parameters);
			
 
				 			entry->tag = j->task->tag_id;
			
 
				+			STARPU_ASSERT(measured >= 0);
			
 
				 			entry->duration = measured;
			
 
				 
			
 
				 			struct starpu_perfmodel_history_list *link;
			
--- a/src/core/perfmodel/perfmodel_print.c
+++ b/src/core/perfmodel/perfmodel_print.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2011, 2013-2016  Université de Bordeaux
			
 
				- * Copyright (C) 2011, 2012, 2013, 2014, 2015, 2016  CNRS
			
 
				+ * Copyright (C) 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
			
 
				  * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -39,7 +39,7 @@ void _starpu_perfmodel_print_history_based(struct starpu_perfmodel_per_arch *per
 
				 			if (!parameter)
			
 
				 			{
			
 
				 				/* There isn't a parameter that is explicitely requested, so we display all parameters */
			
 
				-				printf("%08x\t%-15lu\t%-15e\t%-15e\t%-15e\t%u\n", entry->footprint,
			
 
				+				fprintf(output, "%08x\t%-15lu\t%-15e\t%-15e\t%-15e\t%u\n", entry->footprint,
			
 
				 					(unsigned long) entry->size, entry->flops, entry->mean, entry->deviation, entry->nsample);
			
 
				 			}
			
 
				 			else
			
@@ -47,12 +47,12 @@ void _starpu_perfmodel_print_history_based(struct starpu_perfmodel_per_arch *per
 
				 				/* only display the parameter that was specifically requested */
			
 
				 				if (strcmp(parameter, "mean") == 0)
			
 
				 				{
			
 
				-					printf("%-15e\n", entry->mean);
			
 
				+					fprintf(output, "%-15e\n", entry->mean);
			
 
				 				}
			
 
				 
			
 
				 				if (strcmp(parameter, "stddev") == 0)
			
 
				 				{
			
 
				-					printf("%-15e\n", entry->deviation);
			
 
				+					fprintf(output, "%-15e\n", entry->deviation);
			
 
				 					return;
			
 
				 				}
			
 
				 			}
			
@@ -113,7 +113,7 @@ void starpu_perfmodel_print(struct starpu_perfmodel *model, struct starpu_perfmo
 
				 #if 0
			
 
				 		char debugname[1024];
			
 
				 		starpu_perfmodel_debugfilepath(model, arch, debugname, 1024, nimpl);
			
 
				-		printf("\t debug file path : %s\n", debugname);
			
 
				+		_STARPU_MSG("\t debug file path : %s\n", debugname);
			
 
				 #endif
			
 
				 	}
			
 
				 	else
			
@@ -121,31 +121,31 @@ void starpu_perfmodel_print(struct starpu_perfmodel *model, struct starpu_perfmo
 
				 		/* only display the parameter that was specifically requested */
			
 
				 		if (strcmp(parameter, "a") == 0)
			
 
				 		{
			
 
				-			printf("%e\n", arch_model->regression.a);
			
 
				+			fprintf(output, "%e\n", arch_model->regression.a);
			
 
				 			return;
			
 
				 		}
			
 
				 
			
 
				 		if (strcmp(parameter, "b") == 0)
			
 
				 		{
			
 
				-			printf("%e\n", arch_model->regression.b);
			
 
				+			fprintf(output, "%e\n", arch_model->regression.b);
			
 
				 			return;
			
 
				 		}
			
 
				 
			
 
				 		if (strcmp(parameter, "c") == 0)
			
 
				 		{
			
 
				-			printf("%e\n", arch_model->regression.c);
			
 
				+			fprintf(output, "%e\n", arch_model->regression.c);
			
 
				 			return;
			
 
				 		}
			
 
				 
			
 
				 		if (strcmp(parameter, "alpha") == 0)
			
 
				 		{
			
 
				-			printf("%e\n", arch_model->regression.alpha);
			
 
				+			fprintf(output, "%e\n", arch_model->regression.alpha);
			
 
				 			return;
			
 
				 		}
			
 
				 
			
 
				 		if (strcmp(parameter, "beta") == 0)
			
 
				 		{
			
 
				-			printf("%e\n", arch_model->regression.beta);
			
 
				+			fprintf(output, "%e\n", arch_model->regression.beta);
			
 
				 			return;
			
 
				 		}
			
 
				 
			
@@ -153,7 +153,7 @@ void starpu_perfmodel_print(struct starpu_perfmodel *model, struct starpu_perfmo
 
				 		{
			
 
				 			char debugname[256];
			
 
				 			starpu_perfmodel_debugfilepath(model, arch, debugname, 256, nimpl);
			
 
				-			printf("%s\n", debugname);
			
 
				+			fprintf(output, "%s\n", debugname);
			
 
				 			return;
			
 
				 		}
			
 
				 
			
@@ -165,8 +165,7 @@ void starpu_perfmodel_print(struct starpu_perfmodel *model, struct starpu_perfmo
 
				 
			
 
				 		/* TODO display if it's valid ? */
			
 
				 
			
 
				-		fprintf(output, "Unknown parameter requested, aborting.\n");
			
 
				-		exit(-1);
			
 
				+		_STARPU_ERROR("Unknown parameter requested, aborting.\n");
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -209,8 +208,7 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 
				 			/* For combined CPU workers */
			
 
				 			if ((k < 1) || (k > STARPU_MAXCPUS))
			
 
				 			{
			
 
				-				fprintf(output, "Invalid CPU size\n");
			
 
				-				exit(-1);
			
 
				+				_STARPU_ERROR("Invalid CPU size\n");
			
 
				 			}
			
 
				 
			
 
				 			int implid;
			
@@ -282,7 +280,7 @@ int starpu_perfmodel_print_all(struct starpu_perfmodel *model, char *arch, char
 
				 			return 0;
			
 
				 		}
			
 
				 
			
 
				-		fprintf(output, "Unknown architecture requested\n");
			
 
				+		_STARPU_MSG("Unknown architecture requested\n");
			
 
				 		return -1;
			
 
				 	}
			
 
				 	return 0;
			
--- a/src/core/sched_ctx.c
+++ b/src/core/sched_ctx.c
--- a/src/core/sched_ctx.h
+++ b/src/core/sched_ctx.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2011, 2013  INRIA
			
 
				+ * Copyright (C) 2011, 2013, 2017  INRIA
			
 
				  * Copyright (C) 2016  Uppsala University
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -74,27 +74,12 @@ struct _starpu_sched_ctx
 
				 	long iterations[2];
			
 
				 	int iteration_level;
			
 
				 
			
 
				-	/* cond to block push when there are no workers in the ctx */
			
 
				-	starpu_pthread_cond_t no_workers_cond;
			
 
				-
			
 
				-	/* mutex to block push when there are no workers in the ctx */
			
 
				-	starpu_pthread_mutex_t no_workers_mutex;
			
 
				-
			
 
				 	/*ready tasks that couldn't be pushed because the ctx has no workers*/
			
 
				 	struct starpu_task_list empty_ctx_tasks;
			
 
				 
			
 
				-	/* mutext protecting empty_ctx_tasks list */
			
 
				-	starpu_pthread_mutex_t empty_ctx_mutex;
			
 
				-
			
 
				 	/*ready tasks that couldn't be pushed because the the window of tasks was already full*/
			
 
				 	struct starpu_task_list waiting_tasks;
			
 
				 
			
 
				-	/* mutext protecting waiting_tasks list */
			
 
				-	starpu_pthread_mutex_t waiting_tasks_mutex;
			
 
				-
			
 
				-	/* mutext protecting write to all worker's sched_ctx_list structure for this sched_ctx */
			
 
				-	starpu_pthread_mutex_t sched_ctx_list_mutex;
			
 
				-
			
 
				 	/* min CPUs to execute*/
			
 
				 	int min_ncpus;
			
 
				 
			
@@ -143,27 +128,6 @@ struct _starpu_sched_ctx
 
				 	   if not master is -1 */
			
 
				 	int main_master;
			
 
				 
			
 
				-	/* conditions variables used when parallel sections are executed in contexts */
			
 
				-	starpu_pthread_cond_t parallel_sect_cond[STARPU_NMAXWORKERS];
			
 
				-	starpu_pthread_mutex_t parallel_sect_mutex[STARPU_NMAXWORKERS];
			
 
				-	starpu_pthread_cond_t parallel_sect_cond_busy[STARPU_NMAXWORKERS];
			
 
				-	int busy[STARPU_NMAXWORKERS];
			
 
				-
			
 
				-	/* boolean indicating that workers should block in order to allow
			
 
				-	   parallel sections to be executed on their allocated resources */
			
 
				-	unsigned parallel_sect[STARPU_NMAXWORKERS];
			
 
				-
			
 
				-	/* semaphore that block appl thread until starpu threads are
			
 
				-	   all blocked and ready to exec the parallel code */
			
 
				-	starpu_sem_t fall_asleep_sem[STARPU_NMAXWORKERS];
			
 
				-
			
 
				-	/* semaphore that block appl thread until starpu threads are 
			
 
				-	   all woke up and ready continue appl */
			
 
				-	starpu_sem_t wake_up_sem[STARPU_NMAXWORKERS];
			
 
				-
			
 
				-	/* bool indicating if the workers is sleeping in this ctx */
			
 
				-	unsigned sleeping[STARPU_NMAXWORKERS];
			
 
				-
			
 
				 	/* ctx nesting the current ctx */
			
 
				 	unsigned nesting_sched_ctx;
			
 
				 
			
@@ -191,8 +155,21 @@ struct _starpu_sched_ctx
 
				 	int sms_end_idx;
			
 
				 
			
 
				 	int stream_worker;
			
 
				+
			
 
				+	starpu_pthread_rwlock_t rwlock;
			
 
				+	starpu_pthread_t lock_write_owner;
			
 
				 };
			
 
				 
			
 
				+/* per-worker list of deferred ctx_change ops */
			
 
				+LIST_TYPE(_starpu_ctx_change,
			
 
				+	int sched_ctx_id;
			
 
				+	int op;
			
 
				+	int nworkers_to_notify;
			
 
				+	int *workerids_to_notify;
			
 
				+	int nworkers_to_change;
			
 
				+	int *workerids_to_change;
			
 
				+);
			
 
				+
			
 
				 struct _starpu_machine_config;
			
 
				 
			
 
				 /* init sched_ctx_id of all contextes*/
			
@@ -242,19 +219,10 @@ void _starpu_worker_gets_out_of_ctx(unsigned sched_ctx_id, struct _starpu_worker
 
				 /* Check if the worker belongs to another sched_ctx */
			
 
				 unsigned _starpu_worker_belongs_to_a_sched_ctx(int workerid, unsigned sched_ctx_id);
			
 
				 
			
 
				-/* mutex synchronising several simultaneous modifications of a context */
			
 
				-starpu_pthread_rwlock_t* _starpu_sched_ctx_get_changing_ctx_mutex(unsigned sched_ctx_id);
			
 
				-
			
 
				 /* indicates wheather this worker should go to sleep or not 
			
 
				    (if it is the last one awake in a context he should better keep awake) */
			
 
				 unsigned _starpu_sched_ctx_last_worker_awake(struct _starpu_worker *worker);
			
 
				 
			
 
				-/* let the appl know that the worker blocked to execute parallel code */
			
 
				-void _starpu_sched_ctx_signal_worker_blocked(unsigned sched_ctx_id, int workerid);
			
 
				-
			
 
				-/* let the appl know that the worker woke up */
			
 
				-void _starpu_sched_ctx_signal_worker_woke_up(unsigned sched_ctx_id, int workerid);
			
 
				-
			
 
				 /* If starpu_sched_ctx_set_context() has been called, returns the context
			
 
				  * id set by its last call, or the id of the initial context */
			
 
				 unsigned _starpu_sched_ctx_get_current_context();
			
@@ -273,10 +241,62 @@ void _starpu_sched_ctx_post_exec_task_cb(int workerid, struct starpu_task *task,
 
				 
			
 
				 #endif //STARPU_USE_SC_HYPERVISOR
			
 
				 
			
 
				+void starpu_sched_ctx_add_combined_workers(int *combined_workers_to_add, unsigned n_combined_workers_to_add, unsigned sched_ctx_id);
			
 
				+
			
 
				 /* if the worker is the master of a parallel context, and the job is meant to be executed on this parallel context, return a pointer to the context */
			
 
				 struct _starpu_sched_ctx *__starpu_sched_ctx_get_sched_ctx_for_worker_and_job(struct _starpu_worker *worker, struct _starpu_job *j);
			
 
				 
			
 
				 #define _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(w,j) \
			
 
				 	(_starpu_get_nsched_ctxs() <= 1 ? _starpu_get_sched_ctx_struct(0) : __starpu_sched_ctx_get_sched_ctx_for_worker_and_job((w),(j)))
			
 
				 
			
 
				+static inline struct _starpu_sched_ctx *_starpu_get_sched_ctx_struct(unsigned id);
			
 
				+
			
 
				+static inline int _starpu_sched_ctx_check_write_locked(unsigned sched_ctx_id)
			
 
				+{
			
 
				+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
			
 
				+	return starpu_pthread_equal(sched_ctx->lock_write_owner, starpu_pthread_self());
			
 
				+}
			
 
				+#define STARPU_SCHED_CTX_CHECK_LOCK(sched_ctx_id) STARPU_ASSERT(_starpu_sched_ctx_check_write_locked((sched_ctx_id)))
			
 
				+
			
 
				+static inline void _starpu_sched_ctx_lock_write(unsigned sched_ctx_id)
			
 
				+{
			
 
				+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
			
 
				+	STARPU_HG_DISABLE_CHECKING(sched_ctx->lock_write_owner);
			
 
				+	STARPU_ASSERT(!starpu_pthread_equal(sched_ctx->lock_write_owner, starpu_pthread_self()));
			
 
				+	STARPU_HG_ENABLE_CHECKING(sched_ctx->lock_write_owner);
			
 
				+	STARPU_PTHREAD_RWLOCK_WRLOCK(&sched_ctx->rwlock);
			
 
				+	sched_ctx->lock_write_owner = starpu_pthread_self();
			
 
				+}
			
 
				+
			
 
				+static inline void _starpu_sched_ctx_unlock_write(unsigned sched_ctx_id)
			
 
				+{
			
 
				+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
			
 
				+	STARPU_HG_DISABLE_CHECKING(sched_ctx->lock_write_owner);
			
 
				+	STARPU_ASSERT(starpu_pthread_equal(sched_ctx->lock_write_owner, starpu_pthread_self()));
			
 
				+	memset(&sched_ctx->lock_write_owner, 0, sizeof(sched_ctx->lock_write_owner));
			
 
				+	STARPU_HG_ENABLE_CHECKING(sched_ctx->lock_write_owner);
			
 
				+	STARPU_PTHREAD_RWLOCK_UNLOCK(&sched_ctx->rwlock);
			
 
				+}
			
 
				+
			
 
				+static inline void _starpu_sched_ctx_lock_read(unsigned sched_ctx_id)
			
 
				+{
			
 
				+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
			
 
				+	STARPU_HG_DISABLE_CHECKING(sched_ctx->lock_write_owner);
			
 
				+	STARPU_ASSERT(!starpu_pthread_equal(sched_ctx->lock_write_owner, starpu_pthread_self()));
			
 
				+	STARPU_HG_ENABLE_CHECKING(sched_ctx->lock_write_owner);
			
 
				+	STARPU_PTHREAD_RWLOCK_RDLOCK(&sched_ctx->rwlock);
			
 
				+}
			
 
				+
			
 
				+static inline void _starpu_sched_ctx_unlock_read(unsigned sched_ctx_id)
			
 
				+{
			
 
				+	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
			
 
				+	STARPU_HG_DISABLE_CHECKING(sched_ctx->lock_write_owner);
			
 
				+	STARPU_ASSERT(!starpu_pthread_equal(sched_ctx->lock_write_owner, starpu_pthread_self()));
			
 
				+	STARPU_HG_ENABLE_CHECKING(sched_ctx->lock_write_owner);
			
 
				+	STARPU_PTHREAD_RWLOCK_UNLOCK(&sched_ctx->rwlock);
			
 
				+}
			
 
				+
			
 
				+/* Go through the list of deferred ctx changes of the current worker and apply
			
 
				+ * any ctx change operation found until the list is empty */
			
 
				+void _starpu_worker_apply_deferred_ctx_changes(void);
			
 
				 #endif // __SCHED_CONTEXT_H__
			
--- a/src/core/sched_ctx_list.c
+++ b/src/core/sched_ctx_list.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2011, 2013  INRIA
			
 
				+ * Copyright (C) 2011, 2013, 2017  INRIA
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -253,7 +253,8 @@ void _starpu_sched_ctx_list_remove_elt(struct _starpu_sched_ctx_list **list,
 
				 		if (parent->prev == NULL)
			
 
				 		{
			
 
				 			*list = parent->next;
			
 
				-			parent->next->prev = NULL;
			
 
				+			if (parent->next != NULL)
			
 
				+				parent->next->prev = NULL;
			
 
				 		}
			
 
				 		else
			
 
				 		{
			
--- a/src/core/sched_policy.c
+++ b/src/core/sched_policy.c