8 éve · 134f1eb906
--- a/ChangeLog
+++ b/ChangeLog
@@ -273,6 +273,7 @@ Small features:
 
				     allows to copy in a new buffer values which have not been unpacked by
			
 
				     the current call
			
 
				   * Add STARPU_CODELET_SIMGRID_EXECUTE flag.
			
 
				+  * Add STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT flag.
			
 
				   * Add STARPU_CL_ARGS flag to starpu_task_insert() and
			
 
				     starpu_mpi_task_insert() functions call
			
 
				 
			
--- a/configure.ac
+++ b/configure.ac
@@ -348,9 +348,10 @@ else
 
				     build_mpi_master_slave=no
			
 
				 fi
			
 
				 
			
 
				-#Warn users that they cannot use both at the same time
			
 
				+#users cannot use both at the same time
			
 
				 if test x$build_mpi_master_slave = xyes -a x$enable_mpi = xyes; then
			
 
				-    AC_MSG_WARN(StarPU-MPI and MPI Master-Slave cannot be used at the same time !)
			
 
				+    AC_MSG_WARN(StarPU-MPI and MPI Master-Slave cannot be used at the same time ! Disabling StarPU-MPI...)
			
 
				+	enable_mpi=no
			
 
				 fi
			
 
				 
			
 
				 if test x$build_mpi_master_slave = xyes; then
			
@@ -3117,6 +3118,12 @@ AC_CONFIG_COMMANDS([executable-scripts], [
 
				   test -e tests/microbenchs/parallel_independent_heterogeneous_tasks.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_heterogeneous_tasks.sh tests/microbenchs/
			
 
				   test -e tests/microbenchs/parallel_independent_homogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_homogeneous_tasks_data.sh tests/microbenchs/
			
 
				   test -e tests/microbenchs/parallel_independent_homogeneous_tasks.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_homogeneous_tasks.sh tests/microbenchs/
			
 
				+  mkdir -p tests/datawizard
			
 
				+  test -e tests/datawizard/locality.sh || ln -sf $ac_abs_top_srcdir/tests/datawizard/locality.sh tests/datawizard/
			
 
				+  mkdir -p examples/heat
			
 
				+  test -e examples/heat/heat.sh || ln -sf $ac_abs_top_srcdir/examples/heat/heat.sh examples/heat/
			
 
				+  mkdir -p examples/lu
			
 
				+  test -e examples/lu/lu.sh || ln -sf $ac_abs_top_srcdir/examples/lu/lu.sh examples/lu/
			
 
				 ])
			
 
				 
			
 
				 # Create links to ICD files in build/socl/vendors directory. SOCL will use this
			
--- a/doc/doxygen/chapters/320_scheduling.doxy
+++ b/doc/doxygen/chapters/320_scheduling.doxy
@@ -285,8 +285,9 @@ be used to get information about how well the execution proceeded, and thus the
 
				 overall quality of the execution.
			
 
				 
			
 
				 Precise debugging can also be performed by using the
			
 
				-\ref STARPU_TASK_BREAK_ON_SCHED, \ref STARPU_TASK_BREAK_ON_PUSH, and
			
 
				-\ref STARPU_TASK_BREAK_ON_POP environment variables. By setting the job_id of a task
			
 
				+\ref STARPU_TASK_BREAK_ON_PUSH, \ref STARPU_TASK_BREAK_ON_SCHED,
			
 
				+\ref STARPU_TASK_BREAK_ON_POP, and \ref STARPU_TASK_BREAK_ON_EXEC environment variables.
			
 
				+By setting the job_id of a task
			
 
				 in these environment variables, StarPU will raise <c>SIGTRAP</c> when the task is being
			
 
				 scheduled, pushed, or popped by the scheduler. That means that when one notices
			
 
				 that a task is being scheduled in a seemingly odd way, one can just reexecute
			
--- a/doc/doxygen/chapters/380_offline_performance_tools.doxy
+++ b/doc/doxygen/chapters/380_offline_performance_tools.doxy
@@ -129,7 +129,7 @@ collect the trace files from the MPI nodes, and
 
				 specify them all on the command <c>starpu_fxt_tool</c>, for instance:
			
 
				 
			
 
				 \verbatim
			
 
				-$ starpu_fxt_tool -i /tmp/prof_file_something1 -i /tmp/prof_file_something2
			
 
				+$ starpu_fxt_tool -i /tmp/prof_file_something*
			
 
				 \endverbatim
			
 
				 
			
 
				 By default, all tasks are displayed using a green color. To display tasks with
			
--- a/doc/doxygen/chapters/470_simgrid.doxy
+++ b/doc/doxygen/chapters/470_simgrid.doxy
@@ -9,8 +9,8 @@
 
				 /*! \page SimGridSupport SimGrid Support
			
 
				 
			
 
				 StarPU can use Simgrid in order to simulate execution on an arbitrary
			
 
				-platform. This was tested with simgrid 3.11, 3.12, 3.13, 3.14, and 3.14.159, other versions may have
			
 
				-compatibility issues.
			
 
				+platform. This was tested with simgrid from 3.11 to 3.15,
			
 
				+other versions may have compatibility issues.
			
 
				 
			
 
				 \section Preparing Preparing Your Application For Simulation
			
 
				 
			
@@ -36,7 +36,8 @@ To be able to run the application with e.g. CUDA simulation on a system which
 
				 does not have CUDA installed, one can fill the cuda_funcs with (void*)1, to
			
 
				 express that there is a CUDA implementation, even if one does not actually
			
 
				 provide it. StarPU will not actually run it in Simgrid mode anyway by default
			
 
				-(unless the ::STARPU_CODELET_SIMGRID_EXECUTE flag is set in the codelet)
			
 
				+(unless the ::STARPU_CODELET_SIMGRID_EXECUTE or ::STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT
			
 
				+flags are set in the codelet)
			
 
				 
			
 
				 \snippet simgrid.c To be included. You should update doxygen if you see this text.
			
 
				 
			
--- a/doc/doxygen/chapters/501_environment_variables.doxy
+++ b/doc/doxygen/chapters/501_environment_variables.doxy
@@ -642,9 +642,10 @@ especially regarding data transfers.
 
				 <dd>
			
 
				 \anchor STARPU_SIMGRID_SCHED_COST
			
 
				 \addindex __env__STARPU_SIMGRID_SCHED_COST
			
 
				-When set to 1 (which is the default), scheduling costs are taken into
			
 
				+When set to 1 (0 is the default), scheduling costs are taken into
			
 
				 account in simgrid mode. This provides more accurate simgrid predictions,
			
 
				-and allows studying scheduling overhead of the runtime system.
			
 
				+and allows studying scheduling overhead of the runtime system. However,
			
 
				+it also makes simulation non-deterministic.
			
 
				 </dd>
			
 
				 
			
 
				 </dl>
			
@@ -1021,6 +1022,15 @@ dog is reached, thus allowing to catch the situation in gdb, etc
 
				 (see \ref DetectionStuckConditions)
			
 
				 </dd>
			
 
				 
			
 
				+<dt>STARPU_TASK_BREAK_ON_PUSH</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_TASK_BREAK_ON_PUSH
			
 
				+\addindex __env__STARPU_TASK_BREAK_ON_PUSH
			
 
				+When this variable contains a job id, StarPU will raise SIGTRAP when the task
			
 
				+with that job id is being pushed to the scheduler, which will be nicely catched by debuggers
			
 
				+(see \ref DebuggingScheduling)
			
 
				+</dd>
			
 
				+
			
 
				 <dt>STARPU_TASK_BREAK_ON_SCHED</dt>
			
 
				 <dd>
			
 
				 \anchor STARPU_TASK_BREAK_ON_SCHED
			
@@ -1032,21 +1042,21 @@ This only works for schedulers which have such a scheduling point defined
 
				 (see \ref DebuggingScheduling)
			
 
				 </dd>
			
 
				 
			
 
				-<dt>STARPU_TASK_BREAK_ON_PUSH</dt>
			
 
				+<dt>STARPU_TASK_BREAK_ON_POP</dt>
			
 
				 <dd>
			
 
				-\anchor STARPU_TASK_BREAK_ON_PUSH
			
 
				-\addindex __env__STARPU_TASK_BREAK_ON_PUSH
			
 
				+\anchor STARPU_TASK_BREAK_ON_POP
			
 
				+\addindex __env__STARPU_TASK_BREAK_ON_POP
			
 
				 When this variable contains a job id, StarPU will raise SIGTRAP when the task
			
 
				-with that job id is being pushed to the scheduler, which will be nicely catched by debuggers
			
 
				+with that job id is being popped from the scheduler, which will be nicely catched by debuggers
			
 
				 (see \ref DebuggingScheduling)
			
 
				 </dd>
			
 
				 
			
 
				-<dt>STARPU_TASK_BREAK_ON_POP</dt>
			
 
				+<dt>STARPU_TASK_BREAK_ON_EXEC</dt>
			
 
				 <dd>
			
 
				-\anchor STARPU_TASK_BREAK_ON_POP
			
 
				-\addindex __env__STARPU_TASK_BREAK_ON_POP
			
 
				+\anchor STARPU_TASK_BREAK_ON_EXEC
			
 
				+\addindex __env__STARPU_TASK_BREAK_ON_EXEC
			
 
				 When this variable contains a job id, StarPU will raise SIGTRAP when the task
			
 
				-with that job id is being popped from the scheduler, which will be nicely catched by debuggers
			
 
				+with that job id is being executed, which will be nicely catched by debuggers
			
 
				 (see \ref DebuggingScheduling)
			
 
				 </dd>
			
 
				 
			
--- a/doc/doxygen/chapters/api/codelet_and_tasks.doxy
+++ b/doc/doxygen/chapters/api/codelet_and_tasks.doxy
@@ -135,6 +135,11 @@ Value to be set in starpu_codelet::opencl_flags to allow asynchronous OpenCL ker
 
				 \ingroup API_Codelet_And_Tasks
			
 
				 Value to be set in starpu_codelet::flags to execute the codelet functions even in simgrid mode.
			
 
				 
			
 
				+\def STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT
			
 
				+\ingroup API_Codelet_And_Tasks
			
 
				+Value to be set in starpu_codelet::flags to execute the codelet functions even in simgrid mode,
			
 
				+and later inject the measured timing inside the simulation.
			
 
				+
			
 
				 \typedef starpu_cpu_func_t
			
 
				 \ingroup API_Codelet_And_Tasks
			
 
				 CPU implementation of a codelet.
			
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -77,11 +77,13 @@ EXTRA_DIST = 					\
 
				 	scheduler/schedulers.sh				\
			
 
				 	scheduler/schedulers_context.sh			\
			
 
				 	fortran/Makefile				\
			
 
				-	sched_ctx/axpy_partition_gpu.h				\
			
 
				-	sched_ctx/axpy_partition_gpu.cu
			
 
				+	sched_ctx/axpy_partition_gpu.h			\
			
 
				+	sched_ctx/axpy_partition_gpu.cu			\
			
 
				+	heat/heat.sh					\
			
 
				+	lu/lu.sh
			
 
				 
			
 
				 
			
 
				-CLEANFILES = *.gcno *.gcda *.linkinfo *.mod starpu_idle_microsec.log
			
 
				+CLEANFILES = *.gcno *.gcda *.linkinfo *.mod starpu_idle_microsec.log *.mps *.dot *.pl
			
 
				 
			
 
				 if STARPU_USE_CUDA
			
 
				 
			
@@ -300,6 +302,13 @@ STARPU_EXAMPLES +=				\
 
				 	heat/heat				\
			
 
				 	cg/cg					\
			
 
				 	pipeline/pipeline
			
 
				+
			
 
				+if !STARPU_USE_MPI_MASTER_SLAVE
			
 
				+TESTS += \
			
 
				+	heat/heat.sh				\
			
 
				+	lu/lu.sh
			
 
				+
			
 
				+endif
			
 
				 endif
			
 
				 endif
			
 
				 
			
--- a/examples/heat/dw_sparse_cg.c
+++ b/examples/heat/dw_sparse_cg.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010, 2011, 2015  Université de Bordeaux
			
 
				+ * Copyright (C) 2009, 2010, 2011, 2015, 2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2016  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -25,11 +25,7 @@
 
				 
			
 
				 static struct starpu_task *create_task(starpu_tag_t id)
			
 
				 {
			
 
				-	struct starpu_codelet *cl = calloc(1,sizeof(struct starpu_codelet));
			
 
				-
			
 
				 	struct starpu_task *task = starpu_task_create();
			
 
				-		task->cl = cl;
			
 
				-		task->cl_arg = NULL;
			
 
				 		task->use_tag = 1;
			
 
				 		task->tag_id = id;
			
 
				 
			
@@ -131,6 +127,30 @@ void init_problem(void)
 
				  *	cg initialization phase
			
 
				  */
			
 
				 
			
 
				+static struct starpu_codelet cl1 = {
			
 
				+	.cpu_funcs = { cpu_codelet_func_1 },
			
 
				+	.cpu_funcs_name = { "cpu_codelet_func_1" },
			
 
				+	.nbuffers = 4,
			
 
				+	.modes = { STARPU_R, STARPU_R, STARPU_W, STARPU_R },
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet cl2 = {
			
 
				+	.cpu_funcs = { cpu_codelet_func_2 },
			
 
				+	.cpu_funcs_name = { "cpu_codelet_func_2" },
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = { STARPU_W, STARPU_R },
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet cl3 = {
			
 
				+	.cpu_funcs = { cpu_codelet_func_3 },
			
 
				+	.cpu_funcs_name = { "cpu_codelet_func_3" },
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_funcs = { cublas_codelet_func_3 },
			
 
				+#endif
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = { STARPU_R },
			
 
				+};
			
 
				+
			
 
				 void init_cg(struct cg_problem *problem)
			
 
				 {
			
 
				 	int ret;
			
@@ -139,14 +159,7 @@ void init_cg(struct cg_problem *problem)
 
				 
			
 
				 	/* r = b  - A x */
			
 
				 	struct starpu_task *task1 = create_task(1UL);
			
 
				-	task1->cl->cpu_funcs[0] = cpu_codelet_func_1;
			
 
				-	task1->cl->cpu_funcs_name[0] = "cpu_codelet_func_1";
			
 
				-	task1->cl->nbuffers = 4;
			
 
				-	task1->cl->modes[0] = STARPU_R;
			
 
				-	task1->cl->modes[1] = STARPU_R;
			
 
				-	task1->cl->modes[2] = STARPU_W;
			
 
				-	task1->cl->modes[3] = STARPU_R;
			
 
				-
			
 
				+	task1->cl = &cl1;
			
 
				 	task1->handles[0] = problem->ds_matrixA;
			
 
				 	task1->handles[1] = problem->ds_vecx;
			
 
				 	task1->handles[2] = problem->ds_vecr;
			
@@ -154,12 +167,7 @@ void init_cg(struct cg_problem *problem)
 
				 
			
 
				 	/* d = r */
			
 
				 	struct starpu_task *task2 = create_task(2UL);
			
 
				-	task2->cl->cpu_funcs[0] = cpu_codelet_func_2;
			
 
				-	task2->cl->cpu_funcs_name[0] = "cpu_codelet_func_2";
			
 
				-	task2->cl->nbuffers = 2;
			
 
				-	task2->cl->modes[0] = STARPU_W;
			
 
				-	task2->cl->modes[1] = STARPU_R;
			
 
				-
			
 
				+	task2->cl = &cl2;
			
 
				 	task2->handles[0] = problem->ds_vecd;
			
 
				 	task2->handles[1] = problem->ds_vecr;
			
 
				 
			
@@ -167,15 +175,9 @@ void init_cg(struct cg_problem *problem)
 
				 
			
 
				 	/* delta_new = trans(r) r */
			
 
				 	struct starpu_task *task3 = create_task(3UL);
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	task3->cl->cuda_funcs[0] = cublas_codelet_func_3;
			
 
				-#endif
			
 
				-	task3->cl->cpu_funcs[0] = cpu_codelet_func_3;
			
 
				-	task3->cl->cpu_funcs_name[0] = "cpu_codelet_func_3";
			
 
				+	task3->cl = &cl3;
			
 
				 	task3->cl_arg = problem;
			
 
				 	task3->cl_arg_size = sizeof(*problem);
			
 
				-	task3->cl->nbuffers = 1;
			
 
				-	task3->cl->modes[0] = STARPU_R;
			
 
				 	task3->handles[0] = problem->ds_vecr;
			
 
				 
			
 
				 	task3->callback_func = iteration_cg;
			
@@ -186,6 +188,11 @@ void init_cg(struct cg_problem *problem)
 
				 
			
 
				 	/* launch the computation now */
			
 
				 	ret = starpu_task_submit(task1);
			
 
				+	if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				+	{
			
 
				+		FPRINTF(stderr, "No worker may execute this task\n");
			
 
				+		exit(0);
			
 
				+	}
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
 
				 	ret = starpu_task_submit(task2);
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
			
@@ -198,6 +205,66 @@ void init_cg(struct cg_problem *problem)
 
				  *		the codelet code launcher is its own callback !
			
 
				  */
			
 
				 
			
 
				+static struct starpu_codelet cl4 = {
			
 
				+	.cpu_funcs = { cpu_codelet_func_4 },
			
 
				+	.cpu_funcs_name = { "cpu_codelet_func_4" },
			
 
				+	.nbuffers = 3,
			
 
				+	.modes = { STARPU_R, STARPU_R, STARPU_W },
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet cl5 = {
			
 
				+	.cpu_funcs = { cpu_codelet_func_5 },
			
 
				+	.cpu_funcs_name = { "cpu_codelet_func_5" },
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_funcs = { cublas_codelet_func_5 },
			
 
				+#endif
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = { STARPU_R, STARPU_R },
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet cl6 = {
			
 
				+	.cpu_funcs = { cpu_codelet_func_6 },
			
 
				+	.cpu_funcs_name = { "cpu_codelet_func_6" },
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_funcs = { cublas_codelet_func_6 },
			
 
				+	.cuda_flags = { STARPU_CUDA_ASYNC },
			
 
				+#endif
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = { STARPU_RW, STARPU_R },
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet cl7 = {
			
 
				+	.cpu_funcs = { cpu_codelet_func_7 },
			
 
				+	.cpu_funcs_name = { "cpu_codelet_func_7" },
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_funcs = { cublas_codelet_func_7 },
			
 
				+	.cuda_flags = { STARPU_CUDA_ASYNC },
			
 
				+#endif
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = { STARPU_RW, STARPU_R },
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet cl8 = {
			
 
				+	.cpu_funcs = { cpu_codelet_func_8 },
			
 
				+	.cpu_funcs_name = { "cpu_codelet_func_8" },
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_funcs = { cublas_codelet_func_8 },
			
 
				+#endif
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = { STARPU_R },
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet cl9 = {
			
 
				+	.cpu_funcs = { cpu_codelet_func_9 },
			
 
				+	.cpu_funcs_name = { "cpu_codelet_func_9" },
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.cuda_funcs = { cublas_codelet_func_9 },
			
 
				+	.cuda_flags = { STARPU_CUDA_ASYNC },
			
 
				+#endif
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = { STARPU_RW, STARPU_R },
			
 
				+};
			
 
				+
			
 
				 void launch_new_cg_iteration(struct cg_problem *problem)
			
 
				 {
			
 
				 	int ret;
			
@@ -208,30 +275,16 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 
				 
			
 
				 	/* q = A d */
			
 
				 	struct starpu_task *task4 = create_task(maskiter | 4UL);
			
 
				-	task4->cl->cpu_funcs[0] = cpu_codelet_func_4;
			
 
				-	task4->cl->cpu_funcs_name[0] = "cpu_codelet_func_4";
			
 
				-	task4->cl->nbuffers = 3;
			
 
				-	task4->cl->modes[0] = STARPU_R;
			
 
				-	task4->cl->modes[1] = STARPU_R;
			
 
				-	task4->cl->modes[2] = STARPU_W;
			
 
				-
			
 
				+	task4->cl = &cl4;
			
 
				 	task4->handles[0] = problem->ds_matrixA;
			
 
				 	task4->handles[1] = problem->ds_vecd;
			
 
				 	task4->handles[2] = problem->ds_vecq;
			
 
				 
			
 
				 	/* alpha = delta_new / ( trans(d) q )*/
			
 
				 	struct starpu_task *task5 = create_task(maskiter | 5UL);
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	task5->cl->cuda_funcs[0] = cublas_codelet_func_5;
			
 
				-#endif
			
 
				-	task5->cl->cpu_funcs[0] = cpu_codelet_func_5;
			
 
				-	task5->cl->cpu_funcs_name[0] = "cpu_codelet_func_5";
			
 
				+	task5->cl = &cl5;
			
 
				 	task5->cl_arg = problem;
			
 
				 	task5->cl_arg_size = sizeof(*problem);
			
 
				-	task5->cl->nbuffers = 2;
			
 
				-	task5->cl->modes[0] = STARPU_R;
			
 
				-	task5->cl->modes[1] = STARPU_R;
			
 
				-
			
 
				 	task5->handles[0] = problem->ds_vecd;
			
 
				 	task5->handles[1] = problem->ds_vecq;
			
 
				 
			
@@ -239,18 +292,9 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 
				 
			
 
				 	/* x = x + alpha d */
			
 
				 	struct starpu_task *task6 = create_task(maskiter | 6UL);
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	task6->cl->cuda_funcs[0] = cublas_codelet_func_6;
			
 
				-	task6->cl->cuda_flags[0] = STARPU_CUDA_ASYNC;
			
 
				-#endif
			
 
				-	task6->cl->cpu_funcs[0] = cpu_codelet_func_6;
			
 
				-	task6->cl->cpu_funcs_name[0] = "cpu_codelet_func_6";
			
 
				+	task6->cl = &cl6;
			
 
				 	task6->cl_arg = problem;
			
 
				 	task6->cl_arg_size = sizeof(*problem);
			
 
				-	task6->cl->nbuffers = 2;
			
 
				-	task6->cl->modes[0] = STARPU_RW;
			
 
				-	task6->cl->modes[1] = STARPU_R;
			
 
				-
			
 
				 	task6->handles[0] = problem->ds_vecx;
			
 
				 	task6->handles[1] = problem->ds_vecd;
			
 
				 
			
@@ -258,18 +302,9 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 
				 
			
 
				 	/* r = r - alpha q */
			
 
				 	struct starpu_task *task7 = create_task(maskiter | 7UL);
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	task7->cl->cuda_funcs[0] = cublas_codelet_func_7;
			
 
				-	task7->cl->cuda_flags[0] = STARPU_CUDA_ASYNC;
			
 
				-#endif
			
 
				-	task7->cl->cpu_funcs[0] = cpu_codelet_func_7;
			
 
				-	task7->cl->cpu_funcs_name[0] = "cpu_codelet_func_7";
			
 
				+	task7->cl = &cl7;
			
 
				 	task7->cl_arg = problem;
			
 
				 	task7->cl_arg_size = sizeof(*problem);
			
 
				-	task7->cl->nbuffers = 2;
			
 
				-	task7->cl->modes[0] = STARPU_RW;
			
 
				-	task7->cl->modes[1] = STARPU_R;
			
 
				-
			
 
				 	task7->handles[0] = problem->ds_vecr;
			
 
				 	task7->handles[1] = problem->ds_vecq;
			
 
				 
			
@@ -277,33 +312,18 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 
				 
			
 
				 	/* update delta_* and compute beta */
			
 
				 	struct starpu_task *task8 = create_task(maskiter | 8UL);
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	task8->cl->cuda_funcs[0] = cublas_codelet_func_8;
			
 
				-#endif
			
 
				-	task8->cl->cpu_funcs[0] = cpu_codelet_func_8;
			
 
				-	task8->cl->cpu_funcs_name[0] = "cpu_codelet_func_8";
			
 
				+	task8->cl = &cl8;
			
 
				 	task8->cl_arg = problem;
			
 
				 	task8->cl_arg_size = sizeof(*problem);
			
 
				-	task8->cl->nbuffers = 1;
			
 
				-	task8->cl->modes[0] = STARPU_R;
			
 
				 	task8->handles[0] = problem->ds_vecr;
			
 
				 
			
 
				 	starpu_tag_declare_deps((starpu_tag_t)(maskiter | 8UL), 1, (starpu_tag_t)(maskiter | 7UL));
			
 
				 
			
 
				 	/* d = r + beta d */
			
 
				 	struct starpu_task *task9 = create_task(maskiter | 9UL);
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	task9->cl->cuda_funcs[0] = cublas_codelet_func_9;
			
 
				-	task9->cl->cuda_flags[0] = STARPU_CUDA_ASYNC;
			
 
				-#endif
			
 
				-	task9->cl->cpu_funcs[0] = cpu_codelet_func_9;
			
 
				-	task9->cl->cpu_funcs_name[0] = "cpu_codelet_func_9";
			
 
				+	task9->cl = &cl9;
			
 
				 	task9->cl_arg = problem;
			
 
				 	task9->cl_arg_size = sizeof(*problem);
			
 
				-	task9->cl->nbuffers = 2;
			
 
				-	task9->cl->modes[0] = STARPU_RW;
			
 
				-	task9->cl->modes[1] = STARPU_R;
			
 
				-
			
 
				 	task9->handles[0] = problem->ds_vecd;
			
 
				 	task9->handles[1] = problem->ds_vecr;
			
 
				 
			
@@ -427,6 +447,10 @@ void conjugate_gradient(float *nzvalA, float *vecb, float *vecx, uint32_t nnz,
 
				 	starpu_data_unregister(ds_vecr);
			
 
				 	starpu_data_unregister(ds_vecd);
			
 
				 	starpu_data_unregister(ds_vecq);
			
 
				+
			
 
				+	free(ptr_vecr);
			
 
				+	free(ptr_vecd);
			
 
				+	free(ptr_vecq);
			
 
				 }
			
 
				 
			
 
				 
			
@@ -444,4 +468,6 @@ void do_conjugate_gradient(float *nzvalA, float *vecb, float *vecx, uint32_t nnz
 
				 	starpu_cublas_init();
			
 
				 
			
 
				 	conjugate_gradient(nzvalA, vecb, vecx, nnz, nrow, colind, rowptr);
			
 
				+
			
 
				+	starpu_shutdown();
			
 
				 }
			
--- a/examples/heat/heat.c
+++ b/examples/heat/heat.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010, 2012, 2015  Université de Bordeaux
			
 
				+ * Copyright (C) 2009, 2010, 2012, 2015, 2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2016  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -121,9 +121,9 @@ static void parse_args(int argc, char **argv)
 
				 			STARPU_ASSERT((nthick - 2)*(ntheta - 2) == size);
			
 
				 		}
			
 
				 
			
 
				-		if (strcmp(argv[i], "-h") == 0)
			
 
				+		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-help") == 0)
			
 
				 		{
			
 
				-			printf("usage : %s [-v1|-v2|-v3] [-pin] [-nthick number] [-ntheta number] [-shape [0|1|2]] [-cg] [-size number] [-no-prio]\n", argv[0]);
			
 
				+			printf("usage : %s [-v1|-v2|-v3|-v4] [-pin] [-nthick number] [-ntheta number] [-shape [0|1|2]] [-cg] [-size number] [-no-prio]\n", argv[0]);
			
 
				 		}
			
 
				 	}
			
 
				 }
			
@@ -751,6 +751,10 @@ int main(int argc, char **argv)
 
				 			result[TRANSLATE(i)] = Bformer[TRANSLATE(i)];
			
 
				 		}
			
 
				 
			
 
				+		free(nzval);
			
 
				+		free(colind);
			
 
				+		free(rowptr);
			
 
				+		free(B);
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
--- a/examples/heat/heat.sh
+++ b/examples/heat/heat.sh
@@ -0,0 +1,43 @@
 
				+#!/bin/bash
			
 
				+#
			
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2017  Université de Bordeaux
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+
			
 
				+# Test various LU options
			
 
				+
			
 
				+set -e
			
 
				+
			
 
				+PREFIX=$(dirname $0)
			
 
				+
			
 
				+$PREFIX/heat -shape 0
			
 
				+$PREFIX/heat -shape 1
			
 
				+# sometimes lead to pivot being 0
			
 
				+#$PREFIX/heat -shape 2
			
 
				+
			
 
				+$PREFIX/heat -cg
			
 
				+
			
 
				+# TODO: FIXME
			
 
				+
			
 
				+# segfault
			
 
				+#$PREFIX/heat -v1
			
 
				+
			
 
				+# (actually the default...)
			
 
				+$PREFIX/heat -v2
			
 
				+
			
 
				+# hang
			
 
				+#$PREFIX/heat -v3
			
 
				+
			
 
				+# hang
			
 
				+#$PREFIX/heat -v4
			
--- a/examples/lu/lu.sh
+++ b/examples/lu/lu.sh
@@ -0,0 +1,34 @@
 
				+#!/bin/bash
			
 
				+#
			
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2017  Université de Bordeaux
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+
			
 
				+# Test various LU options
			
 
				+
			
 
				+set -e
			
 
				+
			
 
				+PREFIX=$(dirname $0)
			
 
				+
			
 
				+$PREFIX/lu_implicit_example_float -size $((960 * 4)) -nblocks 4 -piv
			
 
				+$PREFIX/lu_implicit_example_float -size $((960 * 4)) -nblocks 4 -no-stride
			
 
				+$PREFIX/lu_implicit_example_float -size $((960 * 4)) -nblocks 4 -bound
			
 
				+$PREFIX/lu_implicit_example_float -size $((960 * 2)) -nblocks 2 -bounddeps
			
 
				+$PREFIX/lu_implicit_example_float -size $((960 * 2)) -nblocks 2 -bound -bounddeps -bounddepsprio
			
 
				+
			
 
				+$PREFIX/lu_example_float -size $((960 * 4)) -nblocks 4 -piv
			
 
				+$PREFIX/lu_example_float -size $((960 * 4)) -nblocks 4 -no-stride
			
 
				+$PREFIX/lu_example_float -size $((960 * 4)) -nblocks 4 -bound
			
 
				+$PREFIX/lu_example_float -size $((960 * 2)) -nblocks 2 -bounddeps
			
 
				+$PREFIX/lu_example_float -size $((960 * 2)) -nblocks 2 -bound -bounddeps -bounddepsprio
			
--- a/examples/lu/lu_example.c
+++ b/examples/lu/lu_example.c
@@ -422,13 +422,15 @@ int main(int argc, char **argv)
 
				 		if (pivot)
			
 
				 		{
			
 
				 			pivot_saved_matrix(ipiv);
			
 
				-			free(ipiv);
			
 
				 		}
			
 
				 
			
 
				 		check_result();
			
 
				 	}
			
 
				 #endif
			
 
				 
			
 
				+	if (pivot)
			
 
				+		free(ipiv);
			
 
				+
			
 
				 	starpu_free_flags(A, (size_t)size*size*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
			
 
				 
			
 
				 	starpu_cublas_shutdown();
			
--- a/examples/lu/xlu_implicit_pivot.c
+++ b/examples/lu/xlu_implicit_pivot.c
@@ -232,6 +232,10 @@ starpu_data_handle_t get_block_with_striding(starpu_data_handle_t *dataAp,
 
				 
			
 
				 int STARPU_LU(lu_decomposition_pivot)(TYPE *matA, unsigned *ipiv, unsigned size, unsigned ld, unsigned nblocks)
			
 
				 {
			
 
				+	if (starpu_mic_worker_get_count() || starpu_scc_worker_get_count() || starpu_mpi_ms_worker_get_count())
			
 
				+		/* These won't work with pivoting: we pass a pointer in cl_args */
			
 
				+		return -ENODEV;
			
 
				+
			
 
				 	starpu_data_handle_t dataA;
			
 
				 
			
 
				 	/* monitor and partition the A matrix into blocks :
			
--- a/examples/lu/xlu_pivot.c
+++ b/examples/lu/xlu_pivot.c
@@ -399,6 +399,7 @@ int STARPU_LU(lu_decomposition_pivot)(TYPE *matA, unsigned *ipiv, unsigned size,
 
				 
			
 
				 	/* gather all the data */
			
 
				 	starpu_data_unpartition(dataA, STARPU_MAIN_RAM);
			
 
				+	starpu_data_unregister(dataA);
			
 
				 	free(piv_description);
			
 
				 
			
 
				 	return ret;
			
@@ -413,6 +414,10 @@ starpu_data_handle_t get_block_with_no_striding(starpu_data_handle_t *dataAp, un
 
				 
			
 
				 int STARPU_LU(lu_decomposition_pivot_no_stride)(TYPE **matA, unsigned *ipiv, unsigned size, unsigned ld, unsigned nblocks)
			
 
				 {
			
 
				+	if (starpu_mic_worker_get_count() || starpu_scc_worker_get_count() || starpu_mpi_ms_worker_get_count())
			
 
				+		/* These won't work with pivoting: we pass a pointer in cl_args */
			
 
				+		return -ENODEV;
			
 
				+
			
 
				 	starpu_data_handle_t *dataAp = malloc(nblocks*nblocks*sizeof(starpu_data_handle_t));
			
 
				 
			
 
				 	/* monitor and partition the A matrix into blocks :
			
--- a/examples/mlr/mlr.c
+++ b/examples/mlr/mlr.c
@@ -50,7 +50,15 @@ static long sum;
 
				 static void cl_params(struct starpu_task *task, double *parameters)
			
 
				 {
			
 
				 	int m, n, k;
			
 
				-	starpu_codelet_unpack_args(task->cl_arg, &m, &n, &k);
			
 
				+	int* vector_mn;
			
 
				+	starpu_data_handle_t vector_mn_handle;
			
 
				+
			
 
				+	vector_mn = (int*)STARPU_VECTOR_GET_PTR(task->interfaces[0]);
			
 
				+	m = vector_mn[0];
			
 
				+	n = vector_mn[1];
			
 
				+
			
 
				+	starpu_codelet_unpack_args(task->cl_arg, &k);
			
 
				+
			
 
				 	parameters[0] = m;
			
 
				 	parameters[1] = n;
			
 
				 	parameters[2] = k;
			
@@ -61,10 +69,13 @@ void cpu_func(void *buffers[], void *cl_arg)
 
				 {
			
 
				 	long i;
			
 
				 	int m,n,k;
			
 
				-	starpu_codelet_unpack_args(cl_arg,
			
 
				-			     	  &m,
			
 
				-     			     	  &n,
			
 
				-     			     	  &k);
			
 
				+	int* vector_mn;
			
 
				+
			
 
				+	vector_mn = (int*)STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				+	m = vector_mn[0];
			
 
				+	n = vector_mn[1];
			
 
				+
			
 
				+	starpu_codelet_unpack_args(cl_arg, &k);
			
 
				 
			
 
				 	for(i=0; i < (long) (m*m*n); i++)
			
 
				 		sum+=i;
			
@@ -123,7 +134,8 @@ static struct starpu_codelet cl_init =
 
				 {
			
 
				 	.cpu_funcs = { cpu_func },
			
 
				 	.cpu_funcs_name = { "cpu_func" },
			
 
				-	.nbuffers = 0,
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_R},
			
 
				 	.model = &cl_model_init,
			
 
				 };
			
 
				 
			
@@ -131,7 +143,8 @@ static struct starpu_codelet cl_final =
 
				 {
			
 
				 	.cpu_funcs = { cpu_func },
			
 
				 	.cpu_funcs_name = { "cpu_func" },
			
 
				-	.nbuffers = 0,
			
 
				+	.nbuffers = 1,
			
 
				+	.modes = {STARPU_R},
			
 
				 	.model = &cl_model_final,
			
 
				 };
			
 
				 
			
@@ -147,29 +160,42 @@ int main(int argc, char **argv)
 
				 
			
 
				 	sum=0;
			
 
				 	int m,n,k;
			
 
				+	int* vector_mn = malloc( 2 * sizeof(int) );
			
 
				+	starpu_data_handle_t vector_mn_handle;
			
 
				+
			
 
				+	starpu_vector_data_register( &vector_mn_handle,
			
 
				+				     STARPU_MAIN_RAM,
			
 
				+				     (uintptr_t)vector_mn, 2,
			
 
				+				     sizeof(int) );
			
 
				 
			
 
				-        /* Giving pseudo-random values to the M,N,K parameters and inserting tasks */
			
 
				-	for(i=0; i < 42; i++)
			
 
				+	/* Giving pseudo-random values to the M,N,K parameters and inserting tasks */
			
 
				+	for ( i = 0; i < 42; i++)
			
 
				 	{
			
 
				 		m = (int) ((rand() % 10)+1);
			
 
				 		n = (int) ((rand() % 10)+1);
			
 
				 		k = (int) ((rand() % 10)+1);
			
 
				 
			
 
				-		for(j=0; j < 42; j++)
			
 
				+		/* To illustrate the usage, M and N are stored in a data handle */
			
 
				+		starpu_data_acquire(vector_mn_handle, STARPU_W);
			
 
				+		vector_mn[0] = m;
			
 
				+		vector_mn[1] = n;
			
 
				+		starpu_data_release(vector_mn_handle);
			
 
				+
			
 
				+		for ( j = 0; j < 42; j++)
			
 
				 		{
			
 
				-			starpu_insert_task(&cl_init,
			
 
				-				   STARPU_VALUE, &m, sizeof(int),
			
 
				-				   STARPU_VALUE, &n, sizeof(int),
			
 
				-				   STARPU_VALUE, &k, sizeof(int),
			
 
				-				   0);
			
 
				-			starpu_insert_task(&cl_final,
			
 
				-				   STARPU_VALUE, &m, sizeof(int),
			
 
				-				   STARPU_VALUE, &n, sizeof(int),
			
 
				-				   STARPU_VALUE, &k, sizeof(int),
			
 
				-				   0);
			
 
				+			starpu_insert_task( &cl_init,
			
 
				+					    STARPU_R, vector_mn_handle,
			
 
				+					    STARPU_VALUE, &k, sizeof(int),
			
 
				+					    0 );
			
 
				+			starpu_insert_task( &cl_final,
			
 
				+					    STARPU_R, vector_mn_handle,
			
 
				+					    STARPU_VALUE, &k, sizeof(int),
			
 
				+					    0 );
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+	starpu_data_unregister(vector_mn_handle);
			
 
				+	free(vector_mn);
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				 	return 0;
			
--- a/examples/sched_ctx/gpu_partition.c
+++ b/examples/sched_ctx/gpu_partition.c
@@ -105,7 +105,9 @@ int main(int argc, char **argv)
 
				 	int ncuda = 0;
			
 
				 	int gpu_devid = -1;
			
 
				 
			
 
				+#ifdef STARPU_DEVEL
			
 
				 #warning temporary fix: skip test as cuda computation fails
			
 
				+#endif
			
 
				  	return 77;
			
 
				 
			
 
				 #ifndef STARPU_HAVE_SETENV
			
@@ -172,8 +174,8 @@ int main(int argc, char **argv)
 
				 	int ncpus = starpu_cpu_worker_get_count();
			
 
				 	int workers[ncpus+nstreams];
			
 
				 	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, workers, ncpus);
			
 
				-	
			
 
				-	int sched_ctxs[nstreams];
			
 
				+
			
 
				+	unsigned sched_ctxs[nstreams];
			
 
				 	int nsms[nstreams];
			
 
				 	nsms[0] = 6;
			
 
				 	nsms[1] = 7;
			
@@ -185,7 +187,7 @@ int main(int argc, char **argv)
 
				 	}
			
 
				 	unsigned sched_ctx1 = starpu_sched_ctx_create(workers, ncpus+nstreams, "ctx1", STARPU_SCHED_CTX_SUB_CTXS, sched_ctxs, nstreams, STARPU_SCHED_CTX_POLICY_NAME, "dmdas", 0);
			
 
				 
			
 
				-	FPRINTF(stderr, "parent ctx %d\n", sched_ctx1);
			
 
				+	FPRINTF(stderr, "parent ctx %u\n", sched_ctx1);
			
 
				 	starpu_sched_ctx_set_context(&sched_ctx1);
			
 
				 
			
 
				 #endif
			
--- a/examples/stencil/stencil-blocks.c
+++ b/examples/stencil/stencil-blocks.c
@@ -297,11 +297,10 @@ void allocate_memory_on_node(int rank)
 
				 
			
 
				 		int node = block->mpi_node;
			
 
				 
			
 
				-		unsigned size_bz = block_sizes_z[bz];
			
 
				-
			
 
				 		/* Main blocks */
			
 
				 		if (node == rank)
			
 
				 		{
			
 
				+			unsigned size_bz = block_sizes_z[bz];
			
 
				 			allocate_block_on_node(&block->layers_handle[0], bz, &block->layers[0],
			
 
				 						(sizex + 2*K), (sizey + 2*K), (size_bz + 2*K));
			
 
				 #ifndef STARPU_SIMGRID
			
@@ -389,8 +388,8 @@ void check(int rank)
 
				 		/* Main blocks */
			
 
				 		if (node == rank)
			
 
				 		{
			
 
				-			unsigned size_bz = block_sizes_z[bz];
			
 
				 #ifdef LIFE
			
 
				+			unsigned size_bz = block_sizes_z[bz];
			
 
				 			unsigned x, y, z;
			
 
				 			unsigned sum = 0;
			
 
				 			for (x = 0; x < sizex; x++)
			
--- a/include/fstarpu_mod.f90
+++ b/include/fstarpu_mod.f90
@@ -82,6 +82,7 @@ module fstarpu_mod
 
				         type(c_ptr), bind(C) :: FSTARPU_SCC
			
 
				 
			
 
				         type(c_ptr), bind(C) :: FSTARPU_CODELET_SIMGRID_EXECUTE
			
 
				+        type(c_ptr), bind(C) :: FSTARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT
			
 
				         type(c_ptr), bind(C) :: FSTARPU_CUDA_ASYNC
			
 
				         type(c_ptr), bind(C) :: FSTARPU_OPENCL_ASYNC
			
 
				 
			
@@ -1580,7 +1581,7 @@ module fstarpu_mod
 
				                 end subroutine fstarpu_memchunk_tidy
			
 
				 
			
 
				                 ! == starpu_task_util.h ==
			
 
				-                ! struct starpu_data_handle *fstarpu_data_handle_array_alloc(int nb);
			
 
				+                ! starpu_data_handle_t *fstarpu_data_handle_array_alloc(int nb);
			
 
				                 function fstarpu_data_handle_array_alloc (nb) bind(C)
			
 
				                         use iso_c_binding, only: c_ptr, c_int
			
 
				                         type(c_ptr) :: fstarpu_data_handle_array_alloc
			
@@ -2331,7 +2332,9 @@ module fstarpu_mod
 
				                             fstarpu_get_constant(C_CHAR_"FSTARPU_SCC"//C_NULL_CHAR)
			
 
				 
			
 
				                         FSTARPU_CODELET_SIMGRID_EXECUTE = &
			
 
				-                            fstarpu_get_constant(C_CHAR_"FSTARPU_CODELET_SIMGRID_EXECUTE"//C_NULL_CHAR)
			
 
				+                             fstarpu_get_constant(C_CHAR_"FSTARPU_CODELET_SIMGRID_EXECUTE"//C_NULL_CHAR)
			
 
				+                        FSTARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT = &
			
 
				+                             fstarpu_get_constant(C_CHAR_"FSTARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT"//C_NULL_CHAR)
			
 
				                         FSTARPU_CUDA_ASYNC = &
			
 
				                             fstarpu_get_constant(C_CHAR_"FSTARPU_CUDA_ASYNC"//C_NULL_CHAR)
			
 
				                         FSTARPU_OPENCL_ASYNC = &
			
--- a/include/starpu_config.h.in
+++ b/include/starpu_config.h.in
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2009-2016  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016  CNRS
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016, 2017  CNRS
			
 
				  * Copyright (C) 2014  INRIA
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -152,5 +152,6 @@ typedef ssize_t starpu_ssize_t;
 
				 #undef STARPU_HAVE_DARWIN
			
 
				 
			
 
				 #undef STARPU_HAVE_CXX11
			
 
				+#undef STARPU_HAVE_STRERROR_R
			
 
				 
			
 
				 #endif
			
--- a/include/starpu_scheduler.h
+++ b/include/starpu_scheduler.h
@@ -62,8 +62,11 @@ unsigned long starpu_task_get_job_id(struct starpu_task *task);
 
				 /* This function must be called to wake up a worker that is sleeping on the cond. 
			
 
				  * It returns 0 whenever the worker is not in a sleeping state */
			
 
				 int starpu_wake_worker(int workerid);
			
 
				+int starpu_wakeup_worker(int workerid, starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex);
			
 
				 /* This is a version of starpu_wake_worker which assumes that the sched mutex is locked */
			
 
				 int starpu_wake_worker_locked(int workerid);
			
 
				+/* This is a version of starpu_wakeup_worker which assumes that the sched mutex is locked */
			
 
				+int starpu_wakeup_worker_locked(int workerid, starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex);
			
 
				 
			
 
				 int starpu_worker_can_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl);
			
 
				 int starpu_worker_can_execute_task_impl(unsigned workerid, struct starpu_task *task, unsigned *impl_mask);
			
--- a/include/starpu_task.h
+++ b/include/starpu_task.h
@@ -46,6 +46,7 @@ extern "C"
 
				 #define STARPU_MPI_MS	((1ULL)<<9)
			
 
				 
			
 
				 #define STARPU_CODELET_SIMGRID_EXECUTE	(1<<0)
			
 
				+#define STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT	(1<<1)
			
 
				 #define STARPU_CUDA_ASYNC	(1<<0)
			
 
				 #define STARPU_OPENCL_ASYNC	(1<<0)
			
 
				 
			
--- a/include/starpu_thread.h
+++ b/include/starpu_thread.h
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010, 2012-2016  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
			
 
				+ * Copyright (C) 2010, 2012-2017  Université de Bordeaux
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -33,6 +33,7 @@
 
				 #endif
			
 
				 #elif !defined(_MSC_VER) || defined(BUILDING_STARPU)
			
 
				 #include <pthread.h>
			
 
				+#include <semaphore.h>
			
 
				 #endif
			
 
				 #include <stdint.h>
			
 
				 
			
@@ -50,8 +51,9 @@ extern "C"
 
				 typedef msg_process_t starpu_pthread_t;
			
 
				 typedef int starpu_pthread_attr_t;
			
 
				 
			
 
				+int starpu_pthread_equal(starpu_pthread_t t1, starpu_pthread_t t2);
			
 
				+starpu_pthread_t starpu_pthread_self(void);
			
 
				 int starpu_pthread_create_on(char *name, starpu_pthread_t *thread, const starpu_pthread_attr_t *attr, void *(*start_routine) (void *), void *arg, msg_host_t host);
			
 
				-#define starpu_pthread_setname(name)
			
 
				 int starpu_pthread_create(starpu_pthread_t *thread, const starpu_pthread_attr_t *attr, void *(*start_routine) (void *), void *arg);
			
 
				 int starpu_pthread_join(starpu_pthread_t thread, void **retval);
			
 
				 int starpu_pthread_exit(void *retval) STARPU_ATTRIBUTE_NORETURN;
			
@@ -64,8 +66,18 @@ int starpu_pthread_attr_setdetachstate(starpu_pthread_attr_t *attr, int detachst
 
				 typedef pthread_t starpu_pthread_t;
			
 
				 typedef pthread_attr_t starpu_pthread_attr_t;
			
 
				 
			
 
				+#define starpu_pthread_equal pthread_equal
			
 
				+#define starpu_pthread_self pthread_self
			
 
				 #define starpu_pthread_create pthread_create
			
 
				 #define starpu_pthread_create_on(name, thread, attr, routine, arg, where) starpu_pthread_create(thread, attr, routine, arg)
			
 
				+#define starpu_pthread_join pthread_join
			
 
				+#define starpu_pthread_exit pthread_exit
			
 
				+#define starpu_pthread_attr_init pthread_attr_init
			
 
				+#define starpu_pthread_attr_destroy pthread_attr_destroy
			
 
				+#define starpu_pthread_attr_setdetachstate pthread_attr_setdetachstate
			
 
				+
			
 
				+#endif /* STARPU_SIMGRID, _MSC_VER */
			
 
				+
			
 
				 #ifdef STARPU_HAVE_PTHREAD_SETNAME_NP
			
 
				 #ifdef STARPU_HAVE_DARWIN
			
 
				 #define starpu_pthread_setname(name) pthread_setname_np(name)
			
@@ -75,13 +87,6 @@ typedef pthread_attr_t starpu_pthread_attr_t;
 
				 #else
			
 
				 #define starpu_pthread_setname(name)
			
 
				 #endif
			
 
				-#define starpu_pthread_join pthread_join
			
 
				-#define starpu_pthread_exit pthread_exit
			
 
				-#define starpu_pthread_attr_init pthread_attr_init
			
 
				-#define starpu_pthread_attr_destroy pthread_attr_destroy
			
 
				-#define starpu_pthread_attr_setdetachstate pthread_attr_setdetachstate
			
 
				-
			
 
				-#endif /* STARPU_SIMGRID, _MSC_VER */
			
 
				 
			
 
				 /*
			
 
				  * Encapsulation of the pthread_mutex_* functions.
			
@@ -403,6 +408,32 @@ int starpu_pthread_wait_wait(starpu_pthread_wait_t *w);
 
				 int starpu_pthread_wait_destroy(starpu_pthread_wait_t *w);
			
 
				 #endif
			
 
				 
			
 
				+/*
			
 
				+ * Encapsulation of the semaphore functions.
			
 
				+ */
			
 
				+
			
 
				+#ifdef STARPU_SIMGRID
			
 
				+
			
 
				+typedef msg_sem_t starpu_sem_t;
			
 
				+int starpu_sem_destroy(starpu_sem_t *);
			
 
				+int starpu_sem_getvalue(starpu_sem_t *, int *);
			
 
				+int starpu_sem_init(starpu_sem_t *, int, unsigned);
			
 
				+int starpu_sem_post(starpu_sem_t *);
			
 
				+int starpu_sem_trywait(starpu_sem_t *);
			
 
				+int starpu_sem_wait(starpu_sem_t *);
			
 
				+
			
 
				+#elif !defined(_MSC_VER) || defined(BUILDING_STARPU) /* !STARPU_SIMGRID */
			
 
				+
			
 
				+typedef sem_t starpu_sem_t;
			
 
				+#define starpu_sem_destroy sem_destroy
			
 
				+#define starpu_sem_getvalue sem_getvalue
			
 
				+#define starpu_sem_init sem_init
			
 
				+#define starpu_sem_post sem_post
			
 
				+int starpu_sem_trywait(starpu_sem_t *);
			
 
				+int starpu_sem_wait(starpu_sem_t *);
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				 #ifdef __cplusplus
			
 
				 }
			
 
				 #endif
			
--- a/mpi/dev/starpu_mpi_comm_check.sh
+++ b/mpi/dev/starpu_mpi_comm_check.sh
@@ -0,0 +1,109 @@
 
				+#!/bin/bash
			
 
				+
			
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2017 CNRS
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+
			
 
				+# Script to check MPI communications are done properly
			
 
				+# The application should be launched with STARPU_MPI_COMM=1
			
 
				+# e.g
			
 
				+#    $ export STARPU_MPI_COMM=1
			
 
				+#    $ mpirun --output-filename starpu_mpi.log appli parameters
			
 
				+# and then the script can be launched with the output files
			
 
				+#    $ starpu_mpi_comm_check.sh starpu_mpi.log.*
			
 
				+
			
 
				+if test -z "$1"
			
 
				+then
			
 
				+    echo Syntax error: parameter missing
			
 
				+    exit 1
			
 
				+fi
			
 
				+
			
 
				+# Get the nodes identifiers
			
 
				+nodes=$(for f in $*
			
 
				+	do
			
 
				+	    grep starpu_mpi $f | grep '\[' | awk '{print $1}'| sed 's/\[\(.*\)\]\[starpu_mpi\]/\1/' | grep "^[[:digit:]]*$"
			
 
				+	done |sort|uniq
			
 
				+     )
			
 
				+echo nodes $nodes
			
 
				+
			
 
				+DIR=/tmp
			
 
				+
			
 
				+# for each node, extract send and receive communications
			
 
				+for node in $nodes
			
 
				+do
			
 
				+    for f in $*
			
 
				+    do
			
 
				+	grep starpu_mpi $f |grep "\[$node"
			
 
				+    done > $DIR/starpu_mpi_node$node.log
			
 
				+    grep -- "-->" $DIR/starpu_mpi_node$node.log > $DIR/starpu_mpi_node${node}_send.log
			
 
				+    grep -- "<--" $DIR/starpu_mpi_node$node.log > $DIR/starpu_mpi_node${node}_recv.log
			
 
				+done
			
 
				+
			
 
				+# count the number of traced lines
			
 
				+#for node in $nodes
			
 
				+#do
			
 
				+#    wc -l $DIR/starpu_mpi_node${node}_recv.log
			
 
				+#    lines=$(grep :42:42 $DIR/starpu_mpi_node${node}_recv.log | wc -l)
			
 
				+#    lines2=$(( lines + lines ))
			
 
				+#    echo $lines2
			
 
				+#    lines3=$(( lines2 + lines ))
			
 
				+#    echo $lines3
			
 
				+#done
			
 
				+
			
 
				+# for each pair of nodes, check tags are sent and received in the same order
			
 
				+for src in $nodes
			
 
				+do
			
 
				+    for dst in $nodes
			
 
				+    do
			
 
				+	if test $src != $dst
			
 
				+	then
			
 
				+	    grep ":$dst:42:" $DIR/starpu_mpi_node${src}_send.log| awk -F':' '{print $6}' > $DIR/node${src}_send_to_${dst}.log
			
 
				+	    grep ":$src:42:" $DIR/starpu_mpi_node${dst}_recv.log|awk -F ':' '{print $6}'> $DIR/node${dst}_recv_from_${src}.log
			
 
				+ 	    diff --side-by-side  --suppress-common-lines $DIR/node${src}_send_to_${dst}.log $DIR/node${dst}_recv_from_${src}.log  > $DIR/check_$$
			
 
				+	    if test -s $DIR/check_$$
			
 
				+	    then
			
 
				+		echo $src $dst
			
 
				+		less $DIR/check_$$
			
 
				+	    fi
			
 
				+	fi
			
 
				+    done
			
 
				+done
			
 
				+
			
 
				+# check each envelope reception is followed by the appropriate data reception
			
 
				+# first line: MPI_Recv of the envelope
			
 
				+# second line: display envelope information
			
 
				+# third line: MPI_Recv of the data
			
 
				+for node in $nodes
			
 
				+do
			
 
				+    echo processing $DIR/starpu_mpi_node${node}_recv.log
			
 
				+    (
			
 
				+	while read line
			
 
				+	do
			
 
				+	    read line2
			
 
				+	    read line3
			
 
				+	    #echo processing
			
 
				+	    tag2=$(echo $line2 | awk -F ':' '{print $6}')
			
 
				+	    tag3=$(echo $line3 | awk -F ':' '{print $6}')
			
 
				+	    if test "$tag2" != "$tag3"
			
 
				+	    then
			
 
				+		echo erreur
			
 
				+		echo $tag2 $tag3
			
 
				+		echo $line
			
 
				+		echo $line2
			
 
				+		echo $line3
			
 
				+	    fi
			
 
				+	done
			
 
				+    ) < $DIR/starpu_mpi_node${node}_recv.log
			
 
				+done
			
 
				+
			
--- a/sc_hypervisor/src/hypervisor_policies/feft_lp_policy.c
+++ b/sc_hypervisor/src/hypervisor_policies/feft_lp_policy.c
@@ -27,7 +27,7 @@ static void _try_resizing(unsigned *sched_ctxs, int nsched_ctxs, int *workers, i
 
				 	/* for vite */
			
 
				 	int ns = sched_ctxs == NULL ? sc_hypervisor_get_nsched_ctxs() : nsched_ctxs;
			
 
				 #ifdef STARPU_SC_HYPERVISOR_DEBUG
			
 
				-	printf("resize_no = %u %d ctxs\n", resize_no, ns);
			
 
				+	printf("resize_no = %lu %d ctxs\n", resize_no, ns);
			
 
				 #endif
			
 
				 	if(ns <= 0) return;
			
 
				 
			
--- a/socl/src/cl_createkernel.c
+++ b/socl/src/cl_createkernel.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010,2011 University of Bordeaux
			
 
				- * Copyright (C) 2016  CNRS
			
 
				+ * Copyright (C) 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -25,7 +25,7 @@ static void soclCreateKernel_task(void *data) {
 
				 
			
 
				    if (k->program->cl_programs[range] == NULL) {
			
 
				       k->errcodes[range] = CL_SUCCESS;
			
 
				-      DEBUG_MSG("[Device %d] Kernel creation skipped: program has not been built for this device.\n", starpu_worker_get_id_check());
			
 
				+      DEBUG_MSG("[Device %u] Kernel creation skipped: program has not been built for this device.\n", starpu_worker_get_id_check());
			
 
				       return;
			
 
				    }
			
 
				 
			
@@ -163,7 +163,7 @@ soclCreateKernel(cl_program    program,
 
				    }
			
 
				 
			
 
				    /* Create kernel on each device */
			
 
				-   DEBUG_MSG("[Kernel %d] Create %d kernels (name \"%s\")\n", k->id, socl_device_count, kernel_name);
			
 
				+   DEBUG_MSG("[Kernel %d] Create %u kernels (name \"%s\")\n", k->id, socl_device_count, kernel_name);
			
 
				    starpu_execute_on_each_worker_ex(soclCreateKernel_task, k, STARPU_OPENCL, "SOCL_CREATE_KERNEL");
			
 
				 
			
 
				    if (errcode_ret != NULL) {
			
--- a/socl/src/cl_createprogramwithsource.c
+++ b/socl/src/cl_createprogramwithsource.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010,2011 University of Bordeaux
			
 
				- * Copyright (C) 2016  CNRS
			
 
				+ * Copyright (C) 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -141,7 +141,7 @@ soclCreateProgramWithSource(cl_context      context,
 
				       *errcode_ret = CL_SUCCESS;
			
 
				       for (i=0; i<socl_device_count; i++) {
			
 
				          if (data->errcodes[i] != CL_SUCCESS) {
			
 
				-            DEBUG_MSG("Worker [%d] failed\n", i);
			
 
				+            DEBUG_MSG("Worker [%u] failed\n", i);
			
 
				             DEBUG_CL("clCreateProgramWithSource", data->errcodes[i]);
			
 
				             *errcode_ret = data->errcodes[i];
			
 
				             break;
			
--- a/socl/src/cl_enqueuendrangekernel.c
+++ b/socl/src/cl_enqueuendrangekernel.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010,2011, 2016-2017 University of Bordeaux
			
 
				- * Copyright (C) 2016  CNRS
			
 
				+ * Copyright (C) 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -71,13 +71,13 @@ void soclEnqueueNDRangeKernel_task(void *descr[], void *args) {
 
				    if (err != CL_SUCCESS) {
			
 
				 	   ERROR_MSG("Worker[%d] Unable to Enqueue kernel (error %d)\n", wid, err);
			
 
				 	   DEBUG_CL("clEnqueueNDRangeKernel", err);
			
 
				-	   DEBUG_MSG("Workdim %d, global_work_offset %p, global_work_size %p, local_work_size %p\n",
			
 
				+	   DEBUG_MSG("Workdim %u, global_work_offset %p, global_work_size %p, local_work_size %p\n",
			
 
				 			   cmd->work_dim, cmd->global_work_offset, cmd->global_work_size, cmd->local_work_size);
			
 
				-	   DEBUG_MSG("Global work size: %ld %ld %ld\n", cmd->global_work_size[0],
			
 
				-			   (cmd->work_dim > 1 ? cmd->global_work_size[1] : 1), (cmd->work_dim > 2 ? cmd->global_work_size[2] : 1)); 
			
 
				+	   DEBUG_MSG("Global work size: %ld %ld %ld\n", (long)cmd->global_work_size[0],
			
 
				+		     (long)(cmd->work_dim > 1 ? cmd->global_work_size[1] : 1), (long)(cmd->work_dim > 2 ? cmd->global_work_size[2] : 1)); 
			
 
				 	   if (cmd->local_work_size != NULL)
			
 
				-		   DEBUG_MSG("Local work size: %ld %ld %ld\n", cmd->local_work_size[0],
			
 
				-				   (cmd->work_dim > 1 ? cmd->local_work_size[1] : 1), (cmd->work_dim > 2 ? cmd->local_work_size[2] : 1)); 
			
 
				+		   DEBUG_MSG("Local work size: %ld %ld %ld\n", (long)cmd->local_work_size[0],
			
 
				+			     (long)(cmd->work_dim > 1 ? cmd->local_work_size[1] : 1), (long)(cmd->work_dim > 2 ? cmd->local_work_size[2] : 1)); 
			
 
				    }
			
 
				    else {
			
 
				       /* Waiting for kernel to terminate */
			
--- a/socl/src/cl_enqueuereadbuffer.c
+++ b/socl/src/cl_enqueuereadbuffer.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010,2011, 2014 University of Bordeaux
			
 
				- * Copyright (C) 2016  CNRS
			
 
				+ * Copyright (C) 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -25,7 +25,7 @@ static void soclEnqueueReadBuffer_cpu_task(void *descr[], void *args) {
 
				   gc_entity_release(ev);
			
 
				 
			
 
				    char * ptr = (void*)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				-   DEBUG_MSG("[Buffer %d] Reading %ld bytes from %p to %p\n", cmd->buffer->id, cmd->cb, ptr+cmd->offset, cmd->ptr);
			
 
				+   DEBUG_MSG("[Buffer %d] Reading %ld bytes from %p to %p\n", cmd->buffer->id, (long)cmd->cb, ptr+cmd->offset, cmd->ptr);
			
 
				 
			
 
				    //This fix is for people who use USE_HOST_PTR and still use ReadBuffer to sync the buffer in host mem at host_ptr.
			
 
				    //They should use buffer mapping facilities instead.
			
@@ -44,7 +44,7 @@ static void soclEnqueueReadBuffer_opencl_task(void *descr[], void *args) {
 
				 
			
 
				    cl_mem mem = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				 
			
 
				-   DEBUG_MSG("[Buffer %d] Reading %ld bytes from offset %ld into %p\n", cmd->buffer->id, cmd->cb, cmd->offset, cmd->ptr);
			
 
				+   DEBUG_MSG("[Buffer %d] Reading %ld bytes from offset %ld into %p\n", cmd->buffer->id, (long)cmd->cb, (long)cmd->offset, cmd->ptr);
			
 
				 
			
 
				    int wid = starpu_worker_get_id_check();
			
 
				    cl_command_queue cq;
			
--- a/socl/src/cl_enqueuewritebuffer.c
+++ b/socl/src/cl_enqueuewritebuffer.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010,2011, 2014 University of Bordeaux
			
 
				- * Copyright (C) 2016  CNRS
			
 
				+ * Copyright (C) 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -26,7 +26,7 @@ static void soclEnqueueWriteBuffer_cpu_task(void *descr[], void *args) {
 
				   gc_entity_release(ev);
			
 
				 
			
 
				    char * ptr = (void*)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				-   DEBUG_MSG("[Buffer %d] Writing %ld bytes from %p to %p\n", cmd->buffer->id, cmd->cb, cmd->ptr, ptr+cmd->offset);
			
 
				+   DEBUG_MSG("[Buffer %d] Writing %ld bytes from %p to %p\n", cmd->buffer->id, (long)cmd->cb, cmd->ptr, ptr+cmd->offset);
			
 
				 
			
 
				    //FIXME: Fix for people who use USE_HOST_PTR, modify data at host_ptr and use WriteBuffer to commit the change.
			
 
				    // StarPU may have erased host mem at host_ptr (for instance by retrieving current buffer data at host_ptr)
			
@@ -47,7 +47,7 @@ static void soclEnqueueWriteBuffer_opencl_task(void *descr[], void *args) {
 
				 
			
 
				    cl_mem mem = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				 
			
 
				-   DEBUG_MSG("[Buffer %d] Writing %ld bytes to offset %ld from %p\n", cmd->buffer->id, cmd->cb, cmd->offset, cmd->ptr);
			
 
				+   DEBUG_MSG("[Buffer %d] Writing %ld bytes to offset %ld from %p\n", cmd->buffer->id, (long)cmd->cb, (long)cmd->offset, cmd->ptr);
			
 
				 
			
 
				    int wid = starpu_worker_get_id_check();
			
 
				    cl_command_queue cq;
			
--- a/socl/src/cl_setkernelarg.c
+++ b/socl/src/cl_setkernelarg.c
@@ -68,7 +68,7 @@ soclSetKernelArg(cl_kernel  kernel,
 
				    kernel->arg_type[arg_index] = Null;
			
 
				    kernel->arg_size[arg_index] = arg_size;
			
 
				 
			
 
				-   DEBUG_MSG("[Kernel %d] Set argument %d: argsize %ld argvalue %p\n", kernel->id, arg_index, arg_size, arg_value);
			
 
				+   DEBUG_MSG("[Kernel %d] Set argument %d: argsize %ld argvalue %p\n", kernel->id, arg_index, (long)arg_size, arg_value);
			
 
				 
			
 
				    /* Argument is not Null */
			
 
				    if (arg_value != NULL) {
			
--- a/socl/src/task.c
+++ b/socl/src/task.c
@@ -77,7 +77,7 @@ void task_depends_on(starpu_task task, cl_uint num_events, cl_event *events) {
 
				     DEBUG_MSG("Task %p depends on events:", task);
			
 
				     for (i=0; i<num_events; i++) {
			
 
				        tags[i] = events[i]->id;
			
 
				-       DEBUG_MSG_NOHEAD(" %u", events[i]->id);
			
 
				+       DEBUG_MSG_NOHEAD(" %d", events[i]->id);
			
 
				     }
			
 
				     DEBUG_MSG_NOHEAD("\n");
			
 
				 
			
--- a/src/common/fxt.c
+++ b/src/common/fxt.c
@@ -72,7 +72,7 @@ long _starpu_gettid(void)
 
				 #elif defined(_WIN32) && !defined(__CYGWIN__)
			
 
				 	return (long) GetCurrentThreadId();
			
 
				 #else
			
 
				-	return (long) pthread_self();
			
 
				+	return (long) starpu_pthread_self();
			
 
				 #endif
			
 
				 #endif
			
 
				 }
			
--- a/src/common/prio_list.h
+++ b/src/common/prio_list.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2015-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2015-2017  Université de Bordeaux
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -252,9 +252,31 @@
 
				 	static inline void ENAME##_prio_list_deinit(struct ENAME##_prio_list *priolist) \
			
 
				 	{ (void) (priolist); /* ENAME##_list_deinit(&(priolist)->list); */ } \
			
 
				 	static inline void ENAME##_prio_list_push_back(struct ENAME##_prio_list *priolist, struct ENAME *e) \
			
 
				-	{ ENAME##_list_push_back(&(priolist)->list, (e)); } \
			
 
				+	{ \
			
 
				+		struct ENAME *cur; \
			
 
				+		for (cur  = ENAME##_list_begin(&(priolist)->list); \
			
 
				+		     cur != ENAME##_list_end(&(priolist)->list); \
			
 
				+		     cur  = ENAME##_list_next(cur)) \
			
 
				+			if ((e)->PRIOFIELD > cur->PRIOFIELD) \
			
 
				+				break; \
			
 
				+		if (cur == ENAME##_list_end(&(priolist)->list)) \
			
 
				+			ENAME##_list_push_back(&(priolist)->list, (e)); \
			
 
				+		else \
			
 
				+			ENAME##_list_insert_before(&(priolist)->list, (e), cur); \
			
 
				+	} \
			
 
				 	static inline void ENAME##_prio_list_push_front(struct ENAME##_prio_list *priolist, struct ENAME *e) \
			
 
				-	{ ENAME##_list_push_front(&(priolist)->list, (e)); } \
			
 
				+	{ \
			
 
				+		struct ENAME *cur; \
			
 
				+		for (cur  = ENAME##_list_begin(&(priolist)->list); \
			
 
				+		     cur != ENAME##_list_end(&(priolist)->list); \
			
 
				+		     cur  = ENAME##_list_next(cur)) \
			
 
				+			if ((e)->PRIOFIELD >= cur->PRIOFIELD) \
			
 
				+				break; \
			
 
				+		if (cur == ENAME##_list_end(&(priolist)->list)) \
			
 
				+			ENAME##_list_push_back(&(priolist)->list, (e)); \
			
 
				+		else \
			
 
				+			ENAME##_list_insert_before(&(priolist)->list, (e), cur); \
			
 
				+	} \
			
 
				 	static inline int ENAME##_prio_list_empty(const struct ENAME##_prio_list *priolist) \
			
 
				 	{ return ENAME##_list_empty(&(priolist)->list); } \
			
 
				 	static inline void ENAME##_prio_list_erase(struct ENAME##_prio_list *priolist, struct ENAME *e) \
			
--- a/src/common/starpu_spinlock.c
+++ b/src/common/starpu_spinlock.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010, 2012-2014, 2016  Université de Bordeaux
			
 
				- * Copyright (C) 2010, 2011, 2013, 2014  CNRS
			
 
				+ * Copyright (C) 2010, 2011, 2013, 2014, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -25,10 +25,9 @@
 
				 int _starpu_spin_init(struct _starpu_spinlock *lock)
			
 
				 {
			
 
				 	starpu_pthread_mutexattr_t errcheck_attr;
			
 
				-//	memcpy(&lock->errcheck_lock, PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP, sizeof(PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP));
			
 
				 	int ret;
			
 
				 	ret = starpu_pthread_mutexattr_init(&errcheck_attr);
			
 
				-	STARPU_CHECK_RETURN_VALUE(ret, "pthread_mutexattr_init");
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_pthread_mutexattr_init");
			
 
				 
			
 
				 	ret = starpu_pthread_mutexattr_settype(&errcheck_attr, PTHREAD_MUTEX_ERRORCHECK);
			
 
				 	STARPU_ASSERT(!ret);
			
--- a/src/common/thread.c
+++ b/src/common/thread.c
@@ -19,6 +19,7 @@
 
				 #include <core/simgrid.h>
			
 
				 #include <core/workers.h>
			
 
				 
			
 
				+#include <errno.h>
			
 
				 #include <limits.h>
			
 
				 
			
 
				 #ifdef STARPU_SIMGRID
			
@@ -50,6 +51,16 @@ static int _starpu_futex_wake = FUTEX_WAKE;
 
				 
			
 
				 extern int _starpu_simgrid_thread_start(int argc, char *argv[]);
			
 
				 
			
 
				+int starpu_pthread_equal(starpu_pthread_t t1, starpu_pthread_t t2)
			
 
				+{
			
 
				+	return t1 == t2;
			
 
				+}
			
 
				+
			
 
				+starpu_pthread_t starpu_pthread_self(void)
			
 
				+{
			
 
				+	return MSG_process_self();
			
 
				+}
			
 
				+
			
 
				 int starpu_pthread_create_on(char *name, starpu_pthread_t *thread, const starpu_pthread_attr_t *attr STARPU_ATTRIBUTE_UNUSED, void *(*start_routine) (void *), void *arg, msg_host_t host)
			
 
				 {
			
 
				 	char **_args;
			
@@ -62,6 +73,9 @@ int starpu_pthread_create_on(char *name, starpu_pthread_t *thread, const starpu_
 
				 	void *tsd;
			
 
				 	_STARPU_CALLOC(tsd, MAX_TSD+1, sizeof(void*));
			
 
				 	*thread = MSG_process_create_with_arguments(name, _starpu_simgrid_thread_start, tsd, host, 2, _args);
			
 
				+#if SIMGRID_VERSION_MAJOR > 3 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR >= 15)
			
 
				+	MSG_process_ref(*thread);
			
 
				+#endif
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -74,6 +88,9 @@ int starpu_pthread_join(starpu_pthread_t thread STARPU_ATTRIBUTE_UNUSED, void **
 
				 {
			
 
				 #if SIMGRID_VERSION_MAJOR > 3 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR >= 14)
			
 
				 	MSG_process_join(thread, 1000000);
			
 
				+#if SIMGRID_VERSION_MAJOR > 3 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR >= 15)
			
 
				+	MSG_process_unref(thread);
			
 
				+#endif
			
 
				 #else
			
 
				 	MSG_process_sleep(1);
			
 
				 #endif
			
@@ -519,7 +536,7 @@ int starpu_pthread_queue_destroy(starpu_pthread_queue_t *q)
 
				 #endif /* STARPU_SIMGRID */
			
 
				 
			
 
				 #if (defined(STARPU_SIMGRID) && !defined(STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT)) || (!defined(STARPU_SIMGRID) && !defined(STARPU_HAVE_PTHREAD_BARRIER))
			
 
				-int starpu_pthread_barrier_init(starpu_pthread_barrier_t *restrict barrier, const starpu_pthread_barrierattr_t *restrict attr, unsigned count)
			
 
				+int starpu_pthread_barrier_init(starpu_pthread_barrier_t *restrict barrier, const starpu_pthread_barrierattr_t *restrict attr STARPU_ATTRIBUTE_UNUSED, unsigned count)
			
 
				 {
			
 
				 	int ret = starpu_pthread_mutex_init(&barrier->mutex, NULL);
			
 
				 	if (!ret)
			
@@ -703,47 +720,34 @@ int starpu_pthread_barrier_wait(starpu_pthread_barrier_t *barrier)
 
				  * macros of course) which record when the mutex is held or not */
			
 
				 int starpu_pthread_mutex_lock_sched(starpu_pthread_mutex_t *mutex)
			
 
				 {
			
 
				-	const int workerid = starpu_worker_get_id();
			
 
				-	struct _starpu_worker * const worker = (workerid != -1)?_starpu_get_worker_struct(workerid):NULL;
			
 
				-	if(worker && mutex == &worker->sched_mutex)
			
 
				-	{
			
 
				-		STARPU_ASSERT(worker->sched_mutex_depth < UINT_MAX);
			
 
				-		worker->sched_mutex_depth++;
			
 
				-		if (worker->sched_mutex_depth > 1)
			
 
				-			return 0;
			
 
				-	}
			
 
				-
			
 
				-	return starpu_pthread_mutex_lock(mutex);
			
 
				+	int p_ret = starpu_pthread_mutex_lock(mutex);
			
 
				+	int workerid = starpu_worker_get_id();
			
 
				+	if(workerid != -1 && _starpu_worker_mutex_is_sched_mutex(workerid, mutex))
			
 
				+		_starpu_worker_set_flag_sched_mutex_locked(workerid, 1);
			
 
				+	return p_ret;
			
 
				 }
			
 
				 
			
 
				 int starpu_pthread_mutex_unlock_sched(starpu_pthread_mutex_t *mutex)
			
 
				 {
			
 
				-	const int workerid = starpu_worker_get_id();
			
 
				-	struct _starpu_worker * const worker = (workerid != -1)?_starpu_get_worker_struct(workerid):NULL;
			
 
				-	if(worker && mutex == &worker->sched_mutex)
			
 
				-	{
			
 
				-		STARPU_ASSERT(worker->sched_mutex_depth > 0);
			
 
				-		worker->sched_mutex_depth--;
			
 
				-		if (worker->sched_mutex_depth > 0)
			
 
				-			return 0;
			
 
				-	}
			
 
				+	int workerid = starpu_worker_get_id();
			
 
				+	if(workerid != -1 && _starpu_worker_mutex_is_sched_mutex(workerid, mutex))
			
 
				+		_starpu_worker_set_flag_sched_mutex_locked(workerid, 0);
			
 
				 
			
 
				 	return starpu_pthread_mutex_unlock(mutex);
			
 
				 }
			
 
				 
			
 
				 int starpu_pthread_mutex_trylock_sched(starpu_pthread_mutex_t *mutex)
			
 
				 {
			
 
				-	const int workerid = starpu_worker_get_id();
			
 
				-	struct _starpu_worker * const worker = (workerid != -1)?_starpu_get_worker_struct(workerid):NULL;
			
 
				-	if(worker && mutex == &worker->sched_mutex)
			
 
				+	int ret = starpu_pthread_mutex_trylock(mutex);
			
 
				+
			
 
				+	if (!ret)
			
 
				 	{
			
 
				-		STARPU_ASSERT(worker->sched_mutex_depth < UINT_MAX);
			
 
				-		worker->sched_mutex_depth++;
			
 
				-		if (worker->sched_mutex_depth > 1)
			
 
				-			return 0;
			
 
				+		int workerid = starpu_worker_get_id();
			
 
				+		if(workerid != -1 && _starpu_worker_mutex_is_sched_mutex(workerid, mutex))
			
 
				+			_starpu_worker_set_flag_sched_mutex_locked(workerid, 1);
			
 
				 	}
			
 
				 
			
 
				-	return starpu_pthread_mutex_trylock(mutex);
			
 
				+	return ret;
			
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_DEBUG
			
@@ -870,3 +874,72 @@ void _starpu_pthread_spin_do_unlock(starpu_pthread_spinlock_t *lock)
 
				 #endif
			
 
				 
			
 
				 #endif /* defined(STARPU_SIMGRID) || (defined(STARPU_LINUX_SYS) && defined(STARPU_HAVE_XCHG)) || !defined(STARPU_HAVE_PTHREAD_SPIN_LOCK) */
			
 
				+
			
 
				+#ifdef STARPU_SIMGRID
			
 
				+
			
 
				+int starpu_sem_destroy(starpu_sem_t *sem)
			
 
				+{
			
 
				+	MSG_sem_destroy(*sem);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int starpu_sem_init(starpu_sem_t *sem, int pshared, unsigned value)
			
 
				+{
			
 
				+	STARPU_ASSERT_MSG(pshared == 0, "pshared semaphores not supported under simgrid");
			
 
				+	*sem = MSG_sem_init(value);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int starpu_sem_post(starpu_sem_t *sem)
			
 
				+{
			
 
				+	MSG_sem_release(*sem);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int starpu_sem_wait(starpu_sem_t *sem)
			
 
				+{
			
 
				+	MSG_sem_acquire(*sem);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int starpu_sem_trywait(starpu_sem_t *sem)
			
 
				+{
			
 
				+	if (MSG_sem_would_block(*sem))
			
 
				+		return EAGAIN;
			
 
				+	starpu_sem_wait(sem);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int starpu_sem_getvalue(starpu_sem_t *sem, int *sval)
			
 
				+{
			
 
				+#if SIMGRID_VERSION_MAJOR > 3 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR > 13)
			
 
				+	*sval = MSG_sem_get_capacity(*sem);
			
 
				+	return 0;
			
 
				+#else
			
 
				+	(void) sem;
			
 
				+	(void) sval;
			
 
				+	STARPU_ABORT_MSG("sigmrid up to 3.13 did not have working MSG_sem_get_capacity");
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+#elif !defined(_MSC_VER) || defined(BUILDING_STARPU) /* !STARPU_SIMGRID */
			
 
				+
			
 
				+int starpu_sem_wait(starpu_sem_t *sem)
			
 
				+{
			
 
				+	int ret;
			
 
				+	while((ret = sem_wait(sem)) == -1 && errno == EINTR)
			
 
				+		;
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+int starpu_sem_trywait(starpu_sem_t *sem)
			
 
				+{
			
 
				+	int ret;
			
 
				+	while((ret = sem_trywait(sem)) == -1 && errno == EINTR)
			
 
				+		;
			
 
				+	
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+#endif
			
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -24,7 +24,6 @@
 
				 #include <string.h>
			
 
				 #include <stdlib.h>
			
 
				 #include <math.h>
			
 
				-#include <pthread.h>
			
 
				 #ifdef STARPU_HAVE_SCHED_YIELD
			
 
				 #include <sched.h>
			
 
				 #endif
			
@@ -97,9 +96,9 @@
 
				 #endif
			
 
				 
			
 
				 #ifdef STARPU_EXTRA_VERBOSE
			
 
				-#  define _STARPU_LOG_IN()             do { if (!_starpu_silent) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s:%s@%d] -->\n", pthread_self(), __starpu_func__,__FILE__,  __LINE__); }} while(0)
			
 
				-#  define _STARPU_LOG_OUT()            do { if (!_starpu_silent) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s:%s@%d] <--\n", pthread_self(), __starpu_func__, __FILE__,  __LINE__); }} while(0)
			
 
				-#  define _STARPU_LOG_OUT_TAG(outtag)  do { if (!_starpu_silent) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s:%s@%d] <-- (%s)\n", pthread_self(), __starpu_func__, __FILE__, __LINE__, outtag); }} while(0)
			
 
				+#  define _STARPU_LOG_IN()             do { if (!_starpu_silent) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s:%s@%d] -->\n", starpu_pthread_self(), __starpu_func__,__FILE__,  __LINE__); }} while(0)
			
 
				+#  define _STARPU_LOG_OUT()            do { if (!_starpu_silent) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s:%s@%d] <--\n", starpu_pthread_self(), __starpu_func__, __FILE__,  __LINE__); }} while(0)
			
 
				+#  define _STARPU_LOG_OUT_TAG(outtag)  do { if (!_starpu_silent) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s:%s@%d] <-- (%s)\n", starpu_pthread_self(), __starpu_func__, __FILE__, __LINE__, outtag); }} while(0)
			
 
				 #else
			
 
				 #  define _STARPU_LOG_IN()
			
 
				 #  define _STARPU_LOG_OUT()
			
--- a/src/core/jobs.c
+++ b/src/core/jobs.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2009-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
			
 
				  * Copyright (C) 2011  Télécom-SudParis
			
 
				  * Copyright (C) 2011, 2014, 2016  INRIA
			
@@ -88,7 +88,7 @@ struct _starpu_job* STARPU_ATTRIBUTE_MALLOC _starpu_job_create(struct starpu_tas
 
				 
			
 
				 #ifndef STARPU_USE_FXT
			
 
				 	if (_starpu_bound_recording || _starpu_top_status_get() ||
			
 
				-		_starpu_task_break_on_push != -1 || _starpu_task_break_on_pop != -1 || _starpu_task_break_on_sched != -1
			
 
				+		_starpu_task_break_on_push != -1 || _starpu_task_break_on_sched != -1 || _starpu_task_break_on_pop != -1 || _starpu_task_break_on_exec != -1
			
 
				 		|| STARPU_AYU_EVENT)
			
 
				 #endif
			
 
				 	{
			
--- a/src/core/sched_ctx.c
+++ b/src/core/sched_ctx.c
--- a/src/core/sched_ctx.h
+++ b/src/core/sched_ctx.h
@@ -73,12 +73,27 @@ struct _starpu_sched_ctx
 
				 	long iterations[2];
			
 
				 	int iteration_level;
			
 
				 
			
 
				+	/* cond to block push when there are no workers in the ctx */
			
 
				+	starpu_pthread_cond_t no_workers_cond;
			
 
				+
			
 
				+	/* mutex to block push when there are no workers in the ctx */
			
 
				+	starpu_pthread_mutex_t no_workers_mutex;
			
 
				+
			
 
				 	/*ready tasks that couldn't be pushed because the ctx has no workers*/
			
 
				 	struct starpu_task_list empty_ctx_tasks;
			
 
				 
			
 
				+	/* mutext protecting empty_ctx_tasks list */
			
 
				+	starpu_pthread_mutex_t empty_ctx_mutex;
			
 
				+
			
 
				 	/*ready tasks that couldn't be pushed because the the window of tasks was already full*/
			
 
				 	struct starpu_task_list waiting_tasks;
			
 
				 
			
 
				+	/* mutext protecting waiting_tasks list */
			
 
				+	starpu_pthread_mutex_t waiting_tasks_mutex;
			
 
				+
			
 
				+	/* mutext protecting write to all worker's sched_ctx_list structure for this sched_ctx */
			
 
				+	starpu_pthread_mutex_t sched_ctx_list_mutex;
			
 
				+
			
 
				 	/* min CPUs to execute*/
			
 
				 	int min_ncpus;
			
 
				 
			
@@ -127,10 +142,27 @@ struct _starpu_sched_ctx
 
				 	   if not master is -1 */
			
 
				 	int main_master;
			
 
				 
			
 
				+	/* conditions variables used when parallel sections are executed in contexts */
			
 
				+	starpu_pthread_cond_t parallel_sect_cond[STARPU_NMAXWORKERS];
			
 
				+	starpu_pthread_mutex_t parallel_sect_mutex[STARPU_NMAXWORKERS];
			
 
				+	starpu_pthread_cond_t parallel_sect_cond_busy[STARPU_NMAXWORKERS];
			
 
				+	int busy[STARPU_NMAXWORKERS];
			
 
				+
			
 
				 	/* boolean indicating that workers should block in order to allow
			
 
				 	   parallel sections to be executed on their allocated resources */
			
 
				 	unsigned parallel_sect[STARPU_NMAXWORKERS];
			
 
				 
			
 
				+	/* semaphore that block appl thread until starpu threads are
			
 
				+	   all blocked and ready to exec the parallel code */
			
 
				+	starpu_sem_t fall_asleep_sem[STARPU_NMAXWORKERS];
			
 
				+
			
 
				+	/* semaphore that block appl thread until starpu threads are 
			
 
				+	   all woke up and ready continue appl */
			
 
				+	starpu_sem_t wake_up_sem[STARPU_NMAXWORKERS];
			
 
				+
			
 
				+	/* bool indicating if the workers is sleeping in this ctx */
			
 
				+	unsigned sleeping[STARPU_NMAXWORKERS];
			
 
				+
			
 
				 	/* ctx nesting the current ctx */
			
 
				 	unsigned nesting_sched_ctx;
			
 
				 
			
@@ -158,9 +190,6 @@ struct _starpu_sched_ctx
 
				 	int sms_end_idx;
			
 
				 
			
 
				 	int stream_worker;
			
 
				-
			
 
				-	starpu_pthread_rwlock_t rwlock;
			
 
				-	starpu_pthread_t lock_write_owner;
			
 
				 };
			
 
				 
			
 
				 struct _starpu_machine_config;
			
@@ -212,10 +241,19 @@ void _starpu_worker_gets_out_of_ctx(unsigned sched_ctx_id, struct _starpu_worker
 
				 /* Check if the worker belongs to another sched_ctx */
			
 
				 unsigned _starpu_worker_belongs_to_a_sched_ctx(int workerid, unsigned sched_ctx_id);
			
 
				 
			
 
				+/* mutex synchronising several simultaneous modifications of a context */
			
 
				+starpu_pthread_rwlock_t* _starpu_sched_ctx_get_changing_ctx_mutex(unsigned sched_ctx_id);
			
 
				+
			
 
				 /* indicates wheather this worker should go to sleep or not 
			
 
				    (if it is the last one awake in a context he should better keep awake) */
			
 
				 unsigned _starpu_sched_ctx_last_worker_awake(struct _starpu_worker *worker);
			
 
				 
			
 
				+/* let the appl know that the worker blocked to execute parallel code */
			
 
				+void _starpu_sched_ctx_signal_worker_blocked(unsigned sched_ctx_id, int workerid);
			
 
				+
			
 
				+/* let the appl know that the worker woke up */
			
 
				+void _starpu_sched_ctx_signal_worker_woke_up(unsigned sched_ctx_id, int workerid);
			
 
				+
			
 
				 /* If starpu_sched_ctx_set_context() has been called, returns the context
			
 
				  * id set by its last call, or the id of the initial context */
			
 
				 unsigned _starpu_sched_ctx_get_current_context();
			
@@ -240,43 +278,4 @@ struct _starpu_sched_ctx *__starpu_sched_ctx_get_sched_ctx_for_worker_and_job(st
 
				 #define _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(w,j) \
			
 
				 	(_starpu_get_nsched_ctxs() <= 1 ? _starpu_get_sched_ctx_struct(0) : __starpu_sched_ctx_get_sched_ctx_for_worker_and_job((w),(j)))
			
 
				 
			
 
				-static inline struct _starpu_sched_ctx *_starpu_get_sched_ctx_struct(unsigned id);
			
 
				-
			
 
				-static inline int _starpu_sched_ctx_check_write_locked(unsigned sched_ctx_id)
			
 
				-{
			
 
				-	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
			
 
				-	return sched_ctx->lock_write_owner == pthread_self();
			
 
				-}
			
 
				-#define STARPU_SCHED_CTX_CHECK_LOCK(sched_ctx_id) STARPU_ASSERT(_starpu_sched_ctx_check_write_locked((sched_ctx_id)))
			
 
				-
			
 
				-static inline void _starpu_sched_ctx_lock_write(unsigned sched_ctx_id)
			
 
				-{
			
 
				-	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
			
 
				-	STARPU_ASSERT(sched_ctx->lock_write_owner != pthread_self());
			
 
				-	STARPU_PTHREAD_RWLOCK_WRLOCK(&sched_ctx->rwlock);
			
 
				-	sched_ctx->lock_write_owner = pthread_self();
			
 
				-}
			
 
				-
			
 
				-static inline void _starpu_sched_ctx_unlock_write(unsigned sched_ctx_id)
			
 
				-{
			
 
				-	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
			
 
				-	STARPU_ASSERT(sched_ctx->lock_write_owner == pthread_self());
			
 
				-	sched_ctx->lock_write_owner = 0;
			
 
				-	STARPU_PTHREAD_RWLOCK_UNLOCK(&sched_ctx->rwlock);
			
 
				-}
			
 
				-
			
 
				-static inline void _starpu_sched_ctx_lock_read(unsigned sched_ctx_id)
			
 
				-{
			
 
				-	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
			
 
				-	STARPU_ASSERT(sched_ctx->lock_write_owner != pthread_self());
			
 
				-	STARPU_PTHREAD_RWLOCK_RDLOCK(&sched_ctx->rwlock);
			
 
				-}
			
 
				-
			
 
				-static inline void _starpu_sched_ctx_unlock_read(unsigned sched_ctx_id)
			
 
				-{
			
 
				-	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
			
 
				-	STARPU_ASSERT(sched_ctx->lock_write_owner != pthread_self());
			
 
				-	STARPU_PTHREAD_RWLOCK_UNLOCK(&sched_ctx->rwlock);
			
 
				-}
			
 
				-
			
 
				 #endif // __SCHED_CONTEXT_H__
			
--- a/src/core/sched_policy.c
+++ b/src/core/sched_policy.c
@@ -31,15 +31,17 @@ static double idle[STARPU_NMAXWORKERS];
 
				 static double idle_start[STARPU_NMAXWORKERS];
			
 
				 
			
 
				 long _starpu_task_break_on_push = -1;
			
 
				-long _starpu_task_break_on_pop = -1;
			
 
				 long _starpu_task_break_on_sched = -1;
			
 
				+long _starpu_task_break_on_pop = -1;
			
 
				+long _starpu_task_break_on_exec = -1;
			
 
				 static const char *starpu_idle_file;
			
 
				 
			
 
				 void _starpu_sched_init(void)
			
 
				 {
			
 
				 	_starpu_task_break_on_push = starpu_get_env_number_default("STARPU_TASK_BREAK_ON_PUSH", -1);
			
 
				-	_starpu_task_break_on_pop = starpu_get_env_number_default("STARPU_TASK_BREAK_ON_POP", -1);
			
 
				 	_starpu_task_break_on_sched = starpu_get_env_number_default("STARPU_TASK_BREAK_ON_SCHED", -1);
			
 
				+	_starpu_task_break_on_pop = starpu_get_env_number_default("STARPU_TASK_BREAK_ON_POP", -1);
			
 
				+	_starpu_task_break_on_exec = starpu_get_env_number_default("STARPU_TASK_BREAK_ON_EXEC", -1);
			
 
				 	starpu_idle_file = starpu_getenv("STARPU_IDLE_FILE");
			
 
				 }
			
 
				 
			
@@ -431,9 +433,9 @@ int _starpu_repush_task(struct _starpu_job *j)
 
				 
			
 
				 		if(nworkers == 0)
			
 
				 		{
			
 
				-			_starpu_sched_ctx_lock_write(sched_ctx->id);
			
 
				+			STARPU_PTHREAD_MUTEX_LOCK(&sched_ctx->empty_ctx_mutex);
			
 
				 			starpu_task_list_push_front(&sched_ctx->empty_ctx_tasks, task);
			
 
				-			_starpu_sched_ctx_unlock_write(sched_ctx->id);
			
 
				+			STARPU_PTHREAD_MUTEX_UNLOCK(&sched_ctx->empty_ctx_mutex);
			
 
				 #ifdef STARPU_USE_SC_HYPERVISOR
			
 
				 			if(sched_ctx->id != 0 && sched_ctx->perf_counters != NULL
			
 
				 			   && sched_ctx->perf_counters->notify_empty_ctx)
			
@@ -497,9 +499,9 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 
				 
			
 
				 		if (nworkers == 0)
			
 
				 		{
			
 
				-			_starpu_sched_ctx_lock_write(sched_ctx->id);
			
 
				+			STARPU_PTHREAD_MUTEX_LOCK(&sched_ctx->empty_ctx_mutex);
			
 
				 			starpu_task_list_push_back(&sched_ctx->empty_ctx_tasks, task);
			
 
				-			_starpu_sched_ctx_unlock_write(sched_ctx->id);
			
 
				+			STARPU_PTHREAD_MUTEX_UNLOCK(&sched_ctx->empty_ctx_mutex);
			
 
				 #ifdef STARPU_USE_SC_HYPERVISOR
			
 
				 			if(sched_ctx->id != 0 && sched_ctx->perf_counters != NULL
			
 
				 			   && sched_ctx->perf_counters->notify_empty_ctx)
			
@@ -589,6 +591,8 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 
				 		{
			
 
				 			STARPU_ASSERT(sched_ctx->sched_policy->push_task);
			
 
				 			/* check out if there are any workers in the context */
			
 
				+			starpu_pthread_rwlock_t *changing_ctx_mutex = _starpu_sched_ctx_get_changing_ctx_mutex(sched_ctx->id);
			
 
				+			STARPU_PTHREAD_RWLOCK_RDLOCK(changing_ctx_mutex);
			
 
				 			nworkers = starpu_sched_ctx_get_nworkers(sched_ctx->id);
			
 
				 			if (nworkers == 0)
			
 
				 				ret = -1;
			
@@ -599,6 +603,7 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 
				 				ret = sched_ctx->sched_policy->push_task(task);
			
 
				 				_STARPU_SCHED_END;
			
 
				 			}
			
 
				+			STARPU_PTHREAD_RWLOCK_UNLOCK(changing_ctx_mutex);
			
 
				 		}
			
 
				 
			
 
				 		if(ret == -1)
			
--- a/src/core/sched_policy.h
+++ b/src/core/sched_policy.h
@@ -28,7 +28,7 @@
 
				 
			
 
				 #define _STARPU_SCHED_BEGIN \
			
 
				 	_STARPU_TRACE_WORKER_SCHEDULING_PUSH;	\
			
 
				-	_SIMGRID_TIMER_BEGIN
			
 
				+	_SIMGRID_TIMER_BEGIN(_starpu_simgrid_sched_cost())
			
 
				 #define _STARPU_SCHED_END \
			
 
				 	_SIMGRID_TIMER_END;			\
			
 
				 	_STARPU_TRACE_WORKER_SCHEDULING_POP
			
@@ -103,8 +103,9 @@ extern struct starpu_sched_policy _starpu_sched_modular_heft2_policy;
 
				 extern struct starpu_sched_policy _starpu_sched_graph_test_policy;
			
 
				 
			
 
				 extern long _starpu_task_break_on_push;
			
 
				-extern long _starpu_task_break_on_pop;
			
 
				 extern long _starpu_task_break_on_sched;
			
 
				+extern long _starpu_task_break_on_pop;
			
 
				+extern long _starpu_task_break_on_exec;
			
 
				 
			
 
				 #ifdef SIGTRAP
			
 
				 #define _STARPU_TASK_BREAK_ON(task, what) do { \
			
--- a/src/core/simgrid.h
+++ b/src/core/simgrid.h
@@ -69,7 +69,7 @@ starpu_pthread_queue_t _starpu_simgrid_task_queue[STARPU_NMAXWORKERS];
 
				 #define _starpu_simgrid_queue_malloc_cost() starpu_get_env_number_default("STARPU_SIMGRID_QUEUE_MALLOC_COST", 1)
			
 
				 #define _starpu_simgrid_task_submit_cost() starpu_get_env_number_default("STARPU_SIMGRID_TASK_SUBMIT_COST", 1)
			
 
				 #define _starpu_simgrid_fetching_input_cost() starpu_get_env_number_default("STARPU_SIMGRID_FETCHING_INPUT_COST", 1)
			
 
				-#define _starpu_simgrid_sched_cost() starpu_get_env_number_default("STARPU_SIMGRID_SCHED_COST", 1)
			
 
				+#define _starpu_simgrid_sched_cost() starpu_get_env_number_default("STARPU_SIMGRID_SCHED_COST", 0)
			
 
				 
			
 
				 /* Called at initialization to count how many GPUs are interfering with each
			
 
				  * bus */
			
@@ -78,10 +78,10 @@ void _starpu_simgrid_count_ngpus(void);
 
				 void _starpu_simgrid_xbt_thread_create(const char *name, void_f_pvoid_t code,
			
 
				 				       void *param);
			
 
				 
			
 
				-#define _SIMGRID_TIMER_BEGIN		\
			
 
				+#define _SIMGRID_TIMER_BEGIN(cond)			\
			
 
				 	{		\
			
 
				 		xbt_os_timer_t __timer = NULL;		\
			
 
				-		if (_starpu_simgrid_sched_cost()) {		\
			
 
				+		if (cond) {		\
			
 
				 		  __timer = xbt_os_timer_new();		\
			
 
				 		  xbt_os_threadtimer_start(__timer);	\
			
 
				 		}
			
@@ -94,7 +94,7 @@ void _starpu_simgrid_xbt_thread_create(const char *name, void_f_pvoid_t code,
 
				 	}
			
 
				 
			
 
				 #else // !STARPU_SIMGRID
			
 
				-#define _SIMGRID_TIMER_BEGIN {
			
 
				+#define _SIMGRID_TIMER_BEGIN(cond) {
			
 
				 #define _SIMGRID_TIMER_END }
			
 
				 #endif
			
 
				 
			
--- a/src/core/topology.c
+++ b/src/core/topology.c
@@ -1717,7 +1717,7 @@ _starpu_bind_thread_on_cpu (
 
				 	CPU_ZERO(&aff_mask);
			
 
				 	CPU_SET(cpuid, &aff_mask);
			
 
				 
			
 
				-	starpu_pthread_t self = pthread_self();
			
 
				+	starpu_pthread_t self = starpu_pthread_self();
			
 
				 
			
 
				 	ret = pthread_setaffinity_np(self, sizeof(aff_mask), &aff_mask);
			
 
				 	if (ret)
			
@@ -2186,8 +2186,7 @@ _starpu_init_workers_binding_and_memory (struct _starpu_machine_config *config,
 
				 				_starpu_memory_node_add_nworkers(memory_node);
			
 
				 
			
 
				                                 _starpu_worker_drives_memory_node(&workerarg->set->workers[0], STARPU_MAIN_RAM);
			
 
				-				if (memory_node != STARPU_MAIN_RAM)
			
 
				-					_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
			
 
				+				_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
			
 
				 				break;
			
 
				 #endif
			
 
				 
			
@@ -2226,8 +2225,7 @@ _starpu_init_workers_binding_and_memory (struct _starpu_machine_config *config,
 
				 				_starpu_memory_node_add_nworkers(memory_node);
			
 
				 
			
 
				                                 _starpu_worker_drives_memory_node(workerarg, STARPU_MAIN_RAM);
			
 
				-				if (memory_node != STARPU_MAIN_RAM)
			
 
				-					_starpu_worker_drives_memory_node(workerarg, memory_node);
			
 
				+				_starpu_worker_drives_memory_node(workerarg, memory_node);
			
 
				 				break;
			
 
				 #endif
			
 
				 
			
@@ -2259,8 +2257,7 @@ _starpu_init_workers_binding_and_memory (struct _starpu_machine_config *config,
 
				 				_starpu_memory_node_add_nworkers(memory_node);
			
 
				 
			
 
				                                 _starpu_worker_drives_memory_node(&workerarg->set->workers[0], STARPU_MAIN_RAM);
			
 
				-				if (memory_node != STARPU_MAIN_RAM)
			
 
				-					_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
			
 
				+				_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
			
 
				 				break;
			
 
				 #endif /* STARPU_USE_MIC */
			
 
				 
			
@@ -2275,8 +2272,7 @@ _starpu_init_workers_binding_and_memory (struct _starpu_machine_config *config,
 
				 				_starpu_memory_node_add_nworkers(memory_node);
			
 
				 
			
 
				                                 _starpu_worker_drives_memory_node(workerarg, STARPU_MAIN_RAM);
			
 
				-				if (memory_node != STARPU_MAIN_RAM)
			
 
				-					_starpu_worker_drives_memory_node(workerarg, memory_node);
			
 
				+				_starpu_worker_drives_memory_node(workerarg, memory_node);
			
 
				 			}
			
 
				 				break;
			
 
				 #endif /* STARPU_USE_SCC */
			
@@ -2298,8 +2294,7 @@ _starpu_init_workers_binding_and_memory (struct _starpu_machine_config *config,
 
				 
			
 
				 				}
			
 
				                                 _starpu_worker_drives_memory_node(&workerarg->set->workers[0], STARPU_MAIN_RAM);
			
 
				-				if (memory_node != STARPU_MAIN_RAM)
			
 
				-					_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
			
 
				+				_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
			
 
				 #ifndef STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD
			
 
				                                 /* MPI driver thread can manage all slave memories if we disable the MPI multiple thread */
			
 
				                                 unsigned findworker;
			
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -576,15 +576,10 @@ static void _starpu_worker_init(struct _starpu_worker *workerarg, struct _starpu
 
				 	workerarg->reverse_phase[0] = 0;
			
 
				 	workerarg->reverse_phase[1] = 0;
			
 
				 	workerarg->pop_ctx_priority = 1;
			
 
				-	workerarg->sched_mutex_depth = 0;
			
 
				+	workerarg->sched_mutex_locked = 0;
			
 
				+	workerarg->blocked = 0;
			
 
				 	workerarg->is_slave_somewhere = 0;
			
 
				 
			
 
				-	workerarg->state_sched_op_pending = 0;
			
 
				-	workerarg->state_changing_ctx_waiting = 0;
			
 
				-	workerarg->state_blocked = 0;
			
 
				-	workerarg->state_wait_ack__blocked = 0;
			
 
				-	workerarg->state_wait_handshake__blocked = 0;
			
 
				-
			
 
				 	/* cpu_set/hwloc_cpu_set initialized in topology.c */
			
 
				 }
			
 
				 
			
@@ -1417,14 +1412,12 @@ static void _starpu_terminate_workers(struct _starpu_machine_config *pconfig)
 
				 		struct _starpu_worker *worker = &pconfig->workers[workerid];
			
 
				 
			
 
				 		/* in case StarPU termination code is called from a callback,
			
 
				- 		 * we have to check if pthread_self() is the worker itself */
			
 
				+ 		 * we have to check if starpu_pthread_self() is the worker itself */
			
 
				 		if (set && set->nworkers > 0)
			
 
				 		{
			
 
				 			if (set->started)
			
 
				 			{
			
 
				-#ifndef STARPU_SIMGRID
			
 
				-				if (!pthread_equal(pthread_self(), set->worker_thread))
			
 
				-#endif
			
 
				+				if (!starpu_pthread_equal(starpu_pthread_self(), set->worker_thread))
			
 
				 					status = starpu_pthread_join(set->worker_thread, NULL);
			
 
				 				if (status)
			
 
				 				{
			
@@ -1440,9 +1433,7 @@ static void _starpu_terminate_workers(struct _starpu_machine_config *pconfig)
 
				 			if (!worker->run_by_starpu)
			
 
				 				goto out;
			
 
				 
			
 
				-#ifndef STARPU_SIMGRID
			
 
				-			if (!pthread_equal(pthread_self(), worker->worker_thread))
			
 
				-#endif
			
 
				+			if (!starpu_pthread_equal(starpu_pthread_self(), worker->worker_thread))
			
 
				 				status = starpu_pthread_join(worker->worker_thread, NULL);
			
 
				 			if (status)
			
 
				 			{
			
@@ -1697,7 +1688,7 @@ unsigned starpu_worker_get_count(void)
 
				 
			
 
				 unsigned starpu_worker_is_blocked(int workerid)
			
 
				 {
			
 
				-	return (unsigned)_starpu_config.workers[workerid].state_blocked;
			
 
				+	return _starpu_config.workers[workerid].blocked;
			
 
				 }
			
 
				 
			
 
				 unsigned starpu_worker_is_slave_somewhere(int workerid)
			
@@ -2057,7 +2048,7 @@ void starpu_worker_get_sched_condition(int workerid, starpu_pthread_mutex_t **sc
 
				 	*sched_mutex = &_starpu_config.workers[workerid].sched_mutex;
			
 
				 }
			
 
				 
			
 
				-static int starpu_wakeup_worker_locked(int workerid, starpu_pthread_cond_t *sched_cond, starpu_pthread_mutex_t *mutex STARPU_ATTRIBUTE_UNUSED)
			
 
				+int starpu_wakeup_worker_locked(int workerid, starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 #ifdef STARPU_SIMGRID
			
 
				 	starpu_pthread_queue_broadcast(&_starpu_simgrid_task_queue[workerid]);
			
@@ -2065,19 +2056,17 @@ static int starpu_wakeup_worker_locked(int workerid, starpu_pthread_cond_t *sche
 
				 	if (_starpu_config.workers[workerid].status == STATUS_SLEEPING)
			
 
				 	{
			
 
				 		_starpu_config.workers[workerid].status = STATUS_WAKING_UP;
			
 
				-		/* cond_broadcast is required over cond_signal since
			
 
				-		 * the condition is share for multiple purpose */
			
 
				-		STARPU_PTHREAD_COND_BROADCAST(sched_cond);
			
 
				+		STARPU_PTHREAD_COND_SIGNAL(cond);
			
 
				 		return 1;
			
 
				 	}
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static int starpu_wakeup_worker(int workerid, starpu_pthread_cond_t *sched_cond, starpu_pthread_mutex_t *mutex)
			
 
				+int starpu_wakeup_worker(int workerid, starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex)
			
 
				 {
			
 
				 	int success;
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK_SCHED(mutex);
			
 
				-	success = starpu_wakeup_worker_locked(workerid, sched_cond, mutex);
			
 
				+	success = starpu_wakeup_worker_locked(workerid, cond, mutex);
			
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(mutex);
			
 
				 	return success;
			
 
				 }
			
@@ -2171,6 +2160,33 @@ void starpu_get_version(int *major, int *minor, int *release)
 
				 	*release = STARPU_RELEASE_VERSION;
			
 
				 }
			
 
				 
			
 
				+void _starpu_unlock_mutex_if_prev_locked()
			
 
				+{
			
 
				+	int workerid = starpu_worker_get_id();
			
 
				+	if(workerid != -1)
			
 
				+	{
			
 
				+		struct _starpu_worker *w = _starpu_get_worker_struct(workerid);
			
 
				+		if(w->sched_mutex_locked)
			
 
				+		{
			
 
				+			STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&w->sched_mutex);
			
 
				+			_starpu_worker_set_flag_sched_mutex_locked(workerid, 1);
			
 
				+		}
			
 
				+	}
			
 
				+	return;
			
 
				+}
			
 
				+
			
 
				+void _starpu_relock_mutex_if_prev_locked()
			
 
				+{
			
 
				+	int workerid = starpu_worker_get_id();
			
 
				+	if(workerid != -1)
			
 
				+	{
			
 
				+		struct _starpu_worker *w = _starpu_get_worker_struct(workerid);
			
 
				+		if(w->sched_mutex_locked)
			
 
				+			STARPU_PTHREAD_MUTEX_LOCK_SCHED(&w->sched_mutex);
			
 
				+	}
			
 
				+	return;
			
 
				+}
			
 
				+
			
 
				 unsigned starpu_worker_get_sched_ctx_list(int workerid, unsigned **sched_ctxs)
			
 
				 {
			
 
				 	unsigned s = 0;
			
--- a/src/core/workers.h
+++ b/src/core/workers.h
@@ -20,6 +20,8 @@
 
				 #ifndef __WORKERS_H__
			
 
				 #define __WORKERS_H__
			
 
				 
			
 
				+#include <limits.h>
			
 
				+
			
 
				 #include <starpu.h>
			
 
				 #include <common/config.h>
			
 
				 #include <common/timing.h>
			
@@ -83,11 +85,6 @@ LIST_TYPE(_starpu_worker,
 
				 	unsigned numa_memory_node; /* which numa memory node is the worker associated with? (logical index) */
			
 
				 	starpu_pthread_cond_t sched_cond; /* condition variable used when the worker waits for tasks. */
			
 
				         starpu_pthread_mutex_t sched_mutex; /* mutex protecting sched_cond */
			
 
				-	int state_sched_op_pending:1; /* a task pop is ongoing even though sched_mutex may temporarily be unlocked */
			
 
				-	int state_changing_ctx_waiting:1; /* a thread is waiting for transient operations such as pop to complete before acquiring sched_mutex and modifying the worker ctx*/
			
 
				-	int state_blocked:1;
			
 
				-	int state_wait_ack__blocked:1;
			
 
				-	int state_wait_handshake__blocked:1;
			
 
				 	struct starpu_task_list local_tasks; /* this queue contains tasks that have been explicitely submitted to that queue */
			
 
				 	struct starpu_task **local_ordered_tasks; /* this queue contains tasks that have been explicitely submitted to that queue with an explicit order */
			
 
				 	unsigned local_ordered_tasks_size; /* this records the size of local_ordered_tasks */
			
@@ -141,8 +138,11 @@ LIST_TYPE(_starpu_worker,
 
				 	/* indicate which priority of ctx is currently active: the values are 0 or 1*/
			
 
				 	unsigned pop_ctx_priority;
			
 
				 
			
 
				-	/* sched mutex local worker locking depth */
			
 
				-	unsigned sched_mutex_depth;
			
 
				+	/* flag to know if sched_mutex is locked or not */
			
 
				+	unsigned sched_mutex_locked;
			
 
				+
			
 
				+	/* bool to indicate if the worker is blocked in a ctx */
			
 
				+	unsigned blocked;
			
 
				 
			
 
				 	/* bool to indicate if the worker is slave in a ctx */
			
 
				 	unsigned is_slave_somewhere;
			
@@ -509,7 +509,7 @@ static inline struct _starpu_worker *_starpu_get_worker_struct(unsigned id)
 
				 	return &_starpu_config.workers[id];
			
 
				 }
			
 
				 
			
 
				-/* Returns the starpu_sched_ctx structure that describes the state of the 
			
 
				+/* Returns the starpu_sched_ctx structure that descriebes the state of the 
			
 
				  * specified ctx */
			
 
				 static inline struct _starpu_sched_ctx *_starpu_get_sched_ctx_struct(unsigned id)
			
 
				 {
			
@@ -559,6 +559,18 @@ int starpu_worker_get_nids_by_type(enum starpu_worker_archtype type, int *worker
 
				    the list might not be updated */
			
 
				 int starpu_worker_get_nids_ctx_free_by_type(enum starpu_worker_archtype type, int *workerids, int maxsize);
			
 
				 
			
 
				+/* if the current worker has the lock release it */
			
 
				+void _starpu_unlock_mutex_if_prev_locked();
			
 
				+
			
 
				+/* if we prev released the lock relock it */
			
 
				+void _starpu_relock_mutex_if_prev_locked();
			
 
				+
			
 
				+static inline void _starpu_worker_set_flag_sched_mutex_locked(int workerid, unsigned flag)
			
 
				+{
			
 
				+	struct _starpu_worker *w = _starpu_get_worker_struct(workerid);
			
 
				+	w->sched_mutex_locked = flag;
			
 
				+}
			
 
				+
			
 
				 static inline unsigned _starpu_worker_mutex_is_sched_mutex(int workerid, starpu_pthread_mutex_t *mutex)
			
 
				 {
			
 
				 	struct _starpu_worker *w = _starpu_get_worker_struct(workerid);
			
@@ -611,43 +623,4 @@ void _starpu_worker_set_stream_ctx(unsigned workerid, struct _starpu_sched_ctx *
 
				 
			
 
				 struct _starpu_sched_ctx* _starpu_worker_get_ctx_stream(unsigned stream_workerid);
			
 
				 
			
 
				-/* Must be called with worker's sched_mutex held.
			
 
				- * Mark the beginning of a scheduling operation during which the sched_mutex
			
 
				- * lock may be temporarily released, but the scheduling context of the worker
			
 
				- * should not be modified */
			
 
				-static inline void _starpu_worker_enter_transient_sched_op(struct _starpu_worker * const worker)
			
 
				-{
			
 
				-	worker->state_sched_op_pending = 1;
			
 
				-}
			
 
				-
			
 
				-/* Must be called with worker's sched_mutex held.
			
 
				- * Mark the end of a scheduling operation, and notify potential waiters that
			
 
				- * scheduling context changes can safely be performed again.
			
 
				- */
			
 
				-static inline void  _starpu_worker_leave_transient_sched_op(struct _starpu_worker * const worker)
			
 
				-{
			
 
				-	worker->state_sched_op_pending = 0;
			
 
				-	if (worker->state_changing_ctx_waiting)
			
 
				-		/* cond_broadcast is required over cond_signal since
			
 
				-		 * the condition is share for multiple purpose */
			
 
				-		STARPU_PTHREAD_COND_BROADCAST(&worker->sched_cond);
			
 
				-}
			
 
				-
			
 
				-/* Must be called with worker's sched_mutex held.
			
 
				- * Passively wait until state_sched_op_pending is cleared.
			
 
				- */
			
 
				-static inline void _starpu_worker_wait_for_transient_sched_op_completion(struct _starpu_worker * const worker)
			
 
				-{
			
 
				-	if (worker->state_sched_op_pending)
			
 
				-	{
			
 
				-		worker->state_changing_ctx_waiting = 1;
			
 
				-		do
			
 
				-		{
			
 
				-			STARPU_PTHREAD_COND_WAIT(&worker->sched_cond, &worker->sched_mutex);
			
 
				-		}
			
 
				-		while (worker->state_sched_op_pending);
			
 
				-		worker->state_changing_ctx_waiting = 0;
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				 #endif // __WORKERS_H__
			
--- a/src/datawizard/malloc.c
+++ b/src/datawizard/malloc.c
@@ -32,6 +32,7 @@
 
				 #ifdef STARPU_SIMGRID
			
 
				 #include <sys/mman.h>
			
 
				 #include <fcntl.h>
			
 
				+#include <smpi/smpi.h>
			
 
				 #endif
			
 
				 
			
 
				 #ifndef O_BINARY
			
@@ -48,9 +49,12 @@ static int malloc_on_node_default_flags[STARPU_MAXNODES];
 
				 
			
 
				 /* This file is used for implementing "folded" allocation */
			
 
				 #ifdef STARPU_SIMGRID
			
 
				+#if SIMGRID_VERSION_MAJOR < 3 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR < 15)
			
 
				+/* TODO: drop when simgrid 3.15 is reasonably largely used by people who need the feature */
			
 
				 static int bogusfile = -1;
			
 
				 static unsigned long _starpu_malloc_simulation_fold;
			
 
				 #endif
			
 
				+#endif
			
 
				 
			
 
				 void starpu_malloc_set_align(size_t align)
			
 
				 {
			
@@ -230,6 +234,10 @@ int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int fl
 
				 #ifdef STARPU_SIMGRID
			
 
				 	if (flags & STARPU_MALLOC_SIMULATION_FOLDED)
			
 
				 	{
			
 
				+#if SIMGRID_VERSION_MAJOR > 3 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR >= 15)
			
 
				+		*A = SMPI_SHARED_MALLOC(dim);
			
 
				+#else
			
 
				+		/* TODO: drop when simgrid 3.15 is reasonably largely used by people who need the feature */
			
 
				 		/* Use "folded" allocation: the same file is mapped several
			
 
				 		 * times contiguously, to get a memory area one can read/write,
			
 
				 		 * without consuming memory */
			
@@ -282,6 +290,7 @@ int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int fl
 
				 			}
			
 
				 			*A = buf;
			
 
				 		}
			
 
				+#endif
			
 
				 	}
			
 
				 	else
			
 
				 #endif
			
@@ -465,7 +474,12 @@ int _starpu_free_flags_on_node(unsigned dst_node, void *A, size_t dim, int flags
 
				 #ifdef STARPU_SIMGRID
			
 
				 	if (flags & STARPU_MALLOC_SIMULATION_FOLDED)
			
 
				 	{
			
 
				+#if SIMGRID_VERSION_MAJOR > 3 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR >= 15)
			
 
				+		SMPI_SHARED_FREE(A);
			
 
				+#else
			
 
				+		/* TODO: drop when simgrid 3.15 is reasonably largely used by people who need the feature */
			
 
				 		munmap(A, dim);
			
 
				+#endif
			
 
				 	}
			
 
				 	else
			
 
				 #endif
			
@@ -840,9 +854,11 @@ _starpu_malloc_init(unsigned dst_node)
 
				 	disable_pinning = starpu_get_env_number("STARPU_DISABLE_PINNING");
			
 
				 	malloc_on_node_default_flags[dst_node] = STARPU_MALLOC_PINNED | STARPU_MALLOC_COUNT;
			
 
				 #ifdef STARPU_SIMGRID
			
 
				+#if SIMGRID_VERSION_MAJOR < 3 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR < 15)
			
 
				 	/* Reasonably "costless" */
			
 
				 	_starpu_malloc_simulation_fold = starpu_get_env_number_default("STARPU_MALLOC_SIMULATION_FOLD", 1) << 20;
			
 
				 #endif
			
 
				+#endif
			
 
				 }
			
 
				 
			
 
				 void
			
--- a/src/datawizard/memory_nodes.c
+++ b/src/datawizard/memory_nodes.c
@@ -185,9 +185,12 @@ unsigned starpu_worker_get_memory_node(unsigned workerid)
 
				 /* same utility as _starpu_memory_node_add_nworkers */
			
 
				 void _starpu_worker_drives_memory_node(struct _starpu_worker *worker, unsigned memnode)
			
 
				 {
			
 
				-	_starpu_worker_drives_memory[worker->workerid][memnode] = 1;
			
 
				+	if (! _starpu_worker_drives_memory[worker->workerid][memnode])
			
 
				+	{
			
 
				+		_starpu_worker_drives_memory[worker->workerid][memnode] = 1;
			
 
				 #ifdef STARPU_SIMGRID
			
 
				-	starpu_pthread_queue_register(&worker->wait, &_starpu_simgrid_transfer_queue[memnode]);
			
 
				+		starpu_pthread_queue_register(&worker->wait, &_starpu_simgrid_transfer_queue[memnode]);
			
 
				 #endif
			
 
				+	}
			
 
				 }
			
 
				 
			
--- a/src/debug/traces/starpu_fxt.c
+++ b/src/debug/traces/starpu_fxt.c
@@ -157,6 +157,8 @@ static void task_dump(struct task_info *task)
 
				 
			
 
				 	if (task->exclude_from_dag)
			
 
				 		goto out;
			
 
				+	if (!tasks_file)
			
 
				+		goto out;
			
 
				 
			
 
				 	if (task->name)
			
 
				 	{
			
@@ -274,6 +276,8 @@ static struct data_info *get_data(unsigned long handle, int mpi_rank)
 
				 
			
 
				 static void data_dump(struct data_info *data)
			
 
				 {
			
 
				+	if (!data_file)
			
 
				+		goto out;
			
 
				 	fprintf(data_file, "Handle: %lx\n", data->handle);
			
 
				 	fprintf(data_file, "MPIRank: %d\n", data->mpi_rank);
			
 
				 	if (data->name)
			
@@ -291,6 +295,7 @@ static void data_dump(struct data_info *data)
 
				 	}
			
 
				 	fprintf(data_file, "MPIOwner: %d\n", data->mpi_owner);
			
 
				 	fprintf(data_file, "\n");
			
 
				+out:
			
 
				 	HASH_DEL(data_info, data);
			
 
				 	free(data);
			
 
				 }
			
@@ -2388,8 +2393,8 @@ static void handle_task_done(struct fxt_ev_64 *ev, struct starpu_fxt_options *op
 
				 	unsigned exclude_from_dag = ev->param[2];
			
 
				 	struct task_info *task = get_task(job_id, options->file_rank);
			
 
				 	task->exclude_from_dag = exclude_from_dag;
			
 
				-	if (tasks_file)
			
 
				-		task_dump(task);
			
 
				+
			
 
				+	task_dump(task);
			
 
				 
			
 
				 	if (!exclude_from_dag)
			
 
				 		_starpu_fxt_dag_set_task_done(options->file_prefix, job_id, name, colour);
			
@@ -2698,9 +2703,8 @@ static void handle_task_wait_for_all(void)
 
				 	_starpu_fxt_dag_add_sync_point();
			
 
				 }
			
 
				 
			
 
				-static void handle_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
			
 
				+static void handle_string_event(struct fxt_ev_64 *ev, const char *event, struct starpu_fxt_options *options)
			
 
				 {
			
 
				-	char *event = get_fxt_string(ev, 0);
			
 
				 	/* Add an event in the trace */
			
 
				 	if (out_paje_file)
			
 
				 	{
			
@@ -2717,6 +2721,12 @@ static void handle_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *option
 
				 		recfmt_dump_state(get_event_time_stamp(ev, options), "ProgEvent", -1, 0, event, "Program");
			
 
				 }
			
 
				 
			
 
				+static void handle_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
			
 
				+{
			
 
				+	char *event = get_fxt_string(ev, 0);
			
 
				+	handle_string_event(ev, event, options);
			
 
				+}
			
 
				+
			
 
				 static void handle_thread_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
			
 
				 {
			
 
				 	/* Add an event in the trace */
			
@@ -3418,6 +3428,13 @@ void _starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *op
 
				 				fut_keymask = ev.param[0];
			
 
				 				break;
			
 
				 
			
 
				+			case FUT_START_FLUSH_CODE:
			
 
				+				handle_string_event(&ev, "fxt_start_flush", options);
			
 
				+				break;
			
 
				+			case FUT_STOP_FLUSH_CODE:
			
 
				+				handle_string_event(&ev, "fxt_stop_flush", options);
			
 
				+				break;
			
 
				+
			
 
				 			/* We can safely ignore FUT internal events */
			
 
				 			case FUT_CALIBRATE0_CODE:
			
 
				 			case FUT_CALIBRATE1_CODE:
			
@@ -3467,7 +3484,6 @@ void _starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *op
 
				 #endif
			
 
				 	}
			
 
				 
			
 
				-	if (data_file)
			
 
				 	{
			
 
				 		/* TODO: move to handle_data_unregister */
			
 
				 		struct data_info *data, *tmp;
			
--- a/src/debug/traces/starpu_paje.c
+++ b/src/debug/traces/starpu_paje.c
@@ -193,7 +193,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 
				 	poti_DefineEntityValue("E", "S", "Executing", ".0 .6 .5");
			
 
				 	poti_DefineEntityValue("Sc", "S", "Scheduling", ".7 .36 .0");
			
 
				 	poti_DefineEntityValue("Sl", "S", "Sleeping", ".9 .1 .0");
			
 
				-	poti_DefineEntityValue("P", "S", "Progressing", ".4 .1 .6");
			
 
				+	poti_DefineEntityValue("P", "S", "Progressing", ".1 .3 .1");
			
 
				 	poti_DefineEntityValue("U", "S", "Unpartitioning", ".0 .0 1.0");
			
 
				 	poti_DefineEntityValue("H", "S", "Hypervisor", ".5 .18 .0");
			
 
				 	poti_DefineEntityValue("Bu", "S", "Building task", ".5 .18 .0");
			
@@ -213,7 +213,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 
				 	poti_DefineEntityValue("E", "WS", "Executing", ".0 .6 .5");
			
 
				 	poti_DefineEntityValue("Sc", "WS", "Scheduling", ".7 .36 .0");
			
 
				 	poti_DefineEntityValue("Sl", "WS", "Sleeping", ".9 .1 .0");
			
 
				-	poti_DefineEntityValue("P", "WS", "Progressing", ".4 .1 .6");
			
 
				+	poti_DefineEntityValue("P", "WS", "Progressing", ".1 .3 .1");
			
 
				 	poti_DefineEntityValue("U", "WS", "Unpartitioning", ".0 .0 1.0");
			
 
				 	poti_DefineEntityValue("H", "WS", "Hypervisor", ".5 .18 .0");
			
 
				 	poti_DefineEntityValue("Bu", "WS", "Building task", ".5 .18 .0");
			
@@ -268,7 +268,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 
				 		poti_DefineEntityValue("E", ctx, "Executing", ".0 .6 .5");
			
 
				 		poti_DefineEntityValue("Sc", ctx, "Scheduling", ".7 .36 .0");
			
 
				 		poti_DefineEntityValue("Sl", ctx, "Sleeping", ".9 .1 .0");
			
 
				-		poti_DefineEntityValue("P", ctx, "Progressing", ".4 .1 .6");
			
 
				+		poti_DefineEntityValue("P", ctx, "Progressing", ".1 .3 .1");
			
 
				 		poti_DefineEntityValue("U", ctx, "Unpartitioning", ".0 .0 1.0");
			
 
				 		poti_DefineEntityValue("H", ctx, "Hypervisor", ".5 .18 .0");
			
 
				 	}
			
@@ -331,7 +331,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 
				 6       E       S       Executing         \".0 .6 .5\"		\n\
			
 
				 6       Sc       S      Scheduling         \".7 .36 .0\"		\n\
			
 
				 6       Sl       S      Sleeping         \".9 .1 .0\"		\n\
			
 
				-6       P       S       Progressing         \".4 .1 .6\"		\n\
			
 
				+6       P       S       Progressing         \".1 .3 .1\"		\n\
			
 
				 6       U       S       Unpartitioning      \".0 .0 1.0\"		\n\
			
 
				 6       H       S       Hypervisor      \".5 .18 .0\"		\n\
			
 
				 6       Bu      S       \"Building task\"   \".5 .18 .0\"		\n\
			
@@ -351,7 +351,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 
				 6       E       WS       Executing         \".0 .6 .5\"		\n\
			
 
				 6       Sc       WS      Scheduling         \".7 .36 .0\"		\n\
			
 
				 6       Sl       WS      Sleeping         \".9 .1 .0\"		\n\
			
 
				-6       P       WS       Progressing         \".4 .1 .6\"		\n\
			
 
				+6       P       WS       Progressing         \".1 .3 .1\"		\n\
			
 
				 6       U       WS       Unpartitioning      \".0 .0 1.0\"		\n\
			
 
				 6       H       WS       Hypervisor      \".5 .18 .0\"		\n\
			
 
				 6       Bu      WS       \"Building task\"   \".5 .18 .0\"		\n\
			
@@ -394,7 +394,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 
				 6       E       Ctx%u       Executing         \".0 .6 .5\"		\n\
			
 
				 6       Sc       Ctx%u      Scheduling         \".7 .36 .0\"		\n\
			
 
				 6       Sl       Ctx%u      Sleeping         \".9 .1 .0\"		\n\
			
 
				-6       P       Ctx%u       Progressing         \".4 .1 .6\"		\n\
			
 
				+6       P       Ctx%u       Progressing         \".1 .3 .1\"		\n\
			
 
				 6       U       Ctx%u       Unpartitioning         \".0 .0 1.0\"	\n\
			
 
				 6       H       Ctx%u       Hypervisor         \".5 .18 .0\"		\n",
			
 
				 		i, i, i, i, i, i, i, i, i, i, i, i, i);
			
--- a/src/drivers/cpu/driver_cpu.c
+++ b/src/drivers/cpu/driver_cpu.c
@@ -89,6 +89,12 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 
				 #ifdef STARPU_SIMGRID
			
 
				 			if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE)
			
 
				 				func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
			
 
				+			else if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT)
			
 
				+			{
			
 
				+				_SIMGRID_TIMER_BEGIN(1);
			
 
				+				func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
			
 
				+				_SIMGRID_TIMER_END;
			
 
				+			}
			
 
				 			else
			
 
				 				_starpu_simgrid_submit_job(cpu_args->workerid, j, perf_arch, NAN, NULL);
			
 
				 #else
			
@@ -410,13 +416,13 @@ void *_starpu_cpu_worker(void *arg)
 
				 	struct _starpu_worker *worker = arg;
			
 
				 
			
 
				 	_starpu_cpu_driver_init(worker);
			
 
				-	_STARPU_TRACE_END_PROGRESS(worker->memory_node);
			
 
				+	_STARPU_TRACE_START_PROGRESS(worker->memory_node);
			
 
				 	while (_starpu_machine_is_running())
			
 
				 	{
			
 
				 		_starpu_may_pause();
			
 
				 		_starpu_cpu_driver_run_once(worker);
			
 
				 	}
			
 
				-	_STARPU_TRACE_START_PROGRESS(worker->memory_node);
			
 
				+	_STARPU_TRACE_END_PROGRESS(worker->memory_node);
			
 
				 	_starpu_cpu_driver_deinit(worker);
			
 
				 
			
 
				 	return NULL;
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -507,6 +507,12 @@ static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *worke
 
				 		unsigned workerid = worker->workerid;
			
 
				 		if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE && !async)
			
 
				 			func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
			
 
				+		else if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT && !async)
			
 
				+			{
			
 
				+				_SIMGRID_TIMER_BEGIN(1);
			
 
				+				func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
			
 
				+				_SIMGRID_TIMER_END;
			
 
				+			}
			
 
				 		else
			
 
				 			_starpu_simgrid_submit_job(workerid, j, &worker->perf_arch, NAN,
			
 
				 				async ? &task_finished[workerid][pipeline_idx] : NULL);
			
@@ -763,6 +769,7 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 
				 		task = worker->task_transferring;
			
 
				 		if (task && worker->nb_buffers_transferred == worker->nb_buffers_totransfer)
			
 
				 		{
			
 
				+			_STARPU_TRACE_END_PROGRESS(memnode);
			
 
				 			j = _starpu_get_job_associated_to_task(task);
			
 
				 
			
 
				 			_starpu_set_local_worker_key(worker);
			
@@ -779,10 +786,9 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 
				 			}
			
 
				 			else
			
 
				 			{
			
 
				-				_STARPU_TRACE_END_PROGRESS(memnode);
			
 
				 				execute_job_on_cuda(task, worker);
			
 
				-				_STARPU_TRACE_START_PROGRESS(memnode);
			
 
				 			}
			
 
				+			_STARPU_TRACE_START_PROGRESS(memnode);
			
 
				 		}
			
 
				 
			
 
				 		/* Then test for termination of queued tasks */
			
@@ -811,6 +817,7 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 
				 		else
			
 
				 #endif /* !STARPU_SIMGRID */
			
 
				 		{
			
 
				+			_STARPU_TRACE_END_PROGRESS(memnode);
			
 
				 			/* Asynchronous task completed! */
			
 
				 			_starpu_set_local_worker_key(worker);
			
 
				 			finish_job_on_cuda(_starpu_get_job_associated_to_task(task), worker);
			
@@ -831,11 +838,9 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 
				 					 * flushing the pipeline, we can now at
			
 
				 					 * last execute it.  */
			
 
				 
			
 
				-					_STARPU_TRACE_END_PROGRESS(memnode);
			
 
				 					_STARPU_TRACE_EVENT("sync_task");
			
 
				 					execute_job_on_cuda(task, worker);
			
 
				 					_STARPU_TRACE_EVENT("end_sync_task");
			
 
				-					_STARPU_TRACE_START_PROGRESS(memnode);
			
 
				 					worker->pipeline_stuck = 0;
			
 
				 				}
			
 
				 			}
			
@@ -848,6 +853,7 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 
				 				/* Everybody busy */
			
 
				 				_STARPU_TRACE_END_EXECUTING()
			
 
				 #endif
			
 
				+			_STARPU_TRACE_START_PROGRESS(memnode);
			
 
				 		}
			
 
				 
			
 
				 		if (!worker->pipeline_length || worker->ntasks < worker->pipeline_length)
			
--- a/src/drivers/driver_common/driver_common.c
+++ b/src/drivers/driver_common/driver_common.c
@@ -101,6 +101,7 @@ void _starpu_driver_start_job(struct _starpu_worker *worker, struct _starpu_job
 
				 	}
			
 
				 	else
			
 
				 		_STARPU_TRACE_START_CODELET_BODY(j, j->nimpl, perf_arch, workerid);
			
 
				+	_STARPU_TASK_BREAK_ON(task, exec);
			
 
				 }
			
 
				 
			
 
				 void _starpu_driver_end_job(struct _starpu_worker *worker, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch STARPU_ATTRIBUTE_UNUSED, struct timespec *codelet_end, int rank, int profiling)
			
@@ -358,6 +359,7 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *worker, int w
 
				 			sched_ctx = _starpu_get_sched_ctx_struct(e->sched_ctx);
			
 
				 			if(sched_ctx && sched_ctx->id > 0 && sched_ctx->id < STARPU_NMAX_SCHED_CTXS)
			
 
				 			{
			
 
				+				STARPU_PTHREAD_MUTEX_LOCK(&sched_ctx->parallel_sect_mutex[workerid]);
			
 
				 				if(!sched_ctx->sched_policy)
			
 
				 					worker->is_slave_somewhere = sched_ctx->main_master != workerid;
			
 
				 
			
@@ -366,23 +368,18 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *worker, int w
 
				 					/* don't let the worker sleep with the sched_mutex taken */
			
 
				 					/* we need it until here bc of the list of ctxs of the workers
			
 
				 					   that can change in another thread */
			
 
				+					STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&worker->sched_mutex);
			
 
				 					needed = 0;
			
 
				-					worker->state_blocked = 1;
			
 
				-					worker->state_wait_ack__blocked = 1;
			
 
				-					STARPU_PTHREAD_COND_BROADCAST(&worker->sched_cond);
			
 
				-					do
			
 
				-					{
			
 
				-						STARPU_PTHREAD_COND_WAIT(&worker->sched_cond, &worker->sched_mutex);
			
 
				-					}
			
 
				-					while (worker->state_wait_ack__blocked);
			
 
				-					worker->state_blocked = 0;
			
 
				+					_starpu_sched_ctx_signal_worker_blocked(sched_ctx->id, workerid);
			
 
				+					sched_ctx->busy[workerid] = 1;
			
 
				+					STARPU_PTHREAD_COND_WAIT(&sched_ctx->parallel_sect_cond[workerid], &sched_ctx->parallel_sect_mutex[workerid]);
			
 
				+					sched_ctx->busy[workerid] = 0;
			
 
				+					STARPU_PTHREAD_COND_SIGNAL(&sched_ctx->parallel_sect_cond_busy[workerid]);
			
 
				+					_starpu_sched_ctx_signal_worker_woke_up(sched_ctx->id, workerid);
			
 
				 					sched_ctx->parallel_sect[workerid] = 0;
			
 
				-					if (worker->state_wait_handshake__blocked)
			
 
				-					{
			
 
				-						worker->state_wait_handshake__blocked = 0;
			
 
				-						STARPU_PTHREAD_COND_BROADCAST(&worker->sched_cond);
			
 
				-					}
			
 
				+					STARPU_PTHREAD_MUTEX_LOCK_SCHED(&worker->sched_mutex);
			
 
				 				}
			
 
				+				STARPU_PTHREAD_MUTEX_UNLOCK(&sched_ctx->parallel_sect_mutex[workerid]);
			
 
				 			}
			
 
				 			if(!needed)
			
 
				 				break;
			
@@ -391,25 +388,21 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *worker, int w
 
				 		if(worker->tmp_sched_ctx != -1)
			
 
				 		{
			
 
				 			sched_ctx = _starpu_get_sched_ctx_struct(worker->tmp_sched_ctx);
			
 
				+			STARPU_PTHREAD_MUTEX_LOCK(&sched_ctx->parallel_sect_mutex[workerid]);
			
 
				 			if(sched_ctx->parallel_sect[workerid])
			
 
				 			{
			
 
				 //				needed = 0;
			
 
				-				worker->state_blocked = 1;
			
 
				-				worker->state_wait_ack__blocked = 1;
			
 
				-				STARPU_PTHREAD_COND_BROADCAST(&worker->sched_cond);
			
 
				-				do
			
 
				-				{
			
 
				-					STARPU_PTHREAD_COND_WAIT(&worker->sched_cond, &worker->sched_mutex);
			
 
				-				}
			
 
				-				while (worker->state_wait_ack__blocked);
			
 
				-				worker->state_blocked = 0;
			
 
				+				STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&worker->sched_mutex);
			
 
				+				_starpu_sched_ctx_signal_worker_blocked(sched_ctx->id, workerid);
			
 
				+				sched_ctx->busy[workerid] = 1;
			
 
				+				STARPU_PTHREAD_COND_WAIT(&sched_ctx->parallel_sect_cond[workerid], &sched_ctx->parallel_sect_mutex[workerid]);
			
 
				+				sched_ctx->busy[workerid] = 0;
			
 
				+				STARPU_PTHREAD_COND_SIGNAL(&sched_ctx->parallel_sect_cond_busy[workerid]);
			
 
				+				_starpu_sched_ctx_signal_worker_woke_up(sched_ctx->id, workerid);
			
 
				 				sched_ctx->parallel_sect[workerid] = 0;
			
 
				-				if (worker->state_wait_handshake__blocked)
			
 
				-				{
			
 
				-					worker->state_wait_handshake__blocked = 0;
			
 
				-					STARPU_PTHREAD_COND_BROADCAST(&worker->sched_cond);
			
 
				-				}
			
 
				+				STARPU_PTHREAD_MUTEX_LOCK_SCHED(&worker->sched_mutex);
			
 
				 			}
			
 
				+			STARPU_PTHREAD_MUTEX_UNLOCK(&sched_ctx->parallel_sect_mutex[workerid]);
			
 
				 		}
			
 
				 
			
 
				 		needed = !needed;
			
@@ -428,11 +421,7 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *worker, int w
 
				 		task = NULL;
			
 
				 	/*else try to pop a task*/
			
 
				 	else
			
 
				-	{
			
 
				-		_starpu_worker_enter_transient_sched_op(worker);
			
 
				 		task = _starpu_pop_task(worker);
			
 
				-		_starpu_worker_leave_transient_sched_op(worker);
			
 
				-	}
			
 
				 
			
 
				 #if !defined(STARPU_SIMGRID)
			
 
				 	if (task == NULL && !executing)
			
@@ -449,11 +438,7 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *worker, int w
 
				 		if (_starpu_worker_can_block(memnode, worker)
			
 
				 			&& !_starpu_sched_ctx_last_worker_awake(worker))
			
 
				 		{
			
 
				-			do
			
 
				-			{
			
 
				-				STARPU_PTHREAD_COND_WAIT(&worker->sched_cond, &worker->sched_mutex);
			
 
				-			}
			
 
				-			while (worker->status == STATUS_SLEEPING);
			
 
				+			STARPU_PTHREAD_COND_WAIT(&worker->sched_cond, &worker->sched_mutex);
			
 
				 			STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&worker->sched_mutex);
			
 
				 		}
			
 
				 		else
			
@@ -527,9 +512,7 @@ int _starpu_get_multi_worker_task(struct _starpu_worker *workers, struct starpu_
 
				 #endif
			
 
				 			_starpu_worker_set_status_scheduling(workers[i].workerid);
			
 
				 			_starpu_set_local_worker_key(&workers[i]);
			
 
				-			_starpu_worker_enter_transient_sched_op(&workers[i]);
			
 
				 			tasks[i] = _starpu_pop_task(&workers[i]);
			
 
				-			_starpu_worker_leave_transient_sched_op(&workers[i]);
			
 
				 			if(tasks[i] != NULL)
			
 
				 			{
			
 
				 				_starpu_worker_set_status_scheduling_done(workers[i].workerid);
			
@@ -598,11 +581,7 @@ int _starpu_get_multi_worker_task(struct _starpu_worker *workers, struct starpu_
 
				 		if (_starpu_worker_can_block(memnode, worker)
			
 
				 				&& !_starpu_sched_ctx_last_worker_awake(worker))
			
 
				 		{
			
 
				-			do
			
 
				-			{
			
 
				-				STARPU_PTHREAD_COND_WAIT(&worker->sched_cond, &worker->sched_mutex);
			
 
				-			}
			
 
				-			while (worker->status == STATUS_SLEEPING);
			
 
				+			STARPU_PTHREAD_COND_WAIT(&worker->sched_cond, &worker->sched_mutex);
			
 
				 			STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&worker->sched_mutex);
			
 
				 		}
			
 
				 		else
			
--- a/src/drivers/mic/driver_mic_sink.c
+++ b/src/drivers/mic/driver_mic_sink.c
@@ -39,7 +39,7 @@ void _starpu_mic_sink_init(struct _starpu_mp_node *node)
 
				 	cpu_set_t cpuset;
			
 
				 	/* We reserve one core for the communications */
			
 
				 	/*Bind on the first core*/
			
 
				-	self = pthread_self();
			
 
				+	self = starpu_pthread_self();
			
 
				 	CPU_ZERO(&cpuset);
			
 
				 	CPU_SET(0,&cpuset);
			
 
				 	pthread_setaffinity_np(self,sizeof(cpu_set_t),&cpuset);
			
--- a/src/drivers/mp_common/mp_common.c
+++ b/src/drivers/mp_common/mp_common.c
@@ -15,7 +15,6 @@
 
				  */
			
 
				 
			
 
				 #include <stdlib.h>
			
 
				-#include <pthread.h>
			
 
				 
			
 
				 #include <datawizard/interfaces/data_interface.h>
			
 
				 #include <drivers/mp_common/mp_common.h>
			
@@ -400,7 +399,7 @@ void _starpu_mp_common_send_command(const struct _starpu_mp_node *node,
 
				 {
			
 
				 	STARPU_ASSERT_MSG(arg_size <= BUFFER_SIZE, "Too much data (%d) for the static MIC buffer (%d), increase BUFFER_SIZE perhaps?", arg_size, BUFFER_SIZE);
			
 
				 
			
 
				-        //printf("SEND CMD : %d - arg_size %d by %lu \n", command, arg_size, pthread_self());
			
 
				+        //printf("SEND CMD : %d - arg_size %d by %lu \n", command, arg_size, starpu_pthread_self());
			
 
				 
			
 
				 	/* MIC and MPI sizes are given through a int */
			
 
				 	int command_size = sizeof(enum _starpu_mp_command);
			
@@ -436,7 +435,7 @@ enum _starpu_mp_command _starpu_mp_common_recv_command(const struct _starpu_mp_n
 
				 	command = *((enum _starpu_mp_command *) node->buffer);
			
 
				 	*arg_size = *((int *) ((uintptr_t)node->buffer + command_size));
			
 
				 
			
 
				-        //printf("RECV command : %d - arg_size %d by %lu \n", command, *arg_size, pthread_self());
			
 
				+        //printf("RECV command : %d - arg_size %d by %lu \n", command, *arg_size, starpu_pthread_self());
			
 
				 
			
 
				 	/* If there is no argument (ie. arg_size == 0),
			
 
				 	 * let's return the command right now */
			
--- a/src/drivers/mp_common/mp_common.h
+++ b/src/drivers/mp_common/mp_common.h
@@ -17,7 +17,6 @@
 
				 #ifndef __MP_COMMON_H__
			
 
				 #define __MP_COMMON_H__
			
 
				 
			
 
				-#include <pthread.h>
			
 
				 #include <semaphore.h>
			
 
				 
			
 
				 #include <starpu.h>
			
--- a/src/drivers/mpi/driver_mpi_common.c
+++ b/src/drivers/mpi/driver_mpi_common.c
@@ -80,7 +80,10 @@ int _starpu_mpi_common_mp_init()
 
				 #endif
			
 
				 
			
 
				                 int thread_support;
			
 
				-                STARPU_ASSERT(MPI_Init_thread(_starpu_get_argc(), _starpu_get_argv(), required, &thread_support) == MPI_SUCCESS);
			
 
				+                if (MPI_Init_thread(_starpu_get_argc(), _starpu_get_argv(), required, &thread_support) != MPI_SUCCESS)
			
 
				+		{
			
 
				+			STARPU_ABORT_MSG("Cannot Initialize MPI !");
			
 
				+		}
			
 
				 
			
 
				                 if (thread_support != required)
			
 
				                 {
			
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -954,6 +954,14 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker
 
				 			simulate = 1;
			
 
				 		#endif
			
 
				 		}
			
 
				+		else if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT && !async)
			
 
				+			{
			
 
				+				_SIMGRID_TIMER_BEGIN(1);
			
 
				+				func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
			
 
				+				_SIMGRID_TIMER_END;
			
 
				+				simulate=0;
			
 
				+			}
			
 
				+
			
 
				 		if (simulate)
			
 
				 			_starpu_simgrid_submit_job(worker->workerid, j, &worker->perf_arch, length,
			
 
				 						   async ? &task_finished[worker->devid][pipeline_idx] : NULL);
			
--- a/src/profiling/bound.c
+++ b/src/profiling/bound.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2016, 2017  CNRS
			
 
				- * Copyright (C) 2010-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2010-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2011  Télécom-SudParis
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -209,11 +209,11 @@ static double** initialize_arch_duration(int maxdevid, unsigned* maxncore_table)
 
				 static void initialize_duration(struct bound_task *task)
			
 
				 {
			
 
				 	struct _starpu_machine_config *conf = _starpu_get_machine_config();
			
 
				-	task->duration[STARPU_CPU_WORKER] = initialize_arch_duration(1,&conf->topology.ncpus); 
			
 
				-	task->duration[STARPU_CUDA_WORKER] = initialize_arch_duration(conf->topology.ncudagpus,NULL); 
			
 
				-	task->duration[STARPU_OPENCL_WORKER] = initialize_arch_duration(conf->topology.nopenclgpus,NULL); 
			
 
				-	task->duration[STARPU_MIC_WORKER] = initialize_arch_duration(conf->topology.nmicdevices,conf->topology.nmiccores); 
			
 
				-	task->duration[STARPU_SCC_WORKER] = initialize_arch_duration(conf->topology.nsccdevices,NULL); 
			
 
				+	task->duration[STARPU_CPU_WORKER] = initialize_arch_duration(1,&conf->topology.nhwcpus); 
			
 
				+	task->duration[STARPU_CUDA_WORKER] = initialize_arch_duration(conf->topology.nhwcudagpus,NULL); 
			
 
				+	task->duration[STARPU_OPENCL_WORKER] = initialize_arch_duration(conf->topology.nhwopenclgpus,NULL); 
			
 
				+	task->duration[STARPU_MIC_WORKER] = initialize_arch_duration(conf->topology.nhwmicdevices,conf->topology.nmiccores); 
			
 
				+	task->duration[STARPU_SCC_WORKER] = initialize_arch_duration(conf->topology.nhwscc,NULL); 
			
 
				 }
			
 
				 
			
 
				 static struct starpu_perfmodel_device device =
			
@@ -278,7 +278,7 @@ void _starpu_bound_record(struct _starpu_job *j)
 
				 	{
			
 
				 		struct bound_task_pool *tp;
			
 
				 
			
 
				-		_starpu_compute_buffers_footprint(j->task->cl?j->task->cl->model:NULL, STARPU_CPU_WORKER, 0, j);
			
 
				+		_starpu_compute_buffers_footprint(j->task->cl?j->task->cl->model:NULL, NULL, 0, j);
			
 
				 
			
 
				 		if (last && last->cl == j->task->cl && last->footprint == j->footprint)
			
 
				 			tp = last;
			
--- a/src/profiling/profiling_helpers.c
+++ b/src/profiling/profiling_helpers.c
@@ -60,6 +60,7 @@ void _starpu_profiling_bus_helper_display_summary(FILE *stream)
 
				 
			
 
				 		unsigned unit = 0;
			
 
				 		double d = convert_to_byte_units(transferred, max_unit, &unit);
			
 
				+		double avg = (transfer_cnt != 0) ? (d / transfer_cnt) : 0;
			
 
				 
			
 
				 		_starpu_memory_node_get_name(src, src_name, sizeof(src_name));
			
 
				 		_starpu_memory_node_get_name(dst, dst_name, sizeof(dst_name));
			
@@ -67,7 +68,7 @@ void _starpu_profiling_bus_helper_display_summary(FILE *stream)
 
				 		fprintf(stream, "\t%s -> %s", src_name, dst_name);
			
 
				 		fprintf(stream, "\t%.2lf %s", d, byte_units[unit]);
			
 
				 		fprintf(stream, "\t%.2lf %s/s", d / elapsed_time, byte_units[unit]);
			
 
				-		fprintf(stream, "\t(transfers : %lld - avg %.2lf %s)\n", transfer_cnt, d / transfer_cnt, byte_units[unit]);
			
 
				+		fprintf(stream, "\t(transfers : %lld - avg %.2lf %s)\n", transfer_cnt, avg, byte_units[unit]);
			
 
				 
			
 
				 		sum_transferred += transferred;
			
 
				 	}
			
--- a/src/sched_policies/component_worker.c
+++ b/src/sched_policies/component_worker.c
@@ -503,8 +503,11 @@ static void simple_worker_can_pull(struct starpu_sched_component * worker_compon
 
				 	}
			
 
				 	if(_starpu_sched_component_worker_is_sleeping_status(worker_component))
			
 
				 	{
			
 
				+		starpu_pthread_mutex_t *sched_mutex;
			
 
				+		starpu_pthread_cond_t *sched_cond;
			
 
				+		starpu_worker_get_sched_condition(w->workerid, &sched_mutex, &sched_cond);
			
 
				 		_starpu_sched_component_unlock_worker(worker_component->tree->sched_ctx_id, w->workerid);
			
 
				-		starpu_wake_worker(w->workerid);
			
 
				+		starpu_wakeup_worker(w->workerid, sched_cond, sched_mutex);
			
 
				 	}
			
 
				 	else
			
 
				 		_starpu_sched_component_unlock_worker(worker_component->tree->sched_ctx_id, w->workerid);
			
@@ -723,7 +726,10 @@ static void combined_worker_can_pull(struct starpu_sched_component * component)
 
				 		_starpu_sched_component_lock_worker(component->tree->sched_ctx_id, worker);
			
 
				 		if(_starpu_sched_component_worker_is_sleeping_status(component))
			
 
				 		{
			
 
				-			starpu_wake_worker(worker);
			
 
				+			starpu_pthread_mutex_t *sched_mutex;
			
 
				+			starpu_pthread_cond_t *sched_cond;
			
 
				+			starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
			
 
				+			starpu_wakeup_worker(worker, sched_cond, sched_mutex);
			
 
				 		}
			
 
				 		if(_starpu_sched_component_worker_is_reset_status(component))
			
 
				 			_starpu_sched_component_worker_set_changed_status(component);
			
--- a/src/sched_policies/deque_modeling_policy_data_aware.c
+++ b/src/sched_policies/deque_modeling_policy_data_aware.c
@@ -251,6 +251,7 @@ static struct starpu_task *dmda_pop_ready_task(unsigned sched_ctx_id)
 
				 
			
 
				 	/* Take the opportunity to update start time */
			
 
				 	fifo->exp_start = STARPU_MAX(starpu_timing_now(), fifo->exp_start);
			
 
				+	fifo->exp_end = fifo->exp_start + fifo->exp_len;
			
 
				 
			
 
				 	task = _starpu_fifo_pop_first_ready_task(fifo, node, dt->num_priorities);
			
 
				 	if (task)
			
@@ -285,6 +286,7 @@ static struct starpu_task *dmda_pop_task(unsigned sched_ctx_id)
 
				 
			
 
				 	/* Take the opportunity to update start time */
			
 
				 	fifo->exp_start = STARPU_MAX(starpu_timing_now(), fifo->exp_start);
			
 
				+	fifo->exp_end = fifo->exp_start + fifo->exp_len;
			
 
				 
			
 
				 	STARPU_ASSERT_MSG(fifo, "worker %u does not belong to ctx %u anymore.\n", workerid, sched_ctx_id);
			
 
				 
			
@@ -321,6 +323,7 @@ static struct starpu_task *dmda_pop_every_task(unsigned sched_ctx_id)
 
				 
			
 
				 	/* Take the opportunity to update start time */
			
 
				 	fifo->exp_start = STARPU_MAX(starpu_timing_now(), fifo->exp_start);
			
 
				+	fifo->exp_end = fifo->exp_start + fifo->exp_len;
			
 
				 
			
 
				 	starpu_pthread_mutex_t *sched_mutex;
			
 
				 	starpu_pthread_cond_t *sched_cond;
			
@@ -367,6 +370,7 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
				 
			
 
				         /* Sometimes workers didn't take the tasks as early as we expected */
			
 
				 	fifo->exp_start = isnan(fifo->exp_start) ? starpu_timing_now() + fifo->pipeline_len : STARPU_MAX(fifo->exp_start, starpu_timing_now());
			
 
				+	fifo->exp_end = fifo->exp_start + fifo->exp_len;
			
 
				 
			
 
				 	if ((starpu_timing_now() + predicted_transfer) < fifo->exp_end)
			
 
				 	{
			
@@ -448,7 +452,7 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
				 
			
 
				 
			
 
				 #if !defined(STARPU_NON_BLOCKING_DRIVERS) || defined(STARPU_SIMGRID)
			
 
				-		starpu_wake_worker_locked(best_workerid);
			
 
				+		starpu_wakeup_worker_locked(best_workerid, sched_cond, sched_mutex);
			
 
				 #endif
			
 
				 		starpu_push_task_end(task);
			
 
				 		STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(sched_mutex);
			
@@ -460,7 +464,7 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
				 		dt->queue_array[best_workerid]->ntasks++;
			
 
				 		dt->queue_array[best_workerid]->nprocessed++;
			
 
				 #if !defined(STARPU_NON_BLOCKING_DRIVERS) || defined(STARPU_SIMGRID)
			
 
				-		starpu_wake_worker_locked(best_workerid);
			
 
				+		starpu_wakeup_worker_locked(best_workerid, sched_cond, sched_mutex);
			
 
				 #endif
			
 
				 		starpu_push_task_end(task);
			
 
				 		STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(sched_mutex);
			
@@ -773,7 +777,9 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
				 			if (unknown)
			
 
				 				continue;
			
 
				 
			
 
				-			exp_end[worker_ctx][nimpl] = exp_start + prev_exp_len + local_task_length[worker_ctx][nimpl];
			
 
				+			double task_starting_time = STARPU_MAX(exp_start + prev_exp_len, starpu_timing_now() + local_data_penalty[worker_ctx][nimpl]); 
			
 
				+
			
 
				+			exp_end[worker_ctx][nimpl] = task_starting_time + local_task_length[worker_ctx][nimpl];
			
 
				 
			
 
				 			if (exp_end[worker_ctx][nimpl] < best_exp_end)
			
 
				 			{
			
@@ -1126,6 +1132,7 @@ static void dmda_pre_exec_hook(struct starpu_task *task, unsigned sched_ctx_id)
 
				 
			
 
				 	/* Take the opportunity to update start time */
			
 
				 	fifo->exp_start = STARPU_MAX(starpu_timing_now() + fifo->pipeline_len, fifo->exp_start);
			
 
				+	fifo->exp_end = fifo->exp_start + fifo->exp_len;
			
 
				 
			
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(sched_mutex);
			
 
				 }
			
--- a/src/sched_policies/eager_central_policy.c
+++ b/src/sched_policies/eager_central_policy.c
@@ -201,7 +201,7 @@ static void eager_add_workers(unsigned sched_ctx_id, int *workerids, unsigned nw
 
				 		int workerid = workerids[i];
			
 
				 		int curr_workerid = _starpu_worker_get_id();
			
 
				 		if(workerid != curr_workerid)
			
 
				-			starpu_wake_worker_locked(workerid);
			
 
				+			starpu_wake_worker(workerid);
			
 
				 
			
 
				 		starpu_sched_ctx_worker_shares_tasks_lists(workerid, sched_ctx_id);
			
 
				 	}
			
--- a/src/sched_policies/eager_central_priority_policy.c
+++ b/src/sched_policies/eager_central_priority_policy.c
@@ -308,7 +308,7 @@ static void eager_center_priority_add_workers(unsigned sched_ctx_id, int *worker
 
				 		int workerid = workerids[i];
			
 
				 		int curr_workerid = _starpu_worker_get_id();
			
 
				 		if(workerid != curr_workerid)
			
 
				-			starpu_wake_worker_locked(workerid);
			
 
				+			starpu_wake_worker(workerid);
			
 
				 
			
 
				                 starpu_sched_ctx_worker_shares_tasks_lists(workerid, sched_ctx_id);
			
 
				         }
			
--- a/src/sched_policies/parallel_eager.c
+++ b/src/sched_policies/parallel_eager.c
@@ -265,7 +265,7 @@ static struct starpu_task *pop_task_peager_policy(unsigned sched_ctx_id)
 
				 				_starpu_fifo_push_task(data->local_fifo[local_worker], alias);
			
 
				 
			
 
				 #if !defined(STARPU_NON_BLOCKING_DRIVERS) || defined(STARPU_SIMGRID)
			
 
				-				starpu_wake_worker_locked(local_worker);
			
 
				+				starpu_wakeup_worker_locked(local_worker, sched_cond, sched_mutex);
			
 
				 #endif
			
 
				 				STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(sched_mutex);
			
 
				 
			
--- a/src/util/fstarpu.c
+++ b/src/util/fstarpu.c
@@ -85,6 +85,7 @@ static const intptr_t fstarpu_starpu_mic	= STARPU_MIC;
 
				 static const intptr_t fstarpu_starpu_scc	= STARPU_SCC;
			
 
				 
			
 
				 static const intptr_t fstarpu_starpu_codelet_simgrid_execute	= STARPU_CODELET_SIMGRID_EXECUTE;
			
 
				+static const intptr_t fstarpu_starpu_codelet_simgrid_execute_and_inject	= STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT;
			
 
				 static const intptr_t fstarpu_starpu_cuda_async	= STARPU_CUDA_ASYNC;
			
 
				 static const intptr_t fstarpu_starpu_opencl_async	= STARPU_OPENCL_ASYNC;
			
 
				 
			
@@ -153,6 +154,7 @@ intptr_t fstarpu_get_constant(char *s)
 
				 	else if (!strcmp(s, "FSTARPU_SCC"))	{ return fstarpu_starpu_scc; }
			
 
				 
			
 
				 	else if (!strcmp(s, "FSTARPU_CODELET_SIMGRID_EXECUTE"))	{ return fstarpu_starpu_codelet_simgrid_execute; }
			
 
				+	else if (!strcmp(s, "FSTARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT"))	{ return fstarpu_starpu_codelet_simgrid_execute_and_inject; }
			
 
				 	else if (!strcmp(s, "FSTARPU_CUDA_ASYNC"))	{ return fstarpu_starpu_cuda_async; }
			
 
				 	else if (!strcmp(s, "FSTARPU_OPENCL_ASYNC"))	{ return fstarpu_starpu_opencl_async; }
			
 
				 
			
@@ -542,7 +544,7 @@ void fstarpu_worker_get_type_as_string(intptr_t type, char *dst, size_t maxlen)
 
				 	snprintf(dst, maxlen, "%s", str);
			
 
				 }
			
 
				 
			
 
				-struct starpu_data_handle *fstarpu_data_handle_array_alloc(int nb)
			
 
				+starpu_data_handle_t *fstarpu_data_handle_array_alloc(int nb)
			
 
				 {
			
 
				 	void *ptr;
			
 
				 	_STARPU_CALLOC(ptr, (size_t)nb, sizeof(starpu_data_handle_t));
			
--- a/starpu.mk
+++ b/starpu.mk
@@ -16,7 +16,7 @@
 
				 
			
 
				 if STARPU_USE_MPI_MASTER_SLAVE
			
 
				 MPI_LAUNCHER 			= $(MPIEXEC)  $(MPIEXEC_ARGS) -np 4
			
 
				-MPI_RUN_ARGS			= STARPU_WORKERS_NOBIND=1 STARPU_NCPU=4
			
 
				+MPI_RUN_ARGS			= STARPU_WORKERS_NOBIND=1 STARPU_NCPU=4 STARPU_NMPIMSTHREADS=4
			
 
				 endif
			
 
				 
			
 
				 showcheck:
			
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -34,6 +34,7 @@ AM_LDFLAGS = @STARPU_EXPORT_DYNAMIC@ $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFL
 
				 
			
 
				 EXTRA_DIST =					\
			
 
				 	helper.h				\
			
 
				+	datawizard/locality.sh			\
			
 
				 	datawizard/scal.h			\
			
 
				 	datawizard/mpi_like.h			\
			
 
				 	microbenchs/tasks_size_overhead.sh	\
			
@@ -381,6 +382,8 @@ if STARPU_SIMGRID
 
				 TESTS += $(MICROBENCHS:=.sh)
			
 
				 endif
			
 
				 
			
 
				+TESTS += datawizard/locality.sh
			
 
				+
			
 
				 #######################
			
 
				 # Source files        #
			
 
				 #######################
			
--- a/tests/datawizard/locality.sh
+++ b/tests/datawizard/locality.sh
@@ -0,0 +1,33 @@
 
				+#!/bin/bash -x
			
 
				+#
			
 
				+# StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+#
			
 
				+# Copyright (C) 2017  Université de Bordeaux
			
 
				+#
			
 
				+# StarPU is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU Lesser General Public License as published by
			
 
				+# the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+# your option) any later version.
			
 
				+#
			
 
				+# StarPU is distributed in the hope that it will be useful, but
			
 
				+# WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+#
			
 
				+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+
			
 
				+# Test generation of FxT traces
			
 
				+
			
 
				+set -e
			
 
				+
			
 
				+PREFIX=$(dirname $0)
			
 
				+test -x $PREFIX/../../tools/starpu_fxt_tool || exit 77
			
 
				+STARPU_FXT_PREFIX=$PREFIX/ $PREFIX/locality
			
 
				+$PREFIX/../../tools/starpu_fxt_tool -i $PREFIX/prof_file_${USER}_0
			
 
				+
			
 
				+# Check that they are approved by Grenoble :)
			
 
				+
			
 
				+if type pj_dump > /dev/null 2> /dev/null
			
 
				+then
			
 
				+	$PREFIX/../../tools/starpu_paje_sort paje.trace
			
 
				+	pj_dump paje.trace
			
 
				+fi
			
--- a/tools/starpu_fxt_tool.c
+++ b/tools/starpu_fxt_tool.c
@@ -30,8 +30,9 @@ static void usage()
 
				 	fprintf(stderr, "Usage: %s [ options ]\n", PROGNAME);
			
 
				         fprintf(stderr, "\n");
			
 
				         fprintf(stderr, "Options:\n");
			
 
				-	fprintf(stderr, "   -i <input file>     specify the input file. This can be specified several\n");
			
 
				-	fprintf(stderr, "                       times for MPI execution case\n");
			
 
				+	fprintf(stderr, "   -i <input file[s]>  specify the input file[s]. Several files can be provided,\n");
			
 
				+	fprintf(stderr, "                       or the option specified several times for MPI execution\n");
			
 
				+	fprintf(stderr, "                       case\n");
			
 
				         fprintf(stderr, "   -o <output file>    specify the output file\n");
			
 
				         fprintf(stderr, "   -c                  use a different colour for every type of task\n");
			
 
				 	fprintf(stderr, "   -no-events          do not show events\n");