8 years ago · 134f1eb906
--- a/ChangeLog
+++ b/ChangeLog
@@ -273,6 +273,7 @@ Small features:
 
																     allows to copy in a new buffer values which have not been unpacked by
															
 
																     the current call
															
 
																   * Add STARPU_CODELET_SIMGRID_EXECUTE flag.
															
 
																+  * Add STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT flag.
															
 
																   * Add STARPU_CL_ARGS flag to starpu_task_insert() and
															
 
																     starpu_mpi_task_insert() functions call
															
--- a/configure.ac
+++ b/configure.ac
@@ -348,9 +348,10 @@ else
 
																     build_mpi_master_slave=no
															
 
																 fi
															
 
																-#Warn users that they cannot use both at the same time
															
 
																+#users cannot use both at the same time
															
 
																 if test x$build_mpi_master_slave = xyes -a x$enable_mpi = xyes; then
															
 
																-    AC_MSG_WARN(StarPU-MPI and MPI Master-Slave cannot be used at the same time !)
															
 
																+    AC_MSG_WARN(StarPU-MPI and MPI Master-Slave cannot be used at the same time ! Disabling StarPU-MPI...)
															
 
																+	enable_mpi=no
															
 
																 fi
															
 
																 if test x$build_mpi_master_slave = xyes; then
															
@@ -3117,6 +3118,12 @@ AC_CONFIG_COMMANDS([executable-scripts], [
 
																   test -e tests/microbenchs/parallel_independent_heterogeneous_tasks.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_heterogeneous_tasks.sh tests/microbenchs/
															
 
																   test -e tests/microbenchs/parallel_independent_homogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_homogeneous_tasks_data.sh tests/microbenchs/
															
 
																   test -e tests/microbenchs/parallel_independent_homogeneous_tasks.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_homogeneous_tasks.sh tests/microbenchs/
															
 
																+  mkdir -p tests/datawizard
															
 
																+  test -e tests/datawizard/locality.sh || ln -sf $ac_abs_top_srcdir/tests/datawizard/locality.sh tests/datawizard/
															
 
																+  mkdir -p examples/heat
															
 
																+  test -e examples/heat/heat.sh || ln -sf $ac_abs_top_srcdir/examples/heat/heat.sh examples/heat/
															
 
																+  mkdir -p examples/lu
															
 
																+  test -e examples/lu/lu.sh || ln -sf $ac_abs_top_srcdir/examples/lu/lu.sh examples/lu/
															
 
																 ])
															
 
																 # Create links to ICD files in build/socl/vendors directory. SOCL will use this
															
--- a/doc/doxygen/chapters/320_scheduling.doxy
+++ b/doc/doxygen/chapters/320_scheduling.doxy
@@ -285,8 +285,9 @@ be used to get information about how well the execution proceeded, and thus the
 
																 overall quality of the execution.
															
 
																 Precise debugging can also be performed by using the
															
 
																-\ref STARPU_TASK_BREAK_ON_SCHED, \ref STARPU_TASK_BREAK_ON_PUSH, and
															
 
																-\ref STARPU_TASK_BREAK_ON_POP environment variables. By setting the job_id of a task
															
 
																+\ref STARPU_TASK_BREAK_ON_PUSH, \ref STARPU_TASK_BREAK_ON_SCHED,
															
 
																+\ref STARPU_TASK_BREAK_ON_POP, and \ref STARPU_TASK_BREAK_ON_EXEC environment variables.
															
 
																+By setting the job_id of a task
															
 
																 in these environment variables, StarPU will raise <c>SIGTRAP</c> when the task is being
															
 
																 scheduled, pushed, or popped by the scheduler. That means that when one notices
															
 
																 that a task is being scheduled in a seemingly odd way, one can just reexecute
															
--- a/doc/doxygen/chapters/380_offline_performance_tools.doxy
+++ b/doc/doxygen/chapters/380_offline_performance_tools.doxy
@@ -129,7 +129,7 @@ collect the trace files from the MPI nodes, and
 
																 specify them all on the command <c>starpu_fxt_tool</c>, for instance:
															
 
																 \verbatim
															
 
																-$ starpu_fxt_tool -i /tmp/prof_file_something1 -i /tmp/prof_file_something2
															
 
																+$ starpu_fxt_tool -i /tmp/prof_file_something*
															
 
																 \endverbatim
															
 
																 By default, all tasks are displayed using a green color. To display tasks with
															
--- a/doc/doxygen/chapters/470_simgrid.doxy
+++ b/doc/doxygen/chapters/470_simgrid.doxy
@@ -9,8 +9,8 @@
 
																 /*! \page SimGridSupport SimGrid Support
															
 
																 StarPU can use Simgrid in order to simulate execution on an arbitrary
															
 
																-platform. This was tested with simgrid 3.11, 3.12, 3.13, 3.14, and 3.14.159, other versions may have
															
 
																-compatibility issues.
															
 
																+platform. This was tested with simgrid from 3.11 to 3.15,
															
 
																+other versions may have compatibility issues.
															
 
																 \section Preparing Preparing Your Application For Simulation
															
@@ -36,7 +36,8 @@ To be able to run the application with e.g. CUDA simulation on a system which
 
																 does not have CUDA installed, one can fill the cuda_funcs with (void*)1, to
															
 
																 express that there is a CUDA implementation, even if one does not actually
															
 
																 provide it. StarPU will not actually run it in Simgrid mode anyway by default
															
 
																-(unless the ::STARPU_CODELET_SIMGRID_EXECUTE flag is set in the codelet)
															
 
																+(unless the ::STARPU_CODELET_SIMGRID_EXECUTE or ::STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT
															
 
																+flags are set in the codelet)
															
 
																 \snippet simgrid.c To be included. You should update doxygen if you see this text.
															
--- a/doc/doxygen/chapters/501_environment_variables.doxy
+++ b/doc/doxygen/chapters/501_environment_variables.doxy
@@ -642,9 +642,10 @@ especially regarding data transfers.
 
																 <dd>
															
 
																 \anchor STARPU_SIMGRID_SCHED_COST
															
 
																 \addindex __env__STARPU_SIMGRID_SCHED_COST
															
 
																-When set to 1 (which is the default), scheduling costs are taken into
															
 
																+When set to 1 (0 is the default), scheduling costs are taken into
															
 
																 account in simgrid mode. This provides more accurate simgrid predictions,
															
 
																-and allows studying scheduling overhead of the runtime system.
															
 
																+and allows studying scheduling overhead of the runtime system. However,
															
 
																+it also makes simulation non-deterministic.
															
 
																 </dd>
															
 
																 </dl>
															
@@ -1021,6 +1022,15 @@ dog is reached, thus allowing to catch the situation in gdb, etc
 
																 (see \ref DetectionStuckConditions)
															
 
																 </dd>
															
 
																+<dt>STARPU_TASK_BREAK_ON_PUSH</dt>
															
 
																+<dd>
															
 
																+\anchor STARPU_TASK_BREAK_ON_PUSH
															
 
																+\addindex __env__STARPU_TASK_BREAK_ON_PUSH
															
 
																+When this variable contains a job id, StarPU will raise SIGTRAP when the task
															
 
																+with that job id is being pushed to the scheduler, which will be nicely catched by debuggers
															
 
																+(see \ref DebuggingScheduling)
															
 
																+</dd>
															
 
																+
															
 
																 <dt>STARPU_TASK_BREAK_ON_SCHED</dt>
															
 
																 <dd>
															
 
																 \anchor STARPU_TASK_BREAK_ON_SCHED
															
@@ -1032,21 +1042,21 @@ This only works for schedulers which have such a scheduling point defined
 
																 (see \ref DebuggingScheduling)
															
 
																 </dd>
															
 
																-<dt>STARPU_TASK_BREAK_ON_PUSH</dt>
															
 
																+<dt>STARPU_TASK_BREAK_ON_POP</dt>
															
 
																 <dd>
															
 
																-\anchor STARPU_TASK_BREAK_ON_PUSH
															
 
																-\addindex __env__STARPU_TASK_BREAK_ON_PUSH
															
 
																+\anchor STARPU_TASK_BREAK_ON_POP
															
 
																+\addindex __env__STARPU_TASK_BREAK_ON_POP
															
 
																 When this variable contains a job id, StarPU will raise SIGTRAP when the task
															
 
																-with that job id is being pushed to the scheduler, which will be nicely catched by debuggers
															
 
																+with that job id is being popped from the scheduler, which will be nicely catched by debuggers
															
 
																 (see \ref DebuggingScheduling)
															
 
																 </dd>
															
 
																-<dt>STARPU_TASK_BREAK_ON_POP</dt>
															
 
																+<dt>STARPU_TASK_BREAK_ON_EXEC</dt>
															
 
																 <dd>
															
 
																-\anchor STARPU_TASK_BREAK_ON_POP
															
 
																-\addindex __env__STARPU_TASK_BREAK_ON_POP
															
 
																+\anchor STARPU_TASK_BREAK_ON_EXEC
															
 
																+\addindex __env__STARPU_TASK_BREAK_ON_EXEC
															
 
																 When this variable contains a job id, StarPU will raise SIGTRAP when the task
															
 
																-with that job id is being popped from the scheduler, which will be nicely catched by debuggers
															
 
																+with that job id is being executed, which will be nicely catched by debuggers
															
 
																 (see \ref DebuggingScheduling)
															
 
																 </dd>
															
--- a/doc/doxygen/chapters/api/codelet_and_tasks.doxy
+++ b/doc/doxygen/chapters/api/codelet_and_tasks.doxy
@@ -135,6 +135,11 @@ Value to be set in starpu_codelet::opencl_flags to allow asynchronous OpenCL ker
 
																 \ingroup API_Codelet_And_Tasks
															
 
																 Value to be set in starpu_codelet::flags to execute the codelet functions even in simgrid mode.
															
 
																+\def STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT
															
 
																+\ingroup API_Codelet_And_Tasks
															
 
																+Value to be set in starpu_codelet::flags to execute the codelet functions even in simgrid mode,
															
 
																+and later inject the measured timing inside the simulation.
															
 
																+
															
 
																 \typedef starpu_cpu_func_t
															
 
																 \ingroup API_Codelet_And_Tasks
															
 
																 CPU implementation of a codelet.
															
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -77,11 +77,13 @@ EXTRA_DIST = 					\
 
																 	scheduler/schedulers.sh				\
															
 
																 	scheduler/schedulers_context.sh			\
															
 
																 	fortran/Makefile				\
															
 
																-	sched_ctx/axpy_partition_gpu.h				\
															
 
																-	sched_ctx/axpy_partition_gpu.cu
															
 
																+	sched_ctx/axpy_partition_gpu.h			\
															
 
																+	sched_ctx/axpy_partition_gpu.cu			\
															
 
																+	heat/heat.sh					\
															
 
																+	lu/lu.sh
															
 
																-CLEANFILES = *.gcno *.gcda *.linkinfo *.mod starpu_idle_microsec.log
															
 
																+CLEANFILES = *.gcno *.gcda *.linkinfo *.mod starpu_idle_microsec.log *.mps *.dot *.pl
															
 
																 if STARPU_USE_CUDA
															
@@ -300,6 +302,13 @@ STARPU_EXAMPLES +=				\
 
																 	heat/heat				\
															
 
																 	cg/cg					\
															
 
																 	pipeline/pipeline
															
 
																+
															
 
																+if !STARPU_USE_MPI_MASTER_SLAVE
															
 
																+TESTS += \
															
 
																+	heat/heat.sh				\
															
 
																+	lu/lu.sh
															
 
																+
															
 
																+endif
															
 
																 endif
															
 
																 endif
															
--- a/examples/heat/dw_sparse_cg.c
+++ b/examples/heat/dw_sparse_cg.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009, 2010, 2011, 2015  Université de Bordeaux
															
 
																+ * Copyright (C) 2009, 2010, 2011, 2015, 2017  Université de Bordeaux
															
 
																  * Copyright (C) 2010, 2011, 2012, 2013, 2016  CNRS
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -25,11 +25,7 @@
 
																 static struct starpu_task *create_task(starpu_tag_t id)
															
 
																 {
															
 
																-	struct starpu_codelet *cl = calloc(1,sizeof(struct starpu_codelet));
															
 
																-
															
 
																 	struct starpu_task *task = starpu_task_create();
															
 
																-		task->cl = cl;
															
 
																-		task->cl_arg = NULL;
															
 
																 		task->use_tag = 1;
															
 
																 		task->tag_id = id;
															
@@ -131,6 +127,30 @@ void init_problem(void)
 
																  *	cg initialization phase
															
 
																  */
															
 
																+static struct starpu_codelet cl1 = {
															
 
																+	.cpu_funcs = { cpu_codelet_func_1 },
															
 
																+	.cpu_funcs_name = { "cpu_codelet_func_1" },
															
 
																+	.nbuffers = 4,
															
 
																+	.modes = { STARPU_R, STARPU_R, STARPU_W, STARPU_R },
															
 
																+};
															
 
																+
															
 
																+static struct starpu_codelet cl2 = {
															
 
																+	.cpu_funcs = { cpu_codelet_func_2 },
															
 
																+	.cpu_funcs_name = { "cpu_codelet_func_2" },
															
 
																+	.nbuffers = 2,
															
 
																+	.modes = { STARPU_W, STARPU_R },
															
 
																+};
															
 
																+
															
 
																+static struct starpu_codelet cl3 = {
															
 
																+	.cpu_funcs = { cpu_codelet_func_3 },
															
 
																+	.cpu_funcs_name = { "cpu_codelet_func_3" },
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+	.cuda_funcs = { cublas_codelet_func_3 },
															
 
																+#endif
															
 
																+	.nbuffers = 1,
															
 
																+	.modes = { STARPU_R },
															
 
																+};
															
 
																+
															
 
																 void init_cg(struct cg_problem *problem)
															
 
																 {
															
 
																 	int ret;
															
@@ -139,14 +159,7 @@ void init_cg(struct cg_problem *problem)
 
																 	/* r = b  - A x */
															
 
																 	struct starpu_task *task1 = create_task(1UL);
															
 
																-	task1->cl->cpu_funcs[0] = cpu_codelet_func_1;
															
 
																-	task1->cl->cpu_funcs_name[0] = "cpu_codelet_func_1";
															
 
																-	task1->cl->nbuffers = 4;
															
 
																-	task1->cl->modes[0] = STARPU_R;
															
 
																-	task1->cl->modes[1] = STARPU_R;
															
 
																-	task1->cl->modes[2] = STARPU_W;
															
 
																-	task1->cl->modes[3] = STARPU_R;
															
 
																-
															
 
																+	task1->cl = &cl1;
															
 
																 	task1->handles[0] = problem->ds_matrixA;
															
 
																 	task1->handles[1] = problem->ds_vecx;
															
 
																 	task1->handles[2] = problem->ds_vecr;
															
@@ -154,12 +167,7 @@ void init_cg(struct cg_problem *problem)
 
																 	/* d = r */
															
 
																 	struct starpu_task *task2 = create_task(2UL);
															
 
																-	task2->cl->cpu_funcs[0] = cpu_codelet_func_2;
															
 
																-	task2->cl->cpu_funcs_name[0] = "cpu_codelet_func_2";
															
 
																-	task2->cl->nbuffers = 2;
															
 
																-	task2->cl->modes[0] = STARPU_W;
															
 
																-	task2->cl->modes[1] = STARPU_R;
															
 
																-
															
 
																+	task2->cl = &cl2;
															
 
																 	task2->handles[0] = problem->ds_vecd;
															
 
																 	task2->handles[1] = problem->ds_vecr;
															
@@ -167,15 +175,9 @@ void init_cg(struct cg_problem *problem)
 
																 	/* delta_new = trans(r) r */
															
 
																 	struct starpu_task *task3 = create_task(3UL);
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-	task3->cl->cuda_funcs[0] = cublas_codelet_func_3;
															
 
																-#endif
															
 
																-	task3->cl->cpu_funcs[0] = cpu_codelet_func_3;
															
 
																-	task3->cl->cpu_funcs_name[0] = "cpu_codelet_func_3";
															
 
																+	task3->cl = &cl3;
															
 
																 	task3->cl_arg = problem;
															
 
																 	task3->cl_arg_size = sizeof(*problem);
															
 
																-	task3->cl->nbuffers = 1;
															
 
																-	task3->cl->modes[0] = STARPU_R;
															
 
																 	task3->handles[0] = problem->ds_vecr;
															
 
																 	task3->callback_func = iteration_cg;
															
@@ -186,6 +188,11 @@ void init_cg(struct cg_problem *problem)
 
																 	/* launch the computation now */
															
 
																 	ret = starpu_task_submit(task1);
															
 
																+	if (STARPU_UNLIKELY(ret == -ENODEV))
															
 
																+	{
															
 
																+		FPRINTF(stderr, "No worker may execute this task\n");
															
 
																+		exit(0);
															
 
																+	}
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
 
																 	ret = starpu_task_submit(task2);
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
															
@@ -198,6 +205,66 @@ void init_cg(struct cg_problem *problem)
 
																  *		the codelet code launcher is its own callback !
															
 
																  */
															
 
																+static struct starpu_codelet cl4 = {
															
 
																+	.cpu_funcs = { cpu_codelet_func_4 },
															
 
																+	.cpu_funcs_name = { "cpu_codelet_func_4" },
															
 
																+	.nbuffers = 3,
															
 
																+	.modes = { STARPU_R, STARPU_R, STARPU_W },
															
 
																+};
															
 
																+
															
 
																+static struct starpu_codelet cl5 = {
															
 
																+	.cpu_funcs = { cpu_codelet_func_5 },
															
 
																+	.cpu_funcs_name = { "cpu_codelet_func_5" },
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+	.cuda_funcs = { cublas_codelet_func_5 },
															
 
																+#endif
															
 
																+	.nbuffers = 2,
															
 
																+	.modes = { STARPU_R, STARPU_R },
															
 
																+};
															
 
																+
															
 
																+static struct starpu_codelet cl6 = {
															
 
																+	.cpu_funcs = { cpu_codelet_func_6 },
															
 
																+	.cpu_funcs_name = { "cpu_codelet_func_6" },
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+	.cuda_funcs = { cublas_codelet_func_6 },
															
 
																+	.cuda_flags = { STARPU_CUDA_ASYNC },
															
 
																+#endif
															
 
																+	.nbuffers = 2,
															
 
																+	.modes = { STARPU_RW, STARPU_R },
															
 
																+};
															
 
																+
															
 
																+static struct starpu_codelet cl7 = {
															
 
																+	.cpu_funcs = { cpu_codelet_func_7 },
															
 
																+	.cpu_funcs_name = { "cpu_codelet_func_7" },
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+	.cuda_funcs = { cublas_codelet_func_7 },
															
 
																+	.cuda_flags = { STARPU_CUDA_ASYNC },
															
 
																+#endif
															
 
																+	.nbuffers = 2,
															
 
																+	.modes = { STARPU_RW, STARPU_R },
															
 
																+};
															
 
																+
															
 
																+static struct starpu_codelet cl8 = {
															
 
																+	.cpu_funcs = { cpu_codelet_func_8 },
															
 
																+	.cpu_funcs_name = { "cpu_codelet_func_8" },
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+	.cuda_funcs = { cublas_codelet_func_8 },
															
 
																+#endif
															
 
																+	.nbuffers = 1,
															
 
																+	.modes = { STARPU_R },
															
 
																+};
															
 
																+
															
 
																+static struct starpu_codelet cl9 = {
															
 
																+	.cpu_funcs = { cpu_codelet_func_9 },
															
 
																+	.cpu_funcs_name = { "cpu_codelet_func_9" },
															
 
																+#ifdef STARPU_USE_CUDA
															
 
																+	.cuda_funcs = { cublas_codelet_func_9 },
															
 
																+	.cuda_flags = { STARPU_CUDA_ASYNC },
															
 
																+#endif
															
 
																+	.nbuffers = 2,
															
 
																+	.modes = { STARPU_RW, STARPU_R },
															
 
																+};
															
 
																+
															
 
																 void launch_new_cg_iteration(struct cg_problem *problem)
															
 
																 {
															
 
																 	int ret;
															
@@ -208,30 +275,16 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 
																 	/* q = A d */
															
 
																 	struct starpu_task *task4 = create_task(maskiter | 4UL);
															
 
																-	task4->cl->cpu_funcs[0] = cpu_codelet_func_4;
															
 
																-	task4->cl->cpu_funcs_name[0] = "cpu_codelet_func_4";
															
 
																-	task4->cl->nbuffers = 3;
															
 
																-	task4->cl->modes[0] = STARPU_R;
															
 
																-	task4->cl->modes[1] = STARPU_R;
															
 
																-	task4->cl->modes[2] = STARPU_W;
															
 
																-
															
 
																+	task4->cl = &cl4;
															
 
																 	task4->handles[0] = problem->ds_matrixA;
															
 
																 	task4->handles[1] = problem->ds_vecd;
															
 
																 	task4->handles[2] = problem->ds_vecq;
															
 
																 	/* alpha = delta_new / ( trans(d) q )*/
															
 
																 	struct starpu_task *task5 = create_task(maskiter | 5UL);
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-	task5->cl->cuda_funcs[0] = cublas_codelet_func_5;
															
 
																-#endif
															
 
																-	task5->cl->cpu_funcs[0] = cpu_codelet_func_5;
															
 
																-	task5->cl->cpu_funcs_name[0] = "cpu_codelet_func_5";
															
 
																+	task5->cl = &cl5;
															
 
																 	task5->cl_arg = problem;
															
 
																 	task5->cl_arg_size = sizeof(*problem);
															
 
																-	task5->cl->nbuffers = 2;
															
 
																-	task5->cl->modes[0] = STARPU_R;
															
 
																-	task5->cl->modes[1] = STARPU_R;
															
 
																-
															
 
																 	task5->handles[0] = problem->ds_vecd;
															
 
																 	task5->handles[1] = problem->ds_vecq;
															
@@ -239,18 +292,9 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 
																 	/* x = x + alpha d */
															
 
																 	struct starpu_task *task6 = create_task(maskiter | 6UL);
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-	task6->cl->cuda_funcs[0] = cublas_codelet_func_6;
															
 
																-	task6->cl->cuda_flags[0] = STARPU_CUDA_ASYNC;
															
 
																-#endif
															
 
																-	task6->cl->cpu_funcs[0] = cpu_codelet_func_6;
															
 
																-	task6->cl->cpu_funcs_name[0] = "cpu_codelet_func_6";
															
 
																+	task6->cl = &cl6;
															
 
																 	task6->cl_arg = problem;
															
 
																 	task6->cl_arg_size = sizeof(*problem);
															
 
																-	task6->cl->nbuffers = 2;
															
 
																-	task6->cl->modes[0] = STARPU_RW;
															
 
																-	task6->cl->modes[1] = STARPU_R;
															
 
																-
															
 
																 	task6->handles[0] = problem->ds_vecx;
															
 
																 	task6->handles[1] = problem->ds_vecd;
															
@@ -258,18 +302,9 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 
																 	/* r = r - alpha q */
															
 
																 	struct starpu_task *task7 = create_task(maskiter | 7UL);
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-	task7->cl->cuda_funcs[0] = cublas_codelet_func_7;
															
 
																-	task7->cl->cuda_flags[0] = STARPU_CUDA_ASYNC;
															
 
																-#endif
															
 
																-	task7->cl->cpu_funcs[0] = cpu_codelet_func_7;
															
 
																-	task7->cl->cpu_funcs_name[0] = "cpu_codelet_func_7";
															
 
																+	task7->cl = &cl7;
															
 
																 	task7->cl_arg = problem;
															
 
																 	task7->cl_arg_size = sizeof(*problem);
															
 
																-	task7->cl->nbuffers = 2;
															
 
																-	task7->cl->modes[0] = STARPU_RW;
															
 
																-	task7->cl->modes[1] = STARPU_R;
															
 
																-
															
 
																 	task7->handles[0] = problem->ds_vecr;
															
 
																 	task7->handles[1] = problem->ds_vecq;
															
@@ -277,33 +312,18 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 
																 	/* update delta_* and compute beta */
															
 
																 	struct starpu_task *task8 = create_task(maskiter | 8UL);
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-	task8->cl->cuda_funcs[0] = cublas_codelet_func_8;
															
 
																-#endif
															
 
																-	task8->cl->cpu_funcs[0] = cpu_codelet_func_8;
															
 
																-	task8->cl->cpu_funcs_name[0] = "cpu_codelet_func_8";
															
 
																+	task8->cl = &cl8;
															
 
																 	task8->cl_arg = problem;
															
 
																 	task8->cl_arg_size = sizeof(*problem);
															
 
																-	task8->cl->nbuffers = 1;
															
 
																-	task8->cl->modes[0] = STARPU_R;
															
 
																 	task8->handles[0] = problem->ds_vecr;
															
 
																 	starpu_tag_declare_deps((starpu_tag_t)(maskiter | 8UL), 1, (starpu_tag_t)(maskiter | 7UL));
															
 
																 	/* d = r + beta d */
															
 
																 	struct starpu_task *task9 = create_task(maskiter | 9UL);
															
 
																-#ifdef STARPU_USE_CUDA
															
 
																-	task9->cl->cuda_funcs[0] = cublas_codelet_func_9;
															
 
																-	task9->cl->cuda_flags[0] = STARPU_CUDA_ASYNC;
															
 
																-#endif
															
 
																-	task9->cl->cpu_funcs[0] = cpu_codelet_func_9;
															
 
																-	task9->cl->cpu_funcs_name[0] = "cpu_codelet_func_9";
															
 
																+	task9->cl = &cl9;
															
 
																 	task9->cl_arg = problem;
															
 
																 	task9->cl_arg_size = sizeof(*problem);
															
 
																-	task9->cl->nbuffers = 2;
															
 
																-	task9->cl->modes[0] = STARPU_RW;
															
 
																-	task9->cl->modes[1] = STARPU_R;
															
 
																-
															
 
																 	task9->handles[0] = problem->ds_vecd;
															
 
																 	task9->handles[1] = problem->ds_vecr;
															
@@ -427,6 +447,10 @@ void conjugate_gradient(float *nzvalA, float *vecb, float *vecx, uint32_t nnz,
 
																 	starpu_data_unregister(ds_vecr);
															
 
																 	starpu_data_unregister(ds_vecd);
															
 
																 	starpu_data_unregister(ds_vecq);
															
 
																+
															
 
																+	free(ptr_vecr);
															
 
																+	free(ptr_vecd);
															
 
																+	free(ptr_vecq);
															
 
																 }
															
@@ -444,4 +468,6 @@ void do_conjugate_gradient(float *nzvalA, float *vecb, float *vecx, uint32_t nnz
 
																 	starpu_cublas_init();
															
 
																 	conjugate_gradient(nzvalA, vecb, vecx, nnz, nrow, colind, rowptr);
															
 
																+
															
 
																+	starpu_shutdown();
															
 
																 }
															
--- a/examples/heat/heat.c
+++ b/examples/heat/heat.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009, 2010, 2012, 2015  Université de Bordeaux
															
 
																+ * Copyright (C) 2009, 2010, 2012, 2015, 2017  Université de Bordeaux
															
 
																  * Copyright (C) 2010, 2011, 2012, 2016  CNRS
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -121,9 +121,9 @@ static void parse_args(int argc, char **argv)
 
																 			STARPU_ASSERT((nthick - 2)*(ntheta - 2) == size);
															
 
																 		}
															
 
																-		if (strcmp(argv[i], "-h") == 0)
															
 
																+		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-help") == 0)
															
 
																 		{
															
 
																-			printf("usage : %s [-v1|-v2|-v3] [-pin] [-nthick number] [-ntheta number] [-shape [0|1|2]] [-cg] [-size number] [-no-prio]\n", argv[0]);
															
 
																+			printf("usage : %s [-v1|-v2|-v3|-v4] [-pin] [-nthick number] [-ntheta number] [-shape [0|1|2]] [-cg] [-size number] [-no-prio]\n", argv[0]);
															
 
																 		}
															
 
																 	}
															
 
																 }
															
@@ -751,6 +751,10 @@ int main(int argc, char **argv)
 
																 			result[TRANSLATE(i)] = Bformer[TRANSLATE(i)];
															
 
																 		}
															
 
																+		free(nzval);
															
 
																+		free(colind);
															
 
																+		free(rowptr);
															
 
																+		free(B);
															
 
																 	}
															
 
																 	else
															
 
																 	{
															
--- a/examples/heat/heat.sh
+++ b/examples/heat/heat.sh
@@ -0,0 +1,43 @@
 
																+#!/bin/bash
															
 
																+#
															
 
																+# StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+#
															
 
																+# Copyright (C) 2017  Université de Bordeaux
															
 
																+#
															
 
																+# StarPU is free software; you can redistribute it and/or modify
															
 
																+# it under the terms of the GNU Lesser General Public License as published by
															
 
																+# the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+# your option) any later version.
															
 
																+#
															
 
																+# StarPU is distributed in the hope that it will be useful, but
															
 
																+# WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+#
															
 
																+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+
															
 
																+# Test various LU options
															
 
																+
															
 
																+set -e
															
 
																+
															
 
																+PREFIX=$(dirname $0)
															
 
																+
															
 
																+$PREFIX/heat -shape 0
															
 
																+$PREFIX/heat -shape 1
															
 
																+# sometimes lead to pivot being 0
															
 
																+#$PREFIX/heat -shape 2
															
 
																+
															
 
																+$PREFIX/heat -cg
															
 
																+
															
 
																+# TODO: FIXME
															
 
																+
															
 
																+# segfault
															
 
																+#$PREFIX/heat -v1
															
 
																+
															
 
																+# (actually the default...)
															
 
																+$PREFIX/heat -v2
															
 
																+
															
 
																+# hang
															
 
																+#$PREFIX/heat -v3
															
 
																+
															
 
																+# hang
															
 
																+#$PREFIX/heat -v4
															
--- a/examples/lu/lu.sh
+++ b/examples/lu/lu.sh
@@ -0,0 +1,34 @@
 
																+#!/bin/bash
															
 
																+#
															
 
																+# StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+#
															
 
																+# Copyright (C) 2017  Université de Bordeaux
															
 
																+#
															
 
																+# StarPU is free software; you can redistribute it and/or modify
															
 
																+# it under the terms of the GNU Lesser General Public License as published by
															
 
																+# the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+# your option) any later version.
															
 
																+#
															
 
																+# StarPU is distributed in the hope that it will be useful, but
															
 
																+# WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+#
															
 
																+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+
															
 
																+# Test various LU options
															
 
																+
															
 
																+set -e
															
 
																+
															
 
																+PREFIX=$(dirname $0)
															
 
																+
															
 
																+$PREFIX/lu_implicit_example_float -size $((960 * 4)) -nblocks 4 -piv
															
 
																+$PREFIX/lu_implicit_example_float -size $((960 * 4)) -nblocks 4 -no-stride
															
 
																+$PREFIX/lu_implicit_example_float -size $((960 * 4)) -nblocks 4 -bound
															
 
																+$PREFIX/lu_implicit_example_float -size $((960 * 2)) -nblocks 2 -bounddeps
															
 
																+$PREFIX/lu_implicit_example_float -size $((960 * 2)) -nblocks 2 -bound -bounddeps -bounddepsprio
															
 
																+
															
 
																+$PREFIX/lu_example_float -size $((960 * 4)) -nblocks 4 -piv
															
 
																+$PREFIX/lu_example_float -size $((960 * 4)) -nblocks 4 -no-stride
															
 
																+$PREFIX/lu_example_float -size $((960 * 4)) -nblocks 4 -bound
															
 
																+$PREFIX/lu_example_float -size $((960 * 2)) -nblocks 2 -bounddeps
															
 
																+$PREFIX/lu_example_float -size $((960 * 2)) -nblocks 2 -bound -bounddeps -bounddepsprio
															
--- a/examples/lu/lu_example.c
+++ b/examples/lu/lu_example.c
@@ -422,13 +422,15 @@ int main(int argc, char **argv)
 
																 		if (pivot)
															
 
																 		{
															
 
																 			pivot_saved_matrix(ipiv);
															
 
																-			free(ipiv);
															
 
																 		}
															
 
																 		check_result();
															
 
																 	}
															
 
																 #endif
															
 
																+	if (pivot)
															
 
																+		free(ipiv);
															
 
																+
															
 
																 	starpu_free_flags(A, (size_t)size*size*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
															
 
																 	starpu_cublas_shutdown();
															
--- a/examples/lu/xlu_implicit_pivot.c
+++ b/examples/lu/xlu_implicit_pivot.c
@@ -232,6 +232,10 @@ starpu_data_handle_t get_block_with_striding(starpu_data_handle_t *dataAp,
 
																 int STARPU_LU(lu_decomposition_pivot)(TYPE *matA, unsigned *ipiv, unsigned size, unsigned ld, unsigned nblocks)
															
 
																 {
															
 
																+	if (starpu_mic_worker_get_count() || starpu_scc_worker_get_count() || starpu_mpi_ms_worker_get_count())
															
 
																+		/* These won't work with pivoting: we pass a pointer in cl_args */
															
 
																+		return -ENODEV;
															
 
																+
															
 
																 	starpu_data_handle_t dataA;
															
 
																 	/* monitor and partition the A matrix into blocks :
															
--- a/examples/lu/xlu_pivot.c
+++ b/examples/lu/xlu_pivot.c
@@ -399,6 +399,7 @@ int STARPU_LU(lu_decomposition_pivot)(TYPE *matA, unsigned *ipiv, unsigned size,
 
																 	/* gather all the data */
															
 
																 	starpu_data_unpartition(dataA, STARPU_MAIN_RAM);
															
 
																+	starpu_data_unregister(dataA);
															
 
																 	free(piv_description);
															
 
																 	return ret;
															
@@ -413,6 +414,10 @@ starpu_data_handle_t get_block_with_no_striding(starpu_data_handle_t *dataAp, un
 
																 int STARPU_LU(lu_decomposition_pivot_no_stride)(TYPE **matA, unsigned *ipiv, unsigned size, unsigned ld, unsigned nblocks)
															
 
																 {
															
 
																+	if (starpu_mic_worker_get_count() || starpu_scc_worker_get_count() || starpu_mpi_ms_worker_get_count())
															
 
																+		/* These won't work with pivoting: we pass a pointer in cl_args */
															
 
																+		return -ENODEV;
															
 
																+
															
 
																 	starpu_data_handle_t *dataAp = malloc(nblocks*nblocks*sizeof(starpu_data_handle_t));
															
 
																 	/* monitor and partition the A matrix into blocks :
															
--- a/examples/mlr/mlr.c
+++ b/examples/mlr/mlr.c
@@ -50,7 +50,15 @@ static long sum;
 
																 static void cl_params(struct starpu_task *task, double *parameters)
															
 
																 {
															
 
																 	int m, n, k;
															
 
																-	starpu_codelet_unpack_args(task->cl_arg, &m, &n, &k);
															
 
																+	int* vector_mn;
															
 
																+	starpu_data_handle_t vector_mn_handle;
															
 
																+
															
 
																+	vector_mn = (int*)STARPU_VECTOR_GET_PTR(task->interfaces[0]);
															
 
																+	m = vector_mn[0];
															
 
																+	n = vector_mn[1];
															
 
																+
															
 
																+	starpu_codelet_unpack_args(task->cl_arg, &k);
															
 
																+
															
 
																 	parameters[0] = m;
															
 
																 	parameters[1] = n;
															
 
																 	parameters[2] = k;
															
@@ -61,10 +69,13 @@ void cpu_func(void *buffers[], void *cl_arg)
 
																 {
															
 
																 	long i;
															
 
																 	int m,n,k;
															
 
																-	starpu_codelet_unpack_args(cl_arg,
															
 
																-			     	  &m,
															
 
																-     			     	  &n,
															
 
																-     			     	  &k);
															
 
																+	int* vector_mn;
															
 
																+
															
 
																+	vector_mn = (int*)STARPU_VECTOR_GET_PTR(buffers[0]);
															
 
																+	m = vector_mn[0];
															
 
																+	n = vector_mn[1];
															
 
																+
															
 
																+	starpu_codelet_unpack_args(cl_arg, &k);
															
 
																 	for(i=0; i < (long) (m*m*n); i++)
															
 
																 		sum+=i;
															
@@ -123,7 +134,8 @@ static struct starpu_codelet cl_init =
 
																 {
															
 
																 	.cpu_funcs = { cpu_func },
															
 
																 	.cpu_funcs_name = { "cpu_func" },
															
 
																-	.nbuffers = 0,
															
 
																+	.nbuffers = 1,
															
 
																+	.modes = {STARPU_R},
															
 
																 	.model = &cl_model_init,
															
 
																 };
															
@@ -131,7 +143,8 @@ static struct starpu_codelet cl_final =
 
																 {
															
 
																 	.cpu_funcs = { cpu_func },
															
 
																 	.cpu_funcs_name = { "cpu_func" },
															
 
																-	.nbuffers = 0,
															
 
																+	.nbuffers = 1,
															
 
																+	.modes = {STARPU_R},
															
 
																 	.model = &cl_model_final,
															
 
																 };
															
@@ -147,29 +160,42 @@ int main(int argc, char **argv)
 
																 	sum=0;
															
 
																 	int m,n,k;
															
 
																+	int* vector_mn = malloc( 2 * sizeof(int) );
															
 
																+	starpu_data_handle_t vector_mn_handle;
															
 
																+
															
 
																+	starpu_vector_data_register( &vector_mn_handle,
															
 
																+				     STARPU_MAIN_RAM,
															
 
																+				     (uintptr_t)vector_mn, 2,
															
 
																+				     sizeof(int) );
															
 
																-        /* Giving pseudo-random values to the M,N,K parameters and inserting tasks */
															
 
																-	for(i=0; i < 42; i++)
															
 
																+	/* Giving pseudo-random values to the M,N,K parameters and inserting tasks */
															
 
																+	for ( i = 0; i < 42; i++)
															
 
																 	{
															
 
																 		m = (int) ((rand() % 10)+1);
															
 
																 		n = (int) ((rand() % 10)+1);
															
 
																 		k = (int) ((rand() % 10)+1);
															
 
																-		for(j=0; j < 42; j++)
															
 
																+		/* To illustrate the usage, M and N are stored in a data handle */
															
 
																+		starpu_data_acquire(vector_mn_handle, STARPU_W);
															
 
																+		vector_mn[0] = m;
															
 
																+		vector_mn[1] = n;
															
 
																+		starpu_data_release(vector_mn_handle);
															
 
																+
															
 
																+		for ( j = 0; j < 42; j++)
															
 
																 		{
															
 
																-			starpu_insert_task(&cl_init,
															
 
																-				   STARPU_VALUE, &m, sizeof(int),
															
 
																-				   STARPU_VALUE, &n, sizeof(int),
															
 
																-				   STARPU_VALUE, &k, sizeof(int),
															
 
																-				   0);
															
 
																-			starpu_insert_task(&cl_final,
															
 
																-				   STARPU_VALUE, &m, sizeof(int),
															
 
																-				   STARPU_VALUE, &n, sizeof(int),
															
 
																-				   STARPU_VALUE, &k, sizeof(int),
															
 
																-				   0);
															
 
																+			starpu_insert_task( &cl_init,
															
 
																+					    STARPU_R, vector_mn_handle,
															
 
																+					    STARPU_VALUE, &k, sizeof(int),
															
 
																+					    0 );
															
 
																+			starpu_insert_task( &cl_final,
															
 
																+					    STARPU_R, vector_mn_handle,
															
 
																+					    STARPU_VALUE, &k, sizeof(int),
															
 
																+					    0 );
															
 
																 		}
															
 
																 	}
															
 
																+	starpu_data_unregister(vector_mn_handle);
															
 
																+	free(vector_mn);
															
 
																 	starpu_shutdown();
															
 
																 	return 0;
															
--- a/examples/sched_ctx/gpu_partition.c
+++ b/examples/sched_ctx/gpu_partition.c
@@ -105,7 +105,9 @@ int main(int argc, char **argv)
 
																 	int ncuda = 0;
															
 
																 	int gpu_devid = -1;
															
 
																+#ifdef STARPU_DEVEL
															
 
																 #warning temporary fix: skip test as cuda computation fails
															
 
																+#endif
															
 
																  	return 77;
															
 
																 #ifndef STARPU_HAVE_SETENV
															
@@ -172,8 +174,8 @@ int main(int argc, char **argv)
 
																 	int ncpus = starpu_cpu_worker_get_count();
															
 
																 	int workers[ncpus+nstreams];
															
 
																 	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, workers, ncpus);
															
 
																-	
															
 
																-	int sched_ctxs[nstreams];
															
 
																+
															
 
																+	unsigned sched_ctxs[nstreams];
															
 
																 	int nsms[nstreams];
															
 
																 	nsms[0] = 6;
															
 
																 	nsms[1] = 7;
															
@@ -185,7 +187,7 @@ int main(int argc, char **argv)
 
																 	}
															
 
																 	unsigned sched_ctx1 = starpu_sched_ctx_create(workers, ncpus+nstreams, "ctx1", STARPU_SCHED_CTX_SUB_CTXS, sched_ctxs, nstreams, STARPU_SCHED_CTX_POLICY_NAME, "dmdas", 0);
															
 
																-	FPRINTF(stderr, "parent ctx %d\n", sched_ctx1);
															
 
																+	FPRINTF(stderr, "parent ctx %u\n", sched_ctx1);
															
 
																 	starpu_sched_ctx_set_context(&sched_ctx1);
															
 
																 #endif
															
--- a/examples/stencil/stencil-blocks.c
+++ b/examples/stencil/stencil-blocks.c
@@ -297,11 +297,10 @@ void allocate_memory_on_node(int rank)
 
																 		int node = block->mpi_node;
															
 
																-		unsigned size_bz = block_sizes_z[bz];
															
 
																-
															
 
																 		/* Main blocks */
															
 
																 		if (node == rank)
															
 
																 		{
															
 
																+			unsigned size_bz = block_sizes_z[bz];
															
 
																 			allocate_block_on_node(&block->layers_handle[0], bz, &block->layers[0],
															
 
																 						(sizex + 2*K), (sizey + 2*K), (size_bz + 2*K));
															
 
																 #ifndef STARPU_SIMGRID
															
@@ -389,8 +388,8 @@ void check(int rank)
 
																 		/* Main blocks */
															
 
																 		if (node == rank)
															
 
																 		{
															
 
																-			unsigned size_bz = block_sizes_z[bz];
															
 
																 #ifdef LIFE
															
 
																+			unsigned size_bz = block_sizes_z[bz];
															
 
																 			unsigned x, y, z;
															
 
																 			unsigned sum = 0;
															
 
																 			for (x = 0; x < sizex; x++)
															
--- a/include/fstarpu_mod.f90
+++ b/include/fstarpu_mod.f90
@@ -82,6 +82,7 @@ module fstarpu_mod
 
																         type(c_ptr), bind(C) :: FSTARPU_SCC
															
 
																         type(c_ptr), bind(C) :: FSTARPU_CODELET_SIMGRID_EXECUTE
															
 
																+        type(c_ptr), bind(C) :: FSTARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT
															
 
																         type(c_ptr), bind(C) :: FSTARPU_CUDA_ASYNC
															
 
																         type(c_ptr), bind(C) :: FSTARPU_OPENCL_ASYNC
															
@@ -1580,7 +1581,7 @@ module fstarpu_mod
 
																                 end subroutine fstarpu_memchunk_tidy
															
 
																                 ! == starpu_task_util.h ==
															
 
																-                ! struct starpu_data_handle *fstarpu_data_handle_array_alloc(int nb);
															
 
																+                ! starpu_data_handle_t *fstarpu_data_handle_array_alloc(int nb);
															
 
																                 function fstarpu_data_handle_array_alloc (nb) bind(C)
															
 
																                         use iso_c_binding, only: c_ptr, c_int
															
 
																                         type(c_ptr) :: fstarpu_data_handle_array_alloc
															
@@ -2331,7 +2332,9 @@ module fstarpu_mod
 
																                             fstarpu_get_constant(C_CHAR_"FSTARPU_SCC"//C_NULL_CHAR)
															
 
																                         FSTARPU_CODELET_SIMGRID_EXECUTE = &
															
 
																-                            fstarpu_get_constant(C_CHAR_"FSTARPU_CODELET_SIMGRID_EXECUTE"//C_NULL_CHAR)
															
 
																+                             fstarpu_get_constant(C_CHAR_"FSTARPU_CODELET_SIMGRID_EXECUTE"//C_NULL_CHAR)
															
 
																+                        FSTARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT = &
															
 
																+                             fstarpu_get_constant(C_CHAR_"FSTARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT"//C_NULL_CHAR)
															
 
																                         FSTARPU_CUDA_ASYNC = &
															
 
																                             fstarpu_get_constant(C_CHAR_"FSTARPU_CUDA_ASYNC"//C_NULL_CHAR)
															
 
																                         FSTARPU_OPENCL_ASYNC = &
															
--- a/include/starpu_config.h.in
+++ b/include/starpu_config.h.in
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2009-2016  Université de Bordeaux
															
 
																- * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016  CNRS
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016, 2017  CNRS
															
 
																  * Copyright (C) 2014  INRIA
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -152,5 +152,6 @@ typedef ssize_t starpu_ssize_t;
 
																 #undef STARPU_HAVE_DARWIN
															
 
																 #undef STARPU_HAVE_CXX11
															
 
																+#undef STARPU_HAVE_STRERROR_R
															
 
																 #endif
															
--- a/include/starpu_scheduler.h
+++ b/include/starpu_scheduler.h
@@ -62,8 +62,11 @@ unsigned long starpu_task_get_job_id(struct starpu_task *task);
 
																 /* This function must be called to wake up a worker that is sleeping on the cond. 
															
 
																  * It returns 0 whenever the worker is not in a sleeping state */
															
 
																 int starpu_wake_worker(int workerid);
															
 
																+int starpu_wakeup_worker(int workerid, starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex);
															
 
																 /* This is a version of starpu_wake_worker which assumes that the sched mutex is locked */
															
 
																 int starpu_wake_worker_locked(int workerid);
															
 
																+/* This is a version of starpu_wakeup_worker which assumes that the sched mutex is locked */
															
 
																+int starpu_wakeup_worker_locked(int workerid, starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex);
															
 
																 int starpu_worker_can_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl);
															
 
																 int starpu_worker_can_execute_task_impl(unsigned workerid, struct starpu_task *task, unsigned *impl_mask);
															
--- a/include/starpu_task.h
+++ b/include/starpu_task.h
@@ -46,6 +46,7 @@ extern "C"
 
																 #define STARPU_MPI_MS	((1ULL)<<9)
															
 
																 #define STARPU_CODELET_SIMGRID_EXECUTE	(1<<0)
															
 
																+#define STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT	(1<<1)
															
 
																 #define STARPU_CUDA_ASYNC	(1<<0)
															
 
																 #define STARPU_OPENCL_ASYNC	(1<<0)
															
--- a/include/starpu_thread.h
+++ b/include/starpu_thread.h
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010, 2012-2016  Université de Bordeaux
															
 
																- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
															
 
																+ * Copyright (C) 2010, 2012-2017  Université de Bordeaux
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2017  CNRS
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -33,6 +33,7 @@
 
																 #endif
															
 
																 #elif !defined(_MSC_VER) || defined(BUILDING_STARPU)
															
 
																 #include <pthread.h>
															
 
																+#include <semaphore.h>
															
 
																 #endif
															
 
																 #include <stdint.h>
															
@@ -50,8 +51,9 @@ extern "C"
 
																 typedef msg_process_t starpu_pthread_t;
															
 
																 typedef int starpu_pthread_attr_t;
															
 
																+int starpu_pthread_equal(starpu_pthread_t t1, starpu_pthread_t t2);
															
 
																+starpu_pthread_t starpu_pthread_self(void);
															
 
																 int starpu_pthread_create_on(char *name, starpu_pthread_t *thread, const starpu_pthread_attr_t *attr, void *(*start_routine) (void *), void *arg, msg_host_t host);
															
 
																-#define starpu_pthread_setname(name)
															
 
																 int starpu_pthread_create(starpu_pthread_t *thread, const starpu_pthread_attr_t *attr, void *(*start_routine) (void *), void *arg);
															
 
																 int starpu_pthread_join(starpu_pthread_t thread, void **retval);
															
 
																 int starpu_pthread_exit(void *retval) STARPU_ATTRIBUTE_NORETURN;
															
@@ -64,8 +66,18 @@ int starpu_pthread_attr_setdetachstate(starpu_pthread_attr_t *attr, int detachst
 
																 typedef pthread_t starpu_pthread_t;
															
 
																 typedef pthread_attr_t starpu_pthread_attr_t;
															
 
																+#define starpu_pthread_equal pthread_equal
															
 
																+#define starpu_pthread_self pthread_self
															
 
																 #define starpu_pthread_create pthread_create
															
 
																 #define starpu_pthread_create_on(name, thread, attr, routine, arg, where) starpu_pthread_create(thread, attr, routine, arg)
															
 
																+#define starpu_pthread_join pthread_join
															
 
																+#define starpu_pthread_exit pthread_exit
															
 
																+#define starpu_pthread_attr_init pthread_attr_init
															
 
																+#define starpu_pthread_attr_destroy pthread_attr_destroy
															
 
																+#define starpu_pthread_attr_setdetachstate pthread_attr_setdetachstate
															
 
																+
															
 
																+#endif /* STARPU_SIMGRID, _MSC_VER */
															
 
																+
															
 
																 #ifdef STARPU_HAVE_PTHREAD_SETNAME_NP
															
 
																 #ifdef STARPU_HAVE_DARWIN
															
 
																 #define starpu_pthread_setname(name) pthread_setname_np(name)
															
@@ -75,13 +87,6 @@ typedef pthread_attr_t starpu_pthread_attr_t;
 
																 #else
															
 
																 #define starpu_pthread_setname(name)
															
 
																 #endif
															
 
																-#define starpu_pthread_join pthread_join
															
 
																-#define starpu_pthread_exit pthread_exit
															
 
																-#define starpu_pthread_attr_init pthread_attr_init
															
 
																-#define starpu_pthread_attr_destroy pthread_attr_destroy
															
 
																-#define starpu_pthread_attr_setdetachstate pthread_attr_setdetachstate
															
 
																-
															
 
																-#endif /* STARPU_SIMGRID, _MSC_VER */
															
 
																 /*
															
 
																  * Encapsulation of the pthread_mutex_* functions.
															
@@ -403,6 +408,32 @@ int starpu_pthread_wait_wait(starpu_pthread_wait_t *w);
 
																 int starpu_pthread_wait_destroy(starpu_pthread_wait_t *w);
															
 
																 #endif
															
 
																+/*
															
 
																+ * Encapsulation of the semaphore functions.
															
 
																+ */
															
 
																+
															
 
																+#ifdef STARPU_SIMGRID
															
 
																+
															
 
																+typedef msg_sem_t starpu_sem_t;
															
 
																+int starpu_sem_destroy(starpu_sem_t *);
															
 
																+int starpu_sem_getvalue(starpu_sem_t *, int *);
															
 
																+int starpu_sem_init(starpu_sem_t *, int, unsigned);
															
 
																+int starpu_sem_post(starpu_sem_t *);
															
 
																+int starpu_sem_trywait(starpu_sem_t *);
															
 
																+int starpu_sem_wait(starpu_sem_t *);
															
 
																+
															
 
																+#elif !defined(_MSC_VER) || defined(BUILDING_STARPU) /* !STARPU_SIMGRID */
															
 
																+
															
 
																+typedef sem_t starpu_sem_t;
															
 
																+#define starpu_sem_destroy sem_destroy
															
 
																+#define starpu_sem_getvalue sem_getvalue
															
 
																+#define starpu_sem_init sem_init
															
 
																+#define starpu_sem_post sem_post
															
 
																+int starpu_sem_trywait(starpu_sem_t *);
															
 
																+int starpu_sem_wait(starpu_sem_t *);
															
 
																+
															
 
																+#endif
															
 
																+
															
 
																 #ifdef __cplusplus
															
 
																 }
															
 
																 #endif
															
--- a/mpi/dev/starpu_mpi_comm_check.sh
+++ b/mpi/dev/starpu_mpi_comm_check.sh
@@ -0,0 +1,109 @@
 
																+#!/bin/bash
															
 
																+
															
 
																+# StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+#
															
 
																+# Copyright (C) 2017 CNRS
															
 
																+#
															
 
																+# StarPU is free software; you can redistribute it and/or modify
															
 
																+# it under the terms of the GNU Lesser General Public License as published by
															
 
																+# the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+# your option) any later version.
															
 
																+#
															
 
																+# StarPU is distributed in the hope that it will be useful, but
															
 
																+# WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+#
															
 
																+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+
															
 
																+# Script to check MPI communications are done properly
															
 
																+# The application should be launched with STARPU_MPI_COMM=1
															
 
																+# e.g
															
 
																+#    $ export STARPU_MPI_COMM=1
															
 
																+#    $ mpirun --output-filename starpu_mpi.log appli parameters
															
 
																+# and then the script can be launched with the output files
															
 
																+#    $ starpu_mpi_comm_check.sh starpu_mpi.log.*
															
 
																+
															
 
																+if test -z "$1"
															
 
																+then
															
 
																+    echo Syntax error: parameter missing
															
 
																+    exit 1
															
 
																+fi
															
 
																+
															
 
																+# Get the nodes identifiers
															
 
																+nodes=$(for f in $*
															
 
																+	do
															
 
																+	    grep starpu_mpi $f | grep '\[' | awk '{print $1}'| sed 's/\[\(.*\)\]\[starpu_mpi\]/\1/' | grep "^[[:digit:]]*$"
															
 
																+	done |sort|uniq
															
 
																+     )
															
 
																+echo nodes $nodes
															
 
																+
															
 
																+DIR=/tmp
															
 
																+
															
 
																+# for each node, extract send and receive communications
															
 
																+for node in $nodes
															
 
																+do
															
 
																+    for f in $*
															
 
																+    do
															
 
																+	grep starpu_mpi $f |grep "\[$node"
															
 
																+    done > $DIR/starpu_mpi_node$node.log
															
 
																+    grep -- "-->" $DIR/starpu_mpi_node$node.log > $DIR/starpu_mpi_node${node}_send.log
															
 
																+    grep -- "<--" $DIR/starpu_mpi_node$node.log > $DIR/starpu_mpi_node${node}_recv.log
															
 
																+done
															
 
																+
															
 
																+# count the number of traced lines
															
 
																+#for node in $nodes
															
 
																+#do
															
 
																+#    wc -l $DIR/starpu_mpi_node${node}_recv.log
															
 
																+#    lines=$(grep :42:42 $DIR/starpu_mpi_node${node}_recv.log | wc -l)
															
 
																+#    lines2=$(( lines + lines ))
															
 
																+#    echo $lines2
															
 
																+#    lines3=$(( lines2 + lines ))
															
 
																+#    echo $lines3
															
 
																+#done
															
 
																+
															
 
																+# for each pair of nodes, check tags are sent and received in the same order
															
 
																+for src in $nodes
															
 
																+do
															
 
																+    for dst in $nodes
															
 
																+    do
															
 
																+	if test $src != $dst
															
 
																+	then
															
 
																+	    grep ":$dst:42:" $DIR/starpu_mpi_node${src}_send.log| awk -F':' '{print $6}' > $DIR/node${src}_send_to_${dst}.log
															
 
																+	    grep ":$src:42:" $DIR/starpu_mpi_node${dst}_recv.log|awk -F ':' '{print $6}'> $DIR/node${dst}_recv_from_${src}.log
															
 
																+ 	    diff --side-by-side  --suppress-common-lines $DIR/node${src}_send_to_${dst}.log $DIR/node${dst}_recv_from_${src}.log  > $DIR/check_$$
															
 
																+	    if test -s $DIR/check_$$
															
 
																+	    then
															
 
																+		echo $src $dst
															
 
																+		less $DIR/check_$$
															
 
																+	    fi
															
 
																+	fi
															
 
																+    done
															
 
																+done
															
 
																+
															
 
																+# check each envelope reception is followed by the appropriate data reception
															
 
																+# first line: MPI_Recv of the envelope
															
 
																+# second line: display envelope information
															
 
																+# third line: MPI_Recv of the data
															
 
																+for node in $nodes
															
 
																+do
															
 
																+    echo processing $DIR/starpu_mpi_node${node}_recv.log
															
 
																+    (
															
 
																+	while read line
															
 
																+	do
															
 
																+	    read line2
															
 
																+	    read line3
															
 
																+	    #echo processing
															
 
																+	    tag2=$(echo $line2 | awk -F ':' '{print $6}')
															
 
																+	    tag3=$(echo $line3 | awk -F ':' '{print $6}')
															
 
																+	    if test "$tag2" != "$tag3"
															
 
																+	    then
															
 
																+		echo erreur
															
 
																+		echo $tag2 $tag3
															
 
																+		echo $line
															
 
																+		echo $line2
															
 
																+		echo $line3
															
 
																+	    fi
															
 
																+	done
															
 
																+    ) < $DIR/starpu_mpi_node${node}_recv.log
															
 
																+done
															
 
																+
															
--- a/sc_hypervisor/src/hypervisor_policies/feft_lp_policy.c
+++ b/sc_hypervisor/src/hypervisor_policies/feft_lp_policy.c
@@ -27,7 +27,7 @@ static void _try_resizing(unsigned *sched_ctxs, int nsched_ctxs, int *workers, i
 
																 	/* for vite */
															
 
																 	int ns = sched_ctxs == NULL ? sc_hypervisor_get_nsched_ctxs() : nsched_ctxs;
															
 
																 #ifdef STARPU_SC_HYPERVISOR_DEBUG
															
 
																-	printf("resize_no = %u %d ctxs\n", resize_no, ns);
															
 
																+	printf("resize_no = %lu %d ctxs\n", resize_no, ns);
															
 
																 #endif
															
 
																 	if(ns <= 0) return;
															
--- a/socl/src/cl_createkernel.c
+++ b/socl/src/cl_createkernel.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010,2011 University of Bordeaux
															
 
																- * Copyright (C) 2016  CNRS
															
 
																+ * Copyright (C) 2016, 2017  CNRS
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -25,7 +25,7 @@ static void soclCreateKernel_task(void *data) {
 
																    if (k->program->cl_programs[range] == NULL) {
															
 
																       k->errcodes[range] = CL_SUCCESS;
															
 
																-      DEBUG_MSG("[Device %d] Kernel creation skipped: program has not been built for this device.\n", starpu_worker_get_id_check());
															
 
																+      DEBUG_MSG("[Device %u] Kernel creation skipped: program has not been built for this device.\n", starpu_worker_get_id_check());
															
 
																       return;
															
 
																    }
															
@@ -163,7 +163,7 @@ soclCreateKernel(cl_program    program,
 
																    }
															
 
																    /* Create kernel on each device */
															
 
																-   DEBUG_MSG("[Kernel %d] Create %d kernels (name \"%s\")\n", k->id, socl_device_count, kernel_name);
															
 
																+   DEBUG_MSG("[Kernel %d] Create %u kernels (name \"%s\")\n", k->id, socl_device_count, kernel_name);
															
 
																    starpu_execute_on_each_worker_ex(soclCreateKernel_task, k, STARPU_OPENCL, "SOCL_CREATE_KERNEL");
															
 
																    if (errcode_ret != NULL) {
															
--- a/socl/src/cl_createprogramwithsource.c
+++ b/socl/src/cl_createprogramwithsource.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010,2011 University of Bordeaux
															
 
																- * Copyright (C) 2016  CNRS
															
 
																+ * Copyright (C) 2016, 2017  CNRS
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -141,7 +141,7 @@ soclCreateProgramWithSource(cl_context      context,
 
																       *errcode_ret = CL_SUCCESS;
															
 
																       for (i=0; i<socl_device_count; i++) {
															
 
																          if (data->errcodes[i] != CL_SUCCESS) {
															
 
																-            DEBUG_MSG("Worker [%d] failed\n", i);
															
 
																+            DEBUG_MSG("Worker [%u] failed\n", i);
															
 
																             DEBUG_CL("clCreateProgramWithSource", data->errcodes[i]);
															
 
																             *errcode_ret = data->errcodes[i];
															
 
																             break;
															
--- a/socl/src/cl_enqueuendrangekernel.c
+++ b/socl/src/cl_enqueuendrangekernel.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010,2011, 2016-2017 University of Bordeaux
															
 
																- * Copyright (C) 2016  CNRS
															
 
																+ * Copyright (C) 2016, 2017  CNRS
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -71,13 +71,13 @@ void soclEnqueueNDRangeKernel_task(void *descr[], void *args) {
 
																    if (err != CL_SUCCESS) {
															
 
																 	   ERROR_MSG("Worker[%d] Unable to Enqueue kernel (error %d)\n", wid, err);
															
 
																 	   DEBUG_CL("clEnqueueNDRangeKernel", err);
															
 
																-	   DEBUG_MSG("Workdim %d, global_work_offset %p, global_work_size %p, local_work_size %p\n",
															
 
																+	   DEBUG_MSG("Workdim %u, global_work_offset %p, global_work_size %p, local_work_size %p\n",
															
 
																 			   cmd->work_dim, cmd->global_work_offset, cmd->global_work_size, cmd->local_work_size);
															
 
																-	   DEBUG_MSG("Global work size: %ld %ld %ld\n", cmd->global_work_size[0],
															
 
																-			   (cmd->work_dim > 1 ? cmd->global_work_size[1] : 1), (cmd->work_dim > 2 ? cmd->global_work_size[2] : 1)); 
															
 
																+	   DEBUG_MSG("Global work size: %ld %ld %ld\n", (long)cmd->global_work_size[0],
															
 
																+		     (long)(cmd->work_dim > 1 ? cmd->global_work_size[1] : 1), (long)(cmd->work_dim > 2 ? cmd->global_work_size[2] : 1)); 
															
 
																 	   if (cmd->local_work_size != NULL)
															
 
																-		   DEBUG_MSG("Local work size: %ld %ld %ld\n", cmd->local_work_size[0],
															
 
																-				   (cmd->work_dim > 1 ? cmd->local_work_size[1] : 1), (cmd->work_dim > 2 ? cmd->local_work_size[2] : 1)); 
															
 
																+		   DEBUG_MSG("Local work size: %ld %ld %ld\n", (long)cmd->local_work_size[0],
															
 
																+			     (long)(cmd->work_dim > 1 ? cmd->local_work_size[1] : 1), (long)(cmd->work_dim > 2 ? cmd->local_work_size[2] : 1)); 
															
 
																    }
															
 
																    else {
															
 
																       /* Waiting for kernel to terminate */
															
--- a/socl/src/cl_enqueuereadbuffer.c
+++ b/socl/src/cl_enqueuereadbuffer.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010,2011, 2014 University of Bordeaux
															
 
																- * Copyright (C) 2016  CNRS
															
 
																+ * Copyright (C) 2016, 2017  CNRS
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -25,7 +25,7 @@ static void soclEnqueueReadBuffer_cpu_task(void *descr[], void *args) {
 
																   gc_entity_release(ev);
															
 
																    char * ptr = (void*)STARPU_VARIABLE_GET_PTR(descr[0]);
															
 
																-   DEBUG_MSG("[Buffer %d] Reading %ld bytes from %p to %p\n", cmd->buffer->id, cmd->cb, ptr+cmd->offset, cmd->ptr);
															
 
																+   DEBUG_MSG("[Buffer %d] Reading %ld bytes from %p to %p\n", cmd->buffer->id, (long)cmd->cb, ptr+cmd->offset, cmd->ptr);
															
 
																    //This fix is for people who use USE_HOST_PTR and still use ReadBuffer to sync the buffer in host mem at host_ptr.
															
 
																    //They should use buffer mapping facilities instead.
															
@@ -44,7 +44,7 @@ static void soclEnqueueReadBuffer_opencl_task(void *descr[], void *args) {
 
																    cl_mem mem = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
															
 
																-   DEBUG_MSG("[Buffer %d] Reading %ld bytes from offset %ld into %p\n", cmd->buffer->id, cmd->cb, cmd->offset, cmd->ptr);
															
 
																+   DEBUG_MSG("[Buffer %d] Reading %ld bytes from offset %ld into %p\n", cmd->buffer->id, (long)cmd->cb, (long)cmd->offset, cmd->ptr);
															
 
																    int wid = starpu_worker_get_id_check();
															
 
																    cl_command_queue cq;
															
--- a/socl/src/cl_enqueuewritebuffer.c
+++ b/socl/src/cl_enqueuewritebuffer.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010,2011, 2014 University of Bordeaux
															
 
																- * Copyright (C) 2016  CNRS
															
 
																+ * Copyright (C) 2016, 2017  CNRS
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -26,7 +26,7 @@ static void soclEnqueueWriteBuffer_cpu_task(void *descr[], void *args) {
 
																   gc_entity_release(ev);
															
 
																    char * ptr = (void*)STARPU_VARIABLE_GET_PTR(descr[0]);
															
 
																-   DEBUG_MSG("[Buffer %d] Writing %ld bytes from %p to %p\n", cmd->buffer->id, cmd->cb, cmd->ptr, ptr+cmd->offset);
															
 
																+   DEBUG_MSG("[Buffer %d] Writing %ld bytes from %p to %p\n", cmd->buffer->id, (long)cmd->cb, cmd->ptr, ptr+cmd->offset);
															
 
																    //FIXME: Fix for people who use USE_HOST_PTR, modify data at host_ptr and use WriteBuffer to commit the change.
															
 
																    // StarPU may have erased host mem at host_ptr (for instance by retrieving current buffer data at host_ptr)
															
@@ -47,7 +47,7 @@ static void soclEnqueueWriteBuffer_opencl_task(void *descr[], void *args) {
 
																    cl_mem mem = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
															
 
																-   DEBUG_MSG("[Buffer %d] Writing %ld bytes to offset %ld from %p\n", cmd->buffer->id, cmd->cb, cmd->offset, cmd->ptr);
															
 
																+   DEBUG_MSG("[Buffer %d] Writing %ld bytes to offset %ld from %p\n", cmd->buffer->id, (long)cmd->cb, (long)cmd->offset, cmd->ptr);
															
 
																    int wid = starpu_worker_get_id_check();
															
 
																    cl_command_queue cq;
															
--- a/socl/src/cl_setkernelarg.c
+++ b/socl/src/cl_setkernelarg.c
@@ -68,7 +68,7 @@ soclSetKernelArg(cl_kernel  kernel,
 
																    kernel->arg_type[arg_index] = Null;
															
 
																    kernel->arg_size[arg_index] = arg_size;
															
 
																-   DEBUG_MSG("[Kernel %d] Set argument %d: argsize %ld argvalue %p\n", kernel->id, arg_index, arg_size, arg_value);
															
 
																+   DEBUG_MSG("[Kernel %d] Set argument %d: argsize %ld argvalue %p\n", kernel->id, arg_index, (long)arg_size, arg_value);
															
 
																    /* Argument is not Null */
															
 
																    if (arg_value != NULL) {
															
--- a/socl/src/task.c
+++ b/socl/src/task.c
@@ -77,7 +77,7 @@ void task_depends_on(starpu_task task, cl_uint num_events, cl_event *events) {
 
																     DEBUG_MSG("Task %p depends on events:", task);
															
 
																     for (i=0; i<num_events; i++) {
															
 
																        tags[i] = events[i]->id;
															
 
																-       DEBUG_MSG_NOHEAD(" %u", events[i]->id);
															
 
																+       DEBUG_MSG_NOHEAD(" %d", events[i]->id);
															
 
																     }
															
 
																     DEBUG_MSG_NOHEAD("\n");
															
--- a/src/common/fxt.c
+++ b/src/common/fxt.c
@@ -72,7 +72,7 @@ long _starpu_gettid(void)
 
																 #elif defined(_WIN32) && !defined(__CYGWIN__)
															
 
																 	return (long) GetCurrentThreadId();
															
 
																 #else
															
 
																-	return (long) pthread_self();
															
 
																+	return (long) starpu_pthread_self();
															
 
																 #endif
															
 
																 #endif
															
 
																 }
															
--- a/src/common/prio_list.h
+++ b/src/common/prio_list.h
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2015-2016  Université de Bordeaux
															
 
																+ * Copyright (C) 2015-2017  Université de Bordeaux
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -252,9 +252,31 @@
 
																 	static inline void ENAME##_prio_list_deinit(struct ENAME##_prio_list *priolist) \
															
 
																 	{ (void) (priolist); /* ENAME##_list_deinit(&(priolist)->list); */ } \
															
 
																 	static inline void ENAME##_prio_list_push_back(struct ENAME##_prio_list *priolist, struct ENAME *e) \
															
 
																-	{ ENAME##_list_push_back(&(priolist)->list, (e)); } \
															
 
																+	{ \
															
 
																+		struct ENAME *cur; \
															
 
																+		for (cur  = ENAME##_list_begin(&(priolist)->list); \
															
 
																+		     cur != ENAME##_list_end(&(priolist)->list); \
															
 
																+		     cur  = ENAME##_list_next(cur)) \
															
 
																+			if ((e)->PRIOFIELD > cur->PRIOFIELD) \
															
 
																+				break; \
															
 
																+		if (cur == ENAME##_list_end(&(priolist)->list)) \
															
 
																+			ENAME##_list_push_back(&(priolist)->list, (e)); \
															
 
																+		else \
															
 
																+			ENAME##_list_insert_before(&(priolist)->list, (e), cur); \
															
 
																+	} \
															
 
																 	static inline void ENAME##_prio_list_push_front(struct ENAME##_prio_list *priolist, struct ENAME *e) \
															
 
																-	{ ENAME##_list_push_front(&(priolist)->list, (e)); } \
															
 
																+	{ \
															
 
																+		struct ENAME *cur; \
															
 
																+		for (cur  = ENAME##_list_begin(&(priolist)->list); \
															
 
																+		     cur != ENAME##_list_end(&(priolist)->list); \
															
 
																+		     cur  = ENAME##_list_next(cur)) \
															
 
																+			if ((e)->PRIOFIELD >= cur->PRIOFIELD) \
															
 
																+				break; \
															
 
																+		if (cur == ENAME##_list_end(&(priolist)->list)) \
															
 
																+			ENAME##_list_push_back(&(priolist)->list, (e)); \
															
 
																+		else \
															
 
																+			ENAME##_list_insert_before(&(priolist)->list, (e), cur); \
															
 
																+	} \
															
 
																 	static inline int ENAME##_prio_list_empty(const struct ENAME##_prio_list *priolist) \
															
 
																 	{ return ENAME##_list_empty(&(priolist)->list); } \
															
 
																 	static inline void ENAME##_prio_list_erase(struct ENAME##_prio_list *priolist, struct ENAME *e) \
															
--- a/src/common/starpu_spinlock.c
+++ b/src/common/starpu_spinlock.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010, 2012-2014, 2016  Université de Bordeaux
															
 
																- * Copyright (C) 2010, 2011, 2013, 2014  CNRS
															
 
																+ * Copyright (C) 2010, 2011, 2013, 2014, 2017  CNRS
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -25,10 +25,9 @@
 
																 int _starpu_spin_init(struct _starpu_spinlock *lock)
															
 
																 {
															
 
																 	starpu_pthread_mutexattr_t errcheck_attr;
															
 
																-//	memcpy(&lock->errcheck_lock, PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP, sizeof(PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP));
															
 
																 	int ret;
															
 
																 	ret = starpu_pthread_mutexattr_init(&errcheck_attr);
															
 
																-	STARPU_CHECK_RETURN_VALUE(ret, "pthread_mutexattr_init");
															
 
																+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_pthread_mutexattr_init");
															
 
																 	ret = starpu_pthread_mutexattr_settype(&errcheck_attr, PTHREAD_MUTEX_ERRORCHECK);
															
 
																 	STARPU_ASSERT(!ret);
															
--- a/src/common/thread.c
+++ b/src/common/thread.c
@@ -19,6 +19,7 @@
 
																 #include <core/simgrid.h>
															
 
																 #include <core/workers.h>
															
 
																+#include <errno.h>
															
 
																 #include <limits.h>
															
 
																 #ifdef STARPU_SIMGRID
															
@@ -50,6 +51,16 @@ static int _starpu_futex_wake = FUTEX_WAKE;
 
																 extern int _starpu_simgrid_thread_start(int argc, char *argv[]);
															
 
																+int starpu_pthread_equal(starpu_pthread_t t1, starpu_pthread_t t2)
															
 
																+{
															
 
																+	return t1 == t2;
															
 
																+}
															
 
																+
															
 
																+starpu_pthread_t starpu_pthread_self(void)
															
 
																+{
															
 
																+	return MSG_process_self();
															
 
																+}
															
 
																+
															
 
																 int starpu_pthread_create_on(char *name, starpu_pthread_t *thread, const starpu_pthread_attr_t *attr STARPU_ATTRIBUTE_UNUSED, void *(*start_routine) (void *), void *arg, msg_host_t host)
															
 
																 {
															
 
																 	char **_args;
															
@@ -62,6 +73,9 @@ int starpu_pthread_create_on(char *name, starpu_pthread_t *thread, const starpu_
 
																 	void *tsd;
															
 
																 	_STARPU_CALLOC(tsd, MAX_TSD+1, sizeof(void*));
															
 
																 	*thread = MSG_process_create_with_arguments(name, _starpu_simgrid_thread_start, tsd, host, 2, _args);
															
 
																+#if SIMGRID_VERSION_MAJOR > 3 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR >= 15)
															
 
																+	MSG_process_ref(*thread);
															
 
																+#endif
															
 
																 	return 0;
															
 
																 }
															
@@ -74,6 +88,9 @@ int starpu_pthread_join(starpu_pthread_t thread STARPU_ATTRIBUTE_UNUSED, void **
 
																 {
															
 
																 #if SIMGRID_VERSION_MAJOR > 3 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR >= 14)
															
 
																 	MSG_process_join(thread, 1000000);
															
 
																+#if SIMGRID_VERSION_MAJOR > 3 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR >= 15)
															
 
																+	MSG_process_unref(thread);
															
 
																+#endif
															
 
																 #else
															
 
																 	MSG_process_sleep(1);
															
 
																 #endif
															
@@ -519,7 +536,7 @@ int starpu_pthread_queue_destroy(starpu_pthread_queue_t *q)
 
																 #endif /* STARPU_SIMGRID */
															
 
																 #if (defined(STARPU_SIMGRID) && !defined(STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT)) || (!defined(STARPU_SIMGRID) && !defined(STARPU_HAVE_PTHREAD_BARRIER))
															
 
																-int starpu_pthread_barrier_init(starpu_pthread_barrier_t *restrict barrier, const starpu_pthread_barrierattr_t *restrict attr, unsigned count)
															
 
																+int starpu_pthread_barrier_init(starpu_pthread_barrier_t *restrict barrier, const starpu_pthread_barrierattr_t *restrict attr STARPU_ATTRIBUTE_UNUSED, unsigned count)
															
 
																 {
															
 
																 	int ret = starpu_pthread_mutex_init(&barrier->mutex, NULL);
															
 
																 	if (!ret)
															
@@ -703,47 +720,34 @@ int starpu_pthread_barrier_wait(starpu_pthread_barrier_t *barrier)
 
																  * macros of course) which record when the mutex is held or not */
															
 
																 int starpu_pthread_mutex_lock_sched(starpu_pthread_mutex_t *mutex)
															
 
																 {
															
 
																-	const int workerid = starpu_worker_get_id();
															
 
																-	struct _starpu_worker * const worker = (workerid != -1)?_starpu_get_worker_struct(workerid):NULL;
															
 
																-	if(worker && mutex == &worker->sched_mutex)
															
 
																-	{
															
 
																-		STARPU_ASSERT(worker->sched_mutex_depth < UINT_MAX);
															
 
																-		worker->sched_mutex_depth++;
															
 
																-		if (worker->sched_mutex_depth > 1)
															
 
																-			return 0;
															
 
																-	}
															
 
																-
															
 
																-	return starpu_pthread_mutex_lock(mutex);
															
 
																+	int p_ret = starpu_pthread_mutex_lock(mutex);
															
 
																+	int workerid = starpu_worker_get_id();
															
 
																+	if(workerid != -1 && _starpu_worker_mutex_is_sched_mutex(workerid, mutex))
															
 
																+		_starpu_worker_set_flag_sched_mutex_locked(workerid, 1);
															
 
																+	return p_ret;
															
 
																 }
															
 
																 int starpu_pthread_mutex_unlock_sched(starpu_pthread_mutex_t *mutex)
															
 
																 {
															
 
																-	const int workerid = starpu_worker_get_id();
															
 
																-	struct _starpu_worker * const worker = (workerid != -1)?_starpu_get_worker_struct(workerid):NULL;
															
 
																-	if(worker && mutex == &worker->sched_mutex)
															
 
																-	{
															
 
																-		STARPU_ASSERT(worker->sched_mutex_depth > 0);
															
 
																-		worker->sched_mutex_depth--;
															
 
																-		if (worker->sched_mutex_depth > 0)
															
 
																-			return 0;
															
 
																-	}
															
 
																+	int workerid = starpu_worker_get_id();
															
 
																+	if(workerid != -1 && _starpu_worker_mutex_is_sched_mutex(workerid, mutex))
															
 
																+		_starpu_worker_set_flag_sched_mutex_locked(workerid, 0);
															
 
																 	return starpu_pthread_mutex_unlock(mutex);
															
 
																 }
															
 
																 int starpu_pthread_mutex_trylock_sched(starpu_pthread_mutex_t *mutex)
															
 
																 {
															
 
																-	const int workerid = starpu_worker_get_id();
															
 
																-	struct _starpu_worker * const worker = (workerid != -1)?_starpu_get_worker_struct(workerid):NULL;
															
 
																-	if(worker && mutex == &worker->sched_mutex)
															
 
																+	int ret = starpu_pthread_mutex_trylock(mutex);
															
 
																+
															
 
																+	if (!ret)
															
 
																 	{
															
 
																-		STARPU_ASSERT(worker->sched_mutex_depth < UINT_MAX);
															
 
																-		worker->sched_mutex_depth++;
															
 
																-		if (worker->sched_mutex_depth > 1)
															
 
																-			return 0;
															
 
																+		int workerid = starpu_worker_get_id();
															
 
																+		if(workerid != -1 && _starpu_worker_mutex_is_sched_mutex(workerid, mutex))
															
 
																+			_starpu_worker_set_flag_sched_mutex_locked(workerid, 1);
															
 
																 	}
															
 
																-	return starpu_pthread_mutex_trylock(mutex);
															
 
																+	return ret;
															
 
																 }
															
 
																 #ifdef STARPU_DEBUG
															
@@ -870,3 +874,72 @@ void _starpu_pthread_spin_do_unlock(starpu_pthread_spinlock_t *lock)
 
																 #endif
															
 
																 #endif /* defined(STARPU_SIMGRID) || (defined(STARPU_LINUX_SYS) && defined(STARPU_HAVE_XCHG)) || !defined(STARPU_HAVE_PTHREAD_SPIN_LOCK) */
															
 
																+
															
 
																+#ifdef STARPU_SIMGRID
															
 
																+
															
 
																+int starpu_sem_destroy(starpu_sem_t *sem)
															
 
																+{
															
 
																+	MSG_sem_destroy(*sem);
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+int starpu_sem_init(starpu_sem_t *sem, int pshared, unsigned value)
															
 
																+{
															
 
																+	STARPU_ASSERT_MSG(pshared == 0, "pshared semaphores not supported under simgrid");
															
 
																+	*sem = MSG_sem_init(value);
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+int starpu_sem_post(starpu_sem_t *sem)
															
 
																+{
															
 
																+	MSG_sem_release(*sem);
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+int starpu_sem_wait(starpu_sem_t *sem)
															
 
																+{
															
 
																+	MSG_sem_acquire(*sem);
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+int starpu_sem_trywait(starpu_sem_t *sem)
															
 
																+{
															
 
																+	if (MSG_sem_would_block(*sem))
															
 
																+		return EAGAIN;
															
 
																+	starpu_sem_wait(sem);
															
 
																+	return 0;
															
 
																+}
															
 
																+
															
 
																+int starpu_sem_getvalue(starpu_sem_t *sem, int *sval)
															
 
																+{
															
 
																+#if SIMGRID_VERSION_MAJOR > 3 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR > 13)
															
 
																+	*sval = MSG_sem_get_capacity(*sem);
															
 
																+	return 0;
															
 
																+#else
															
 
																+	(void) sem;
															
 
																+	(void) sval;
															
 
																+	STARPU_ABORT_MSG("sigmrid up to 3.13 did not have working MSG_sem_get_capacity");
															
 
																+#endif
															
 
																+}
															
 
																+
															
 
																+#elif !defined(_MSC_VER) || defined(BUILDING_STARPU) /* !STARPU_SIMGRID */
															
 
																+
															
 
																+int starpu_sem_wait(starpu_sem_t *sem)
															
 
																+{
															
 
																+	int ret;
															
 
																+	while((ret = sem_wait(sem)) == -1 && errno == EINTR)
															
 
																+		;
															
 
																+
															
 
																+	return ret;
															
 
																+}
															
 
																+
															
 
																+int starpu_sem_trywait(starpu_sem_t *sem)
															
 
																+{
															
 
																+	int ret;
															
 
																+	while((ret = sem_trywait(sem)) == -1 && errno == EINTR)
															
 
																+		;
															
 
																+	
															
 
																+	return ret;
															
 
																+}
															
 
																+
															
 
																+#endif
															
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -24,7 +24,6 @@
 
																 #include <string.h>
															
 
																 #include <stdlib.h>
															
 
																 #include <math.h>
															
 
																-#include <pthread.h>
															
 
																 #ifdef STARPU_HAVE_SCHED_YIELD
															
 
																 #include <sched.h>
															
 
																 #endif
															
@@ -97,9 +96,9 @@
 
																 #endif
															
 
																 #ifdef STARPU_EXTRA_VERBOSE
															
 
																-#  define _STARPU_LOG_IN()             do { if (!_starpu_silent) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s:%s@%d] -->\n", pthread_self(), __starpu_func__,__FILE__,  __LINE__); }} while(0)
															
 
																-#  define _STARPU_LOG_OUT()            do { if (!_starpu_silent) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s:%s@%d] <--\n", pthread_self(), __starpu_func__, __FILE__,  __LINE__); }} while(0)
															
 
																-#  define _STARPU_LOG_OUT_TAG(outtag)  do { if (!_starpu_silent) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s:%s@%d] <-- (%s)\n", pthread_self(), __starpu_func__, __FILE__, __LINE__, outtag); }} while(0)
															
 
																+#  define _STARPU_LOG_IN()             do { if (!_starpu_silent) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s:%s@%d] -->\n", starpu_pthread_self(), __starpu_func__,__FILE__,  __LINE__); }} while(0)
															
 
																+#  define _STARPU_LOG_OUT()            do { if (!_starpu_silent) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s:%s@%d] <--\n", starpu_pthread_self(), __starpu_func__, __FILE__,  __LINE__); }} while(0)
															
 
																+#  define _STARPU_LOG_OUT_TAG(outtag)  do { if (!_starpu_silent) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s:%s@%d] <-- (%s)\n", starpu_pthread_self(), __starpu_func__, __FILE__, __LINE__, outtag); }} while(0)
															
 
																 #else
															
 
																 #  define _STARPU_LOG_IN()
															
 
																 #  define _STARPU_LOG_OUT()
															
--- a/src/core/jobs.c
+++ b/src/core/jobs.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009-2016  Université de Bordeaux
															
 
																+ * Copyright (C) 2009-2017  Université de Bordeaux
															
 
																  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
															
 
																  * Copyright (C) 2011  Télécom-SudParis
															
 
																  * Copyright (C) 2011, 2014, 2016  INRIA
															
@@ -88,7 +88,7 @@ struct _starpu_job* STARPU_ATTRIBUTE_MALLOC _starpu_job_create(struct starpu_tas
 
																 #ifndef STARPU_USE_FXT
															
 
																 	if (_starpu_bound_recording || _starpu_top_status_get() ||
															
 
																-		_starpu_task_break_on_push != -1 || _starpu_task_break_on_pop != -1 || _starpu_task_break_on_sched != -1
															
 
																+		_starpu_task_break_on_push != -1 || _starpu_task_break_on_sched != -1 || _starpu_task_break_on_pop != -1 || _starpu_task_break_on_exec != -1
															
 
																 		|| STARPU_AYU_EVENT)
															
 
																 #endif
															
 
																 	{
															
--- a/src/core/sched_ctx.c
+++ b/src/core/sched_ctx.c
--- a/src/core/sched_ctx.h
+++ b/src/core/sched_ctx.h
@@ -73,12 +73,27 @@ struct _starpu_sched_ctx
 
																 	long iterations[2];
															
 
																 	int iteration_level;
															
 
																+	/* cond to block push when there are no workers in the ctx */
															
 
																+	starpu_pthread_cond_t no_workers_cond;
															
 
																+
															
 
																+	/* mutex to block push when there are no workers in the ctx */
															
 
																+	starpu_pthread_mutex_t no_workers_mutex;
															
 
																+
															
 
																 	/*ready tasks that couldn't be pushed because the ctx has no workers*/
															
 
																 	struct starpu_task_list empty_ctx_tasks;
															
 
																+	/* mutext protecting empty_ctx_tasks list */
															
 
																+	starpu_pthread_mutex_t empty_ctx_mutex;
															
 
																+
															
 
																 	/*ready tasks that couldn't be pushed because the the window of tasks was already full*/
															
 
																 	struct starpu_task_list waiting_tasks;
															
 
																+	/* mutext protecting waiting_tasks list */
															
 
																+	starpu_pthread_mutex_t waiting_tasks_mutex;
															
 
																+
															
 
																+	/* mutext protecting write to all worker's sched_ctx_list structure for this sched_ctx */
															
 
																+	starpu_pthread_mutex_t sched_ctx_list_mutex;
															
 
																+
															
 
																 	/* min CPUs to execute*/
															
 
																 	int min_ncpus;
															
@@ -127,10 +142,27 @@ struct _starpu_sched_ctx
 
																 	   if not master is -1 */
															
 
																 	int main_master;
															
 
																+	/* conditions variables used when parallel sections are executed in contexts */
															
 
																+	starpu_pthread_cond_t parallel_sect_cond[STARPU_NMAXWORKERS];
															
 
																+	starpu_pthread_mutex_t parallel_sect_mutex[STARPU_NMAXWORKERS];
															
 
																+	starpu_pthread_cond_t parallel_sect_cond_busy[STARPU_NMAXWORKERS];
															
 
																+	int busy[STARPU_NMAXWORKERS];
															
 
																+
															
 
																 	/* boolean indicating that workers should block in order to allow
															
 
																 	   parallel sections to be executed on their allocated resources */
															
 
																 	unsigned parallel_sect[STARPU_NMAXWORKERS];
															
 
																+	/* semaphore that block appl thread until starpu threads are
															
 
																+	   all blocked and ready to exec the parallel code */
															
 
																+	starpu_sem_t fall_asleep_sem[STARPU_NMAXWORKERS];
															
 
																+
															
 
																+	/* semaphore that block appl thread until starpu threads are 
															
 
																+	   all woke up and ready continue appl */
															
 
																+	starpu_sem_t wake_up_sem[STARPU_NMAXWORKERS];
															
 
																+
															
 
																+	/* bool indicating if the workers is sleeping in this ctx */
															
 
																+	unsigned sleeping[STARPU_NMAXWORKERS];
															
 
																+
															
 
																 	/* ctx nesting the current ctx */
															
 
																 	unsigned nesting_sched_ctx;
															
@@ -158,9 +190,6 @@ struct _starpu_sched_ctx
 
																 	int sms_end_idx;
															
 
																 	int stream_worker;
															
 
																-
															
 
																-	starpu_pthread_rwlock_t rwlock;
															
 
																-	starpu_pthread_t lock_write_owner;
															
 
																 };
															
 
																 struct _starpu_machine_config;
															
@@ -212,10 +241,19 @@ void _starpu_worker_gets_out_of_ctx(unsigned sched_ctx_id, struct _starpu_worker
 
																 /* Check if the worker belongs to another sched_ctx */
															
 
																 unsigned _starpu_worker_belongs_to_a_sched_ctx(int workerid, unsigned sched_ctx_id);
															
 
																+/* mutex synchronising several simultaneous modifications of a context */
															
 
																+starpu_pthread_rwlock_t* _starpu_sched_ctx_get_changing_ctx_mutex(unsigned sched_ctx_id);
															
 
																+
															
 
																 /* indicates wheather this worker should go to sleep or not 
															
 
																    (if it is the last one awake in a context he should better keep awake) */
															
 
																 unsigned _starpu_sched_ctx_last_worker_awake(struct _starpu_worker *worker);
															
 
																+/* let the appl know that the worker blocked to execute parallel code */
															
 
																+void _starpu_sched_ctx_signal_worker_blocked(unsigned sched_ctx_id, int workerid);
															
 
																+
															
 
																+/* let the appl know that the worker woke up */
															
 
																+void _starpu_sched_ctx_signal_worker_woke_up(unsigned sched_ctx_id, int workerid);
															
 
																+
															
 
																 /* If starpu_sched_ctx_set_context() has been called, returns the context
															
 
																  * id set by its last call, or the id of the initial context */
															
 
																 unsigned _starpu_sched_ctx_get_current_context();
															
@@ -240,43 +278,4 @@ struct _starpu_sched_ctx *__starpu_sched_ctx_get_sched_ctx_for_worker_and_job(st
 
																 #define _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(w,j) \
															
 
																 	(_starpu_get_nsched_ctxs() <= 1 ? _starpu_get_sched_ctx_struct(0) : __starpu_sched_ctx_get_sched_ctx_for_worker_and_job((w),(j)))
															
 
																-static inline struct _starpu_sched_ctx *_starpu_get_sched_ctx_struct(unsigned id);
															
 
																-
															
 
																-static inline int _starpu_sched_ctx_check_write_locked(unsigned sched_ctx_id)
															
 
																-{
															
 
																-	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
 
																-	return sched_ctx->lock_write_owner == pthread_self();
															
 
																-}
															
 
																-#define STARPU_SCHED_CTX_CHECK_LOCK(sched_ctx_id) STARPU_ASSERT(_starpu_sched_ctx_check_write_locked((sched_ctx_id)))
															
 
																-
															
 
																-static inline void _starpu_sched_ctx_lock_write(unsigned sched_ctx_id)
															
 
																-{
															
 
																-	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
 
																-	STARPU_ASSERT(sched_ctx->lock_write_owner != pthread_self());
															
 
																-	STARPU_PTHREAD_RWLOCK_WRLOCK(&sched_ctx->rwlock);
															
 
																-	sched_ctx->lock_write_owner = pthread_self();
															
 
																-}
															
 
																-
															
 
																-static inline void _starpu_sched_ctx_unlock_write(unsigned sched_ctx_id)
															
 
																-{
															
 
																-	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
 
																-	STARPU_ASSERT(sched_ctx->lock_write_owner == pthread_self());
															
 
																-	sched_ctx->lock_write_owner = 0;
															
 
																-	STARPU_PTHREAD_RWLOCK_UNLOCK(&sched_ctx->rwlock);
															
 
																-}
															
 
																-
															
 
																-static inline void _starpu_sched_ctx_lock_read(unsigned sched_ctx_id)
															
 
																-{
															
 
																-	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
 
																-	STARPU_ASSERT(sched_ctx->lock_write_owner != pthread_self());
															
 
																-	STARPU_PTHREAD_RWLOCK_RDLOCK(&sched_ctx->rwlock);
															
 
																-}
															
 
																-
															
 
																-static inline void _starpu_sched_ctx_unlock_read(unsigned sched_ctx_id)
															
 
																-{
															
 
																-	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
															
 
																-	STARPU_ASSERT(sched_ctx->lock_write_owner != pthread_self());
															
 
																-	STARPU_PTHREAD_RWLOCK_UNLOCK(&sched_ctx->rwlock);
															
 
																-}
															
 
																-
															
 
																 #endif // __SCHED_CONTEXT_H__
															
--- a/src/core/sched_policy.c
+++ b/src/core/sched_policy.c
@@ -31,15 +31,17 @@ static double idle[STARPU_NMAXWORKERS];
 
																 static double idle_start[STARPU_NMAXWORKERS];
															
 
																 long _starpu_task_break_on_push = -1;
															
 
																-long _starpu_task_break_on_pop = -1;
															
 
																 long _starpu_task_break_on_sched = -1;
															
 
																+long _starpu_task_break_on_pop = -1;
															
 
																+long _starpu_task_break_on_exec = -1;
															
 
																 static const char *starpu_idle_file;
															
 
																 void _starpu_sched_init(void)
															
 
																 {
															
 
																 	_starpu_task_break_on_push = starpu_get_env_number_default("STARPU_TASK_BREAK_ON_PUSH", -1);
															
 
																-	_starpu_task_break_on_pop = starpu_get_env_number_default("STARPU_TASK_BREAK_ON_POP", -1);
															
 
																 	_starpu_task_break_on_sched = starpu_get_env_number_default("STARPU_TASK_BREAK_ON_SCHED", -1);
															
 
																+	_starpu_task_break_on_pop = starpu_get_env_number_default("STARPU_TASK_BREAK_ON_POP", -1);
															
 
																+	_starpu_task_break_on_exec = starpu_get_env_number_default("STARPU_TASK_BREAK_ON_EXEC", -1);
															
 
																 	starpu_idle_file = starpu_getenv("STARPU_IDLE_FILE");
															
 
																 }
															
@@ -431,9 +433,9 @@ int _starpu_repush_task(struct _starpu_job *j)
 
																 		if(nworkers == 0)
															
 
																 		{
															
 
																-			_starpu_sched_ctx_lock_write(sched_ctx->id);
															
 
																+			STARPU_PTHREAD_MUTEX_LOCK(&sched_ctx->empty_ctx_mutex);
															
 
																 			starpu_task_list_push_front(&sched_ctx->empty_ctx_tasks, task);
															
 
																-			_starpu_sched_ctx_unlock_write(sched_ctx->id);
															
 
																+			STARPU_PTHREAD_MUTEX_UNLOCK(&sched_ctx->empty_ctx_mutex);
															
 
																 #ifdef STARPU_USE_SC_HYPERVISOR
															
 
																 			if(sched_ctx->id != 0 && sched_ctx->perf_counters != NULL
															
 
																 			   && sched_ctx->perf_counters->notify_empty_ctx)
															
@@ -497,9 +499,9 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 
																 		if (nworkers == 0)
															
 
																 		{
															
 
																-			_starpu_sched_ctx_lock_write(sched_ctx->id);
															
 
																+			STARPU_PTHREAD_MUTEX_LOCK(&sched_ctx->empty_ctx_mutex);
															
 
																 			starpu_task_list_push_back(&sched_ctx->empty_ctx_tasks, task);
															
 
																-			_starpu_sched_ctx_unlock_write(sched_ctx->id);
															
 
																+			STARPU_PTHREAD_MUTEX_UNLOCK(&sched_ctx->empty_ctx_mutex);
															
 
																 #ifdef STARPU_USE_SC_HYPERVISOR
															
 
																 			if(sched_ctx->id != 0 && sched_ctx->perf_counters != NULL
															
 
																 			   && sched_ctx->perf_counters->notify_empty_ctx)
															
@@ -589,6 +591,8 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 
																 		{
															
 
																 			STARPU_ASSERT(sched_ctx->sched_policy->push_task);
															
 
																 			/* check out if there are any workers in the context */
															
 
																+			starpu_pthread_rwlock_t *changing_ctx_mutex = _starpu_sched_ctx_get_changing_ctx_mutex(sched_ctx->id);
															
 
																+			STARPU_PTHREAD_RWLOCK_RDLOCK(changing_ctx_mutex);
															
 
																 			nworkers = starpu_sched_ctx_get_nworkers(sched_ctx->id);
															
 
																 			if (nworkers == 0)
															
 
																 				ret = -1;
															
@@ -599,6 +603,7 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 
																 				ret = sched_ctx->sched_policy->push_task(task);
															
 
																 				_STARPU_SCHED_END;
															
 
																 			}
															
 
																+			STARPU_PTHREAD_RWLOCK_UNLOCK(changing_ctx_mutex);
															
 
																 		}
															
 
																 		if(ret == -1)
															
--- a/src/core/sched_policy.h
+++ b/src/core/sched_policy.h
@@ -28,7 +28,7 @@
 
																 #define _STARPU_SCHED_BEGIN \
															
 
																 	_STARPU_TRACE_WORKER_SCHEDULING_PUSH;	\
															
 
																-	_SIMGRID_TIMER_BEGIN
															
 
																+	_SIMGRID_TIMER_BEGIN(_starpu_simgrid_sched_cost())
															
 
																 #define _STARPU_SCHED_END \
															
 
																 	_SIMGRID_TIMER_END;			\
															
 
																 	_STARPU_TRACE_WORKER_SCHEDULING_POP
															
@@ -103,8 +103,9 @@ extern struct starpu_sched_policy _starpu_sched_modular_heft2_policy;
 
																 extern struct starpu_sched_policy _starpu_sched_graph_test_policy;
															
 
																 extern long _starpu_task_break_on_push;
															
 
																-extern long _starpu_task_break_on_pop;
															
 
																 extern long _starpu_task_break_on_sched;
															
 
																+extern long _starpu_task_break_on_pop;
															
 
																+extern long _starpu_task_break_on_exec;
															
 
																 #ifdef SIGTRAP
															
 
																 #define _STARPU_TASK_BREAK_ON(task, what) do { \
															
--- a/src/core/simgrid.h
+++ b/src/core/simgrid.h
@@ -69,7 +69,7 @@ starpu_pthread_queue_t _starpu_simgrid_task_queue[STARPU_NMAXWORKERS];
 
																 #define _starpu_simgrid_queue_malloc_cost() starpu_get_env_number_default("STARPU_SIMGRID_QUEUE_MALLOC_COST", 1)
															
 
																 #define _starpu_simgrid_task_submit_cost() starpu_get_env_number_default("STARPU_SIMGRID_TASK_SUBMIT_COST", 1)
															
 
																 #define _starpu_simgrid_fetching_input_cost() starpu_get_env_number_default("STARPU_SIMGRID_FETCHING_INPUT_COST", 1)
															
 
																-#define _starpu_simgrid_sched_cost() starpu_get_env_number_default("STARPU_SIMGRID_SCHED_COST", 1)
															
 
																+#define _starpu_simgrid_sched_cost() starpu_get_env_number_default("STARPU_SIMGRID_SCHED_COST", 0)
															
 
																 /* Called at initialization to count how many GPUs are interfering with each
															
 
																  * bus */
															
@@ -78,10 +78,10 @@ void _starpu_simgrid_count_ngpus(void);
 
																 void _starpu_simgrid_xbt_thread_create(const char *name, void_f_pvoid_t code,
															
 
																 				       void *param);
															
 
																-#define _SIMGRID_TIMER_BEGIN		\
															
 
																+#define _SIMGRID_TIMER_BEGIN(cond)			\
															
 
																 	{		\
															
 
																 		xbt_os_timer_t __timer = NULL;		\
															
 
																-		if (_starpu_simgrid_sched_cost()) {		\
															
 
																+		if (cond) {		\
															
 
																 		  __timer = xbt_os_timer_new();		\
															
 
																 		  xbt_os_threadtimer_start(__timer);	\
															
 
																 		}
															
@@ -94,7 +94,7 @@ void _starpu_simgrid_xbt_thread_create(const char *name, void_f_pvoid_t code,
 
																 	}
															
 
																 #else // !STARPU_SIMGRID
															
 
																-#define _SIMGRID_TIMER_BEGIN {
															
 
																+#define _SIMGRID_TIMER_BEGIN(cond) {
															
 
																 #define _SIMGRID_TIMER_END }
															
 
																 #endif
															
--- a/src/core/topology.c
+++ b/src/core/topology.c
@@ -1717,7 +1717,7 @@ _starpu_bind_thread_on_cpu (
 
																 	CPU_ZERO(&aff_mask);
															
 
																 	CPU_SET(cpuid, &aff_mask);
															
 
																-	starpu_pthread_t self = pthread_self();
															
 
																+	starpu_pthread_t self = starpu_pthread_self();
															
 
																 	ret = pthread_setaffinity_np(self, sizeof(aff_mask), &aff_mask);
															
 
																 	if (ret)
															
@@ -2186,8 +2186,7 @@ _starpu_init_workers_binding_and_memory (struct _starpu_machine_config *config,
 
																 				_starpu_memory_node_add_nworkers(memory_node);
															
 
																                                 _starpu_worker_drives_memory_node(&workerarg->set->workers[0], STARPU_MAIN_RAM);
															
 
																-				if (memory_node != STARPU_MAIN_RAM)
															
 
																-					_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
															
 
																+				_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
															
 
																 				break;
															
 
																 #endif
															
@@ -2226,8 +2225,7 @@ _starpu_init_workers_binding_and_memory (struct _starpu_machine_config *config,
 
																 				_starpu_memory_node_add_nworkers(memory_node);
															
 
																                                 _starpu_worker_drives_memory_node(workerarg, STARPU_MAIN_RAM);
															
 
																-				if (memory_node != STARPU_MAIN_RAM)
															
 
																-					_starpu_worker_drives_memory_node(workerarg, memory_node);
															
 
																+				_starpu_worker_drives_memory_node(workerarg, memory_node);
															
 
																 				break;
															
 
																 #endif
															
@@ -2259,8 +2257,7 @@ _starpu_init_workers_binding_and_memory (struct _starpu_machine_config *config,
 
																 				_starpu_memory_node_add_nworkers(memory_node);
															
 
																                                 _starpu_worker_drives_memory_node(&workerarg->set->workers[0], STARPU_MAIN_RAM);
															
 
																-				if (memory_node != STARPU_MAIN_RAM)
															
 
																-					_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
															
 
																+				_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
															
 
																 				break;
															
 
																 #endif /* STARPU_USE_MIC */
															
@@ -2275,8 +2272,7 @@ _starpu_init_workers_binding_and_memory (struct _starpu_machine_config *config,
 
																 				_starpu_memory_node_add_nworkers(memory_node);
															
 
																                                 _starpu_worker_drives_memory_node(workerarg, STARPU_MAIN_RAM);
															
 
																-				if (memory_node != STARPU_MAIN_RAM)
															
 
																-					_starpu_worker_drives_memory_node(workerarg, memory_node);
															
 
																+				_starpu_worker_drives_memory_node(workerarg, memory_node);
															
 
																 			}
															
 
																 				break;
															
 
																 #endif /* STARPU_USE_SCC */
															
@@ -2298,8 +2294,7 @@ _starpu_init_workers_binding_and_memory (struct _starpu_machine_config *config,
 
																 				}
															
 
																                                 _starpu_worker_drives_memory_node(&workerarg->set->workers[0], STARPU_MAIN_RAM);
															
 
																-				if (memory_node != STARPU_MAIN_RAM)
															
 
																-					_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
															
 
																+				_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
															
 
																 #ifndef STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD
															
 
																                                 /* MPI driver thread can manage all slave memories if we disable the MPI multiple thread */
															
 
																                                 unsigned findworker;
															
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -576,15 +576,10 @@ static void _starpu_worker_init(struct _starpu_worker *workerarg, struct _starpu
 
																 	workerarg->reverse_phase[0] = 0;
															
 
																 	workerarg->reverse_phase[1] = 0;
															
 
																 	workerarg->pop_ctx_priority = 1;
															
 
																-	workerarg->sched_mutex_depth = 0;
															
 
																+	workerarg->sched_mutex_locked = 0;
															
 
																+	workerarg->blocked = 0;
															
 
																 	workerarg->is_slave_somewhere = 0;
															
 
																-	workerarg->state_sched_op_pending = 0;
															
 
																-	workerarg->state_changing_ctx_waiting = 0;
															
 
																-	workerarg->state_blocked = 0;
															
 
																-	workerarg->state_wait_ack__blocked = 0;
															
 
																-	workerarg->state_wait_handshake__blocked = 0;
															
 
																-
															
 
																 	/* cpu_set/hwloc_cpu_set initialized in topology.c */
															
 
																 }
															
@@ -1417,14 +1412,12 @@ static void _starpu_terminate_workers(struct _starpu_machine_config *pconfig)
 
																 		struct _starpu_worker *worker = &pconfig->workers[workerid];
															
 
																 		/* in case StarPU termination code is called from a callback,
															
 
																- 		 * we have to check if pthread_self() is the worker itself */
															
 
																+ 		 * we have to check if starpu_pthread_self() is the worker itself */
															
 
																 		if (set && set->nworkers > 0)
															
 
																 		{
															
 
																 			if (set->started)
															
 
																 			{
															
 
																-#ifndef STARPU_SIMGRID
															
 
																-				if (!pthread_equal(pthread_self(), set->worker_thread))
															
 
																-#endif
															
 
																+				if (!starpu_pthread_equal(starpu_pthread_self(), set->worker_thread))
															
 
																 					status = starpu_pthread_join(set->worker_thread, NULL);
															
 
																 				if (status)
															
 
																 				{
															
@@ -1440,9 +1433,7 @@ static void _starpu_terminate_workers(struct _starpu_machine_config *pconfig)
 
																 			if (!worker->run_by_starpu)
															
 
																 				goto out;
															
 
																-#ifndef STARPU_SIMGRID
															
 
																-			if (!pthread_equal(pthread_self(), worker->worker_thread))
															
 
																-#endif
															
 
																+			if (!starpu_pthread_equal(starpu_pthread_self(), worker->worker_thread))
															
 
																 				status = starpu_pthread_join(worker->worker_thread, NULL);
															
 
																 			if (status)
															
 
																 			{
															
@@ -1697,7 +1688,7 @@ unsigned starpu_worker_get_count(void)
 
																 unsigned starpu_worker_is_blocked(int workerid)
															
 
																 {
															
 
																-	return (unsigned)_starpu_config.workers[workerid].state_blocked;
															
 
																+	return _starpu_config.workers[workerid].blocked;
															
 
																 }
															
 
																 unsigned starpu_worker_is_slave_somewhere(int workerid)
															
@@ -2057,7 +2048,7 @@ void starpu_worker_get_sched_condition(int workerid, starpu_pthread_mutex_t **sc
 
																 	*sched_mutex = &_starpu_config.workers[workerid].sched_mutex;
															
 
																 }
															
 
																-static int starpu_wakeup_worker_locked(int workerid, starpu_pthread_cond_t *sched_cond, starpu_pthread_mutex_t *mutex STARPU_ATTRIBUTE_UNUSED)
															
 
																+int starpu_wakeup_worker_locked(int workerid, starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex STARPU_ATTRIBUTE_UNUSED)
															
 
																 {
															
 
																 #ifdef STARPU_SIMGRID
															
 
																 	starpu_pthread_queue_broadcast(&_starpu_simgrid_task_queue[workerid]);
															
@@ -2065,19 +2056,17 @@ static int starpu_wakeup_worker_locked(int workerid, starpu_pthread_cond_t *sche
 
																 	if (_starpu_config.workers[workerid].status == STATUS_SLEEPING)
															
 
																 	{
															
 
																 		_starpu_config.workers[workerid].status = STATUS_WAKING_UP;
															
 
																-		/* cond_broadcast is required over cond_signal since
															
 
																-		 * the condition is share for multiple purpose */
															
 
																-		STARPU_PTHREAD_COND_BROADCAST(sched_cond);
															
 
																+		STARPU_PTHREAD_COND_SIGNAL(cond);
															
 
																 		return 1;
															
 
																 	}
															
 
																 	return 0;
															
 
																 }
															
 
																-static int starpu_wakeup_worker(int workerid, starpu_pthread_cond_t *sched_cond, starpu_pthread_mutex_t *mutex)
															
 
																+int starpu_wakeup_worker(int workerid, starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex)
															
 
																 {
															
 
																 	int success;
															
 
																 	STARPU_PTHREAD_MUTEX_LOCK_SCHED(mutex);
															
 
																-	success = starpu_wakeup_worker_locked(workerid, sched_cond, mutex);
															
 
																+	success = starpu_wakeup_worker_locked(workerid, cond, mutex);
															
 
																 	STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(mutex);
															
 
																 	return success;
															
 
																 }
															
@@ -2171,6 +2160,33 @@ void starpu_get_version(int *major, int *minor, int *release)
 
																 	*release = STARPU_RELEASE_VERSION;
															
 
																 }
															
 
																+void _starpu_unlock_mutex_if_prev_locked()
															
 
																+{
															
 
																+	int workerid = starpu_worker_get_id();
															
 
																+	if(workerid != -1)
															
 
																+	{
															
 
																+		struct _starpu_worker *w = _starpu_get_worker_struct(workerid);
															
 
																+		if(w->sched_mutex_locked)
															
 
																+		{
															
 
																+			STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&w->sched_mutex);
															
 
																+			_starpu_worker_set_flag_sched_mutex_locked(workerid, 1);
															
 
																+		}
															
 
																+	}
															
 
																+	return;
															
 
																+}
															
 
																+
															
 
																+void _starpu_relock_mutex_if_prev_locked()
															
 
																+{
															
 
																+	int workerid = starpu_worker_get_id();
															
 
																+	if(workerid != -1)
															
 
																+	{
															
 
																+		struct _starpu_worker *w = _starpu_get_worker_struct(workerid);
															
 
																+		if(w->sched_mutex_locked)
															
 
																+			STARPU_PTHREAD_MUTEX_LOCK_SCHED(&w->sched_mutex);
															
 
																+	}
															
 
																+	return;
															
 
																+}
															
 
																+
															
 
																 unsigned starpu_worker_get_sched_ctx_list(int workerid, unsigned **sched_ctxs)
															
 
																 {
															
 
																 	unsigned s = 0;
															
--- a/src/core/workers.h
+++ b/src/core/workers.h
@@ -20,6 +20,8 @@
 
																 #ifndef __WORKERS_H__
															
 
																 #define __WORKERS_H__
															
 
																+#include <limits.h>
															
 
																+
															
 
																 #include <starpu.h>
															
 
																 #include <common/config.h>
															
 
																 #include <common/timing.h>
															
@@ -83,11 +85,6 @@ LIST_TYPE(_starpu_worker,
 
																 	unsigned numa_memory_node; /* which numa memory node is the worker associated with? (logical index) */
															
 
																 	starpu_pthread_cond_t sched_cond; /* condition variable used when the worker waits for tasks. */
															
 
																         starpu_pthread_mutex_t sched_mutex; /* mutex protecting sched_cond */
															
 
																-	int state_sched_op_pending:1; /* a task pop is ongoing even though sched_mutex may temporarily be unlocked */
															
 
																-	int state_changing_ctx_waiting:1; /* a thread is waiting for transient operations such as pop to complete before acquiring sched_mutex and modifying the worker ctx*/
															
 
																-	int state_blocked:1;
															
 
																-	int state_wait_ack__blocked:1;
															
 
																-	int state_wait_handshake__blocked:1;
															
 
																 	struct starpu_task_list local_tasks; /* this queue contains tasks that have been explicitely submitted to that queue */
															
 
																 	struct starpu_task **local_ordered_tasks; /* this queue contains tasks that have been explicitely submitted to that queue with an explicit order */
															
 
																 	unsigned local_ordered_tasks_size; /* this records the size of local_ordered_tasks */
															
@@ -141,8 +138,11 @@ LIST_TYPE(_starpu_worker,
 
																 	/* indicate which priority of ctx is currently active: the values are 0 or 1*/
															
 
																 	unsigned pop_ctx_priority;
															
 
																-	/* sched mutex local worker locking depth */
															
 
																-	unsigned sched_mutex_depth;
															
 
																+	/* flag to know if sched_mutex is locked or not */
															
 
																+	unsigned sched_mutex_locked;
															
 
																+
															
 
																+	/* bool to indicate if the worker is blocked in a ctx */
															
 
																+	unsigned blocked;
															
 
																 	/* bool to indicate if the worker is slave in a ctx */
															
 
																 	unsigned is_slave_somewhere;
															
@@ -509,7 +509,7 @@ static inline struct _starpu_worker *_starpu_get_worker_struct(unsigned id)
 
																 	return &_starpu_config.workers[id];
															
 
																 }
															
 
																-/* Returns the starpu_sched_ctx structure that describes the state of the 
															
 
																+/* Returns the starpu_sched_ctx structure that descriebes the state of the 
															
 
																  * specified ctx */
															
 
																 static inline struct _starpu_sched_ctx *_starpu_get_sched_ctx_struct(unsigned id)
															
 
																 {
															
@@ -559,6 +559,18 @@ int starpu_worker_get_nids_by_type(enum starpu_worker_archtype type, int *worker
 
																    the list might not be updated */
															
 
																 int starpu_worker_get_nids_ctx_free_by_type(enum starpu_worker_archtype type, int *workerids, int maxsize);
															
 
																+/* if the current worker has the lock release it */
															
 
																+void _starpu_unlock_mutex_if_prev_locked();
															
 
																+
															
 
																+/* if we prev released the lock relock it */
															
 
																+void _starpu_relock_mutex_if_prev_locked();
															
 
																+
															
 
																+static inline void _starpu_worker_set_flag_sched_mutex_locked(int workerid, unsigned flag)
															
 
																+{
															
 
																+	struct _starpu_worker *w = _starpu_get_worker_struct(workerid);
															
 
																+	w->sched_mutex_locked = flag;
															
 
																+}
															
 
																+
															
 
																 static inline unsigned _starpu_worker_mutex_is_sched_mutex(int workerid, starpu_pthread_mutex_t *mutex)
															
 
																 {
															
 
																 	struct _starpu_worker *w = _starpu_get_worker_struct(workerid);
															
@@ -611,43 +623,4 @@ void _starpu_worker_set_stream_ctx(unsigned workerid, struct _starpu_sched_ctx *
 
																 struct _starpu_sched_ctx* _starpu_worker_get_ctx_stream(unsigned stream_workerid);
															
 
																-/* Must be called with worker's sched_mutex held.
															
 
																- * Mark the beginning of a scheduling operation during which the sched_mutex
															
 
																- * lock may be temporarily released, but the scheduling context of the worker
															
 
																- * should not be modified */
															
 
																-static inline void _starpu_worker_enter_transient_sched_op(struct _starpu_worker * const worker)
															
 
																-{
															
 
																-	worker->state_sched_op_pending = 1;
															
 
																-}
															
 
																-
															
 
																-/* Must be called with worker's sched_mutex held.
															
 
																- * Mark the end of a scheduling operation, and notify potential waiters that
															
 
																- * scheduling context changes can safely be performed again.
															
 
																- */
															
 
																-static inline void  _starpu_worker_leave_transient_sched_op(struct _starpu_worker * const worker)
															
 
																-{
															
 
																-	worker->state_sched_op_pending = 0;
															
 
																-	if (worker->state_changing_ctx_waiting)
															
 
																-		/* cond_broadcast is required over cond_signal since
															
 
																-		 * the condition is share for multiple purpose */
															
 
																-		STARPU_PTHREAD_COND_BROADCAST(&worker->sched_cond);
															
 
																-}
															
 
																-
															
 
																-/* Must be called with worker's sched_mutex held.
															
 
																- * Passively wait until state_sched_op_pending is cleared.
															
 
																- */
															
 
																-static inline void _starpu_worker_wait_for_transient_sched_op_completion(struct _starpu_worker * const worker)
															
 
																-{
															
 
																-	if (worker->state_sched_op_pending)
															
 
																-	{
															
 
																-		worker->state_changing_ctx_waiting = 1;
															
 
																-		do
															
 
																-		{
															
 
																-			STARPU_PTHREAD_COND_WAIT(&worker->sched_cond, &worker->sched_mutex);
															
 
																-		}
															
 
																-		while (worker->state_sched_op_pending);
															
 
																-		worker->state_changing_ctx_waiting = 0;
															
 
																-	}
															
 
																-}
															
 
																-
															
 
																 #endif // __WORKERS_H__
															
--- a/src/datawizard/malloc.c
+++ b/src/datawizard/malloc.c
@@ -32,6 +32,7 @@
 
																 #ifdef STARPU_SIMGRID
															
 
																 #include <sys/mman.h>
															
 
																 #include <fcntl.h>
															
 
																+#include <smpi/smpi.h>
															
 
																 #endif
															
 
																 #ifndef O_BINARY
															
@@ -48,9 +49,12 @@ static int malloc_on_node_default_flags[STARPU_MAXNODES];
 
																 /* This file is used for implementing "folded" allocation */
															
 
																 #ifdef STARPU_SIMGRID
															
 
																+#if SIMGRID_VERSION_MAJOR < 3 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR < 15)
															
 
																+/* TODO: drop when simgrid 3.15 is reasonably largely used by people who need the feature */
															
 
																 static int bogusfile = -1;
															
 
																 static unsigned long _starpu_malloc_simulation_fold;
															
 
																 #endif
															
 
																+#endif
															
 
																 void starpu_malloc_set_align(size_t align)
															
 
																 {
															
@@ -230,6 +234,10 @@ int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int fl
 
																 #ifdef STARPU_SIMGRID
															
 
																 	if (flags & STARPU_MALLOC_SIMULATION_FOLDED)
															
 
																 	{
															
 
																+#if SIMGRID_VERSION_MAJOR > 3 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR >= 15)
															
 
																+		*A = SMPI_SHARED_MALLOC(dim);
															
 
																+#else
															
 
																+		/* TODO: drop when simgrid 3.15 is reasonably largely used by people who need the feature */
															
 
																 		/* Use "folded" allocation: the same file is mapped several
															
 
																 		 * times contiguously, to get a memory area one can read/write,
															
 
																 		 * without consuming memory */
															
@@ -282,6 +290,7 @@ int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int fl
 
																 			}
															
 
																 			*A = buf;
															
 
																 		}
															
 
																+#endif
															
 
																 	}
															
 
																 	else
															
 
																 #endif
															
@@ -465,7 +474,12 @@ int _starpu_free_flags_on_node(unsigned dst_node, void *A, size_t dim, int flags
 
																 #ifdef STARPU_SIMGRID
															
 
																 	if (flags & STARPU_MALLOC_SIMULATION_FOLDED)
															
 
																 	{
															
 
																+#if SIMGRID_VERSION_MAJOR > 3 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR >= 15)
															
 
																+		SMPI_SHARED_FREE(A);
															
 
																+#else
															
 
																+		/* TODO: drop when simgrid 3.15 is reasonably largely used by people who need the feature */
															
 
																 		munmap(A, dim);
															
 
																+#endif
															
 
																 	}
															
 
																 	else
															
 
																 #endif
															
@@ -840,9 +854,11 @@ _starpu_malloc_init(unsigned dst_node)
 
																 	disable_pinning = starpu_get_env_number("STARPU_DISABLE_PINNING");
															
 
																 	malloc_on_node_default_flags[dst_node] = STARPU_MALLOC_PINNED | STARPU_MALLOC_COUNT;
															
 
																 #ifdef STARPU_SIMGRID
															
 
																+#if SIMGRID_VERSION_MAJOR < 3 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR < 15)
															
 
																 	/* Reasonably "costless" */
															
 
																 	_starpu_malloc_simulation_fold = starpu_get_env_number_default("STARPU_MALLOC_SIMULATION_FOLD", 1) << 20;
															
 
																 #endif
															
 
																+#endif
															
 
																 }
															
 
																 void
															
--- a/src/datawizard/memory_nodes.c
+++ b/src/datawizard/memory_nodes.c
@@ -185,9 +185,12 @@ unsigned starpu_worker_get_memory_node(unsigned workerid)
 
																 /* same utility as _starpu_memory_node_add_nworkers */
															
 
																 void _starpu_worker_drives_memory_node(struct _starpu_worker *worker, unsigned memnode)
															
 
																 {
															
 
																-	_starpu_worker_drives_memory[worker->workerid][memnode] = 1;
															
 
																+	if (! _starpu_worker_drives_memory[worker->workerid][memnode])
															
 
																+	{
															
 
																+		_starpu_worker_drives_memory[worker->workerid][memnode] = 1;
															
 
																 #ifdef STARPU_SIMGRID
															
 
																-	starpu_pthread_queue_register(&worker->wait, &_starpu_simgrid_transfer_queue[memnode]);
															
 
																+		starpu_pthread_queue_register(&worker->wait, &_starpu_simgrid_transfer_queue[memnode]);
															
 
																 #endif
															
 
																+	}
															
 
																 }
															
--- a/src/debug/traces/starpu_fxt.c
+++ b/src/debug/traces/starpu_fxt.c
@@ -157,6 +157,8 @@ static void task_dump(struct task_info *task)
 
																 	if (task->exclude_from_dag)
															
 
																 		goto out;
															
 
																+	if (!tasks_file)
															
 
																+		goto out;
															
 
																 	if (task->name)
															
 
																 	{
															
@@ -274,6 +276,8 @@ static struct data_info *get_data(unsigned long handle, int mpi_rank)
 
																 static void data_dump(struct data_info *data)
															
 
																 {
															
 
																+	if (!data_file)
															
 
																+		goto out;
															
 
																 	fprintf(data_file, "Handle: %lx\n", data->handle);
															
 
																 	fprintf(data_file, "MPIRank: %d\n", data->mpi_rank);
															
 
																 	if (data->name)
															
@@ -291,6 +295,7 @@ static void data_dump(struct data_info *data)
 
																 	}
															
 
																 	fprintf(data_file, "MPIOwner: %d\n", data->mpi_owner);
															
 
																 	fprintf(data_file, "\n");
															
 
																+out:
															
 
																 	HASH_DEL(data_info, data);
															
 
																 	free(data);
															
 
																 }
															
@@ -2388,8 +2393,8 @@ static void handle_task_done(struct fxt_ev_64 *ev, struct starpu_fxt_options *op
 
																 	unsigned exclude_from_dag = ev->param[2];
															
 
																 	struct task_info *task = get_task(job_id, options->file_rank);
															
 
																 	task->exclude_from_dag = exclude_from_dag;
															
 
																-	if (tasks_file)
															
 
																-		task_dump(task);
															
 
																+
															
 
																+	task_dump(task);
															
 
																 	if (!exclude_from_dag)
															
 
																 		_starpu_fxt_dag_set_task_done(options->file_prefix, job_id, name, colour);
															
@@ -2698,9 +2703,8 @@ static void handle_task_wait_for_all(void)
 
																 	_starpu_fxt_dag_add_sync_point();
															
 
																 }
															
 
																-static void handle_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
															
 
																+static void handle_string_event(struct fxt_ev_64 *ev, const char *event, struct starpu_fxt_options *options)
															
 
																 {
															
 
																-	char *event = get_fxt_string(ev, 0);
															
 
																 	/* Add an event in the trace */
															
 
																 	if (out_paje_file)
															
 
																 	{
															
@@ -2717,6 +2721,12 @@ static void handle_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *option
 
																 		recfmt_dump_state(get_event_time_stamp(ev, options), "ProgEvent", -1, 0, event, "Program");
															
 
																 }
															
 
																+static void handle_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
															
 
																+{
															
 
																+	char *event = get_fxt_string(ev, 0);
															
 
																+	handle_string_event(ev, event, options);
															
 
																+}
															
 
																+
															
 
																 static void handle_thread_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
															
 
																 {
															
 
																 	/* Add an event in the trace */
															
@@ -3418,6 +3428,13 @@ void _starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *op
 
																 				fut_keymask = ev.param[0];
															
 
																 				break;
															
 
																+			case FUT_START_FLUSH_CODE:
															
 
																+				handle_string_event(&ev, "fxt_start_flush", options);
															
 
																+				break;
															
 
																+			case FUT_STOP_FLUSH_CODE:
															
 
																+				handle_string_event(&ev, "fxt_stop_flush", options);
															
 
																+				break;
															
 
																+
															
 
																 			/* We can safely ignore FUT internal events */
															
 
																 			case FUT_CALIBRATE0_CODE:
															
 
																 			case FUT_CALIBRATE1_CODE:
															
@@ -3467,7 +3484,6 @@ void _starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *op
 
																 #endif
															
 
																 	}
															
 
																-	if (data_file)
															
 
																 	{
															
 
																 		/* TODO: move to handle_data_unregister */
															
 
																 		struct data_info *data, *tmp;
															
--- a/src/debug/traces/starpu_paje.c
+++ b/src/debug/traces/starpu_paje.c
@@ -193,7 +193,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 
																 	poti_DefineEntityValue("E", "S", "Executing", ".0 .6 .5");
															
 
																 	poti_DefineEntityValue("Sc", "S", "Scheduling", ".7 .36 .0");
															
 
																 	poti_DefineEntityValue("Sl", "S", "Sleeping", ".9 .1 .0");
															
 
																-	poti_DefineEntityValue("P", "S", "Progressing", ".4 .1 .6");
															
 
																+	poti_DefineEntityValue("P", "S", "Progressing", ".1 .3 .1");
															
 
																 	poti_DefineEntityValue("U", "S", "Unpartitioning", ".0 .0 1.0");
															
 
																 	poti_DefineEntityValue("H", "S", "Hypervisor", ".5 .18 .0");
															
 
																 	poti_DefineEntityValue("Bu", "S", "Building task", ".5 .18 .0");
															
@@ -213,7 +213,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 
																 	poti_DefineEntityValue("E", "WS", "Executing", ".0 .6 .5");
															
 
																 	poti_DefineEntityValue("Sc", "WS", "Scheduling", ".7 .36 .0");
															
 
																 	poti_DefineEntityValue("Sl", "WS", "Sleeping", ".9 .1 .0");
															
 
																-	poti_DefineEntityValue("P", "WS", "Progressing", ".4 .1 .6");
															
 
																+	poti_DefineEntityValue("P", "WS", "Progressing", ".1 .3 .1");
															
 
																 	poti_DefineEntityValue("U", "WS", "Unpartitioning", ".0 .0 1.0");
															
 
																 	poti_DefineEntityValue("H", "WS", "Hypervisor", ".5 .18 .0");
															
 
																 	poti_DefineEntityValue("Bu", "WS", "Building task", ".5 .18 .0");
															
@@ -268,7 +268,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 
																 		poti_DefineEntityValue("E", ctx, "Executing", ".0 .6 .5");
															
 
																 		poti_DefineEntityValue("Sc", ctx, "Scheduling", ".7 .36 .0");
															
 
																 		poti_DefineEntityValue("Sl", ctx, "Sleeping", ".9 .1 .0");
															
 
																-		poti_DefineEntityValue("P", ctx, "Progressing", ".4 .1 .6");
															
 
																+		poti_DefineEntityValue("P", ctx, "Progressing", ".1 .3 .1");
															
 
																 		poti_DefineEntityValue("U", ctx, "Unpartitioning", ".0 .0 1.0");
															
 
																 		poti_DefineEntityValue("H", ctx, "Hypervisor", ".5 .18 .0");
															
 
																 	}
															
@@ -331,7 +331,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 
																 6       E       S       Executing         \".0 .6 .5\"		\n\
															
 
																 6       Sc       S      Scheduling         \".7 .36 .0\"		\n\
															
 
																 6       Sl       S      Sleeping         \".9 .1 .0\"		\n\
															
 
																-6       P       S       Progressing         \".4 .1 .6\"		\n\
															
 
																+6       P       S       Progressing         \".1 .3 .1\"		\n\
															
 
																 6       U       S       Unpartitioning      \".0 .0 1.0\"		\n\
															
 
																 6       H       S       Hypervisor      \".5 .18 .0\"		\n\
															
 
																 6       Bu      S       \"Building task\"   \".5 .18 .0\"		\n\
															
@@ -351,7 +351,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 
																 6       E       WS       Executing         \".0 .6 .5\"		\n\
															
 
																 6       Sc       WS      Scheduling         \".7 .36 .0\"		\n\
															
 
																 6       Sl       WS      Sleeping         \".9 .1 .0\"		\n\
															
 
																-6       P       WS       Progressing         \".4 .1 .6\"		\n\
															
 
																+6       P       WS       Progressing         \".1 .3 .1\"		\n\
															
 
																 6       U       WS       Unpartitioning      \".0 .0 1.0\"		\n\
															
 
																 6       H       WS       Hypervisor      \".5 .18 .0\"		\n\
															
 
																 6       Bu      WS       \"Building task\"   \".5 .18 .0\"		\n\
															
@@ -394,7 +394,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 
																 6       E       Ctx%u       Executing         \".0 .6 .5\"		\n\
															
 
																 6       Sc       Ctx%u      Scheduling         \".7 .36 .0\"		\n\
															
 
																 6       Sl       Ctx%u      Sleeping         \".9 .1 .0\"		\n\
															
 
																-6       P       Ctx%u       Progressing         \".4 .1 .6\"		\n\
															
 
																+6       P       Ctx%u       Progressing         \".1 .3 .1\"		\n\
															
 
																 6       U       Ctx%u       Unpartitioning         \".0 .0 1.0\"	\n\
															
 
																 6       H       Ctx%u       Hypervisor         \".5 .18 .0\"		\n",
															
 
																 		i, i, i, i, i, i, i, i, i, i, i, i, i);
															
--- a/src/drivers/cpu/driver_cpu.c
+++ b/src/drivers/cpu/driver_cpu.c
@@ -89,6 +89,12 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 
																 #ifdef STARPU_SIMGRID
															
 
																 			if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE)
															
 
																 				func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
															
 
																+			else if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT)
															
 
																+			{
															
 
																+				_SIMGRID_TIMER_BEGIN(1);
															
 
																+				func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
															
 
																+				_SIMGRID_TIMER_END;
															
 
																+			}
															
 
																 			else
															
 
																 				_starpu_simgrid_submit_job(cpu_args->workerid, j, perf_arch, NAN, NULL);
															
 
																 #else
															
@@ -410,13 +416,13 @@ void *_starpu_cpu_worker(void *arg)
 
																 	struct _starpu_worker *worker = arg;
															
 
																 	_starpu_cpu_driver_init(worker);
															
 
																-	_STARPU_TRACE_END_PROGRESS(worker->memory_node);
															
 
																+	_STARPU_TRACE_START_PROGRESS(worker->memory_node);
															
 
																 	while (_starpu_machine_is_running())
															
 
																 	{
															
 
																 		_starpu_may_pause();
															
 
																 		_starpu_cpu_driver_run_once(worker);
															
 
																 	}
															
 
																-	_STARPU_TRACE_START_PROGRESS(worker->memory_node);
															
 
																+	_STARPU_TRACE_END_PROGRESS(worker->memory_node);
															
 
																 	_starpu_cpu_driver_deinit(worker);
															
 
																 	return NULL;
															
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -507,6 +507,12 @@ static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *worke
 
																 		unsigned workerid = worker->workerid;
															
 
																 		if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE && !async)
															
 
																 			func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
															
 
																+		else if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT && !async)
															
 
																+			{
															
 
																+				_SIMGRID_TIMER_BEGIN(1);
															
 
																+				func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
															
 
																+				_SIMGRID_TIMER_END;
															
 
																+			}
															
 
																 		else
															
 
																 			_starpu_simgrid_submit_job(workerid, j, &worker->perf_arch, NAN,
															
 
																 				async ? &task_finished[workerid][pipeline_idx] : NULL);
															
@@ -763,6 +769,7 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 
																 		task = worker->task_transferring;
															
 
																 		if (task && worker->nb_buffers_transferred == worker->nb_buffers_totransfer)
															
 
																 		{
															
 
																+			_STARPU_TRACE_END_PROGRESS(memnode);
															
 
																 			j = _starpu_get_job_associated_to_task(task);
															
 
																 			_starpu_set_local_worker_key(worker);
															
@@ -779,10 +786,9 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 
																 			}
															
 
																 			else
															
 
																 			{
															
 
																-				_STARPU_TRACE_END_PROGRESS(memnode);
															
 
																 				execute_job_on_cuda(task, worker);
															
 
																-				_STARPU_TRACE_START_PROGRESS(memnode);
															
 
																 			}
															
 
																+			_STARPU_TRACE_START_PROGRESS(memnode);
															
 
																 		}
															
 
																 		/* Then test for termination of queued tasks */
															
@@ -811,6 +817,7 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 
																 		else
															
 
																 #endif /* !STARPU_SIMGRID */
															
 
																 		{
															
 
																+			_STARPU_TRACE_END_PROGRESS(memnode);
															
 
																 			/* Asynchronous task completed! */
															
 
																 			_starpu_set_local_worker_key(worker);
															
 
																 			finish_job_on_cuda(_starpu_get_job_associated_to_task(task), worker);
															
@@ -831,11 +838,9 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 
																 					 * flushing the pipeline, we can now at
															
 
																 					 * last execute it.  */
															
 
																-					_STARPU_TRACE_END_PROGRESS(memnode);
															
 
																 					_STARPU_TRACE_EVENT("sync_task");
															
 
																 					execute_job_on_cuda(task, worker);
															
 
																 					_STARPU_TRACE_EVENT("end_sync_task");
															
 
																-					_STARPU_TRACE_START_PROGRESS(memnode);
															
 
																 					worker->pipeline_stuck = 0;
															
 
																 				}
															
 
																 			}
															
@@ -848,6 +853,7 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 
																 				/* Everybody busy */
															
 
																 				_STARPU_TRACE_END_EXECUTING()
															
 
																 #endif
															
 
																+			_STARPU_TRACE_START_PROGRESS(memnode);
															
 
																 		}
															
 
																 		if (!worker->pipeline_length || worker->ntasks < worker->pipeline_length)
															
--- a/src/drivers/driver_common/driver_common.c
+++ b/src/drivers/driver_common/driver_common.c
@@ -101,6 +101,7 @@ void _starpu_driver_start_job(struct _starpu_worker *worker, struct _starpu_job
 
																 	}
															
 
																 	else
															
 
																 		_STARPU_TRACE_START_CODELET_BODY(j, j->nimpl, perf_arch, workerid);
															
 
																+	_STARPU_TASK_BREAK_ON(task, exec);
															
 
																 }
															
 
																 void _starpu_driver_end_job(struct _starpu_worker *worker, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch STARPU_ATTRIBUTE_UNUSED, struct timespec *codelet_end, int rank, int profiling)
															
@@ -358,6 +359,7 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *worker, int w
 
																 			sched_ctx = _starpu_get_sched_ctx_struct(e->sched_ctx);
															
 
																 			if(sched_ctx && sched_ctx->id > 0 && sched_ctx->id < STARPU_NMAX_SCHED_CTXS)
															
 
																 			{
															
 
																+				STARPU_PTHREAD_MUTEX_LOCK(&sched_ctx->parallel_sect_mutex[workerid]);
															
 
																 				if(!sched_ctx->sched_policy)
															
 
																 					worker->is_slave_somewhere = sched_ctx->main_master != workerid;
															
@@ -366,23 +368,18 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *worker, int w
 
																 					/* don't let the worker sleep with the sched_mutex taken */
															
 
																 					/* we need it until here bc of the list of ctxs of the workers
															
 
																 					   that can change in another thread */
															
 
																+					STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&worker->sched_mutex);
															
 
																 					needed = 0;
															
 
																-					worker->state_blocked = 1;
															
 
																-					worker->state_wait_ack__blocked = 1;
															
 
																-					STARPU_PTHREAD_COND_BROADCAST(&worker->sched_cond);
															
 
																-					do
															
 
																-					{
															
 
																-						STARPU_PTHREAD_COND_WAIT(&worker->sched_cond, &worker->sched_mutex);
															
 
																-					}
															
 
																-					while (worker->state_wait_ack__blocked);
															
 
																-					worker->state_blocked = 0;
															
 
																+					_starpu_sched_ctx_signal_worker_blocked(sched_ctx->id, workerid);
															
 
																+					sched_ctx->busy[workerid] = 1;
															
 
																+					STARPU_PTHREAD_COND_WAIT(&sched_ctx->parallel_sect_cond[workerid], &sched_ctx->parallel_sect_mutex[workerid]);
															
 
																+					sched_ctx->busy[workerid] = 0;
															
 
																+					STARPU_PTHREAD_COND_SIGNAL(&sched_ctx->parallel_sect_cond_busy[workerid]);
															
 
																+					_starpu_sched_ctx_signal_worker_woke_up(sched_ctx->id, workerid);
															
 
																 					sched_ctx->parallel_sect[workerid] = 0;
															
 
																-					if (worker->state_wait_handshake__blocked)
															
 
																-					{
															
 
																-						worker->state_wait_handshake__blocked = 0;
															
 
																-						STARPU_PTHREAD_COND_BROADCAST(&worker->sched_cond);
															
 
																-					}
															
 
																+					STARPU_PTHREAD_MUTEX_LOCK_SCHED(&worker->sched_mutex);
															
 
																 				}
															
 
																+				STARPU_PTHREAD_MUTEX_UNLOCK(&sched_ctx->parallel_sect_mutex[workerid]);
															
 
																 			}
															
 
																 			if(!needed)
															
 
																 				break;
															
@@ -391,25 +388,21 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *worker, int w
 
																 		if(worker->tmp_sched_ctx != -1)
															
 
																 		{
															
 
																 			sched_ctx = _starpu_get_sched_ctx_struct(worker->tmp_sched_ctx);
															
 
																+			STARPU_PTHREAD_MUTEX_LOCK(&sched_ctx->parallel_sect_mutex[workerid]);
															
 
																 			if(sched_ctx->parallel_sect[workerid])
															
 
																 			{
															
 
																 //				needed = 0;
															
 
																-				worker->state_blocked = 1;
															
 
																-				worker->state_wait_ack__blocked = 1;
															
 
																-				STARPU_PTHREAD_COND_BROADCAST(&worker->sched_cond);
															
 
																-				do
															
 
																-				{
															
 
																-					STARPU_PTHREAD_COND_WAIT(&worker->sched_cond, &worker->sched_mutex);
															
 
																-				}
															
 
																-				while (worker->state_wait_ack__blocked);
															
 
																-				worker->state_blocked = 0;
															
 
																+				STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&worker->sched_mutex);
															
 
																+				_starpu_sched_ctx_signal_worker_blocked(sched_ctx->id, workerid);
															
 
																+				sched_ctx->busy[workerid] = 1;
															
 
																+				STARPU_PTHREAD_COND_WAIT(&sched_ctx->parallel_sect_cond[workerid], &sched_ctx->parallel_sect_mutex[workerid]);
															
 
																+				sched_ctx->busy[workerid] = 0;
															
 
																+				STARPU_PTHREAD_COND_SIGNAL(&sched_ctx->parallel_sect_cond_busy[workerid]);
															
 
																+				_starpu_sched_ctx_signal_worker_woke_up(sched_ctx->id, workerid);
															
 
																 				sched_ctx->parallel_sect[workerid] = 0;
															
 
																-				if (worker->state_wait_handshake__blocked)
															
 
																-				{
															
 
																-					worker->state_wait_handshake__blocked = 0;
															
 
																-					STARPU_PTHREAD_COND_BROADCAST(&worker->sched_cond);
															
 
																-				}
															
 
																+				STARPU_PTHREAD_MUTEX_LOCK_SCHED(&worker->sched_mutex);
															
 
																 			}
															
 
																+			STARPU_PTHREAD_MUTEX_UNLOCK(&sched_ctx->parallel_sect_mutex[workerid]);
															
 
																 		}
															
 
																 		needed = !needed;
															
@@ -428,11 +421,7 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *worker, int w
 
																 		task = NULL;
															
 
																 	/*else try to pop a task*/
															
 
																 	else
															
 
																-	{
															
 
																-		_starpu_worker_enter_transient_sched_op(worker);
															
 
																 		task = _starpu_pop_task(worker);
															
 
																-		_starpu_worker_leave_transient_sched_op(worker);
															
 
																-	}
															
 
																 #if !defined(STARPU_SIMGRID)
															
 
																 	if (task == NULL && !executing)
															
@@ -449,11 +438,7 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *worker, int w
 
																 		if (_starpu_worker_can_block(memnode, worker)
															
 
																 			&& !_starpu_sched_ctx_last_worker_awake(worker))
															
 
																 		{
															
 
																-			do
															
 
																-			{
															
 
																-				STARPU_PTHREAD_COND_WAIT(&worker->sched_cond, &worker->sched_mutex);
															
 
																-			}
															
 
																-			while (worker->status == STATUS_SLEEPING);
															
 
																+			STARPU_PTHREAD_COND_WAIT(&worker->sched_cond, &worker->sched_mutex);
															
 
																 			STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&worker->sched_mutex);
															
 
																 		}
															
 
																 		else
															
@@ -527,9 +512,7 @@ int _starpu_get_multi_worker_task(struct _starpu_worker *workers, struct starpu_
 
																 #endif
															
 
																 			_starpu_worker_set_status_scheduling(workers[i].workerid);
															
 
																 			_starpu_set_local_worker_key(&workers[i]);
															
 
																-			_starpu_worker_enter_transient_sched_op(&workers[i]);
															
 
																 			tasks[i] = _starpu_pop_task(&workers[i]);
															
 
																-			_starpu_worker_leave_transient_sched_op(&workers[i]);
															
 
																 			if(tasks[i] != NULL)
															
 
																 			{
															
 
																 				_starpu_worker_set_status_scheduling_done(workers[i].workerid);
															
@@ -598,11 +581,7 @@ int _starpu_get_multi_worker_task(struct _starpu_worker *workers, struct starpu_
 
																 		if (_starpu_worker_can_block(memnode, worker)
															
 
																 				&& !_starpu_sched_ctx_last_worker_awake(worker))
															
 
																 		{
															
 
																-			do
															
 
																-			{
															
 
																-				STARPU_PTHREAD_COND_WAIT(&worker->sched_cond, &worker->sched_mutex);
															
 
																-			}
															
 
																-			while (worker->status == STATUS_SLEEPING);
															
 
																+			STARPU_PTHREAD_COND_WAIT(&worker->sched_cond, &worker->sched_mutex);
															
 
																 			STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&worker->sched_mutex);
															
 
																 		}
															
 
																 		else
															
--- a/src/drivers/mic/driver_mic_sink.c
+++ b/src/drivers/mic/driver_mic_sink.c
@@ -39,7 +39,7 @@ void _starpu_mic_sink_init(struct _starpu_mp_node *node)
 
																 	cpu_set_t cpuset;
															
 
																 	/* We reserve one core for the communications */
															
 
																 	/*Bind on the first core*/
															
 
																-	self = pthread_self();
															
 
																+	self = starpu_pthread_self();
															
 
																 	CPU_ZERO(&cpuset);
															
 
																 	CPU_SET(0,&cpuset);
															
 
																 	pthread_setaffinity_np(self,sizeof(cpu_set_t),&cpuset);
															
--- a/src/drivers/mp_common/mp_common.c
+++ b/src/drivers/mp_common/mp_common.c
@@ -15,7 +15,6 @@
 
																  */
															
 
																 #include <stdlib.h>
															
 
																-#include <pthread.h>
															
 
																 #include <datawizard/interfaces/data_interface.h>
															
 
																 #include <drivers/mp_common/mp_common.h>
															
@@ -400,7 +399,7 @@ void _starpu_mp_common_send_command(const struct _starpu_mp_node *node,
 
																 {
															
 
																 	STARPU_ASSERT_MSG(arg_size <= BUFFER_SIZE, "Too much data (%d) for the static MIC buffer (%d), increase BUFFER_SIZE perhaps?", arg_size, BUFFER_SIZE);
															
 
																-        //printf("SEND CMD : %d - arg_size %d by %lu \n", command, arg_size, pthread_self());
															
 
																+        //printf("SEND CMD : %d - arg_size %d by %lu \n", command, arg_size, starpu_pthread_self());
															
 
																 	/* MIC and MPI sizes are given through a int */
															
 
																 	int command_size = sizeof(enum _starpu_mp_command);
															
@@ -436,7 +435,7 @@ enum _starpu_mp_command _starpu_mp_common_recv_command(const struct _starpu_mp_n
 
																 	command = *((enum _starpu_mp_command *) node->buffer);
															
 
																 	*arg_size = *((int *) ((uintptr_t)node->buffer + command_size));
															
 
																-        //printf("RECV command : %d - arg_size %d by %lu \n", command, *arg_size, pthread_self());
															
 
																+        //printf("RECV command : %d - arg_size %d by %lu \n", command, *arg_size, starpu_pthread_self());
															
 
																 	/* If there is no argument (ie. arg_size == 0),
															
 
																 	 * let's return the command right now */
															
--- a/src/drivers/mp_common/mp_common.h
+++ b/src/drivers/mp_common/mp_common.h
@@ -17,7 +17,6 @@
 
																 #ifndef __MP_COMMON_H__
															
 
																 #define __MP_COMMON_H__
															
 
																-#include <pthread.h>
															
 
																 #include <semaphore.h>
															
 
																 #include <starpu.h>
															
--- a/src/drivers/mpi/driver_mpi_common.c
+++ b/src/drivers/mpi/driver_mpi_common.c
@@ -80,7 +80,10 @@ int _starpu_mpi_common_mp_init()
 
																 #endif
															
 
																                 int thread_support;
															
 
																-                STARPU_ASSERT(MPI_Init_thread(_starpu_get_argc(), _starpu_get_argv(), required, &thread_support) == MPI_SUCCESS);
															
 
																+                if (MPI_Init_thread(_starpu_get_argc(), _starpu_get_argv(), required, &thread_support) != MPI_SUCCESS)
															
 
																+		{
															
 
																+			STARPU_ABORT_MSG("Cannot Initialize MPI !");
															
 
																+		}
															
 
																                 if (thread_support != required)
															
 
																                 {
															
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -954,6 +954,14 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker
 
																 			simulate = 1;
															
 
																 		#endif
															
 
																 		}
															
 
																+		else if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT && !async)
															
 
																+			{
															
 
																+				_SIMGRID_TIMER_BEGIN(1);
															
 
																+				func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
															
 
																+				_SIMGRID_TIMER_END;
															
 
																+				simulate=0;
															
 
																+			}
															
 
																+
															
 
																 		if (simulate)
															
 
																 			_starpu_simgrid_submit_job(worker->workerid, j, &worker->perf_arch, length,
															
 
																 						   async ? &task_finished[worker->devid][pipeline_idx] : NULL);
															
--- a/src/profiling/bound.c
+++ b/src/profiling/bound.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2010, 2011, 2012, 2013, 2016, 2017  CNRS
															
 
																- * Copyright (C) 2010-2016  Université de Bordeaux
															
 
																+ * Copyright (C) 2010-2017  Université de Bordeaux
															
 
																  * Copyright (C) 2011  Télécom-SudParis
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -209,11 +209,11 @@ static double** initialize_arch_duration(int maxdevid, unsigned* maxncore_table)
 
																 static void initialize_duration(struct bound_task *task)
															
 
																 {
															
 
																 	struct _starpu_machine_config *conf = _starpu_get_machine_config();
															
 
																-	task->duration[STARPU_CPU_WORKER] = initialize_arch_duration(1,&conf->topology.ncpus); 
															
 
																-	task->duration[STARPU_CUDA_WORKER] = initialize_arch_duration(conf->topology.ncudagpus,NULL); 
															
 
																-	task->duration[STARPU_OPENCL_WORKER] = initialize_arch_duration(conf->topology.nopenclgpus,NULL); 
															
 
																-	task->duration[STARPU_MIC_WORKER] = initialize_arch_duration(conf->topology.nmicdevices,conf->topology.nmiccores); 
															
 
																-	task->duration[STARPU_SCC_WORKER] = initialize_arch_duration(conf->topology.nsccdevices,NULL); 
															
 
																+	task->duration[STARPU_CPU_WORKER] = initialize_arch_duration(1,&conf->topology.nhwcpus); 
															
 
																+	task->duration[STARPU_CUDA_WORKER] = initialize_arch_duration(conf->topology.nhwcudagpus,NULL); 
															
 
																+	task->duration[STARPU_OPENCL_WORKER] = initialize_arch_duration(conf->topology.nhwopenclgpus,NULL); 
															
 
																+	task->duration[STARPU_MIC_WORKER] = initialize_arch_duration(conf->topology.nhwmicdevices,conf->topology.nmiccores); 
															
 
																+	task->duration[STARPU_SCC_WORKER] = initialize_arch_duration(conf->topology.nhwscc,NULL); 
															
 
																 }
															
 
																 static struct starpu_perfmodel_device device =
															
@@ -278,7 +278,7 @@ void _starpu_bound_record(struct _starpu_job *j)
 
																 	{
															
 
																 		struct bound_task_pool *tp;
															
 
																-		_starpu_compute_buffers_footprint(j->task->cl?j->task->cl->model:NULL, STARPU_CPU_WORKER, 0, j);
															
 
																+		_starpu_compute_buffers_footprint(j->task->cl?j->task->cl->model:NULL, NULL, 0, j);
															
 
																 		if (last && last->cl == j->task->cl && last->footprint == j->footprint)
															
 
																 			tp = last;
															
--- a/src/profiling/profiling_helpers.c
+++ b/src/profiling/profiling_helpers.c
@@ -60,6 +60,7 @@ void _starpu_profiling_bus_helper_display_summary(FILE *stream)
 
																 		unsigned unit = 0;
															
 
																 		double d = convert_to_byte_units(transferred, max_unit, &unit);
															
 
																+		double avg = (transfer_cnt != 0) ? (d / transfer_cnt) : 0;
															
 
																 		_starpu_memory_node_get_name(src, src_name, sizeof(src_name));
															
 
																 		_starpu_memory_node_get_name(dst, dst_name, sizeof(dst_name));
															
@@ -67,7 +68,7 @@ void _starpu_profiling_bus_helper_display_summary(FILE *stream)
 
																 		fprintf(stream, "\t%s -> %s", src_name, dst_name);
															
 
																 		fprintf(stream, "\t%.2lf %s", d, byte_units[unit]);
															
 
																 		fprintf(stream, "\t%.2lf %s/s", d / elapsed_time, byte_units[unit]);
															
 
																-		fprintf(stream, "\t(transfers : %lld - avg %.2lf %s)\n", transfer_cnt, d / transfer_cnt, byte_units[unit]);
															
 
																+		fprintf(stream, "\t(transfers : %lld - avg %.2lf %s)\n", transfer_cnt, avg, byte_units[unit]);
															
 
																 		sum_transferred += transferred;
															
 
																 	}
															
--- a/src/sched_policies/component_worker.c
+++ b/src/sched_policies/component_worker.c
@@ -503,8 +503,11 @@ static void simple_worker_can_pull(struct starpu_sched_component * worker_compon
 
																 	}
															
 
																 	if(_starpu_sched_component_worker_is_sleeping_status(worker_component))
															
 
																 	{
															
 
																+		starpu_pthread_mutex_t *sched_mutex;
															
 
																+		starpu_pthread_cond_t *sched_cond;
															
 
																+		starpu_worker_get_sched_condition(w->workerid, &sched_mutex, &sched_cond);
															
 
																 		_starpu_sched_component_unlock_worker(worker_component->tree->sched_ctx_id, w->workerid);
															
 
																-		starpu_wake_worker(w->workerid);
															
 
																+		starpu_wakeup_worker(w->workerid, sched_cond, sched_mutex);
															
 
																 	}
															
 
																 	else
															
 
																 		_starpu_sched_component_unlock_worker(worker_component->tree->sched_ctx_id, w->workerid);
															
@@ -723,7 +726,10 @@ static void combined_worker_can_pull(struct starpu_sched_component * component)
 
																 		_starpu_sched_component_lock_worker(component->tree->sched_ctx_id, worker);
															
 
																 		if(_starpu_sched_component_worker_is_sleeping_status(component))
															
 
																 		{
															
 
																-			starpu_wake_worker(worker);
															
 
																+			starpu_pthread_mutex_t *sched_mutex;
															
 
																+			starpu_pthread_cond_t *sched_cond;
															
 
																+			starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
															
 
																+			starpu_wakeup_worker(worker, sched_cond, sched_mutex);
															
 
																 		}
															
 
																 		if(_starpu_sched_component_worker_is_reset_status(component))
															
 
																 			_starpu_sched_component_worker_set_changed_status(component);
															
--- a/src/sched_policies/deque_modeling_policy_data_aware.c
+++ b/src/sched_policies/deque_modeling_policy_data_aware.c
@@ -251,6 +251,7 @@ static struct starpu_task *dmda_pop_ready_task(unsigned sched_ctx_id)
 
																 	/* Take the opportunity to update start time */
															
 
																 	fifo->exp_start = STARPU_MAX(starpu_timing_now(), fifo->exp_start);
															
 
																+	fifo->exp_end = fifo->exp_start + fifo->exp_len;
															
 
																 	task = _starpu_fifo_pop_first_ready_task(fifo, node, dt->num_priorities);
															
 
																 	if (task)
															
@@ -285,6 +286,7 @@ static struct starpu_task *dmda_pop_task(unsigned sched_ctx_id)
 
																 	/* Take the opportunity to update start time */
															
 
																 	fifo->exp_start = STARPU_MAX(starpu_timing_now(), fifo->exp_start);
															
 
																+	fifo->exp_end = fifo->exp_start + fifo->exp_len;
															
 
																 	STARPU_ASSERT_MSG(fifo, "worker %u does not belong to ctx %u anymore.\n", workerid, sched_ctx_id);
															
@@ -321,6 +323,7 @@ static struct starpu_task *dmda_pop_every_task(unsigned sched_ctx_id)
 
																 	/* Take the opportunity to update start time */
															
 
																 	fifo->exp_start = STARPU_MAX(starpu_timing_now(), fifo->exp_start);
															
 
																+	fifo->exp_end = fifo->exp_start + fifo->exp_len;
															
 
																 	starpu_pthread_mutex_t *sched_mutex;
															
 
																 	starpu_pthread_cond_t *sched_cond;
															
@@ -367,6 +370,7 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
																         /* Sometimes workers didn't take the tasks as early as we expected */
															
 
																 	fifo->exp_start = isnan(fifo->exp_start) ? starpu_timing_now() + fifo->pipeline_len : STARPU_MAX(fifo->exp_start, starpu_timing_now());
															
 
																+	fifo->exp_end = fifo->exp_start + fifo->exp_len;
															
 
																 	if ((starpu_timing_now() + predicted_transfer) < fifo->exp_end)
															
 
																 	{
															
@@ -448,7 +452,7 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
																 #if !defined(STARPU_NON_BLOCKING_DRIVERS) || defined(STARPU_SIMGRID)
															
 
																-		starpu_wake_worker_locked(best_workerid);
															
 
																+		starpu_wakeup_worker_locked(best_workerid, sched_cond, sched_mutex);
															
 
																 #endif
															
 
																 		starpu_push_task_end(task);
															
 
																 		STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(sched_mutex);
															
@@ -460,7 +464,7 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
																 		dt->queue_array[best_workerid]->ntasks++;
															
 
																 		dt->queue_array[best_workerid]->nprocessed++;
															
 
																 #if !defined(STARPU_NON_BLOCKING_DRIVERS) || defined(STARPU_SIMGRID)
															
 
																-		starpu_wake_worker_locked(best_workerid);
															
 
																+		starpu_wakeup_worker_locked(best_workerid, sched_cond, sched_mutex);
															
 
																 #endif
															
 
																 		starpu_push_task_end(task);
															
 
																 		STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(sched_mutex);
															
@@ -773,7 +777,9 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 
																 			if (unknown)
															
 
																 				continue;
															
 
																-			exp_end[worker_ctx][nimpl] = exp_start + prev_exp_len + local_task_length[worker_ctx][nimpl];
															
 
																+			double task_starting_time = STARPU_MAX(exp_start + prev_exp_len, starpu_timing_now() + local_data_penalty[worker_ctx][nimpl]); 
															
 
																+
															
 
																+			exp_end[worker_ctx][nimpl] = task_starting_time + local_task_length[worker_ctx][nimpl];
															
 
																 			if (exp_end[worker_ctx][nimpl] < best_exp_end)
															
 
																 			{
															
@@ -1126,6 +1132,7 @@ static void dmda_pre_exec_hook(struct starpu_task *task, unsigned sched_ctx_id)
 
																 	/* Take the opportunity to update start time */
															
 
																 	fifo->exp_start = STARPU_MAX(starpu_timing_now() + fifo->pipeline_len, fifo->exp_start);
															
 
																+	fifo->exp_end = fifo->exp_start + fifo->exp_len;
															
 
																 	STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(sched_mutex);
															
 
																 }
															
--- a/src/sched_policies/eager_central_policy.c
+++ b/src/sched_policies/eager_central_policy.c
@@ -201,7 +201,7 @@ static void eager_add_workers(unsigned sched_ctx_id, int *workerids, unsigned nw
 
																 		int workerid = workerids[i];
															
 
																 		int curr_workerid = _starpu_worker_get_id();
															
 
																 		if(workerid != curr_workerid)
															
 
																-			starpu_wake_worker_locked(workerid);
															
 
																+			starpu_wake_worker(workerid);
															
 
																 		starpu_sched_ctx_worker_shares_tasks_lists(workerid, sched_ctx_id);
															
 
																 	}
															
--- a/src/sched_policies/eager_central_priority_policy.c
+++ b/src/sched_policies/eager_central_priority_policy.c
@@ -308,7 +308,7 @@ static void eager_center_priority_add_workers(unsigned sched_ctx_id, int *worker
 
																 		int workerid = workerids[i];
															
 
																 		int curr_workerid = _starpu_worker_get_id();
															
 
																 		if(workerid != curr_workerid)
															
 
																-			starpu_wake_worker_locked(workerid);
															
 
																+			starpu_wake_worker(workerid);
															
 
																                 starpu_sched_ctx_worker_shares_tasks_lists(workerid, sched_ctx_id);
															
 
																         }
															
--- a/src/sched_policies/parallel_eager.c
+++ b/src/sched_policies/parallel_eager.c
@@ -265,7 +265,7 @@ static struct starpu_task *pop_task_peager_policy(unsigned sched_ctx_id)
 
																 				_starpu_fifo_push_task(data->local_fifo[local_worker], alias);
															
 
																 #if !defined(STARPU_NON_BLOCKING_DRIVERS) || defined(STARPU_SIMGRID)
															
 
																-				starpu_wake_worker_locked(local_worker);
															
 
																+				starpu_wakeup_worker_locked(local_worker, sched_cond, sched_mutex);
															
 
																 #endif
															
 
																 				STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(sched_mutex);
															
--- a/src/util/fstarpu.c
+++ b/src/util/fstarpu.c
@@ -85,6 +85,7 @@ static const intptr_t fstarpu_starpu_mic	= STARPU_MIC;
 
																 static const intptr_t fstarpu_starpu_scc	= STARPU_SCC;
															
 
																 static const intptr_t fstarpu_starpu_codelet_simgrid_execute	= STARPU_CODELET_SIMGRID_EXECUTE;
															
 
																+static const intptr_t fstarpu_starpu_codelet_simgrid_execute_and_inject	= STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT;
															
 
																 static const intptr_t fstarpu_starpu_cuda_async	= STARPU_CUDA_ASYNC;
															
 
																 static const intptr_t fstarpu_starpu_opencl_async	= STARPU_OPENCL_ASYNC;
															
@@ -153,6 +154,7 @@ intptr_t fstarpu_get_constant(char *s)
 
																 	else if (!strcmp(s, "FSTARPU_SCC"))	{ return fstarpu_starpu_scc; }
															
 
																 	else if (!strcmp(s, "FSTARPU_CODELET_SIMGRID_EXECUTE"))	{ return fstarpu_starpu_codelet_simgrid_execute; }
															
 
																+	else if (!strcmp(s, "FSTARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT"))	{ return fstarpu_starpu_codelet_simgrid_execute_and_inject; }
															
 
																 	else if (!strcmp(s, "FSTARPU_CUDA_ASYNC"))	{ return fstarpu_starpu_cuda_async; }
															
 
																 	else if (!strcmp(s, "FSTARPU_OPENCL_ASYNC"))	{ return fstarpu_starpu_opencl_async; }
															
@@ -542,7 +544,7 @@ void fstarpu_worker_get_type_as_string(intptr_t type, char *dst, size_t maxlen)
 
																 	snprintf(dst, maxlen, "%s", str);
															
 
																 }
															
 
																-struct starpu_data_handle *fstarpu_data_handle_array_alloc(int nb)
															
 
																+starpu_data_handle_t *fstarpu_data_handle_array_alloc(int nb)
															
 
																 {
															
 
																 	void *ptr;
															
 
																 	_STARPU_CALLOC(ptr, (size_t)nb, sizeof(starpu_data_handle_t));
															
--- a/starpu.mk
+++ b/starpu.mk
@@ -16,7 +16,7 @@
 
																 if STARPU_USE_MPI_MASTER_SLAVE
															
 
																 MPI_LAUNCHER 			= $(MPIEXEC)  $(MPIEXEC_ARGS) -np 4
															
 
																-MPI_RUN_ARGS			= STARPU_WORKERS_NOBIND=1 STARPU_NCPU=4
															
 
																+MPI_RUN_ARGS			= STARPU_WORKERS_NOBIND=1 STARPU_NCPU=4 STARPU_NMPIMSTHREADS=4
															
 
																 endif
															
 
																 showcheck:
															
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -34,6 +34,7 @@ AM_LDFLAGS = @STARPU_EXPORT_DYNAMIC@ $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFL
 
																 EXTRA_DIST =					\
															
 
																 	helper.h				\
															
 
																+	datawizard/locality.sh			\
															
 
																 	datawizard/scal.h			\
															
 
																 	datawizard/mpi_like.h			\
															
 
																 	microbenchs/tasks_size_overhead.sh	\
															
@@ -381,6 +382,8 @@ if STARPU_SIMGRID
 
																 TESTS += $(MICROBENCHS:=.sh)
															
 
																 endif
															
 
																+TESTS += datawizard/locality.sh
															
 
																+
															
 
																 #######################
															
 
																 # Source files        #
															
 
																 #######################
															
--- a/tests/datawizard/locality.sh
+++ b/tests/datawizard/locality.sh
@@ -0,0 +1,33 @@
 
																+#!/bin/bash -x
															
 
																+#
															
 
																+# StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																+#
															
 
																+# Copyright (C) 2017  Université de Bordeaux
															
 
																+#
															
 
																+# StarPU is free software; you can redistribute it and/or modify
															
 
																+# it under the terms of the GNU Lesser General Public License as published by
															
 
																+# the Free Software Foundation; either version 2.1 of the License, or (at
															
 
																+# your option) any later version.
															
 
																+#
															
 
																+# StarPU is distributed in the hope that it will be useful, but
															
 
																+# WITHOUT ANY WARRANTY; without even the implied warranty of
															
 
																+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
															
 
																+#
															
 
																+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
															
 
																+
															
 
																+# Test generation of FxT traces
															
 
																+
															
 
																+set -e
															
 
																+
															
 
																+PREFIX=$(dirname $0)
															
 
																+test -x $PREFIX/../../tools/starpu_fxt_tool || exit 77
															
 
																+STARPU_FXT_PREFIX=$PREFIX/ $PREFIX/locality
															
 
																+$PREFIX/../../tools/starpu_fxt_tool -i $PREFIX/prof_file_${USER}_0
															
 
																+
															
 
																+# Check that they are approved by Grenoble :)
															
 
																+
															
 
																+if type pj_dump > /dev/null 2> /dev/null
															
 
																+then
															
 
																+	$PREFIX/../../tools/starpu_paje_sort paje.trace
															
 
																+	pj_dump paje.trace
															
 
																+fi
															
--- a/tools/starpu_fxt_tool.c
+++ b/tools/starpu_fxt_tool.c
@@ -30,8 +30,9 @@ static void usage()
 
																 	fprintf(stderr, "Usage: %s [ options ]\n", PROGNAME);
															
 
																         fprintf(stderr, "\n");
															
 
																         fprintf(stderr, "Options:\n");
															
 
																-	fprintf(stderr, "   -i <input file>     specify the input file. This can be specified several\n");
															
 
																-	fprintf(stderr, "                       times for MPI execution case\n");
															
 
																+	fprintf(stderr, "   -i <input file[s]>  specify the input file[s]. Several files can be provided,\n");
															
 
																+	fprintf(stderr, "                       or the option specified several times for MPI execution\n");
															
 
																+	fprintf(stderr, "                       case\n");
															
 
																         fprintf(stderr, "   -o <output file>    specify the output file\n");
															
 
																         fprintf(stderr, "   -c                  use a different colour for every type of task\n");
															
 
																 	fprintf(stderr, "   -no-events          do not show events\n");