Browse Source

Merge from trunk

Corentin Salingue 8 years ago
parent
commit
134f1eb906
70 changed files with 1232 additions and 716 deletions
  1. 1 0
      ChangeLog
  2. 9 2
      configure.ac
  3. 3 2
      doc/doxygen/chapters/320_scheduling.doxy
  4. 1 1
      doc/doxygen/chapters/380_offline_performance_tools.doxy
  5. 4 3
      doc/doxygen/chapters/470_simgrid.doxy
  6. 20 10
      doc/doxygen/chapters/501_environment_variables.doxy
  7. 5 0
      doc/doxygen/chapters/api/codelet_and_tasks.doxy
  8. 12 3
      examples/Makefile.am
  9. 105 79
      examples/heat/dw_sparse_cg.c
  10. 7 3
      examples/heat/heat.c
  11. 43 0
      examples/heat/heat.sh
  12. 34 0
      examples/lu/lu.sh
  13. 3 1
      examples/lu/lu_example.c
  14. 4 0
      examples/lu/xlu_implicit_pivot.c
  15. 5 0
      examples/lu/xlu_pivot.c
  16. 46 20
      examples/mlr/mlr.c
  17. 5 3
      examples/sched_ctx/gpu_partition.c
  18. 2 3
      examples/stencil/stencil-blocks.c
  19. 5 2
      include/fstarpu_mod.f90
  20. 2 1
      include/starpu_config.h.in
  21. 3 0
      include/starpu_scheduler.h
  22. 1 0
      include/starpu_task.h
  23. 41 10
      include/starpu_thread.h
  24. 109 0
      mpi/dev/starpu_mpi_comm_check.sh
  25. 1 1
      sc_hypervisor/src/hypervisor_policies/feft_lp_policy.c
  26. 3 3
      socl/src/cl_createkernel.c
  27. 2 2
      socl/src/cl_createprogramwithsource.c
  28. 6 6
      socl/src/cl_enqueuendrangekernel.c
  29. 3 3
      socl/src/cl_enqueuereadbuffer.c
  30. 3 3
      socl/src/cl_enqueuewritebuffer.c
  31. 1 1
      socl/src/cl_setkernelarg.c
  32. 1 1
      socl/src/task.c
  33. 1 1
      src/common/fxt.c
  34. 25 3
      src/common/prio_list.h
  35. 2 3
      src/common/starpu_spinlock.c
  36. 102 29
      src/common/thread.c
  37. 3 4
      src/common/utils.h
  38. 2 2
      src/core/jobs.c
  39. 308 289
      src/core/sched_ctx.c
  40. 41 42
      src/core/sched_ctx.h
  41. 11 6
      src/core/sched_policy.c
  42. 3 2
      src/core/sched_policy.h
  43. 4 4
      src/core/simgrid.h
  44. 6 11
      src/core/topology.c
  45. 37 21
      src/core/workers.c
  46. 20 47
      src/core/workers.h
  47. 16 0
      src/datawizard/malloc.c
  48. 5 2
      src/datawizard/memory_nodes.c
  49. 21 5
      src/debug/traces/starpu_fxt.c
  50. 6 6
      src/debug/traces/starpu_paje.c
  51. 8 2
      src/drivers/cpu/driver_cpu.c
  52. 10 4
      src/drivers/cuda/driver_cuda.c
  53. 23 44
      src/drivers/driver_common/driver_common.c
  54. 1 1
      src/drivers/mic/driver_mic_sink.c
  55. 2 3
      src/drivers/mp_common/mp_common.c
  56. 0 1
      src/drivers/mp_common/mp_common.h
  57. 4 1
      src/drivers/mpi/driver_mpi_common.c
  58. 8 0
      src/drivers/opencl/driver_opencl.c
  59. 7 7
      src/profiling/bound.c
  60. 2 1
      src/profiling/profiling_helpers.c
  61. 8 2
      src/sched_policies/component_worker.c
  62. 10 3
      src/sched_policies/deque_modeling_policy_data_aware.c
  63. 1 1
      src/sched_policies/eager_central_policy.c
  64. 1 1
      src/sched_policies/eager_central_priority_policy.c
  65. 1 1
      src/sched_policies/parallel_eager.c
  66. 3 1
      src/util/fstarpu.c
  67. 1 1
      starpu.mk
  68. 3 0
      tests/Makefile.am
  69. 33 0
      tests/datawizard/locality.sh
  70. 3 2
      tools/starpu_fxt_tool.c

+ 1 - 0
ChangeLog

@@ -273,6 +273,7 @@ Small features:
     allows to copy in a new buffer values which have not been unpacked by
     allows to copy in a new buffer values which have not been unpacked by
     the current call
     the current call
   * Add STARPU_CODELET_SIMGRID_EXECUTE flag.
   * Add STARPU_CODELET_SIMGRID_EXECUTE flag.
+  * Add STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT flag.
   * Add STARPU_CL_ARGS flag to starpu_task_insert() and
   * Add STARPU_CL_ARGS flag to starpu_task_insert() and
     starpu_mpi_task_insert() functions call
     starpu_mpi_task_insert() functions call
 
 

+ 9 - 2
configure.ac

@@ -348,9 +348,10 @@ else
     build_mpi_master_slave=no
     build_mpi_master_slave=no
 fi
 fi
 
 
-#Warn users that they cannot use both at the same time
+#users cannot use both at the same time
 if test x$build_mpi_master_slave = xyes -a x$enable_mpi = xyes; then
 if test x$build_mpi_master_slave = xyes -a x$enable_mpi = xyes; then
-    AC_MSG_WARN(StarPU-MPI and MPI Master-Slave cannot be used at the same time !)
+    AC_MSG_WARN(StarPU-MPI and MPI Master-Slave cannot be used at the same time ! Disabling StarPU-MPI...)
+	enable_mpi=no
 fi
 fi
 
 
 if test x$build_mpi_master_slave = xyes; then
 if test x$build_mpi_master_slave = xyes; then
@@ -3117,6 +3118,12 @@ AC_CONFIG_COMMANDS([executable-scripts], [
   test -e tests/microbenchs/parallel_independent_heterogeneous_tasks.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_heterogeneous_tasks.sh tests/microbenchs/
   test -e tests/microbenchs/parallel_independent_heterogeneous_tasks.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_heterogeneous_tasks.sh tests/microbenchs/
   test -e tests/microbenchs/parallel_independent_homogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_homogeneous_tasks_data.sh tests/microbenchs/
   test -e tests/microbenchs/parallel_independent_homogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_homogeneous_tasks_data.sh tests/microbenchs/
   test -e tests/microbenchs/parallel_independent_homogeneous_tasks.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_homogeneous_tasks.sh tests/microbenchs/
   test -e tests/microbenchs/parallel_independent_homogeneous_tasks.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_homogeneous_tasks.sh tests/microbenchs/
+  mkdir -p tests/datawizard
+  test -e tests/datawizard/locality.sh || ln -sf $ac_abs_top_srcdir/tests/datawizard/locality.sh tests/datawizard/
+  mkdir -p examples/heat
+  test -e examples/heat/heat.sh || ln -sf $ac_abs_top_srcdir/examples/heat/heat.sh examples/heat/
+  mkdir -p examples/lu
+  test -e examples/lu/lu.sh || ln -sf $ac_abs_top_srcdir/examples/lu/lu.sh examples/lu/
 ])
 ])
 
 
 # Create links to ICD files in build/socl/vendors directory. SOCL will use this
 # Create links to ICD files in build/socl/vendors directory. SOCL will use this

+ 3 - 2
doc/doxygen/chapters/320_scheduling.doxy

@@ -285,8 +285,9 @@ be used to get information about how well the execution proceeded, and thus the
 overall quality of the execution.
 overall quality of the execution.
 
 
 Precise debugging can also be performed by using the
 Precise debugging can also be performed by using the
-\ref STARPU_TASK_BREAK_ON_SCHED, \ref STARPU_TASK_BREAK_ON_PUSH, and
-\ref STARPU_TASK_BREAK_ON_POP environment variables. By setting the job_id of a task
+\ref STARPU_TASK_BREAK_ON_PUSH, \ref STARPU_TASK_BREAK_ON_SCHED,
+\ref STARPU_TASK_BREAK_ON_POP, and \ref STARPU_TASK_BREAK_ON_EXEC environment variables.
+By setting the job_id of a task
 in these environment variables, StarPU will raise <c>SIGTRAP</c> when the task is being
 in these environment variables, StarPU will raise <c>SIGTRAP</c> when the task is being
 scheduled, pushed, or popped by the scheduler. That means that when one notices
 scheduled, pushed, or popped by the scheduler. That means that when one notices
 that a task is being scheduled in a seemingly odd way, one can just reexecute
 that a task is being scheduled in a seemingly odd way, one can just reexecute

+ 1 - 1
doc/doxygen/chapters/380_offline_performance_tools.doxy

@@ -129,7 +129,7 @@ collect the trace files from the MPI nodes, and
 specify them all on the command <c>starpu_fxt_tool</c>, for instance:
 specify them all on the command <c>starpu_fxt_tool</c>, for instance:
 
 
 \verbatim
 \verbatim
-$ starpu_fxt_tool -i /tmp/prof_file_something1 -i /tmp/prof_file_something2
+$ starpu_fxt_tool -i /tmp/prof_file_something*
 \endverbatim
 \endverbatim
 
 
 By default, all tasks are displayed using a green color. To display tasks with
 By default, all tasks are displayed using a green color. To display tasks with

+ 4 - 3
doc/doxygen/chapters/470_simgrid.doxy

@@ -9,8 +9,8 @@
 /*! \page SimGridSupport SimGrid Support
 /*! \page SimGridSupport SimGrid Support
 
 
 StarPU can use Simgrid in order to simulate execution on an arbitrary
 StarPU can use Simgrid in order to simulate execution on an arbitrary
-platform. This was tested with simgrid 3.11, 3.12, 3.13, 3.14, and 3.14.159, other versions may have
-compatibility issues.
+platform. This was tested with simgrid from 3.11 to 3.15,
+other versions may have compatibility issues.
 
 
 \section Preparing Preparing Your Application For Simulation
 \section Preparing Preparing Your Application For Simulation
 
 
@@ -36,7 +36,8 @@ To be able to run the application with e.g. CUDA simulation on a system which
 does not have CUDA installed, one can fill the cuda_funcs with (void*)1, to
 does not have CUDA installed, one can fill the cuda_funcs with (void*)1, to
 express that there is a CUDA implementation, even if one does not actually
 express that there is a CUDA implementation, even if one does not actually
 provide it. StarPU will not actually run it in Simgrid mode anyway by default
 provide it. StarPU will not actually run it in Simgrid mode anyway by default
-(unless the ::STARPU_CODELET_SIMGRID_EXECUTE flag is set in the codelet)
+(unless the ::STARPU_CODELET_SIMGRID_EXECUTE or ::STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT
+flags are set in the codelet)
 
 
 \snippet simgrid.c To be included. You should update doxygen if you see this text.
 \snippet simgrid.c To be included. You should update doxygen if you see this text.
 
 

+ 20 - 10
doc/doxygen/chapters/501_environment_variables.doxy

@@ -642,9 +642,10 @@ especially regarding data transfers.
 <dd>
 <dd>
 \anchor STARPU_SIMGRID_SCHED_COST
 \anchor STARPU_SIMGRID_SCHED_COST
 \addindex __env__STARPU_SIMGRID_SCHED_COST
 \addindex __env__STARPU_SIMGRID_SCHED_COST
-When set to 1 (which is the default), scheduling costs are taken into
+When set to 1 (0 is the default), scheduling costs are taken into
 account in simgrid mode. This provides more accurate simgrid predictions,
 account in simgrid mode. This provides more accurate simgrid predictions,
-and allows studying scheduling overhead of the runtime system.
+and allows studying scheduling overhead of the runtime system. However,
+it also makes simulation non-deterministic.
 </dd>
 </dd>
 
 
 </dl>
 </dl>
@@ -1021,6 +1022,15 @@ dog is reached, thus allowing to catch the situation in gdb, etc
 (see \ref DetectionStuckConditions)
 (see \ref DetectionStuckConditions)
 </dd>
 </dd>
 
 
+<dt>STARPU_TASK_BREAK_ON_PUSH</dt>
+<dd>
+\anchor STARPU_TASK_BREAK_ON_PUSH
+\addindex __env__STARPU_TASK_BREAK_ON_PUSH
+When this variable contains a job id, StarPU will raise SIGTRAP when the task
+with that job id is being pushed to the scheduler, which will be nicely catched by debuggers
+(see \ref DebuggingScheduling)
+</dd>
+
 <dt>STARPU_TASK_BREAK_ON_SCHED</dt>
 <dt>STARPU_TASK_BREAK_ON_SCHED</dt>
 <dd>
 <dd>
 \anchor STARPU_TASK_BREAK_ON_SCHED
 \anchor STARPU_TASK_BREAK_ON_SCHED
@@ -1032,21 +1042,21 @@ This only works for schedulers which have such a scheduling point defined
 (see \ref DebuggingScheduling)
 (see \ref DebuggingScheduling)
 </dd>
 </dd>
 
 
-<dt>STARPU_TASK_BREAK_ON_PUSH</dt>
+<dt>STARPU_TASK_BREAK_ON_POP</dt>
 <dd>
 <dd>
-\anchor STARPU_TASK_BREAK_ON_PUSH
-\addindex __env__STARPU_TASK_BREAK_ON_PUSH
+\anchor STARPU_TASK_BREAK_ON_POP
+\addindex __env__STARPU_TASK_BREAK_ON_POP
 When this variable contains a job id, StarPU will raise SIGTRAP when the task
 When this variable contains a job id, StarPU will raise SIGTRAP when the task
-with that job id is being pushed to the scheduler, which will be nicely catched by debuggers
+with that job id is being popped from the scheduler, which will be nicely catched by debuggers
 (see \ref DebuggingScheduling)
 (see \ref DebuggingScheduling)
 </dd>
 </dd>
 
 
-<dt>STARPU_TASK_BREAK_ON_POP</dt>
+<dt>STARPU_TASK_BREAK_ON_EXEC</dt>
 <dd>
 <dd>
-\anchor STARPU_TASK_BREAK_ON_POP
-\addindex __env__STARPU_TASK_BREAK_ON_POP
+\anchor STARPU_TASK_BREAK_ON_EXEC
+\addindex __env__STARPU_TASK_BREAK_ON_EXEC
 When this variable contains a job id, StarPU will raise SIGTRAP when the task
 When this variable contains a job id, StarPU will raise SIGTRAP when the task
-with that job id is being popped from the scheduler, which will be nicely catched by debuggers
+with that job id is being executed, which will be nicely catched by debuggers
 (see \ref DebuggingScheduling)
 (see \ref DebuggingScheduling)
 </dd>
 </dd>
 
 

+ 5 - 0
doc/doxygen/chapters/api/codelet_and_tasks.doxy

@@ -135,6 +135,11 @@ Value to be set in starpu_codelet::opencl_flags to allow asynchronous OpenCL ker
 \ingroup API_Codelet_And_Tasks
 \ingroup API_Codelet_And_Tasks
 Value to be set in starpu_codelet::flags to execute the codelet functions even in simgrid mode.
 Value to be set in starpu_codelet::flags to execute the codelet functions even in simgrid mode.
 
 
+\def STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT
+\ingroup API_Codelet_And_Tasks
+Value to be set in starpu_codelet::flags to execute the codelet functions even in simgrid mode,
+and later inject the measured timing inside the simulation.
+
 \typedef starpu_cpu_func_t
 \typedef starpu_cpu_func_t
 \ingroup API_Codelet_And_Tasks
 \ingroup API_Codelet_And_Tasks
 CPU implementation of a codelet.
 CPU implementation of a codelet.

+ 12 - 3
examples/Makefile.am

@@ -77,11 +77,13 @@ EXTRA_DIST = 					\
 	scheduler/schedulers.sh				\
 	scheduler/schedulers.sh				\
 	scheduler/schedulers_context.sh			\
 	scheduler/schedulers_context.sh			\
 	fortran/Makefile				\
 	fortran/Makefile				\
-	sched_ctx/axpy_partition_gpu.h				\
-	sched_ctx/axpy_partition_gpu.cu
+	sched_ctx/axpy_partition_gpu.h			\
+	sched_ctx/axpy_partition_gpu.cu			\
+	heat/heat.sh					\
+	lu/lu.sh
 
 
 
 
-CLEANFILES = *.gcno *.gcda *.linkinfo *.mod starpu_idle_microsec.log
+CLEANFILES = *.gcno *.gcda *.linkinfo *.mod starpu_idle_microsec.log *.mps *.dot *.pl
 
 
 if STARPU_USE_CUDA
 if STARPU_USE_CUDA
 
 
@@ -300,6 +302,13 @@ STARPU_EXAMPLES +=				\
 	heat/heat				\
 	heat/heat				\
 	cg/cg					\
 	cg/cg					\
 	pipeline/pipeline
 	pipeline/pipeline
+
+if !STARPU_USE_MPI_MASTER_SLAVE
+TESTS += \
+	heat/heat.sh				\
+	lu/lu.sh
+
+endif
 endif
 endif
 endif
 endif
 
 

+ 105 - 79
examples/heat/dw_sparse_cg.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009, 2010, 2011, 2015  Université de Bordeaux
+ * Copyright (C) 2009, 2010, 2011, 2015, 2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2016  CNRS
  * Copyright (C) 2010, 2011, 2012, 2013, 2016  CNRS
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -25,11 +25,7 @@
 
 
 static struct starpu_task *create_task(starpu_tag_t id)
 static struct starpu_task *create_task(starpu_tag_t id)
 {
 {
-	struct starpu_codelet *cl = calloc(1,sizeof(struct starpu_codelet));
-
 	struct starpu_task *task = starpu_task_create();
 	struct starpu_task *task = starpu_task_create();
-		task->cl = cl;
-		task->cl_arg = NULL;
 		task->use_tag = 1;
 		task->use_tag = 1;
 		task->tag_id = id;
 		task->tag_id = id;
 
 
@@ -131,6 +127,30 @@ void init_problem(void)
  *	cg initialization phase
  *	cg initialization phase
  */
  */
 
 
+static struct starpu_codelet cl1 = {
+	.cpu_funcs = { cpu_codelet_func_1 },
+	.cpu_funcs_name = { "cpu_codelet_func_1" },
+	.nbuffers = 4,
+	.modes = { STARPU_R, STARPU_R, STARPU_W, STARPU_R },
+};
+
+static struct starpu_codelet cl2 = {
+	.cpu_funcs = { cpu_codelet_func_2 },
+	.cpu_funcs_name = { "cpu_codelet_func_2" },
+	.nbuffers = 2,
+	.modes = { STARPU_W, STARPU_R },
+};
+
+static struct starpu_codelet cl3 = {
+	.cpu_funcs = { cpu_codelet_func_3 },
+	.cpu_funcs_name = { "cpu_codelet_func_3" },
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = { cublas_codelet_func_3 },
+#endif
+	.nbuffers = 1,
+	.modes = { STARPU_R },
+};
+
 void init_cg(struct cg_problem *problem)
 void init_cg(struct cg_problem *problem)
 {
 {
 	int ret;
 	int ret;
@@ -139,14 +159,7 @@ void init_cg(struct cg_problem *problem)
 
 
 	/* r = b  - A x */
 	/* r = b  - A x */
 	struct starpu_task *task1 = create_task(1UL);
 	struct starpu_task *task1 = create_task(1UL);
-	task1->cl->cpu_funcs[0] = cpu_codelet_func_1;
-	task1->cl->cpu_funcs_name[0] = "cpu_codelet_func_1";
-	task1->cl->nbuffers = 4;
-	task1->cl->modes[0] = STARPU_R;
-	task1->cl->modes[1] = STARPU_R;
-	task1->cl->modes[2] = STARPU_W;
-	task1->cl->modes[3] = STARPU_R;
-
+	task1->cl = &cl1;
 	task1->handles[0] = problem->ds_matrixA;
 	task1->handles[0] = problem->ds_matrixA;
 	task1->handles[1] = problem->ds_vecx;
 	task1->handles[1] = problem->ds_vecx;
 	task1->handles[2] = problem->ds_vecr;
 	task1->handles[2] = problem->ds_vecr;
@@ -154,12 +167,7 @@ void init_cg(struct cg_problem *problem)
 
 
 	/* d = r */
 	/* d = r */
 	struct starpu_task *task2 = create_task(2UL);
 	struct starpu_task *task2 = create_task(2UL);
-	task2->cl->cpu_funcs[0] = cpu_codelet_func_2;
-	task2->cl->cpu_funcs_name[0] = "cpu_codelet_func_2";
-	task2->cl->nbuffers = 2;
-	task2->cl->modes[0] = STARPU_W;
-	task2->cl->modes[1] = STARPU_R;
-
+	task2->cl = &cl2;
 	task2->handles[0] = problem->ds_vecd;
 	task2->handles[0] = problem->ds_vecd;
 	task2->handles[1] = problem->ds_vecr;
 	task2->handles[1] = problem->ds_vecr;
 
 
@@ -167,15 +175,9 @@ void init_cg(struct cg_problem *problem)
 
 
 	/* delta_new = trans(r) r */
 	/* delta_new = trans(r) r */
 	struct starpu_task *task3 = create_task(3UL);
 	struct starpu_task *task3 = create_task(3UL);
-#ifdef STARPU_USE_CUDA
-	task3->cl->cuda_funcs[0] = cublas_codelet_func_3;
-#endif
-	task3->cl->cpu_funcs[0] = cpu_codelet_func_3;
-	task3->cl->cpu_funcs_name[0] = "cpu_codelet_func_3";
+	task3->cl = &cl3;
 	task3->cl_arg = problem;
 	task3->cl_arg = problem;
 	task3->cl_arg_size = sizeof(*problem);
 	task3->cl_arg_size = sizeof(*problem);
-	task3->cl->nbuffers = 1;
-	task3->cl->modes[0] = STARPU_R;
 	task3->handles[0] = problem->ds_vecr;
 	task3->handles[0] = problem->ds_vecr;
 
 
 	task3->callback_func = iteration_cg;
 	task3->callback_func = iteration_cg;
@@ -186,6 +188,11 @@ void init_cg(struct cg_problem *problem)
 
 
 	/* launch the computation now */
 	/* launch the computation now */
 	ret = starpu_task_submit(task1);
 	ret = starpu_task_submit(task1);
+	if (STARPU_UNLIKELY(ret == -ENODEV))
+	{
+		FPRINTF(stderr, "No worker may execute this task\n");
+		exit(0);
+	}
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	ret = starpu_task_submit(task2);
 	ret = starpu_task_submit(task2);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
@@ -198,6 +205,66 @@ void init_cg(struct cg_problem *problem)
  *		the codelet code launcher is its own callback !
  *		the codelet code launcher is its own callback !
  */
  */
 
 
+static struct starpu_codelet cl4 = {
+	.cpu_funcs = { cpu_codelet_func_4 },
+	.cpu_funcs_name = { "cpu_codelet_func_4" },
+	.nbuffers = 3,
+	.modes = { STARPU_R, STARPU_R, STARPU_W },
+};
+
+static struct starpu_codelet cl5 = {
+	.cpu_funcs = { cpu_codelet_func_5 },
+	.cpu_funcs_name = { "cpu_codelet_func_5" },
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = { cublas_codelet_func_5 },
+#endif
+	.nbuffers = 2,
+	.modes = { STARPU_R, STARPU_R },
+};
+
+static struct starpu_codelet cl6 = {
+	.cpu_funcs = { cpu_codelet_func_6 },
+	.cpu_funcs_name = { "cpu_codelet_func_6" },
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = { cublas_codelet_func_6 },
+	.cuda_flags = { STARPU_CUDA_ASYNC },
+#endif
+	.nbuffers = 2,
+	.modes = { STARPU_RW, STARPU_R },
+};
+
+static struct starpu_codelet cl7 = {
+	.cpu_funcs = { cpu_codelet_func_7 },
+	.cpu_funcs_name = { "cpu_codelet_func_7" },
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = { cublas_codelet_func_7 },
+	.cuda_flags = { STARPU_CUDA_ASYNC },
+#endif
+	.nbuffers = 2,
+	.modes = { STARPU_RW, STARPU_R },
+};
+
+static struct starpu_codelet cl8 = {
+	.cpu_funcs = { cpu_codelet_func_8 },
+	.cpu_funcs_name = { "cpu_codelet_func_8" },
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = { cublas_codelet_func_8 },
+#endif
+	.nbuffers = 1,
+	.modes = { STARPU_R },
+};
+
+static struct starpu_codelet cl9 = {
+	.cpu_funcs = { cpu_codelet_func_9 },
+	.cpu_funcs_name = { "cpu_codelet_func_9" },
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = { cublas_codelet_func_9 },
+	.cuda_flags = { STARPU_CUDA_ASYNC },
+#endif
+	.nbuffers = 2,
+	.modes = { STARPU_RW, STARPU_R },
+};
+
 void launch_new_cg_iteration(struct cg_problem *problem)
 void launch_new_cg_iteration(struct cg_problem *problem)
 {
 {
 	int ret;
 	int ret;
@@ -208,30 +275,16 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 
 
 	/* q = A d */
 	/* q = A d */
 	struct starpu_task *task4 = create_task(maskiter | 4UL);
 	struct starpu_task *task4 = create_task(maskiter | 4UL);
-	task4->cl->cpu_funcs[0] = cpu_codelet_func_4;
-	task4->cl->cpu_funcs_name[0] = "cpu_codelet_func_4";
-	task4->cl->nbuffers = 3;
-	task4->cl->modes[0] = STARPU_R;
-	task4->cl->modes[1] = STARPU_R;
-	task4->cl->modes[2] = STARPU_W;
-
+	task4->cl = &cl4;
 	task4->handles[0] = problem->ds_matrixA;
 	task4->handles[0] = problem->ds_matrixA;
 	task4->handles[1] = problem->ds_vecd;
 	task4->handles[1] = problem->ds_vecd;
 	task4->handles[2] = problem->ds_vecq;
 	task4->handles[2] = problem->ds_vecq;
 
 
 	/* alpha = delta_new / ( trans(d) q )*/
 	/* alpha = delta_new / ( trans(d) q )*/
 	struct starpu_task *task5 = create_task(maskiter | 5UL);
 	struct starpu_task *task5 = create_task(maskiter | 5UL);
-#ifdef STARPU_USE_CUDA
-	task5->cl->cuda_funcs[0] = cublas_codelet_func_5;
-#endif
-	task5->cl->cpu_funcs[0] = cpu_codelet_func_5;
-	task5->cl->cpu_funcs_name[0] = "cpu_codelet_func_5";
+	task5->cl = &cl5;
 	task5->cl_arg = problem;
 	task5->cl_arg = problem;
 	task5->cl_arg_size = sizeof(*problem);
 	task5->cl_arg_size = sizeof(*problem);
-	task5->cl->nbuffers = 2;
-	task5->cl->modes[0] = STARPU_R;
-	task5->cl->modes[1] = STARPU_R;
-
 	task5->handles[0] = problem->ds_vecd;
 	task5->handles[0] = problem->ds_vecd;
 	task5->handles[1] = problem->ds_vecq;
 	task5->handles[1] = problem->ds_vecq;
 
 
@@ -239,18 +292,9 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 
 
 	/* x = x + alpha d */
 	/* x = x + alpha d */
 	struct starpu_task *task6 = create_task(maskiter | 6UL);
 	struct starpu_task *task6 = create_task(maskiter | 6UL);
-#ifdef STARPU_USE_CUDA
-	task6->cl->cuda_funcs[0] = cublas_codelet_func_6;
-	task6->cl->cuda_flags[0] = STARPU_CUDA_ASYNC;
-#endif
-	task6->cl->cpu_funcs[0] = cpu_codelet_func_6;
-	task6->cl->cpu_funcs_name[0] = "cpu_codelet_func_6";
+	task6->cl = &cl6;
 	task6->cl_arg = problem;
 	task6->cl_arg = problem;
 	task6->cl_arg_size = sizeof(*problem);
 	task6->cl_arg_size = sizeof(*problem);
-	task6->cl->nbuffers = 2;
-	task6->cl->modes[0] = STARPU_RW;
-	task6->cl->modes[1] = STARPU_R;
-
 	task6->handles[0] = problem->ds_vecx;
 	task6->handles[0] = problem->ds_vecx;
 	task6->handles[1] = problem->ds_vecd;
 	task6->handles[1] = problem->ds_vecd;
 
 
@@ -258,18 +302,9 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 
 
 	/* r = r - alpha q */
 	/* r = r - alpha q */
 	struct starpu_task *task7 = create_task(maskiter | 7UL);
 	struct starpu_task *task7 = create_task(maskiter | 7UL);
-#ifdef STARPU_USE_CUDA
-	task7->cl->cuda_funcs[0] = cublas_codelet_func_7;
-	task7->cl->cuda_flags[0] = STARPU_CUDA_ASYNC;
-#endif
-	task7->cl->cpu_funcs[0] = cpu_codelet_func_7;
-	task7->cl->cpu_funcs_name[0] = "cpu_codelet_func_7";
+	task7->cl = &cl7;
 	task7->cl_arg = problem;
 	task7->cl_arg = problem;
 	task7->cl_arg_size = sizeof(*problem);
 	task7->cl_arg_size = sizeof(*problem);
-	task7->cl->nbuffers = 2;
-	task7->cl->modes[0] = STARPU_RW;
-	task7->cl->modes[1] = STARPU_R;
-
 	task7->handles[0] = problem->ds_vecr;
 	task7->handles[0] = problem->ds_vecr;
 	task7->handles[1] = problem->ds_vecq;
 	task7->handles[1] = problem->ds_vecq;
 
 
@@ -277,33 +312,18 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 
 
 	/* update delta_* and compute beta */
 	/* update delta_* and compute beta */
 	struct starpu_task *task8 = create_task(maskiter | 8UL);
 	struct starpu_task *task8 = create_task(maskiter | 8UL);
-#ifdef STARPU_USE_CUDA
-	task8->cl->cuda_funcs[0] = cublas_codelet_func_8;
-#endif
-	task8->cl->cpu_funcs[0] = cpu_codelet_func_8;
-	task8->cl->cpu_funcs_name[0] = "cpu_codelet_func_8";
+	task8->cl = &cl8;
 	task8->cl_arg = problem;
 	task8->cl_arg = problem;
 	task8->cl_arg_size = sizeof(*problem);
 	task8->cl_arg_size = sizeof(*problem);
-	task8->cl->nbuffers = 1;
-	task8->cl->modes[0] = STARPU_R;
 	task8->handles[0] = problem->ds_vecr;
 	task8->handles[0] = problem->ds_vecr;
 
 
 	starpu_tag_declare_deps((starpu_tag_t)(maskiter | 8UL), 1, (starpu_tag_t)(maskiter | 7UL));
 	starpu_tag_declare_deps((starpu_tag_t)(maskiter | 8UL), 1, (starpu_tag_t)(maskiter | 7UL));
 
 
 	/* d = r + beta d */
 	/* d = r + beta d */
 	struct starpu_task *task9 = create_task(maskiter | 9UL);
 	struct starpu_task *task9 = create_task(maskiter | 9UL);
-#ifdef STARPU_USE_CUDA
-	task9->cl->cuda_funcs[0] = cublas_codelet_func_9;
-	task9->cl->cuda_flags[0] = STARPU_CUDA_ASYNC;
-#endif
-	task9->cl->cpu_funcs[0] = cpu_codelet_func_9;
-	task9->cl->cpu_funcs_name[0] = "cpu_codelet_func_9";
+	task9->cl = &cl9;
 	task9->cl_arg = problem;
 	task9->cl_arg = problem;
 	task9->cl_arg_size = sizeof(*problem);
 	task9->cl_arg_size = sizeof(*problem);
-	task9->cl->nbuffers = 2;
-	task9->cl->modes[0] = STARPU_RW;
-	task9->cl->modes[1] = STARPU_R;
-
 	task9->handles[0] = problem->ds_vecd;
 	task9->handles[0] = problem->ds_vecd;
 	task9->handles[1] = problem->ds_vecr;
 	task9->handles[1] = problem->ds_vecr;
 
 
@@ -427,6 +447,10 @@ void conjugate_gradient(float *nzvalA, float *vecb, float *vecx, uint32_t nnz,
 	starpu_data_unregister(ds_vecr);
 	starpu_data_unregister(ds_vecr);
 	starpu_data_unregister(ds_vecd);
 	starpu_data_unregister(ds_vecd);
 	starpu_data_unregister(ds_vecq);
 	starpu_data_unregister(ds_vecq);
+
+	free(ptr_vecr);
+	free(ptr_vecd);
+	free(ptr_vecq);
 }
 }
 
 
 
 
@@ -444,4 +468,6 @@ void do_conjugate_gradient(float *nzvalA, float *vecb, float *vecx, uint32_t nnz
 	starpu_cublas_init();
 	starpu_cublas_init();
 
 
 	conjugate_gradient(nzvalA, vecb, vecx, nnz, nrow, colind, rowptr);
 	conjugate_gradient(nzvalA, vecb, vecx, nnz, nrow, colind, rowptr);
+
+	starpu_shutdown();
 }
 }

+ 7 - 3
examples/heat/heat.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009, 2010, 2012, 2015  Université de Bordeaux
+ * Copyright (C) 2009, 2010, 2012, 2015, 2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2016  CNRS
  * Copyright (C) 2010, 2011, 2012, 2016  CNRS
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -121,9 +121,9 @@ static void parse_args(int argc, char **argv)
 			STARPU_ASSERT((nthick - 2)*(ntheta - 2) == size);
 			STARPU_ASSERT((nthick - 2)*(ntheta - 2) == size);
 		}
 		}
 
 
-		if (strcmp(argv[i], "-h") == 0)
+		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-help") == 0)
 		{
 		{
-			printf("usage : %s [-v1|-v2|-v3] [-pin] [-nthick number] [-ntheta number] [-shape [0|1|2]] [-cg] [-size number] [-no-prio]\n", argv[0]);
+			printf("usage : %s [-v1|-v2|-v3|-v4] [-pin] [-nthick number] [-ntheta number] [-shape [0|1|2]] [-cg] [-size number] [-no-prio]\n", argv[0]);
 		}
 		}
 	}
 	}
 }
 }
@@ -751,6 +751,10 @@ int main(int argc, char **argv)
 			result[TRANSLATE(i)] = Bformer[TRANSLATE(i)];
 			result[TRANSLATE(i)] = Bformer[TRANSLATE(i)];
 		}
 		}
 
 
+		free(nzval);
+		free(colind);
+		free(rowptr);
+		free(B);
 	}
 	}
 	else
 	else
 	{
 	{

+ 43 - 0
examples/heat/heat.sh

@@ -0,0 +1,43 @@
+#!/bin/bash
+#
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2017  Université de Bordeaux
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+# Test various LU options
+
+set -e
+
+PREFIX=$(dirname $0)
+
+$PREFIX/heat -shape 0
+$PREFIX/heat -shape 1
+# sometimes lead to pivot being 0
+#$PREFIX/heat -shape 2
+
+$PREFIX/heat -cg
+
+# TODO: FIXME
+
+# segfault
+#$PREFIX/heat -v1
+
+# (actually the default...)
+$PREFIX/heat -v2
+
+# hang
+#$PREFIX/heat -v3
+
+# hang
+#$PREFIX/heat -v4

+ 34 - 0
examples/lu/lu.sh

@@ -0,0 +1,34 @@
+#!/bin/bash
+#
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2017  Université de Bordeaux
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+# Test various LU options
+
+set -e
+
+PREFIX=$(dirname $0)
+
+$PREFIX/lu_implicit_example_float -size $((960 * 4)) -nblocks 4 -piv
+$PREFIX/lu_implicit_example_float -size $((960 * 4)) -nblocks 4 -no-stride
+$PREFIX/lu_implicit_example_float -size $((960 * 4)) -nblocks 4 -bound
+$PREFIX/lu_implicit_example_float -size $((960 * 2)) -nblocks 2 -bounddeps
+$PREFIX/lu_implicit_example_float -size $((960 * 2)) -nblocks 2 -bound -bounddeps -bounddepsprio
+
+$PREFIX/lu_example_float -size $((960 * 4)) -nblocks 4 -piv
+$PREFIX/lu_example_float -size $((960 * 4)) -nblocks 4 -no-stride
+$PREFIX/lu_example_float -size $((960 * 4)) -nblocks 4 -bound
+$PREFIX/lu_example_float -size $((960 * 2)) -nblocks 2 -bounddeps
+$PREFIX/lu_example_float -size $((960 * 2)) -nblocks 2 -bound -bounddeps -bounddepsprio

+ 3 - 1
examples/lu/lu_example.c

@@ -422,13 +422,15 @@ int main(int argc, char **argv)
 		if (pivot)
 		if (pivot)
 		{
 		{
 			pivot_saved_matrix(ipiv);
 			pivot_saved_matrix(ipiv);
-			free(ipiv);
 		}
 		}
 
 
 		check_result();
 		check_result();
 	}
 	}
 #endif
 #endif
 
 
+	if (pivot)
+		free(ipiv);
+
 	starpu_free_flags(A, (size_t)size*size*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
 	starpu_free_flags(A, (size_t)size*size*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
 
 
 	starpu_cublas_shutdown();
 	starpu_cublas_shutdown();

+ 4 - 0
examples/lu/xlu_implicit_pivot.c

@@ -232,6 +232,10 @@ starpu_data_handle_t get_block_with_striding(starpu_data_handle_t *dataAp,
 
 
 int STARPU_LU(lu_decomposition_pivot)(TYPE *matA, unsigned *ipiv, unsigned size, unsigned ld, unsigned nblocks)
 int STARPU_LU(lu_decomposition_pivot)(TYPE *matA, unsigned *ipiv, unsigned size, unsigned ld, unsigned nblocks)
 {
 {
+	if (starpu_mic_worker_get_count() || starpu_scc_worker_get_count() || starpu_mpi_ms_worker_get_count())
+		/* These won't work with pivoting: we pass a pointer in cl_args */
+		return -ENODEV;
+
 	starpu_data_handle_t dataA;
 	starpu_data_handle_t dataA;
 
 
 	/* monitor and partition the A matrix into blocks :
 	/* monitor and partition the A matrix into blocks :

+ 5 - 0
examples/lu/xlu_pivot.c

@@ -399,6 +399,7 @@ int STARPU_LU(lu_decomposition_pivot)(TYPE *matA, unsigned *ipiv, unsigned size,
 
 
 	/* gather all the data */
 	/* gather all the data */
 	starpu_data_unpartition(dataA, STARPU_MAIN_RAM);
 	starpu_data_unpartition(dataA, STARPU_MAIN_RAM);
+	starpu_data_unregister(dataA);
 	free(piv_description);
 	free(piv_description);
 
 
 	return ret;
 	return ret;
@@ -413,6 +414,10 @@ starpu_data_handle_t get_block_with_no_striding(starpu_data_handle_t *dataAp, un
 
 
 int STARPU_LU(lu_decomposition_pivot_no_stride)(TYPE **matA, unsigned *ipiv, unsigned size, unsigned ld, unsigned nblocks)
 int STARPU_LU(lu_decomposition_pivot_no_stride)(TYPE **matA, unsigned *ipiv, unsigned size, unsigned ld, unsigned nblocks)
 {
 {
+	if (starpu_mic_worker_get_count() || starpu_scc_worker_get_count() || starpu_mpi_ms_worker_get_count())
+		/* These won't work with pivoting: we pass a pointer in cl_args */
+		return -ENODEV;
+
 	starpu_data_handle_t *dataAp = malloc(nblocks*nblocks*sizeof(starpu_data_handle_t));
 	starpu_data_handle_t *dataAp = malloc(nblocks*nblocks*sizeof(starpu_data_handle_t));
 
 
 	/* monitor and partition the A matrix into blocks :
 	/* monitor and partition the A matrix into blocks :

+ 46 - 20
examples/mlr/mlr.c

@@ -50,7 +50,15 @@ static long sum;
 static void cl_params(struct starpu_task *task, double *parameters)
 static void cl_params(struct starpu_task *task, double *parameters)
 {
 {
 	int m, n, k;
 	int m, n, k;
-	starpu_codelet_unpack_args(task->cl_arg, &m, &n, &k);
+	int* vector_mn;
+	starpu_data_handle_t vector_mn_handle;
+
+	vector_mn = (int*)STARPU_VECTOR_GET_PTR(task->interfaces[0]);
+	m = vector_mn[0];
+	n = vector_mn[1];
+
+	starpu_codelet_unpack_args(task->cl_arg, &k);
+
 	parameters[0] = m;
 	parameters[0] = m;
 	parameters[1] = n;
 	parameters[1] = n;
 	parameters[2] = k;
 	parameters[2] = k;
@@ -61,10 +69,13 @@ void cpu_func(void *buffers[], void *cl_arg)
 {
 {
 	long i;
 	long i;
 	int m,n,k;
 	int m,n,k;
-	starpu_codelet_unpack_args(cl_arg,
-			     	  &m,
-     			     	  &n,
-     			     	  &k);
+	int* vector_mn;
+
+	vector_mn = (int*)STARPU_VECTOR_GET_PTR(buffers[0]);
+	m = vector_mn[0];
+	n = vector_mn[1];
+
+	starpu_codelet_unpack_args(cl_arg, &k);
 
 
 	for(i=0; i < (long) (m*m*n); i++)
 	for(i=0; i < (long) (m*m*n); i++)
 		sum+=i;
 		sum+=i;
@@ -123,7 +134,8 @@ static struct starpu_codelet cl_init =
 {
 {
 	.cpu_funcs = { cpu_func },
 	.cpu_funcs = { cpu_func },
 	.cpu_funcs_name = { "cpu_func" },
 	.cpu_funcs_name = { "cpu_func" },
-	.nbuffers = 0,
+	.nbuffers = 1,
+	.modes = {STARPU_R},
 	.model = &cl_model_init,
 	.model = &cl_model_init,
 };
 };
 
 
@@ -131,7 +143,8 @@ static struct starpu_codelet cl_final =
 {
 {
 	.cpu_funcs = { cpu_func },
 	.cpu_funcs = { cpu_func },
 	.cpu_funcs_name = { "cpu_func" },
 	.cpu_funcs_name = { "cpu_func" },
-	.nbuffers = 0,
+	.nbuffers = 1,
+	.modes = {STARPU_R},
 	.model = &cl_model_final,
 	.model = &cl_model_final,
 };
 };
 
 
@@ -147,29 +160,42 @@ int main(int argc, char **argv)
 
 
 	sum=0;
 	sum=0;
 	int m,n,k;
 	int m,n,k;
+	int* vector_mn = malloc( 2 * sizeof(int) );
+	starpu_data_handle_t vector_mn_handle;
+
+	starpu_vector_data_register( &vector_mn_handle,
+				     STARPU_MAIN_RAM,
+				     (uintptr_t)vector_mn, 2,
+				     sizeof(int) );
 
 
-        /* Giving pseudo-random values to the M,N,K parameters and inserting tasks */
-	for(i=0; i < 42; i++)
+	/* Giving pseudo-random values to the M,N,K parameters and inserting tasks */
+	for ( i = 0; i < 42; i++)
 	{
 	{
 		m = (int) ((rand() % 10)+1);
 		m = (int) ((rand() % 10)+1);
 		n = (int) ((rand() % 10)+1);
 		n = (int) ((rand() % 10)+1);
 		k = (int) ((rand() % 10)+1);
 		k = (int) ((rand() % 10)+1);
 
 
-		for(j=0; j < 42; j++)
+		/* To illustrate the usage, M and N are stored in a data handle */
+		starpu_data_acquire(vector_mn_handle, STARPU_W);
+		vector_mn[0] = m;
+		vector_mn[1] = n;
+		starpu_data_release(vector_mn_handle);
+
+		for ( j = 0; j < 42; j++)
 		{
 		{
-			starpu_insert_task(&cl_init,
-				   STARPU_VALUE, &m, sizeof(int),
-				   STARPU_VALUE, &n, sizeof(int),
-				   STARPU_VALUE, &k, sizeof(int),
-				   0);
-			starpu_insert_task(&cl_final,
-				   STARPU_VALUE, &m, sizeof(int),
-				   STARPU_VALUE, &n, sizeof(int),
-				   STARPU_VALUE, &k, sizeof(int),
-				   0);
+			starpu_insert_task( &cl_init,
+					    STARPU_R, vector_mn_handle,
+					    STARPU_VALUE, &k, sizeof(int),
+					    0 );
+			starpu_insert_task( &cl_final,
+					    STARPU_R, vector_mn_handle,
+					    STARPU_VALUE, &k, sizeof(int),
+					    0 );
 		}
 		}
 	}
 	}
 
 
+	starpu_data_unregister(vector_mn_handle);
+	free(vector_mn);
 	starpu_shutdown();
 	starpu_shutdown();
 
 
 	return 0;
 	return 0;

+ 5 - 3
examples/sched_ctx/gpu_partition.c

@@ -105,7 +105,9 @@ int main(int argc, char **argv)
 	int ncuda = 0;
 	int ncuda = 0;
 	int gpu_devid = -1;
 	int gpu_devid = -1;
 
 
+#ifdef STARPU_DEVEL
 #warning temporary fix: skip test as cuda computation fails
 #warning temporary fix: skip test as cuda computation fails
+#endif
  	return 77;
  	return 77;
 
 
 #ifndef STARPU_HAVE_SETENV
 #ifndef STARPU_HAVE_SETENV
@@ -172,8 +174,8 @@ int main(int argc, char **argv)
 	int ncpus = starpu_cpu_worker_get_count();
 	int ncpus = starpu_cpu_worker_get_count();
 	int workers[ncpus+nstreams];
 	int workers[ncpus+nstreams];
 	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, workers, ncpus);
 	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, workers, ncpus);
-	
-	int sched_ctxs[nstreams];
+
+	unsigned sched_ctxs[nstreams];
 	int nsms[nstreams];
 	int nsms[nstreams];
 	nsms[0] = 6;
 	nsms[0] = 6;
 	nsms[1] = 7;
 	nsms[1] = 7;
@@ -185,7 +187,7 @@ int main(int argc, char **argv)
 	}
 	}
 	unsigned sched_ctx1 = starpu_sched_ctx_create(workers, ncpus+nstreams, "ctx1", STARPU_SCHED_CTX_SUB_CTXS, sched_ctxs, nstreams, STARPU_SCHED_CTX_POLICY_NAME, "dmdas", 0);
 	unsigned sched_ctx1 = starpu_sched_ctx_create(workers, ncpus+nstreams, "ctx1", STARPU_SCHED_CTX_SUB_CTXS, sched_ctxs, nstreams, STARPU_SCHED_CTX_POLICY_NAME, "dmdas", 0);
 
 
-	FPRINTF(stderr, "parent ctx %d\n", sched_ctx1);
+	FPRINTF(stderr, "parent ctx %u\n", sched_ctx1);
 	starpu_sched_ctx_set_context(&sched_ctx1);
 	starpu_sched_ctx_set_context(&sched_ctx1);
 
 
 #endif
 #endif

+ 2 - 3
examples/stencil/stencil-blocks.c

@@ -297,11 +297,10 @@ void allocate_memory_on_node(int rank)
 
 
 		int node = block->mpi_node;
 		int node = block->mpi_node;
 
 
-		unsigned size_bz = block_sizes_z[bz];
-
 		/* Main blocks */
 		/* Main blocks */
 		if (node == rank)
 		if (node == rank)
 		{
 		{
+			unsigned size_bz = block_sizes_z[bz];
 			allocate_block_on_node(&block->layers_handle[0], bz, &block->layers[0],
 			allocate_block_on_node(&block->layers_handle[0], bz, &block->layers[0],
 						(sizex + 2*K), (sizey + 2*K), (size_bz + 2*K));
 						(sizex + 2*K), (sizey + 2*K), (size_bz + 2*K));
 #ifndef STARPU_SIMGRID
 #ifndef STARPU_SIMGRID
@@ -389,8 +388,8 @@ void check(int rank)
 		/* Main blocks */
 		/* Main blocks */
 		if (node == rank)
 		if (node == rank)
 		{
 		{
-			unsigned size_bz = block_sizes_z[bz];
 #ifdef LIFE
 #ifdef LIFE
+			unsigned size_bz = block_sizes_z[bz];
 			unsigned x, y, z;
 			unsigned x, y, z;
 			unsigned sum = 0;
 			unsigned sum = 0;
 			for (x = 0; x < sizex; x++)
 			for (x = 0; x < sizex; x++)

+ 5 - 2
include/fstarpu_mod.f90

@@ -82,6 +82,7 @@ module fstarpu_mod
         type(c_ptr), bind(C) :: FSTARPU_SCC
         type(c_ptr), bind(C) :: FSTARPU_SCC
 
 
         type(c_ptr), bind(C) :: FSTARPU_CODELET_SIMGRID_EXECUTE
         type(c_ptr), bind(C) :: FSTARPU_CODELET_SIMGRID_EXECUTE
+        type(c_ptr), bind(C) :: FSTARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT
         type(c_ptr), bind(C) :: FSTARPU_CUDA_ASYNC
         type(c_ptr), bind(C) :: FSTARPU_CUDA_ASYNC
         type(c_ptr), bind(C) :: FSTARPU_OPENCL_ASYNC
         type(c_ptr), bind(C) :: FSTARPU_OPENCL_ASYNC
 
 
@@ -1580,7 +1581,7 @@ module fstarpu_mod
                 end subroutine fstarpu_memchunk_tidy
                 end subroutine fstarpu_memchunk_tidy
 
 
                 ! == starpu_task_util.h ==
                 ! == starpu_task_util.h ==
-                ! struct starpu_data_handle *fstarpu_data_handle_array_alloc(int nb);
+                ! starpu_data_handle_t *fstarpu_data_handle_array_alloc(int nb);
                 function fstarpu_data_handle_array_alloc (nb) bind(C)
                 function fstarpu_data_handle_array_alloc (nb) bind(C)
                         use iso_c_binding, only: c_ptr, c_int
                         use iso_c_binding, only: c_ptr, c_int
                         type(c_ptr) :: fstarpu_data_handle_array_alloc
                         type(c_ptr) :: fstarpu_data_handle_array_alloc
@@ -2331,7 +2332,9 @@ module fstarpu_mod
                             fstarpu_get_constant(C_CHAR_"FSTARPU_SCC"//C_NULL_CHAR)
                             fstarpu_get_constant(C_CHAR_"FSTARPU_SCC"//C_NULL_CHAR)
 
 
                         FSTARPU_CODELET_SIMGRID_EXECUTE = &
                         FSTARPU_CODELET_SIMGRID_EXECUTE = &
-                            fstarpu_get_constant(C_CHAR_"FSTARPU_CODELET_SIMGRID_EXECUTE"//C_NULL_CHAR)
+                             fstarpu_get_constant(C_CHAR_"FSTARPU_CODELET_SIMGRID_EXECUTE"//C_NULL_CHAR)
+                        FSTARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT = &
+                             fstarpu_get_constant(C_CHAR_"FSTARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT"//C_NULL_CHAR)
                         FSTARPU_CUDA_ASYNC = &
                         FSTARPU_CUDA_ASYNC = &
                             fstarpu_get_constant(C_CHAR_"FSTARPU_CUDA_ASYNC"//C_NULL_CHAR)
                             fstarpu_get_constant(C_CHAR_"FSTARPU_CUDA_ASYNC"//C_NULL_CHAR)
                         FSTARPU_OPENCL_ASYNC = &
                         FSTARPU_OPENCL_ASYNC = &

+ 2 - 1
include/starpu_config.h.in

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2009-2016  Université de Bordeaux
  * Copyright (C) 2009-2016  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016, 2017  CNRS
  * Copyright (C) 2014  INRIA
  * Copyright (C) 2014  INRIA
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -152,5 +152,6 @@ typedef ssize_t starpu_ssize_t;
 #undef STARPU_HAVE_DARWIN
 #undef STARPU_HAVE_DARWIN
 
 
 #undef STARPU_HAVE_CXX11
 #undef STARPU_HAVE_CXX11
+#undef STARPU_HAVE_STRERROR_R
 
 
 #endif
 #endif

+ 3 - 0
include/starpu_scheduler.h

@@ -62,8 +62,11 @@ unsigned long starpu_task_get_job_id(struct starpu_task *task);
 /* This function must be called to wake up a worker that is sleeping on the cond. 
 /* This function must be called to wake up a worker that is sleeping on the cond. 
  * It returns 0 whenever the worker is not in a sleeping state */
  * It returns 0 whenever the worker is not in a sleeping state */
 int starpu_wake_worker(int workerid);
 int starpu_wake_worker(int workerid);
+int starpu_wakeup_worker(int workerid, starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex);
 /* This is a version of starpu_wake_worker which assumes that the sched mutex is locked */
 /* This is a version of starpu_wake_worker which assumes that the sched mutex is locked */
 int starpu_wake_worker_locked(int workerid);
 int starpu_wake_worker_locked(int workerid);
+/* This is a version of starpu_wakeup_worker which assumes that the sched mutex is locked */
+int starpu_wakeup_worker_locked(int workerid, starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex);
 
 
 int starpu_worker_can_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl);
 int starpu_worker_can_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl);
 int starpu_worker_can_execute_task_impl(unsigned workerid, struct starpu_task *task, unsigned *impl_mask);
 int starpu_worker_can_execute_task_impl(unsigned workerid, struct starpu_task *task, unsigned *impl_mask);

+ 1 - 0
include/starpu_task.h

@@ -46,6 +46,7 @@ extern "C"
 #define STARPU_MPI_MS	((1ULL)<<9)
 #define STARPU_MPI_MS	((1ULL)<<9)
 
 
 #define STARPU_CODELET_SIMGRID_EXECUTE	(1<<0)
 #define STARPU_CODELET_SIMGRID_EXECUTE	(1<<0)
+#define STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT	(1<<1)
 #define STARPU_CUDA_ASYNC	(1<<0)
 #define STARPU_CUDA_ASYNC	(1<<0)
 #define STARPU_OPENCL_ASYNC	(1<<0)
 #define STARPU_OPENCL_ASYNC	(1<<0)
 
 

+ 41 - 10
include/starpu_thread.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010, 2012-2016  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
+ * Copyright (C) 2010, 2012-2017  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2017  CNRS
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -33,6 +33,7 @@
 #endif
 #endif
 #elif !defined(_MSC_VER) || defined(BUILDING_STARPU)
 #elif !defined(_MSC_VER) || defined(BUILDING_STARPU)
 #include <pthread.h>
 #include <pthread.h>
+#include <semaphore.h>
 #endif
 #endif
 #include <stdint.h>
 #include <stdint.h>
 
 
@@ -50,8 +51,9 @@ extern "C"
 typedef msg_process_t starpu_pthread_t;
 typedef msg_process_t starpu_pthread_t;
 typedef int starpu_pthread_attr_t;
 typedef int starpu_pthread_attr_t;
 
 
+int starpu_pthread_equal(starpu_pthread_t t1, starpu_pthread_t t2);
+starpu_pthread_t starpu_pthread_self(void);
 int starpu_pthread_create_on(char *name, starpu_pthread_t *thread, const starpu_pthread_attr_t *attr, void *(*start_routine) (void *), void *arg, msg_host_t host);
 int starpu_pthread_create_on(char *name, starpu_pthread_t *thread, const starpu_pthread_attr_t *attr, void *(*start_routine) (void *), void *arg, msg_host_t host);
-#define starpu_pthread_setname(name)
 int starpu_pthread_create(starpu_pthread_t *thread, const starpu_pthread_attr_t *attr, void *(*start_routine) (void *), void *arg);
 int starpu_pthread_create(starpu_pthread_t *thread, const starpu_pthread_attr_t *attr, void *(*start_routine) (void *), void *arg);
 int starpu_pthread_join(starpu_pthread_t thread, void **retval);
 int starpu_pthread_join(starpu_pthread_t thread, void **retval);
 int starpu_pthread_exit(void *retval) STARPU_ATTRIBUTE_NORETURN;
 int starpu_pthread_exit(void *retval) STARPU_ATTRIBUTE_NORETURN;
@@ -64,8 +66,18 @@ int starpu_pthread_attr_setdetachstate(starpu_pthread_attr_t *attr, int detachst
 typedef pthread_t starpu_pthread_t;
 typedef pthread_t starpu_pthread_t;
 typedef pthread_attr_t starpu_pthread_attr_t;
 typedef pthread_attr_t starpu_pthread_attr_t;
 
 
+#define starpu_pthread_equal pthread_equal
+#define starpu_pthread_self pthread_self
 #define starpu_pthread_create pthread_create
 #define starpu_pthread_create pthread_create
 #define starpu_pthread_create_on(name, thread, attr, routine, arg, where) starpu_pthread_create(thread, attr, routine, arg)
 #define starpu_pthread_create_on(name, thread, attr, routine, arg, where) starpu_pthread_create(thread, attr, routine, arg)
+#define starpu_pthread_join pthread_join
+#define starpu_pthread_exit pthread_exit
+#define starpu_pthread_attr_init pthread_attr_init
+#define starpu_pthread_attr_destroy pthread_attr_destroy
+#define starpu_pthread_attr_setdetachstate pthread_attr_setdetachstate
+
+#endif /* STARPU_SIMGRID, _MSC_VER */
+
 #ifdef STARPU_HAVE_PTHREAD_SETNAME_NP
 #ifdef STARPU_HAVE_PTHREAD_SETNAME_NP
 #ifdef STARPU_HAVE_DARWIN
 #ifdef STARPU_HAVE_DARWIN
 #define starpu_pthread_setname(name) pthread_setname_np(name)
 #define starpu_pthread_setname(name) pthread_setname_np(name)
@@ -75,13 +87,6 @@ typedef pthread_attr_t starpu_pthread_attr_t;
 #else
 #else
 #define starpu_pthread_setname(name)
 #define starpu_pthread_setname(name)
 #endif
 #endif
-#define starpu_pthread_join pthread_join
-#define starpu_pthread_exit pthread_exit
-#define starpu_pthread_attr_init pthread_attr_init
-#define starpu_pthread_attr_destroy pthread_attr_destroy
-#define starpu_pthread_attr_setdetachstate pthread_attr_setdetachstate
-
-#endif /* STARPU_SIMGRID, _MSC_VER */
 
 
 /*
 /*
  * Encapsulation of the pthread_mutex_* functions.
  * Encapsulation of the pthread_mutex_* functions.
@@ -403,6 +408,32 @@ int starpu_pthread_wait_wait(starpu_pthread_wait_t *w);
 int starpu_pthread_wait_destroy(starpu_pthread_wait_t *w);
 int starpu_pthread_wait_destroy(starpu_pthread_wait_t *w);
 #endif
 #endif
 
 
+/*
+ * Encapsulation of the semaphore functions.
+ */
+
+#ifdef STARPU_SIMGRID
+
+typedef msg_sem_t starpu_sem_t;
+int starpu_sem_destroy(starpu_sem_t *);
+int starpu_sem_getvalue(starpu_sem_t *, int *);
+int starpu_sem_init(starpu_sem_t *, int, unsigned);
+int starpu_sem_post(starpu_sem_t *);
+int starpu_sem_trywait(starpu_sem_t *);
+int starpu_sem_wait(starpu_sem_t *);
+
+#elif !defined(_MSC_VER) || defined(BUILDING_STARPU) /* !STARPU_SIMGRID */
+
+typedef sem_t starpu_sem_t;
+#define starpu_sem_destroy sem_destroy
+#define starpu_sem_getvalue sem_getvalue
+#define starpu_sem_init sem_init
+#define starpu_sem_post sem_post
+int starpu_sem_trywait(starpu_sem_t *);
+int starpu_sem_wait(starpu_sem_t *);
+
+#endif
+
 #ifdef __cplusplus
 #ifdef __cplusplus
 }
 }
 #endif
 #endif

+ 109 - 0
mpi/dev/starpu_mpi_comm_check.sh

@@ -0,0 +1,109 @@
+#!/bin/bash
+
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2017 CNRS
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+# Script to check MPI communications are done properly
+# The application should be launched with STARPU_MPI_COMM=1
+# e.g
+#    $ export STARPU_MPI_COMM=1
+#    $ mpirun --output-filename starpu_mpi.log appli parameters
+# and then the script can be launched with the output files
+#    $ starpu_mpi_comm_check.sh starpu_mpi.log.*
+
+if test -z "$1"
+then
+    echo Syntax error: parameter missing
+    exit 1
+fi
+
+# Get the nodes identifiers
+nodes=$(for f in $*
+	do
+	    grep starpu_mpi $f | grep '\[' | awk '{print $1}'| sed 's/\[\(.*\)\]\[starpu_mpi\]/\1/' | grep "^[[:digit:]]*$"
+	done |sort|uniq
+     )
+echo nodes $nodes
+
+DIR=/tmp
+
+# for each node, extract send and receive communications
+for node in $nodes
+do
+    for f in $*
+    do
+	grep starpu_mpi $f |grep "\[$node"
+    done > $DIR/starpu_mpi_node$node.log
+    grep -- "-->" $DIR/starpu_mpi_node$node.log > $DIR/starpu_mpi_node${node}_send.log
+    grep -- "<--" $DIR/starpu_mpi_node$node.log > $DIR/starpu_mpi_node${node}_recv.log
+done
+
+# count the number of traced lines
+#for node in $nodes
+#do
+#    wc -l $DIR/starpu_mpi_node${node}_recv.log
+#    lines=$(grep :42:42 $DIR/starpu_mpi_node${node}_recv.log | wc -l)
+#    lines2=$(( lines + lines ))
+#    echo $lines2
+#    lines3=$(( lines2 + lines ))
+#    echo $lines3
+#done
+
+# for each pair of nodes, check tags are sent and received in the same order
+for src in $nodes
+do
+    for dst in $nodes
+    do
+	if test $src != $dst
+	then
+	    grep ":$dst:42:" $DIR/starpu_mpi_node${src}_send.log| awk -F':' '{print $6}' > $DIR/node${src}_send_to_${dst}.log
+	    grep ":$src:42:" $DIR/starpu_mpi_node${dst}_recv.log|awk -F ':' '{print $6}'> $DIR/node${dst}_recv_from_${src}.log
+ 	    diff --side-by-side  --suppress-common-lines $DIR/node${src}_send_to_${dst}.log $DIR/node${dst}_recv_from_${src}.log  > $DIR/check_$$
+	    if test -s $DIR/check_$$
+	    then
+		echo $src $dst
+		less $DIR/check_$$
+	    fi
+	fi
+    done
+done
+
+# check each envelope reception is followed by the appropriate data reception
+# first line: MPI_Recv of the envelope
+# second line: display envelope information
+# third line: MPI_Recv of the data
+for node in $nodes
+do
+    echo processing $DIR/starpu_mpi_node${node}_recv.log
+    (
+	while read line
+	do
+	    read line2
+	    read line3
+	    #echo processing
+	    tag2=$(echo $line2 | awk -F ':' '{print $6}')
+	    tag3=$(echo $line3 | awk -F ':' '{print $6}')
+	    if test "$tag2" != "$tag3"
+	    then
+		echo erreur
+		echo $tag2 $tag3
+		echo $line
+		echo $line2
+		echo $line3
+	    fi
+	done
+    ) < $DIR/starpu_mpi_node${node}_recv.log
+done
+

+ 1 - 1
sc_hypervisor/src/hypervisor_policies/feft_lp_policy.c

@@ -27,7 +27,7 @@ static void _try_resizing(unsigned *sched_ctxs, int nsched_ctxs, int *workers, i
 	/* for vite */
 	/* for vite */
 	int ns = sched_ctxs == NULL ? sc_hypervisor_get_nsched_ctxs() : nsched_ctxs;
 	int ns = sched_ctxs == NULL ? sc_hypervisor_get_nsched_ctxs() : nsched_ctxs;
 #ifdef STARPU_SC_HYPERVISOR_DEBUG
 #ifdef STARPU_SC_HYPERVISOR_DEBUG
-	printf("resize_no = %u %d ctxs\n", resize_no, ns);
+	printf("resize_no = %lu %d ctxs\n", resize_no, ns);
 #endif
 #endif
 	if(ns <= 0) return;
 	if(ns <= 0) return;
 
 

+ 3 - 3
socl/src/cl_createkernel.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010,2011 University of Bordeaux
  * Copyright (C) 2010,2011 University of Bordeaux
- * Copyright (C) 2016  CNRS
+ * Copyright (C) 2016, 2017  CNRS
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -25,7 +25,7 @@ static void soclCreateKernel_task(void *data) {
 
 
    if (k->program->cl_programs[range] == NULL) {
    if (k->program->cl_programs[range] == NULL) {
       k->errcodes[range] = CL_SUCCESS;
       k->errcodes[range] = CL_SUCCESS;
-      DEBUG_MSG("[Device %d] Kernel creation skipped: program has not been built for this device.\n", starpu_worker_get_id_check());
+      DEBUG_MSG("[Device %u] Kernel creation skipped: program has not been built for this device.\n", starpu_worker_get_id_check());
       return;
       return;
    }
    }
 
 
@@ -163,7 +163,7 @@ soclCreateKernel(cl_program    program,
    }
    }
 
 
    /* Create kernel on each device */
    /* Create kernel on each device */
-   DEBUG_MSG("[Kernel %d] Create %d kernels (name \"%s\")\n", k->id, socl_device_count, kernel_name);
+   DEBUG_MSG("[Kernel %d] Create %u kernels (name \"%s\")\n", k->id, socl_device_count, kernel_name);
    starpu_execute_on_each_worker_ex(soclCreateKernel_task, k, STARPU_OPENCL, "SOCL_CREATE_KERNEL");
    starpu_execute_on_each_worker_ex(soclCreateKernel_task, k, STARPU_OPENCL, "SOCL_CREATE_KERNEL");
 
 
    if (errcode_ret != NULL) {
    if (errcode_ret != NULL) {

+ 2 - 2
socl/src/cl_createprogramwithsource.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010,2011 University of Bordeaux
  * Copyright (C) 2010,2011 University of Bordeaux
- * Copyright (C) 2016  CNRS
+ * Copyright (C) 2016, 2017  CNRS
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -141,7 +141,7 @@ soclCreateProgramWithSource(cl_context      context,
       *errcode_ret = CL_SUCCESS;
       *errcode_ret = CL_SUCCESS;
       for (i=0; i<socl_device_count; i++) {
       for (i=0; i<socl_device_count; i++) {
          if (data->errcodes[i] != CL_SUCCESS) {
          if (data->errcodes[i] != CL_SUCCESS) {
-            DEBUG_MSG("Worker [%d] failed\n", i);
+            DEBUG_MSG("Worker [%u] failed\n", i);
             DEBUG_CL("clCreateProgramWithSource", data->errcodes[i]);
             DEBUG_CL("clCreateProgramWithSource", data->errcodes[i]);
             *errcode_ret = data->errcodes[i];
             *errcode_ret = data->errcodes[i];
             break;
             break;

+ 6 - 6
socl/src/cl_enqueuendrangekernel.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010,2011, 2016-2017 University of Bordeaux
  * Copyright (C) 2010,2011, 2016-2017 University of Bordeaux
- * Copyright (C) 2016  CNRS
+ * Copyright (C) 2016, 2017  CNRS
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -71,13 +71,13 @@ void soclEnqueueNDRangeKernel_task(void *descr[], void *args) {
    if (err != CL_SUCCESS) {
    if (err != CL_SUCCESS) {
 	   ERROR_MSG("Worker[%d] Unable to Enqueue kernel (error %d)\n", wid, err);
 	   ERROR_MSG("Worker[%d] Unable to Enqueue kernel (error %d)\n", wid, err);
 	   DEBUG_CL("clEnqueueNDRangeKernel", err);
 	   DEBUG_CL("clEnqueueNDRangeKernel", err);
-	   DEBUG_MSG("Workdim %d, global_work_offset %p, global_work_size %p, local_work_size %p\n",
+	   DEBUG_MSG("Workdim %u, global_work_offset %p, global_work_size %p, local_work_size %p\n",
 			   cmd->work_dim, cmd->global_work_offset, cmd->global_work_size, cmd->local_work_size);
 			   cmd->work_dim, cmd->global_work_offset, cmd->global_work_size, cmd->local_work_size);
-	   DEBUG_MSG("Global work size: %ld %ld %ld\n", cmd->global_work_size[0],
-			   (cmd->work_dim > 1 ? cmd->global_work_size[1] : 1), (cmd->work_dim > 2 ? cmd->global_work_size[2] : 1)); 
+	   DEBUG_MSG("Global work size: %ld %ld %ld\n", (long)cmd->global_work_size[0],
+		     (long)(cmd->work_dim > 1 ? cmd->global_work_size[1] : 1), (long)(cmd->work_dim > 2 ? cmd->global_work_size[2] : 1)); 
 	   if (cmd->local_work_size != NULL)
 	   if (cmd->local_work_size != NULL)
-		   DEBUG_MSG("Local work size: %ld %ld %ld\n", cmd->local_work_size[0],
-				   (cmd->work_dim > 1 ? cmd->local_work_size[1] : 1), (cmd->work_dim > 2 ? cmd->local_work_size[2] : 1)); 
+		   DEBUG_MSG("Local work size: %ld %ld %ld\n", (long)cmd->local_work_size[0],
+			     (long)(cmd->work_dim > 1 ? cmd->local_work_size[1] : 1), (long)(cmd->work_dim > 2 ? cmd->local_work_size[2] : 1)); 
    }
    }
    else {
    else {
       /* Waiting for kernel to terminate */
       /* Waiting for kernel to terminate */

+ 3 - 3
socl/src/cl_enqueuereadbuffer.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010,2011, 2014 University of Bordeaux
  * Copyright (C) 2010,2011, 2014 University of Bordeaux
- * Copyright (C) 2016  CNRS
+ * Copyright (C) 2016, 2017  CNRS
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -25,7 +25,7 @@ static void soclEnqueueReadBuffer_cpu_task(void *descr[], void *args) {
   gc_entity_release(ev);
   gc_entity_release(ev);
 
 
    char * ptr = (void*)STARPU_VARIABLE_GET_PTR(descr[0]);
    char * ptr = (void*)STARPU_VARIABLE_GET_PTR(descr[0]);
-   DEBUG_MSG("[Buffer %d] Reading %ld bytes from %p to %p\n", cmd->buffer->id, cmd->cb, ptr+cmd->offset, cmd->ptr);
+   DEBUG_MSG("[Buffer %d] Reading %ld bytes from %p to %p\n", cmd->buffer->id, (long)cmd->cb, ptr+cmd->offset, cmd->ptr);
 
 
    //This fix is for people who use USE_HOST_PTR and still use ReadBuffer to sync the buffer in host mem at host_ptr.
    //This fix is for people who use USE_HOST_PTR and still use ReadBuffer to sync the buffer in host mem at host_ptr.
    //They should use buffer mapping facilities instead.
    //They should use buffer mapping facilities instead.
@@ -44,7 +44,7 @@ static void soclEnqueueReadBuffer_opencl_task(void *descr[], void *args) {
 
 
    cl_mem mem = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
    cl_mem mem = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
 
 
-   DEBUG_MSG("[Buffer %d] Reading %ld bytes from offset %ld into %p\n", cmd->buffer->id, cmd->cb, cmd->offset, cmd->ptr);
+   DEBUG_MSG("[Buffer %d] Reading %ld bytes from offset %ld into %p\n", cmd->buffer->id, (long)cmd->cb, (long)cmd->offset, cmd->ptr);
 
 
    int wid = starpu_worker_get_id_check();
    int wid = starpu_worker_get_id_check();
    cl_command_queue cq;
    cl_command_queue cq;

+ 3 - 3
socl/src/cl_enqueuewritebuffer.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010,2011, 2014 University of Bordeaux
  * Copyright (C) 2010,2011, 2014 University of Bordeaux
- * Copyright (C) 2016  CNRS
+ * Copyright (C) 2016, 2017  CNRS
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -26,7 +26,7 @@ static void soclEnqueueWriteBuffer_cpu_task(void *descr[], void *args) {
   gc_entity_release(ev);
   gc_entity_release(ev);
 
 
    char * ptr = (void*)STARPU_VARIABLE_GET_PTR(descr[0]);
    char * ptr = (void*)STARPU_VARIABLE_GET_PTR(descr[0]);
-   DEBUG_MSG("[Buffer %d] Writing %ld bytes from %p to %p\n", cmd->buffer->id, cmd->cb, cmd->ptr, ptr+cmd->offset);
+   DEBUG_MSG("[Buffer %d] Writing %ld bytes from %p to %p\n", cmd->buffer->id, (long)cmd->cb, cmd->ptr, ptr+cmd->offset);
 
 
    //FIXME: Fix for people who use USE_HOST_PTR, modify data at host_ptr and use WriteBuffer to commit the change.
    //FIXME: Fix for people who use USE_HOST_PTR, modify data at host_ptr and use WriteBuffer to commit the change.
    // StarPU may have erased host mem at host_ptr (for instance by retrieving current buffer data at host_ptr)
    // StarPU may have erased host mem at host_ptr (for instance by retrieving current buffer data at host_ptr)
@@ -47,7 +47,7 @@ static void soclEnqueueWriteBuffer_opencl_task(void *descr[], void *args) {
 
 
    cl_mem mem = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
    cl_mem mem = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
 
 
-   DEBUG_MSG("[Buffer %d] Writing %ld bytes to offset %ld from %p\n", cmd->buffer->id, cmd->cb, cmd->offset, cmd->ptr);
+   DEBUG_MSG("[Buffer %d] Writing %ld bytes to offset %ld from %p\n", cmd->buffer->id, (long)cmd->cb, (long)cmd->offset, cmd->ptr);
 
 
    int wid = starpu_worker_get_id_check();
    int wid = starpu_worker_get_id_check();
    cl_command_queue cq;
    cl_command_queue cq;

+ 1 - 1
socl/src/cl_setkernelarg.c

@@ -68,7 +68,7 @@ soclSetKernelArg(cl_kernel  kernel,
    kernel->arg_type[arg_index] = Null;
    kernel->arg_type[arg_index] = Null;
    kernel->arg_size[arg_index] = arg_size;
    kernel->arg_size[arg_index] = arg_size;
 
 
-   DEBUG_MSG("[Kernel %d] Set argument %d: argsize %ld argvalue %p\n", kernel->id, arg_index, arg_size, arg_value);
+   DEBUG_MSG("[Kernel %d] Set argument %d: argsize %ld argvalue %p\n", kernel->id, arg_index, (long)arg_size, arg_value);
 
 
    /* Argument is not Null */
    /* Argument is not Null */
    if (arg_value != NULL) {
    if (arg_value != NULL) {

+ 1 - 1
socl/src/task.c

@@ -77,7 +77,7 @@ void task_depends_on(starpu_task task, cl_uint num_events, cl_event *events) {
     DEBUG_MSG("Task %p depends on events:", task);
     DEBUG_MSG("Task %p depends on events:", task);
     for (i=0; i<num_events; i++) {
     for (i=0; i<num_events; i++) {
        tags[i] = events[i]->id;
        tags[i] = events[i]->id;
-       DEBUG_MSG_NOHEAD(" %u", events[i]->id);
+       DEBUG_MSG_NOHEAD(" %d", events[i]->id);
     }
     }
     DEBUG_MSG_NOHEAD("\n");
     DEBUG_MSG_NOHEAD("\n");
 
 

+ 1 - 1
src/common/fxt.c

@@ -72,7 +72,7 @@ long _starpu_gettid(void)
 #elif defined(_WIN32) && !defined(__CYGWIN__)
 #elif defined(_WIN32) && !defined(__CYGWIN__)
 	return (long) GetCurrentThreadId();
 	return (long) GetCurrentThreadId();
 #else
 #else
-	return (long) pthread_self();
+	return (long) starpu_pthread_self();
 #endif
 #endif
 #endif
 #endif
 }
 }

+ 25 - 3
src/common/prio_list.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2015-2016  Université de Bordeaux
+ * Copyright (C) 2015-2017  Université de Bordeaux
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -252,9 +252,31 @@
 	static inline void ENAME##_prio_list_deinit(struct ENAME##_prio_list *priolist) \
 	static inline void ENAME##_prio_list_deinit(struct ENAME##_prio_list *priolist) \
 	{ (void) (priolist); /* ENAME##_list_deinit(&(priolist)->list); */ } \
 	{ (void) (priolist); /* ENAME##_list_deinit(&(priolist)->list); */ } \
 	static inline void ENAME##_prio_list_push_back(struct ENAME##_prio_list *priolist, struct ENAME *e) \
 	static inline void ENAME##_prio_list_push_back(struct ENAME##_prio_list *priolist, struct ENAME *e) \
-	{ ENAME##_list_push_back(&(priolist)->list, (e)); } \
+	{ \
+		struct ENAME *cur; \
+		for (cur  = ENAME##_list_begin(&(priolist)->list); \
+		     cur != ENAME##_list_end(&(priolist)->list); \
+		     cur  = ENAME##_list_next(cur)) \
+			if ((e)->PRIOFIELD > cur->PRIOFIELD) \
+				break; \
+		if (cur == ENAME##_list_end(&(priolist)->list)) \
+			ENAME##_list_push_back(&(priolist)->list, (e)); \
+		else \
+			ENAME##_list_insert_before(&(priolist)->list, (e), cur); \
+	} \
 	static inline void ENAME##_prio_list_push_front(struct ENAME##_prio_list *priolist, struct ENAME *e) \
 	static inline void ENAME##_prio_list_push_front(struct ENAME##_prio_list *priolist, struct ENAME *e) \
-	{ ENAME##_list_push_front(&(priolist)->list, (e)); } \
+	{ \
+		struct ENAME *cur; \
+		for (cur  = ENAME##_list_begin(&(priolist)->list); \
+		     cur != ENAME##_list_end(&(priolist)->list); \
+		     cur  = ENAME##_list_next(cur)) \
+			if ((e)->PRIOFIELD >= cur->PRIOFIELD) \
+				break; \
+		if (cur == ENAME##_list_end(&(priolist)->list)) \
+			ENAME##_list_push_back(&(priolist)->list, (e)); \
+		else \
+			ENAME##_list_insert_before(&(priolist)->list, (e), cur); \
+	} \
 	static inline int ENAME##_prio_list_empty(const struct ENAME##_prio_list *priolist) \
 	static inline int ENAME##_prio_list_empty(const struct ENAME##_prio_list *priolist) \
 	{ return ENAME##_list_empty(&(priolist)->list); } \
 	{ return ENAME##_list_empty(&(priolist)->list); } \
 	static inline void ENAME##_prio_list_erase(struct ENAME##_prio_list *priolist, struct ENAME *e) \
 	static inline void ENAME##_prio_list_erase(struct ENAME##_prio_list *priolist, struct ENAME *e) \

+ 2 - 3
src/common/starpu_spinlock.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010, 2012-2014, 2016  Université de Bordeaux
  * Copyright (C) 2010, 2012-2014, 2016  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2013, 2014  CNRS
+ * Copyright (C) 2010, 2011, 2013, 2014, 2017  CNRS
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -25,10 +25,9 @@
 int _starpu_spin_init(struct _starpu_spinlock *lock)
 int _starpu_spin_init(struct _starpu_spinlock *lock)
 {
 {
 	starpu_pthread_mutexattr_t errcheck_attr;
 	starpu_pthread_mutexattr_t errcheck_attr;
-//	memcpy(&lock->errcheck_lock, PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP, sizeof(PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP));
 	int ret;
 	int ret;
 	ret = starpu_pthread_mutexattr_init(&errcheck_attr);
 	ret = starpu_pthread_mutexattr_init(&errcheck_attr);
-	STARPU_CHECK_RETURN_VALUE(ret, "pthread_mutexattr_init");
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_pthread_mutexattr_init");
 
 
 	ret = starpu_pthread_mutexattr_settype(&errcheck_attr, PTHREAD_MUTEX_ERRORCHECK);
 	ret = starpu_pthread_mutexattr_settype(&errcheck_attr, PTHREAD_MUTEX_ERRORCHECK);
 	STARPU_ASSERT(!ret);
 	STARPU_ASSERT(!ret);

+ 102 - 29
src/common/thread.c

@@ -19,6 +19,7 @@
 #include <core/simgrid.h>
 #include <core/simgrid.h>
 #include <core/workers.h>
 #include <core/workers.h>
 
 
+#include <errno.h>
 #include <limits.h>
 #include <limits.h>
 
 
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
@@ -50,6 +51,16 @@ static int _starpu_futex_wake = FUTEX_WAKE;
 
 
 extern int _starpu_simgrid_thread_start(int argc, char *argv[]);
 extern int _starpu_simgrid_thread_start(int argc, char *argv[]);
 
 
+int starpu_pthread_equal(starpu_pthread_t t1, starpu_pthread_t t2)
+{
+	return t1 == t2;
+}
+
+starpu_pthread_t starpu_pthread_self(void)
+{
+	return MSG_process_self();
+}
+
 int starpu_pthread_create_on(char *name, starpu_pthread_t *thread, const starpu_pthread_attr_t *attr STARPU_ATTRIBUTE_UNUSED, void *(*start_routine) (void *), void *arg, msg_host_t host)
 int starpu_pthread_create_on(char *name, starpu_pthread_t *thread, const starpu_pthread_attr_t *attr STARPU_ATTRIBUTE_UNUSED, void *(*start_routine) (void *), void *arg, msg_host_t host)
 {
 {
 	char **_args;
 	char **_args;
@@ -62,6 +73,9 @@ int starpu_pthread_create_on(char *name, starpu_pthread_t *thread, const starpu_
 	void *tsd;
 	void *tsd;
 	_STARPU_CALLOC(tsd, MAX_TSD+1, sizeof(void*));
 	_STARPU_CALLOC(tsd, MAX_TSD+1, sizeof(void*));
 	*thread = MSG_process_create_with_arguments(name, _starpu_simgrid_thread_start, tsd, host, 2, _args);
 	*thread = MSG_process_create_with_arguments(name, _starpu_simgrid_thread_start, tsd, host, 2, _args);
+#if SIMGRID_VERSION_MAJOR > 3 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR >= 15)
+	MSG_process_ref(*thread);
+#endif
 	return 0;
 	return 0;
 }
 }
 
 
@@ -74,6 +88,9 @@ int starpu_pthread_join(starpu_pthread_t thread STARPU_ATTRIBUTE_UNUSED, void **
 {
 {
 #if SIMGRID_VERSION_MAJOR > 3 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR >= 14)
 #if SIMGRID_VERSION_MAJOR > 3 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR >= 14)
 	MSG_process_join(thread, 1000000);
 	MSG_process_join(thread, 1000000);
+#if SIMGRID_VERSION_MAJOR > 3 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR >= 15)
+	MSG_process_unref(thread);
+#endif
 #else
 #else
 	MSG_process_sleep(1);
 	MSG_process_sleep(1);
 #endif
 #endif
@@ -519,7 +536,7 @@ int starpu_pthread_queue_destroy(starpu_pthread_queue_t *q)
 #endif /* STARPU_SIMGRID */
 #endif /* STARPU_SIMGRID */
 
 
 #if (defined(STARPU_SIMGRID) && !defined(STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT)) || (!defined(STARPU_SIMGRID) && !defined(STARPU_HAVE_PTHREAD_BARRIER))
 #if (defined(STARPU_SIMGRID) && !defined(STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT)) || (!defined(STARPU_SIMGRID) && !defined(STARPU_HAVE_PTHREAD_BARRIER))
-int starpu_pthread_barrier_init(starpu_pthread_barrier_t *restrict barrier, const starpu_pthread_barrierattr_t *restrict attr, unsigned count)
+int starpu_pthread_barrier_init(starpu_pthread_barrier_t *restrict barrier, const starpu_pthread_barrierattr_t *restrict attr STARPU_ATTRIBUTE_UNUSED, unsigned count)
 {
 {
 	int ret = starpu_pthread_mutex_init(&barrier->mutex, NULL);
 	int ret = starpu_pthread_mutex_init(&barrier->mutex, NULL);
 	if (!ret)
 	if (!ret)
@@ -703,47 +720,34 @@ int starpu_pthread_barrier_wait(starpu_pthread_barrier_t *barrier)
  * macros of course) which record when the mutex is held or not */
  * macros of course) which record when the mutex is held or not */
 int starpu_pthread_mutex_lock_sched(starpu_pthread_mutex_t *mutex)
 int starpu_pthread_mutex_lock_sched(starpu_pthread_mutex_t *mutex)
 {
 {
-	const int workerid = starpu_worker_get_id();
-	struct _starpu_worker * const worker = (workerid != -1)?_starpu_get_worker_struct(workerid):NULL;
-	if(worker && mutex == &worker->sched_mutex)
-	{
-		STARPU_ASSERT(worker->sched_mutex_depth < UINT_MAX);
-		worker->sched_mutex_depth++;
-		if (worker->sched_mutex_depth > 1)
-			return 0;
-	}
-
-	return starpu_pthread_mutex_lock(mutex);
+	int p_ret = starpu_pthread_mutex_lock(mutex);
+	int workerid = starpu_worker_get_id();
+	if(workerid != -1 && _starpu_worker_mutex_is_sched_mutex(workerid, mutex))
+		_starpu_worker_set_flag_sched_mutex_locked(workerid, 1);
+	return p_ret;
 }
 }
 
 
 int starpu_pthread_mutex_unlock_sched(starpu_pthread_mutex_t *mutex)
 int starpu_pthread_mutex_unlock_sched(starpu_pthread_mutex_t *mutex)
 {
 {
-	const int workerid = starpu_worker_get_id();
-	struct _starpu_worker * const worker = (workerid != -1)?_starpu_get_worker_struct(workerid):NULL;
-	if(worker && mutex == &worker->sched_mutex)
-	{
-		STARPU_ASSERT(worker->sched_mutex_depth > 0);
-		worker->sched_mutex_depth--;
-		if (worker->sched_mutex_depth > 0)
-			return 0;
-	}
+	int workerid = starpu_worker_get_id();
+	if(workerid != -1 && _starpu_worker_mutex_is_sched_mutex(workerid, mutex))
+		_starpu_worker_set_flag_sched_mutex_locked(workerid, 0);
 
 
 	return starpu_pthread_mutex_unlock(mutex);
 	return starpu_pthread_mutex_unlock(mutex);
 }
 }
 
 
 int starpu_pthread_mutex_trylock_sched(starpu_pthread_mutex_t *mutex)
 int starpu_pthread_mutex_trylock_sched(starpu_pthread_mutex_t *mutex)
 {
 {
-	const int workerid = starpu_worker_get_id();
-	struct _starpu_worker * const worker = (workerid != -1)?_starpu_get_worker_struct(workerid):NULL;
-	if(worker && mutex == &worker->sched_mutex)
+	int ret = starpu_pthread_mutex_trylock(mutex);
+
+	if (!ret)
 	{
 	{
-		STARPU_ASSERT(worker->sched_mutex_depth < UINT_MAX);
-		worker->sched_mutex_depth++;
-		if (worker->sched_mutex_depth > 1)
-			return 0;
+		int workerid = starpu_worker_get_id();
+		if(workerid != -1 && _starpu_worker_mutex_is_sched_mutex(workerid, mutex))
+			_starpu_worker_set_flag_sched_mutex_locked(workerid, 1);
 	}
 	}
 
 
-	return starpu_pthread_mutex_trylock(mutex);
+	return ret;
 }
 }
 
 
 #ifdef STARPU_DEBUG
 #ifdef STARPU_DEBUG
@@ -870,3 +874,72 @@ void _starpu_pthread_spin_do_unlock(starpu_pthread_spinlock_t *lock)
 #endif
 #endif
 
 
 #endif /* defined(STARPU_SIMGRID) || (defined(STARPU_LINUX_SYS) && defined(STARPU_HAVE_XCHG)) || !defined(STARPU_HAVE_PTHREAD_SPIN_LOCK) */
 #endif /* defined(STARPU_SIMGRID) || (defined(STARPU_LINUX_SYS) && defined(STARPU_HAVE_XCHG)) || !defined(STARPU_HAVE_PTHREAD_SPIN_LOCK) */
+
+#ifdef STARPU_SIMGRID
+
+int starpu_sem_destroy(starpu_sem_t *sem)
+{
+	MSG_sem_destroy(*sem);
+	return 0;
+}
+
+int starpu_sem_init(starpu_sem_t *sem, int pshared, unsigned value)
+{
+	STARPU_ASSERT_MSG(pshared == 0, "pshared semaphores not supported under simgrid");
+	*sem = MSG_sem_init(value);
+	return 0;
+}
+
+int starpu_sem_post(starpu_sem_t *sem)
+{
+	MSG_sem_release(*sem);
+	return 0;
+}
+
+int starpu_sem_wait(starpu_sem_t *sem)
+{
+	MSG_sem_acquire(*sem);
+	return 0;
+}
+
+int starpu_sem_trywait(starpu_sem_t *sem)
+{
+	if (MSG_sem_would_block(*sem))
+		return EAGAIN;
+	starpu_sem_wait(sem);
+	return 0;
+}
+
+int starpu_sem_getvalue(starpu_sem_t *sem, int *sval)
+{
+#if SIMGRID_VERSION_MAJOR > 3 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR > 13)
+	*sval = MSG_sem_get_capacity(*sem);
+	return 0;
+#else
+	(void) sem;
+	(void) sval;
+	STARPU_ABORT_MSG("sigmrid up to 3.13 did not have working MSG_sem_get_capacity");
+#endif
+}
+
+#elif !defined(_MSC_VER) || defined(BUILDING_STARPU) /* !STARPU_SIMGRID */
+
+int starpu_sem_wait(starpu_sem_t *sem)
+{
+	int ret;
+	while((ret = sem_wait(sem)) == -1 && errno == EINTR)
+		;
+
+	return ret;
+}
+
+int starpu_sem_trywait(starpu_sem_t *sem)
+{
+	int ret;
+	while((ret = sem_trywait(sem)) == -1 && errno == EINTR)
+		;
+	
+	return ret;
+}
+
+#endif

+ 3 - 4
src/common/utils.h

@@ -24,7 +24,6 @@
 #include <string.h>
 #include <string.h>
 #include <stdlib.h>
 #include <stdlib.h>
 #include <math.h>
 #include <math.h>
-#include <pthread.h>
 #ifdef STARPU_HAVE_SCHED_YIELD
 #ifdef STARPU_HAVE_SCHED_YIELD
 #include <sched.h>
 #include <sched.h>
 #endif
 #endif
@@ -97,9 +96,9 @@
 #endif
 #endif
 
 
 #ifdef STARPU_EXTRA_VERBOSE
 #ifdef STARPU_EXTRA_VERBOSE
-#  define _STARPU_LOG_IN()             do { if (!_starpu_silent) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s:%s@%d] -->\n", pthread_self(), __starpu_func__,__FILE__,  __LINE__); }} while(0)
-#  define _STARPU_LOG_OUT()            do { if (!_starpu_silent) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s:%s@%d] <--\n", pthread_self(), __starpu_func__, __FILE__,  __LINE__); }} while(0)
-#  define _STARPU_LOG_OUT_TAG(outtag)  do { if (!_starpu_silent) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s:%s@%d] <-- (%s)\n", pthread_self(), __starpu_func__, __FILE__, __LINE__, outtag); }} while(0)
+#  define _STARPU_LOG_IN()             do { if (!_starpu_silent) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s:%s@%d] -->\n", starpu_pthread_self(), __starpu_func__,__FILE__,  __LINE__); }} while(0)
+#  define _STARPU_LOG_OUT()            do { if (!_starpu_silent) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s:%s@%d] <--\n", starpu_pthread_self(), __starpu_func__, __FILE__,  __LINE__); }} while(0)
+#  define _STARPU_LOG_OUT_TAG(outtag)  do { if (!_starpu_silent) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s:%s@%d] <-- (%s)\n", starpu_pthread_self(), __starpu_func__, __FILE__, __LINE__, outtag); }} while(0)
 #else
 #else
 #  define _STARPU_LOG_IN()
 #  define _STARPU_LOG_IN()
 #  define _STARPU_LOG_OUT()
 #  define _STARPU_LOG_OUT()

+ 2 - 2
src/core/jobs.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009-2016  Université de Bordeaux
+ * Copyright (C) 2009-2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011, 2014, 2016  INRIA
  * Copyright (C) 2011, 2014, 2016  INRIA
@@ -88,7 +88,7 @@ struct _starpu_job* STARPU_ATTRIBUTE_MALLOC _starpu_job_create(struct starpu_tas
 
 
 #ifndef STARPU_USE_FXT
 #ifndef STARPU_USE_FXT
 	if (_starpu_bound_recording || _starpu_top_status_get() ||
 	if (_starpu_bound_recording || _starpu_top_status_get() ||
-		_starpu_task_break_on_push != -1 || _starpu_task_break_on_pop != -1 || _starpu_task_break_on_sched != -1
+		_starpu_task_break_on_push != -1 || _starpu_task_break_on_sched != -1 || _starpu_task_break_on_pop != -1 || _starpu_task_break_on_exec != -1
 		|| STARPU_AYU_EVENT)
 		|| STARPU_AYU_EVENT)
 #endif
 #endif
 	{
 	{

File diff suppressed because it is too large
+ 308 - 289
src/core/sched_ctx.c


+ 41 - 42
src/core/sched_ctx.h

@@ -73,12 +73,27 @@ struct _starpu_sched_ctx
 	long iterations[2];
 	long iterations[2];
 	int iteration_level;
 	int iteration_level;
 
 
+	/* cond to block push when there are no workers in the ctx */
+	starpu_pthread_cond_t no_workers_cond;
+
+	/* mutex to block push when there are no workers in the ctx */
+	starpu_pthread_mutex_t no_workers_mutex;
+
 	/*ready tasks that couldn't be pushed because the ctx has no workers*/
 	/*ready tasks that couldn't be pushed because the ctx has no workers*/
 	struct starpu_task_list empty_ctx_tasks;
 	struct starpu_task_list empty_ctx_tasks;
 
 
+	/* mutext protecting empty_ctx_tasks list */
+	starpu_pthread_mutex_t empty_ctx_mutex;
+
 	/*ready tasks that couldn't be pushed because the the window of tasks was already full*/
 	/*ready tasks that couldn't be pushed because the the window of tasks was already full*/
 	struct starpu_task_list waiting_tasks;
 	struct starpu_task_list waiting_tasks;
 
 
+	/* mutext protecting waiting_tasks list */
+	starpu_pthread_mutex_t waiting_tasks_mutex;
+
+	/* mutext protecting write to all worker's sched_ctx_list structure for this sched_ctx */
+	starpu_pthread_mutex_t sched_ctx_list_mutex;
+
 	/* min CPUs to execute*/
 	/* min CPUs to execute*/
 	int min_ncpus;
 	int min_ncpus;
 
 
@@ -127,10 +142,27 @@ struct _starpu_sched_ctx
 	   if not master is -1 */
 	   if not master is -1 */
 	int main_master;
 	int main_master;
 
 
+	/* conditions variables used when parallel sections are executed in contexts */
+	starpu_pthread_cond_t parallel_sect_cond[STARPU_NMAXWORKERS];
+	starpu_pthread_mutex_t parallel_sect_mutex[STARPU_NMAXWORKERS];
+	starpu_pthread_cond_t parallel_sect_cond_busy[STARPU_NMAXWORKERS];
+	int busy[STARPU_NMAXWORKERS];
+
 	/* boolean indicating that workers should block in order to allow
 	/* boolean indicating that workers should block in order to allow
 	   parallel sections to be executed on their allocated resources */
 	   parallel sections to be executed on their allocated resources */
 	unsigned parallel_sect[STARPU_NMAXWORKERS];
 	unsigned parallel_sect[STARPU_NMAXWORKERS];
 
 
+	/* semaphore that block appl thread until starpu threads are
+	   all blocked and ready to exec the parallel code */
+	starpu_sem_t fall_asleep_sem[STARPU_NMAXWORKERS];
+
+	/* semaphore that block appl thread until starpu threads are 
+	   all woke up and ready continue appl */
+	starpu_sem_t wake_up_sem[STARPU_NMAXWORKERS];
+
+	/* bool indicating if the workers is sleeping in this ctx */
+	unsigned sleeping[STARPU_NMAXWORKERS];
+
 	/* ctx nesting the current ctx */
 	/* ctx nesting the current ctx */
 	unsigned nesting_sched_ctx;
 	unsigned nesting_sched_ctx;
 
 
@@ -158,9 +190,6 @@ struct _starpu_sched_ctx
 	int sms_end_idx;
 	int sms_end_idx;
 
 
 	int stream_worker;
 	int stream_worker;
-
-	starpu_pthread_rwlock_t rwlock;
-	starpu_pthread_t lock_write_owner;
 };
 };
 
 
 struct _starpu_machine_config;
 struct _starpu_machine_config;
@@ -212,10 +241,19 @@ void _starpu_worker_gets_out_of_ctx(unsigned sched_ctx_id, struct _starpu_worker
 /* Check if the worker belongs to another sched_ctx */
 /* Check if the worker belongs to another sched_ctx */
 unsigned _starpu_worker_belongs_to_a_sched_ctx(int workerid, unsigned sched_ctx_id);
 unsigned _starpu_worker_belongs_to_a_sched_ctx(int workerid, unsigned sched_ctx_id);
 
 
+/* mutex synchronising several simultaneous modifications of a context */
+starpu_pthread_rwlock_t* _starpu_sched_ctx_get_changing_ctx_mutex(unsigned sched_ctx_id);
+
 /* indicates wheather this worker should go to sleep or not 
 /* indicates wheather this worker should go to sleep or not 
    (if it is the last one awake in a context he should better keep awake) */
    (if it is the last one awake in a context he should better keep awake) */
 unsigned _starpu_sched_ctx_last_worker_awake(struct _starpu_worker *worker);
 unsigned _starpu_sched_ctx_last_worker_awake(struct _starpu_worker *worker);
 
 
+/* let the appl know that the worker blocked to execute parallel code */
+void _starpu_sched_ctx_signal_worker_blocked(unsigned sched_ctx_id, int workerid);
+
+/* let the appl know that the worker woke up */
+void _starpu_sched_ctx_signal_worker_woke_up(unsigned sched_ctx_id, int workerid);
+
 /* If starpu_sched_ctx_set_context() has been called, returns the context
 /* If starpu_sched_ctx_set_context() has been called, returns the context
  * id set by its last call, or the id of the initial context */
  * id set by its last call, or the id of the initial context */
 unsigned _starpu_sched_ctx_get_current_context();
 unsigned _starpu_sched_ctx_get_current_context();
@@ -240,43 +278,4 @@ struct _starpu_sched_ctx *__starpu_sched_ctx_get_sched_ctx_for_worker_and_job(st
 #define _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(w,j) \
 #define _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(w,j) \
 	(_starpu_get_nsched_ctxs() <= 1 ? _starpu_get_sched_ctx_struct(0) : __starpu_sched_ctx_get_sched_ctx_for_worker_and_job((w),(j)))
 	(_starpu_get_nsched_ctxs() <= 1 ? _starpu_get_sched_ctx_struct(0) : __starpu_sched_ctx_get_sched_ctx_for_worker_and_job((w),(j)))
 
 
-static inline struct _starpu_sched_ctx *_starpu_get_sched_ctx_struct(unsigned id);
-
-static inline int _starpu_sched_ctx_check_write_locked(unsigned sched_ctx_id)
-{
-	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
-	return sched_ctx->lock_write_owner == pthread_self();
-}
-#define STARPU_SCHED_CTX_CHECK_LOCK(sched_ctx_id) STARPU_ASSERT(_starpu_sched_ctx_check_write_locked((sched_ctx_id)))
-
-static inline void _starpu_sched_ctx_lock_write(unsigned sched_ctx_id)
-{
-	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
-	STARPU_ASSERT(sched_ctx->lock_write_owner != pthread_self());
-	STARPU_PTHREAD_RWLOCK_WRLOCK(&sched_ctx->rwlock);
-	sched_ctx->lock_write_owner = pthread_self();
-}
-
-static inline void _starpu_sched_ctx_unlock_write(unsigned sched_ctx_id)
-{
-	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
-	STARPU_ASSERT(sched_ctx->lock_write_owner == pthread_self());
-	sched_ctx->lock_write_owner = 0;
-	STARPU_PTHREAD_RWLOCK_UNLOCK(&sched_ctx->rwlock);
-}
-
-static inline void _starpu_sched_ctx_lock_read(unsigned sched_ctx_id)
-{
-	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
-	STARPU_ASSERT(sched_ctx->lock_write_owner != pthread_self());
-	STARPU_PTHREAD_RWLOCK_RDLOCK(&sched_ctx->rwlock);
-}
-
-static inline void _starpu_sched_ctx_unlock_read(unsigned sched_ctx_id)
-{
-	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
-	STARPU_ASSERT(sched_ctx->lock_write_owner != pthread_self());
-	STARPU_PTHREAD_RWLOCK_UNLOCK(&sched_ctx->rwlock);
-}
-
 #endif // __SCHED_CONTEXT_H__
 #endif // __SCHED_CONTEXT_H__

+ 11 - 6
src/core/sched_policy.c

@@ -31,15 +31,17 @@ static double idle[STARPU_NMAXWORKERS];
 static double idle_start[STARPU_NMAXWORKERS];
 static double idle_start[STARPU_NMAXWORKERS];
 
 
 long _starpu_task_break_on_push = -1;
 long _starpu_task_break_on_push = -1;
-long _starpu_task_break_on_pop = -1;
 long _starpu_task_break_on_sched = -1;
 long _starpu_task_break_on_sched = -1;
+long _starpu_task_break_on_pop = -1;
+long _starpu_task_break_on_exec = -1;
 static const char *starpu_idle_file;
 static const char *starpu_idle_file;
 
 
 void _starpu_sched_init(void)
 void _starpu_sched_init(void)
 {
 {
 	_starpu_task_break_on_push = starpu_get_env_number_default("STARPU_TASK_BREAK_ON_PUSH", -1);
 	_starpu_task_break_on_push = starpu_get_env_number_default("STARPU_TASK_BREAK_ON_PUSH", -1);
-	_starpu_task_break_on_pop = starpu_get_env_number_default("STARPU_TASK_BREAK_ON_POP", -1);
 	_starpu_task_break_on_sched = starpu_get_env_number_default("STARPU_TASK_BREAK_ON_SCHED", -1);
 	_starpu_task_break_on_sched = starpu_get_env_number_default("STARPU_TASK_BREAK_ON_SCHED", -1);
+	_starpu_task_break_on_pop = starpu_get_env_number_default("STARPU_TASK_BREAK_ON_POP", -1);
+	_starpu_task_break_on_exec = starpu_get_env_number_default("STARPU_TASK_BREAK_ON_EXEC", -1);
 	starpu_idle_file = starpu_getenv("STARPU_IDLE_FILE");
 	starpu_idle_file = starpu_getenv("STARPU_IDLE_FILE");
 }
 }
 
 
@@ -431,9 +433,9 @@ int _starpu_repush_task(struct _starpu_job *j)
 
 
 		if(nworkers == 0)
 		if(nworkers == 0)
 		{
 		{
-			_starpu_sched_ctx_lock_write(sched_ctx->id);
+			STARPU_PTHREAD_MUTEX_LOCK(&sched_ctx->empty_ctx_mutex);
 			starpu_task_list_push_front(&sched_ctx->empty_ctx_tasks, task);
 			starpu_task_list_push_front(&sched_ctx->empty_ctx_tasks, task);
-			_starpu_sched_ctx_unlock_write(sched_ctx->id);
+			STARPU_PTHREAD_MUTEX_UNLOCK(&sched_ctx->empty_ctx_mutex);
 #ifdef STARPU_USE_SC_HYPERVISOR
 #ifdef STARPU_USE_SC_HYPERVISOR
 			if(sched_ctx->id != 0 && sched_ctx->perf_counters != NULL
 			if(sched_ctx->id != 0 && sched_ctx->perf_counters != NULL
 			   && sched_ctx->perf_counters->notify_empty_ctx)
 			   && sched_ctx->perf_counters->notify_empty_ctx)
@@ -497,9 +499,9 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 
 
 		if (nworkers == 0)
 		if (nworkers == 0)
 		{
 		{
-			_starpu_sched_ctx_lock_write(sched_ctx->id);
+			STARPU_PTHREAD_MUTEX_LOCK(&sched_ctx->empty_ctx_mutex);
 			starpu_task_list_push_back(&sched_ctx->empty_ctx_tasks, task);
 			starpu_task_list_push_back(&sched_ctx->empty_ctx_tasks, task);
-			_starpu_sched_ctx_unlock_write(sched_ctx->id);
+			STARPU_PTHREAD_MUTEX_UNLOCK(&sched_ctx->empty_ctx_mutex);
 #ifdef STARPU_USE_SC_HYPERVISOR
 #ifdef STARPU_USE_SC_HYPERVISOR
 			if(sched_ctx->id != 0 && sched_ctx->perf_counters != NULL
 			if(sched_ctx->id != 0 && sched_ctx->perf_counters != NULL
 			   && sched_ctx->perf_counters->notify_empty_ctx)
 			   && sched_ctx->perf_counters->notify_empty_ctx)
@@ -589,6 +591,8 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 		{
 		{
 			STARPU_ASSERT(sched_ctx->sched_policy->push_task);
 			STARPU_ASSERT(sched_ctx->sched_policy->push_task);
 			/* check out if there are any workers in the context */
 			/* check out if there are any workers in the context */
+			starpu_pthread_rwlock_t *changing_ctx_mutex = _starpu_sched_ctx_get_changing_ctx_mutex(sched_ctx->id);
+			STARPU_PTHREAD_RWLOCK_RDLOCK(changing_ctx_mutex);
 			nworkers = starpu_sched_ctx_get_nworkers(sched_ctx->id);
 			nworkers = starpu_sched_ctx_get_nworkers(sched_ctx->id);
 			if (nworkers == 0)
 			if (nworkers == 0)
 				ret = -1;
 				ret = -1;
@@ -599,6 +603,7 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 				ret = sched_ctx->sched_policy->push_task(task);
 				ret = sched_ctx->sched_policy->push_task(task);
 				_STARPU_SCHED_END;
 				_STARPU_SCHED_END;
 			}
 			}
+			STARPU_PTHREAD_RWLOCK_UNLOCK(changing_ctx_mutex);
 		}
 		}
 
 
 		if(ret == -1)
 		if(ret == -1)

+ 3 - 2
src/core/sched_policy.h

@@ -28,7 +28,7 @@
 
 
 #define _STARPU_SCHED_BEGIN \
 #define _STARPU_SCHED_BEGIN \
 	_STARPU_TRACE_WORKER_SCHEDULING_PUSH;	\
 	_STARPU_TRACE_WORKER_SCHEDULING_PUSH;	\
-	_SIMGRID_TIMER_BEGIN
+	_SIMGRID_TIMER_BEGIN(_starpu_simgrid_sched_cost())
 #define _STARPU_SCHED_END \
 #define _STARPU_SCHED_END \
 	_SIMGRID_TIMER_END;			\
 	_SIMGRID_TIMER_END;			\
 	_STARPU_TRACE_WORKER_SCHEDULING_POP
 	_STARPU_TRACE_WORKER_SCHEDULING_POP
@@ -103,8 +103,9 @@ extern struct starpu_sched_policy _starpu_sched_modular_heft2_policy;
 extern struct starpu_sched_policy _starpu_sched_graph_test_policy;
 extern struct starpu_sched_policy _starpu_sched_graph_test_policy;
 
 
 extern long _starpu_task_break_on_push;
 extern long _starpu_task_break_on_push;
-extern long _starpu_task_break_on_pop;
 extern long _starpu_task_break_on_sched;
 extern long _starpu_task_break_on_sched;
+extern long _starpu_task_break_on_pop;
+extern long _starpu_task_break_on_exec;
 
 
 #ifdef SIGTRAP
 #ifdef SIGTRAP
 #define _STARPU_TASK_BREAK_ON(task, what) do { \
 #define _STARPU_TASK_BREAK_ON(task, what) do { \

+ 4 - 4
src/core/simgrid.h

@@ -69,7 +69,7 @@ starpu_pthread_queue_t _starpu_simgrid_task_queue[STARPU_NMAXWORKERS];
 #define _starpu_simgrid_queue_malloc_cost() starpu_get_env_number_default("STARPU_SIMGRID_QUEUE_MALLOC_COST", 1)
 #define _starpu_simgrid_queue_malloc_cost() starpu_get_env_number_default("STARPU_SIMGRID_QUEUE_MALLOC_COST", 1)
 #define _starpu_simgrid_task_submit_cost() starpu_get_env_number_default("STARPU_SIMGRID_TASK_SUBMIT_COST", 1)
 #define _starpu_simgrid_task_submit_cost() starpu_get_env_number_default("STARPU_SIMGRID_TASK_SUBMIT_COST", 1)
 #define _starpu_simgrid_fetching_input_cost() starpu_get_env_number_default("STARPU_SIMGRID_FETCHING_INPUT_COST", 1)
 #define _starpu_simgrid_fetching_input_cost() starpu_get_env_number_default("STARPU_SIMGRID_FETCHING_INPUT_COST", 1)
-#define _starpu_simgrid_sched_cost() starpu_get_env_number_default("STARPU_SIMGRID_SCHED_COST", 1)
+#define _starpu_simgrid_sched_cost() starpu_get_env_number_default("STARPU_SIMGRID_SCHED_COST", 0)
 
 
 /* Called at initialization to count how many GPUs are interfering with each
 /* Called at initialization to count how many GPUs are interfering with each
  * bus */
  * bus */
@@ -78,10 +78,10 @@ void _starpu_simgrid_count_ngpus(void);
 void _starpu_simgrid_xbt_thread_create(const char *name, void_f_pvoid_t code,
 void _starpu_simgrid_xbt_thread_create(const char *name, void_f_pvoid_t code,
 				       void *param);
 				       void *param);
 
 
-#define _SIMGRID_TIMER_BEGIN		\
+#define _SIMGRID_TIMER_BEGIN(cond)			\
 	{		\
 	{		\
 		xbt_os_timer_t __timer = NULL;		\
 		xbt_os_timer_t __timer = NULL;		\
-		if (_starpu_simgrid_sched_cost()) {		\
+		if (cond) {		\
 		  __timer = xbt_os_timer_new();		\
 		  __timer = xbt_os_timer_new();		\
 		  xbt_os_threadtimer_start(__timer);	\
 		  xbt_os_threadtimer_start(__timer);	\
 		}
 		}
@@ -94,7 +94,7 @@ void _starpu_simgrid_xbt_thread_create(const char *name, void_f_pvoid_t code,
 	}
 	}
 
 
 #else // !STARPU_SIMGRID
 #else // !STARPU_SIMGRID
-#define _SIMGRID_TIMER_BEGIN {
+#define _SIMGRID_TIMER_BEGIN(cond) {
 #define _SIMGRID_TIMER_END }
 #define _SIMGRID_TIMER_END }
 #endif
 #endif
 
 

+ 6 - 11
src/core/topology.c

@@ -1717,7 +1717,7 @@ _starpu_bind_thread_on_cpu (
 	CPU_ZERO(&aff_mask);
 	CPU_ZERO(&aff_mask);
 	CPU_SET(cpuid, &aff_mask);
 	CPU_SET(cpuid, &aff_mask);
 
 
-	starpu_pthread_t self = pthread_self();
+	starpu_pthread_t self = starpu_pthread_self();
 
 
 	ret = pthread_setaffinity_np(self, sizeof(aff_mask), &aff_mask);
 	ret = pthread_setaffinity_np(self, sizeof(aff_mask), &aff_mask);
 	if (ret)
 	if (ret)
@@ -2186,8 +2186,7 @@ _starpu_init_workers_binding_and_memory (struct _starpu_machine_config *config,
 				_starpu_memory_node_add_nworkers(memory_node);
 				_starpu_memory_node_add_nworkers(memory_node);
 
 
                                 _starpu_worker_drives_memory_node(&workerarg->set->workers[0], STARPU_MAIN_RAM);
                                 _starpu_worker_drives_memory_node(&workerarg->set->workers[0], STARPU_MAIN_RAM);
-				if (memory_node != STARPU_MAIN_RAM)
-					_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
+				_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
 				break;
 				break;
 #endif
 #endif
 
 
@@ -2226,8 +2225,7 @@ _starpu_init_workers_binding_and_memory (struct _starpu_machine_config *config,
 				_starpu_memory_node_add_nworkers(memory_node);
 				_starpu_memory_node_add_nworkers(memory_node);
 
 
                                 _starpu_worker_drives_memory_node(workerarg, STARPU_MAIN_RAM);
                                 _starpu_worker_drives_memory_node(workerarg, STARPU_MAIN_RAM);
-				if (memory_node != STARPU_MAIN_RAM)
-					_starpu_worker_drives_memory_node(workerarg, memory_node);
+				_starpu_worker_drives_memory_node(workerarg, memory_node);
 				break;
 				break;
 #endif
 #endif
 
 
@@ -2259,8 +2257,7 @@ _starpu_init_workers_binding_and_memory (struct _starpu_machine_config *config,
 				_starpu_memory_node_add_nworkers(memory_node);
 				_starpu_memory_node_add_nworkers(memory_node);
 
 
                                 _starpu_worker_drives_memory_node(&workerarg->set->workers[0], STARPU_MAIN_RAM);
                                 _starpu_worker_drives_memory_node(&workerarg->set->workers[0], STARPU_MAIN_RAM);
-				if (memory_node != STARPU_MAIN_RAM)
-					_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
+				_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
 				break;
 				break;
 #endif /* STARPU_USE_MIC */
 #endif /* STARPU_USE_MIC */
 
 
@@ -2275,8 +2272,7 @@ _starpu_init_workers_binding_and_memory (struct _starpu_machine_config *config,
 				_starpu_memory_node_add_nworkers(memory_node);
 				_starpu_memory_node_add_nworkers(memory_node);
 
 
                                 _starpu_worker_drives_memory_node(workerarg, STARPU_MAIN_RAM);
                                 _starpu_worker_drives_memory_node(workerarg, STARPU_MAIN_RAM);
-				if (memory_node != STARPU_MAIN_RAM)
-					_starpu_worker_drives_memory_node(workerarg, memory_node);
+				_starpu_worker_drives_memory_node(workerarg, memory_node);
 			}
 			}
 				break;
 				break;
 #endif /* STARPU_USE_SCC */
 #endif /* STARPU_USE_SCC */
@@ -2298,8 +2294,7 @@ _starpu_init_workers_binding_and_memory (struct _starpu_machine_config *config,
 
 
 				}
 				}
                                 _starpu_worker_drives_memory_node(&workerarg->set->workers[0], STARPU_MAIN_RAM);
                                 _starpu_worker_drives_memory_node(&workerarg->set->workers[0], STARPU_MAIN_RAM);
-				if (memory_node != STARPU_MAIN_RAM)
-					_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
+				_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
 #ifndef STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD
 #ifndef STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD
                                 /* MPI driver thread can manage all slave memories if we disable the MPI multiple thread */
                                 /* MPI driver thread can manage all slave memories if we disable the MPI multiple thread */
                                 unsigned findworker;
                                 unsigned findworker;

+ 37 - 21
src/core/workers.c

@@ -576,15 +576,10 @@ static void _starpu_worker_init(struct _starpu_worker *workerarg, struct _starpu
 	workerarg->reverse_phase[0] = 0;
 	workerarg->reverse_phase[0] = 0;
 	workerarg->reverse_phase[1] = 0;
 	workerarg->reverse_phase[1] = 0;
 	workerarg->pop_ctx_priority = 1;
 	workerarg->pop_ctx_priority = 1;
-	workerarg->sched_mutex_depth = 0;
+	workerarg->sched_mutex_locked = 0;
+	workerarg->blocked = 0;
 	workerarg->is_slave_somewhere = 0;
 	workerarg->is_slave_somewhere = 0;
 
 
-	workerarg->state_sched_op_pending = 0;
-	workerarg->state_changing_ctx_waiting = 0;
-	workerarg->state_blocked = 0;
-	workerarg->state_wait_ack__blocked = 0;
-	workerarg->state_wait_handshake__blocked = 0;
-
 	/* cpu_set/hwloc_cpu_set initialized in topology.c */
 	/* cpu_set/hwloc_cpu_set initialized in topology.c */
 }
 }
 
 
@@ -1417,14 +1412,12 @@ static void _starpu_terminate_workers(struct _starpu_machine_config *pconfig)
 		struct _starpu_worker *worker = &pconfig->workers[workerid];
 		struct _starpu_worker *worker = &pconfig->workers[workerid];
 
 
 		/* in case StarPU termination code is called from a callback,
 		/* in case StarPU termination code is called from a callback,
- 		 * we have to check if pthread_self() is the worker itself */
+ 		 * we have to check if starpu_pthread_self() is the worker itself */
 		if (set && set->nworkers > 0)
 		if (set && set->nworkers > 0)
 		{
 		{
 			if (set->started)
 			if (set->started)
 			{
 			{
-#ifndef STARPU_SIMGRID
-				if (!pthread_equal(pthread_self(), set->worker_thread))
-#endif
+				if (!starpu_pthread_equal(starpu_pthread_self(), set->worker_thread))
 					status = starpu_pthread_join(set->worker_thread, NULL);
 					status = starpu_pthread_join(set->worker_thread, NULL);
 				if (status)
 				if (status)
 				{
 				{
@@ -1440,9 +1433,7 @@ static void _starpu_terminate_workers(struct _starpu_machine_config *pconfig)
 			if (!worker->run_by_starpu)
 			if (!worker->run_by_starpu)
 				goto out;
 				goto out;
 
 
-#ifndef STARPU_SIMGRID
-			if (!pthread_equal(pthread_self(), worker->worker_thread))
-#endif
+			if (!starpu_pthread_equal(starpu_pthread_self(), worker->worker_thread))
 				status = starpu_pthread_join(worker->worker_thread, NULL);
 				status = starpu_pthread_join(worker->worker_thread, NULL);
 			if (status)
 			if (status)
 			{
 			{
@@ -1697,7 +1688,7 @@ unsigned starpu_worker_get_count(void)
 
 
 unsigned starpu_worker_is_blocked(int workerid)
 unsigned starpu_worker_is_blocked(int workerid)
 {
 {
-	return (unsigned)_starpu_config.workers[workerid].state_blocked;
+	return _starpu_config.workers[workerid].blocked;
 }
 }
 
 
 unsigned starpu_worker_is_slave_somewhere(int workerid)
 unsigned starpu_worker_is_slave_somewhere(int workerid)
@@ -2057,7 +2048,7 @@ void starpu_worker_get_sched_condition(int workerid, starpu_pthread_mutex_t **sc
 	*sched_mutex = &_starpu_config.workers[workerid].sched_mutex;
 	*sched_mutex = &_starpu_config.workers[workerid].sched_mutex;
 }
 }
 
 
-static int starpu_wakeup_worker_locked(int workerid, starpu_pthread_cond_t *sched_cond, starpu_pthread_mutex_t *mutex STARPU_ATTRIBUTE_UNUSED)
+int starpu_wakeup_worker_locked(int workerid, starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex STARPU_ATTRIBUTE_UNUSED)
 {
 {
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
 	starpu_pthread_queue_broadcast(&_starpu_simgrid_task_queue[workerid]);
 	starpu_pthread_queue_broadcast(&_starpu_simgrid_task_queue[workerid]);
@@ -2065,19 +2056,17 @@ static int starpu_wakeup_worker_locked(int workerid, starpu_pthread_cond_t *sche
 	if (_starpu_config.workers[workerid].status == STATUS_SLEEPING)
 	if (_starpu_config.workers[workerid].status == STATUS_SLEEPING)
 	{
 	{
 		_starpu_config.workers[workerid].status = STATUS_WAKING_UP;
 		_starpu_config.workers[workerid].status = STATUS_WAKING_UP;
-		/* cond_broadcast is required over cond_signal since
-		 * the condition is share for multiple purpose */
-		STARPU_PTHREAD_COND_BROADCAST(sched_cond);
+		STARPU_PTHREAD_COND_SIGNAL(cond);
 		return 1;
 		return 1;
 	}
 	}
 	return 0;
 	return 0;
 }
 }
 
 
-static int starpu_wakeup_worker(int workerid, starpu_pthread_cond_t *sched_cond, starpu_pthread_mutex_t *mutex)
+int starpu_wakeup_worker(int workerid, starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex)
 {
 {
 	int success;
 	int success;
 	STARPU_PTHREAD_MUTEX_LOCK_SCHED(mutex);
 	STARPU_PTHREAD_MUTEX_LOCK_SCHED(mutex);
-	success = starpu_wakeup_worker_locked(workerid, sched_cond, mutex);
+	success = starpu_wakeup_worker_locked(workerid, cond, mutex);
 	STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(mutex);
 	STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(mutex);
 	return success;
 	return success;
 }
 }
@@ -2171,6 +2160,33 @@ void starpu_get_version(int *major, int *minor, int *release)
 	*release = STARPU_RELEASE_VERSION;
 	*release = STARPU_RELEASE_VERSION;
 }
 }
 
 
+void _starpu_unlock_mutex_if_prev_locked()
+{
+	int workerid = starpu_worker_get_id();
+	if(workerid != -1)
+	{
+		struct _starpu_worker *w = _starpu_get_worker_struct(workerid);
+		if(w->sched_mutex_locked)
+		{
+			STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&w->sched_mutex);
+			_starpu_worker_set_flag_sched_mutex_locked(workerid, 1);
+		}
+	}
+	return;
+}
+
+void _starpu_relock_mutex_if_prev_locked()
+{
+	int workerid = starpu_worker_get_id();
+	if(workerid != -1)
+	{
+		struct _starpu_worker *w = _starpu_get_worker_struct(workerid);
+		if(w->sched_mutex_locked)
+			STARPU_PTHREAD_MUTEX_LOCK_SCHED(&w->sched_mutex);
+	}
+	return;
+}
+
 unsigned starpu_worker_get_sched_ctx_list(int workerid, unsigned **sched_ctxs)
 unsigned starpu_worker_get_sched_ctx_list(int workerid, unsigned **sched_ctxs)
 {
 {
 	unsigned s = 0;
 	unsigned s = 0;

+ 20 - 47
src/core/workers.h

@@ -20,6 +20,8 @@
 #ifndef __WORKERS_H__
 #ifndef __WORKERS_H__
 #define __WORKERS_H__
 #define __WORKERS_H__
 
 
+#include <limits.h>
+
 #include <starpu.h>
 #include <starpu.h>
 #include <common/config.h>
 #include <common/config.h>
 #include <common/timing.h>
 #include <common/timing.h>
@@ -83,11 +85,6 @@ LIST_TYPE(_starpu_worker,
 	unsigned numa_memory_node; /* which numa memory node is the worker associated with? (logical index) */
 	unsigned numa_memory_node; /* which numa memory node is the worker associated with? (logical index) */
 	starpu_pthread_cond_t sched_cond; /* condition variable used when the worker waits for tasks. */
 	starpu_pthread_cond_t sched_cond; /* condition variable used when the worker waits for tasks. */
         starpu_pthread_mutex_t sched_mutex; /* mutex protecting sched_cond */
         starpu_pthread_mutex_t sched_mutex; /* mutex protecting sched_cond */
-	int state_sched_op_pending:1; /* a task pop is ongoing even though sched_mutex may temporarily be unlocked */
-	int state_changing_ctx_waiting:1; /* a thread is waiting for transient operations such as pop to complete before acquiring sched_mutex and modifying the worker ctx*/
-	int state_blocked:1;
-	int state_wait_ack__blocked:1;
-	int state_wait_handshake__blocked:1;
 	struct starpu_task_list local_tasks; /* this queue contains tasks that have been explicitely submitted to that queue */
 	struct starpu_task_list local_tasks; /* this queue contains tasks that have been explicitely submitted to that queue */
 	struct starpu_task **local_ordered_tasks; /* this queue contains tasks that have been explicitely submitted to that queue with an explicit order */
 	struct starpu_task **local_ordered_tasks; /* this queue contains tasks that have been explicitely submitted to that queue with an explicit order */
 	unsigned local_ordered_tasks_size; /* this records the size of local_ordered_tasks */
 	unsigned local_ordered_tasks_size; /* this records the size of local_ordered_tasks */
@@ -141,8 +138,11 @@ LIST_TYPE(_starpu_worker,
 	/* indicate which priority of ctx is currently active: the values are 0 or 1*/
 	/* indicate which priority of ctx is currently active: the values are 0 or 1*/
 	unsigned pop_ctx_priority;
 	unsigned pop_ctx_priority;
 
 
-	/* sched mutex local worker locking depth */
-	unsigned sched_mutex_depth;
+	/* flag to know if sched_mutex is locked or not */
+	unsigned sched_mutex_locked;
+
+	/* bool to indicate if the worker is blocked in a ctx */
+	unsigned blocked;
 
 
 	/* bool to indicate if the worker is slave in a ctx */
 	/* bool to indicate if the worker is slave in a ctx */
 	unsigned is_slave_somewhere;
 	unsigned is_slave_somewhere;
@@ -509,7 +509,7 @@ static inline struct _starpu_worker *_starpu_get_worker_struct(unsigned id)
 	return &_starpu_config.workers[id];
 	return &_starpu_config.workers[id];
 }
 }
 
 
-/* Returns the starpu_sched_ctx structure that describes the state of the 
+/* Returns the starpu_sched_ctx structure that descriebes the state of the 
  * specified ctx */
  * specified ctx */
 static inline struct _starpu_sched_ctx *_starpu_get_sched_ctx_struct(unsigned id)
 static inline struct _starpu_sched_ctx *_starpu_get_sched_ctx_struct(unsigned id)
 {
 {
@@ -559,6 +559,18 @@ int starpu_worker_get_nids_by_type(enum starpu_worker_archtype type, int *worker
    the list might not be updated */
    the list might not be updated */
 int starpu_worker_get_nids_ctx_free_by_type(enum starpu_worker_archtype type, int *workerids, int maxsize);
 int starpu_worker_get_nids_ctx_free_by_type(enum starpu_worker_archtype type, int *workerids, int maxsize);
 
 
+/* if the current worker has the lock release it */
+void _starpu_unlock_mutex_if_prev_locked();
+
+/* if we prev released the lock relock it */
+void _starpu_relock_mutex_if_prev_locked();
+
+static inline void _starpu_worker_set_flag_sched_mutex_locked(int workerid, unsigned flag)
+{
+	struct _starpu_worker *w = _starpu_get_worker_struct(workerid);
+	w->sched_mutex_locked = flag;
+}
+
 static inline unsigned _starpu_worker_mutex_is_sched_mutex(int workerid, starpu_pthread_mutex_t *mutex)
 static inline unsigned _starpu_worker_mutex_is_sched_mutex(int workerid, starpu_pthread_mutex_t *mutex)
 {
 {
 	struct _starpu_worker *w = _starpu_get_worker_struct(workerid);
 	struct _starpu_worker *w = _starpu_get_worker_struct(workerid);
@@ -611,43 +623,4 @@ void _starpu_worker_set_stream_ctx(unsigned workerid, struct _starpu_sched_ctx *
 
 
 struct _starpu_sched_ctx* _starpu_worker_get_ctx_stream(unsigned stream_workerid);
 struct _starpu_sched_ctx* _starpu_worker_get_ctx_stream(unsigned stream_workerid);
 
 
-/* Must be called with worker's sched_mutex held.
- * Mark the beginning of a scheduling operation during which the sched_mutex
- * lock may be temporarily released, but the scheduling context of the worker
- * should not be modified */
-static inline void _starpu_worker_enter_transient_sched_op(struct _starpu_worker * const worker)
-{
-	worker->state_sched_op_pending = 1;
-}
-
-/* Must be called with worker's sched_mutex held.
- * Mark the end of a scheduling operation, and notify potential waiters that
- * scheduling context changes can safely be performed again.
- */
-static inline void  _starpu_worker_leave_transient_sched_op(struct _starpu_worker * const worker)
-{
-	worker->state_sched_op_pending = 0;
-	if (worker->state_changing_ctx_waiting)
-		/* cond_broadcast is required over cond_signal since
-		 * the condition is share for multiple purpose */
-		STARPU_PTHREAD_COND_BROADCAST(&worker->sched_cond);
-}
-
-/* Must be called with worker's sched_mutex held.
- * Passively wait until state_sched_op_pending is cleared.
- */
-static inline void _starpu_worker_wait_for_transient_sched_op_completion(struct _starpu_worker * const worker)
-{
-	if (worker->state_sched_op_pending)
-	{
-		worker->state_changing_ctx_waiting = 1;
-		do
-		{
-			STARPU_PTHREAD_COND_WAIT(&worker->sched_cond, &worker->sched_mutex);
-		}
-		while (worker->state_sched_op_pending);
-		worker->state_changing_ctx_waiting = 0;
-	}
-}
-
 #endif // __WORKERS_H__
 #endif // __WORKERS_H__

+ 16 - 0
src/datawizard/malloc.c

@@ -32,6 +32,7 @@
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
 #include <sys/mman.h>
 #include <sys/mman.h>
 #include <fcntl.h>
 #include <fcntl.h>
+#include <smpi/smpi.h>
 #endif
 #endif
 
 
 #ifndef O_BINARY
 #ifndef O_BINARY
@@ -48,9 +49,12 @@ static int malloc_on_node_default_flags[STARPU_MAXNODES];
 
 
 /* This file is used for implementing "folded" allocation */
 /* This file is used for implementing "folded" allocation */
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
+#if SIMGRID_VERSION_MAJOR < 3 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR < 15)
+/* TODO: drop when simgrid 3.15 is reasonably largely used by people who need the feature */
 static int bogusfile = -1;
 static int bogusfile = -1;
 static unsigned long _starpu_malloc_simulation_fold;
 static unsigned long _starpu_malloc_simulation_fold;
 #endif
 #endif
+#endif
 
 
 void starpu_malloc_set_align(size_t align)
 void starpu_malloc_set_align(size_t align)
 {
 {
@@ -230,6 +234,10 @@ int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int fl
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
 	if (flags & STARPU_MALLOC_SIMULATION_FOLDED)
 	if (flags & STARPU_MALLOC_SIMULATION_FOLDED)
 	{
 	{
+#if SIMGRID_VERSION_MAJOR > 3 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR >= 15)
+		*A = SMPI_SHARED_MALLOC(dim);
+#else
+		/* TODO: drop when simgrid 3.15 is reasonably largely used by people who need the feature */
 		/* Use "folded" allocation: the same file is mapped several
 		/* Use "folded" allocation: the same file is mapped several
 		 * times contiguously, to get a memory area one can read/write,
 		 * times contiguously, to get a memory area one can read/write,
 		 * without consuming memory */
 		 * without consuming memory */
@@ -282,6 +290,7 @@ int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int fl
 			}
 			}
 			*A = buf;
 			*A = buf;
 		}
 		}
+#endif
 	}
 	}
 	else
 	else
 #endif
 #endif
@@ -465,7 +474,12 @@ int _starpu_free_flags_on_node(unsigned dst_node, void *A, size_t dim, int flags
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
 	if (flags & STARPU_MALLOC_SIMULATION_FOLDED)
 	if (flags & STARPU_MALLOC_SIMULATION_FOLDED)
 	{
 	{
+#if SIMGRID_VERSION_MAJOR > 3 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR >= 15)
+		SMPI_SHARED_FREE(A);
+#else
+		/* TODO: drop when simgrid 3.15 is reasonably largely used by people who need the feature */
 		munmap(A, dim);
 		munmap(A, dim);
+#endif
 	}
 	}
 	else
 	else
 #endif
 #endif
@@ -840,9 +854,11 @@ _starpu_malloc_init(unsigned dst_node)
 	disable_pinning = starpu_get_env_number("STARPU_DISABLE_PINNING");
 	disable_pinning = starpu_get_env_number("STARPU_DISABLE_PINNING");
 	malloc_on_node_default_flags[dst_node] = STARPU_MALLOC_PINNED | STARPU_MALLOC_COUNT;
 	malloc_on_node_default_flags[dst_node] = STARPU_MALLOC_PINNED | STARPU_MALLOC_COUNT;
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
+#if SIMGRID_VERSION_MAJOR < 3 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR < 15)
 	/* Reasonably "costless" */
 	/* Reasonably "costless" */
 	_starpu_malloc_simulation_fold = starpu_get_env_number_default("STARPU_MALLOC_SIMULATION_FOLD", 1) << 20;
 	_starpu_malloc_simulation_fold = starpu_get_env_number_default("STARPU_MALLOC_SIMULATION_FOLD", 1) << 20;
 #endif
 #endif
+#endif
 }
 }
 
 
 void
 void

+ 5 - 2
src/datawizard/memory_nodes.c

@@ -185,9 +185,12 @@ unsigned starpu_worker_get_memory_node(unsigned workerid)
 /* same utility as _starpu_memory_node_add_nworkers */
 /* same utility as _starpu_memory_node_add_nworkers */
 void _starpu_worker_drives_memory_node(struct _starpu_worker *worker, unsigned memnode)
 void _starpu_worker_drives_memory_node(struct _starpu_worker *worker, unsigned memnode)
 {
 {
-	_starpu_worker_drives_memory[worker->workerid][memnode] = 1;
+	if (! _starpu_worker_drives_memory[worker->workerid][memnode])
+	{
+		_starpu_worker_drives_memory[worker->workerid][memnode] = 1;
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
-	starpu_pthread_queue_register(&worker->wait, &_starpu_simgrid_transfer_queue[memnode]);
+		starpu_pthread_queue_register(&worker->wait, &_starpu_simgrid_transfer_queue[memnode]);
 #endif
 #endif
+	}
 }
 }
 
 

+ 21 - 5
src/debug/traces/starpu_fxt.c

@@ -157,6 +157,8 @@ static void task_dump(struct task_info *task)
 
 
 	if (task->exclude_from_dag)
 	if (task->exclude_from_dag)
 		goto out;
 		goto out;
+	if (!tasks_file)
+		goto out;
 
 
 	if (task->name)
 	if (task->name)
 	{
 	{
@@ -274,6 +276,8 @@ static struct data_info *get_data(unsigned long handle, int mpi_rank)
 
 
 static void data_dump(struct data_info *data)
 static void data_dump(struct data_info *data)
 {
 {
+	if (!data_file)
+		goto out;
 	fprintf(data_file, "Handle: %lx\n", data->handle);
 	fprintf(data_file, "Handle: %lx\n", data->handle);
 	fprintf(data_file, "MPIRank: %d\n", data->mpi_rank);
 	fprintf(data_file, "MPIRank: %d\n", data->mpi_rank);
 	if (data->name)
 	if (data->name)
@@ -291,6 +295,7 @@ static void data_dump(struct data_info *data)
 	}
 	}
 	fprintf(data_file, "MPIOwner: %d\n", data->mpi_owner);
 	fprintf(data_file, "MPIOwner: %d\n", data->mpi_owner);
 	fprintf(data_file, "\n");
 	fprintf(data_file, "\n");
+out:
 	HASH_DEL(data_info, data);
 	HASH_DEL(data_info, data);
 	free(data);
 	free(data);
 }
 }
@@ -2388,8 +2393,8 @@ static void handle_task_done(struct fxt_ev_64 *ev, struct starpu_fxt_options *op
 	unsigned exclude_from_dag = ev->param[2];
 	unsigned exclude_from_dag = ev->param[2];
 	struct task_info *task = get_task(job_id, options->file_rank);
 	struct task_info *task = get_task(job_id, options->file_rank);
 	task->exclude_from_dag = exclude_from_dag;
 	task->exclude_from_dag = exclude_from_dag;
-	if (tasks_file)
-		task_dump(task);
+
+	task_dump(task);
 
 
 	if (!exclude_from_dag)
 	if (!exclude_from_dag)
 		_starpu_fxt_dag_set_task_done(options->file_prefix, job_id, name, colour);
 		_starpu_fxt_dag_set_task_done(options->file_prefix, job_id, name, colour);
@@ -2698,9 +2703,8 @@ static void handle_task_wait_for_all(void)
 	_starpu_fxt_dag_add_sync_point();
 	_starpu_fxt_dag_add_sync_point();
 }
 }
 
 
-static void handle_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
+static void handle_string_event(struct fxt_ev_64 *ev, const char *event, struct starpu_fxt_options *options)
 {
 {
-	char *event = get_fxt_string(ev, 0);
 	/* Add an event in the trace */
 	/* Add an event in the trace */
 	if (out_paje_file)
 	if (out_paje_file)
 	{
 	{
@@ -2717,6 +2721,12 @@ static void handle_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *option
 		recfmt_dump_state(get_event_time_stamp(ev, options), "ProgEvent", -1, 0, event, "Program");
 		recfmt_dump_state(get_event_time_stamp(ev, options), "ProgEvent", -1, 0, event, "Program");
 }
 }
 
 
+static void handle_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
+{
+	char *event = get_fxt_string(ev, 0);
+	handle_string_event(ev, event, options);
+}
+
 static void handle_thread_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
 static void handle_thread_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
 {
 {
 	/* Add an event in the trace */
 	/* Add an event in the trace */
@@ -3418,6 +3428,13 @@ void _starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *op
 				fut_keymask = ev.param[0];
 				fut_keymask = ev.param[0];
 				break;
 				break;
 
 
+			case FUT_START_FLUSH_CODE:
+				handle_string_event(&ev, "fxt_start_flush", options);
+				break;
+			case FUT_STOP_FLUSH_CODE:
+				handle_string_event(&ev, "fxt_stop_flush", options);
+				break;
+
 			/* We can safely ignore FUT internal events */
 			/* We can safely ignore FUT internal events */
 			case FUT_CALIBRATE0_CODE:
 			case FUT_CALIBRATE0_CODE:
 			case FUT_CALIBRATE1_CODE:
 			case FUT_CALIBRATE1_CODE:
@@ -3467,7 +3484,6 @@ void _starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *op
 #endif
 #endif
 	}
 	}
 
 
-	if (data_file)
 	{
 	{
 		/* TODO: move to handle_data_unregister */
 		/* TODO: move to handle_data_unregister */
 		struct data_info *data, *tmp;
 		struct data_info *data, *tmp;

+ 6 - 6
src/debug/traces/starpu_paje.c

@@ -193,7 +193,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 	poti_DefineEntityValue("E", "S", "Executing", ".0 .6 .5");
 	poti_DefineEntityValue("E", "S", "Executing", ".0 .6 .5");
 	poti_DefineEntityValue("Sc", "S", "Scheduling", ".7 .36 .0");
 	poti_DefineEntityValue("Sc", "S", "Scheduling", ".7 .36 .0");
 	poti_DefineEntityValue("Sl", "S", "Sleeping", ".9 .1 .0");
 	poti_DefineEntityValue("Sl", "S", "Sleeping", ".9 .1 .0");
-	poti_DefineEntityValue("P", "S", "Progressing", ".4 .1 .6");
+	poti_DefineEntityValue("P", "S", "Progressing", ".1 .3 .1");
 	poti_DefineEntityValue("U", "S", "Unpartitioning", ".0 .0 1.0");
 	poti_DefineEntityValue("U", "S", "Unpartitioning", ".0 .0 1.0");
 	poti_DefineEntityValue("H", "S", "Hypervisor", ".5 .18 .0");
 	poti_DefineEntityValue("H", "S", "Hypervisor", ".5 .18 .0");
 	poti_DefineEntityValue("Bu", "S", "Building task", ".5 .18 .0");
 	poti_DefineEntityValue("Bu", "S", "Building task", ".5 .18 .0");
@@ -213,7 +213,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 	poti_DefineEntityValue("E", "WS", "Executing", ".0 .6 .5");
 	poti_DefineEntityValue("E", "WS", "Executing", ".0 .6 .5");
 	poti_DefineEntityValue("Sc", "WS", "Scheduling", ".7 .36 .0");
 	poti_DefineEntityValue("Sc", "WS", "Scheduling", ".7 .36 .0");
 	poti_DefineEntityValue("Sl", "WS", "Sleeping", ".9 .1 .0");
 	poti_DefineEntityValue("Sl", "WS", "Sleeping", ".9 .1 .0");
-	poti_DefineEntityValue("P", "WS", "Progressing", ".4 .1 .6");
+	poti_DefineEntityValue("P", "WS", "Progressing", ".1 .3 .1");
 	poti_DefineEntityValue("U", "WS", "Unpartitioning", ".0 .0 1.0");
 	poti_DefineEntityValue("U", "WS", "Unpartitioning", ".0 .0 1.0");
 	poti_DefineEntityValue("H", "WS", "Hypervisor", ".5 .18 .0");
 	poti_DefineEntityValue("H", "WS", "Hypervisor", ".5 .18 .0");
 	poti_DefineEntityValue("Bu", "WS", "Building task", ".5 .18 .0");
 	poti_DefineEntityValue("Bu", "WS", "Building task", ".5 .18 .0");
@@ -268,7 +268,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 		poti_DefineEntityValue("E", ctx, "Executing", ".0 .6 .5");
 		poti_DefineEntityValue("E", ctx, "Executing", ".0 .6 .5");
 		poti_DefineEntityValue("Sc", ctx, "Scheduling", ".7 .36 .0");
 		poti_DefineEntityValue("Sc", ctx, "Scheduling", ".7 .36 .0");
 		poti_DefineEntityValue("Sl", ctx, "Sleeping", ".9 .1 .0");
 		poti_DefineEntityValue("Sl", ctx, "Sleeping", ".9 .1 .0");
-		poti_DefineEntityValue("P", ctx, "Progressing", ".4 .1 .6");
+		poti_DefineEntityValue("P", ctx, "Progressing", ".1 .3 .1");
 		poti_DefineEntityValue("U", ctx, "Unpartitioning", ".0 .0 1.0");
 		poti_DefineEntityValue("U", ctx, "Unpartitioning", ".0 .0 1.0");
 		poti_DefineEntityValue("H", ctx, "Hypervisor", ".5 .18 .0");
 		poti_DefineEntityValue("H", ctx, "Hypervisor", ".5 .18 .0");
 	}
 	}
@@ -331,7 +331,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 6       E       S       Executing         \".0 .6 .5\"		\n\
 6       E       S       Executing         \".0 .6 .5\"		\n\
 6       Sc       S      Scheduling         \".7 .36 .0\"		\n\
 6       Sc       S      Scheduling         \".7 .36 .0\"		\n\
 6       Sl       S      Sleeping         \".9 .1 .0\"		\n\
 6       Sl       S      Sleeping         \".9 .1 .0\"		\n\
-6       P       S       Progressing         \".4 .1 .6\"		\n\
+6       P       S       Progressing         \".1 .3 .1\"		\n\
 6       U       S       Unpartitioning      \".0 .0 1.0\"		\n\
 6       U       S       Unpartitioning      \".0 .0 1.0\"		\n\
 6       H       S       Hypervisor      \".5 .18 .0\"		\n\
 6       H       S       Hypervisor      \".5 .18 .0\"		\n\
 6       Bu      S       \"Building task\"   \".5 .18 .0\"		\n\
 6       Bu      S       \"Building task\"   \".5 .18 .0\"		\n\
@@ -351,7 +351,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 6       E       WS       Executing         \".0 .6 .5\"		\n\
 6       E       WS       Executing         \".0 .6 .5\"		\n\
 6       Sc       WS      Scheduling         \".7 .36 .0\"		\n\
 6       Sc       WS      Scheduling         \".7 .36 .0\"		\n\
 6       Sl       WS      Sleeping         \".9 .1 .0\"		\n\
 6       Sl       WS      Sleeping         \".9 .1 .0\"		\n\
-6       P       WS       Progressing         \".4 .1 .6\"		\n\
+6       P       WS       Progressing         \".1 .3 .1\"		\n\
 6       U       WS       Unpartitioning      \".0 .0 1.0\"		\n\
 6       U       WS       Unpartitioning      \".0 .0 1.0\"		\n\
 6       H       WS       Hypervisor      \".5 .18 .0\"		\n\
 6       H       WS       Hypervisor      \".5 .18 .0\"		\n\
 6       Bu      WS       \"Building task\"   \".5 .18 .0\"		\n\
 6       Bu      WS       \"Building task\"   \".5 .18 .0\"		\n\
@@ -394,7 +394,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 6       E       Ctx%u       Executing         \".0 .6 .5\"		\n\
 6       E       Ctx%u       Executing         \".0 .6 .5\"		\n\
 6       Sc       Ctx%u      Scheduling         \".7 .36 .0\"		\n\
 6       Sc       Ctx%u      Scheduling         \".7 .36 .0\"		\n\
 6       Sl       Ctx%u      Sleeping         \".9 .1 .0\"		\n\
 6       Sl       Ctx%u      Sleeping         \".9 .1 .0\"		\n\
-6       P       Ctx%u       Progressing         \".4 .1 .6\"		\n\
+6       P       Ctx%u       Progressing         \".1 .3 .1\"		\n\
 6       U       Ctx%u       Unpartitioning         \".0 .0 1.0\"	\n\
 6       U       Ctx%u       Unpartitioning         \".0 .0 1.0\"	\n\
 6       H       Ctx%u       Hypervisor         \".5 .18 .0\"		\n",
 6       H       Ctx%u       Hypervisor         \".5 .18 .0\"		\n",
 		i, i, i, i, i, i, i, i, i, i, i, i, i);
 		i, i, i, i, i, i, i, i, i, i, i, i, i);

+ 8 - 2
src/drivers/cpu/driver_cpu.c

@@ -89,6 +89,12 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
 			if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE)
 			if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE)
 				func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
 				func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
+			else if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT)
+			{
+				_SIMGRID_TIMER_BEGIN(1);
+				func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
+				_SIMGRID_TIMER_END;
+			}
 			else
 			else
 				_starpu_simgrid_submit_job(cpu_args->workerid, j, perf_arch, NAN, NULL);
 				_starpu_simgrid_submit_job(cpu_args->workerid, j, perf_arch, NAN, NULL);
 #else
 #else
@@ -410,13 +416,13 @@ void *_starpu_cpu_worker(void *arg)
 	struct _starpu_worker *worker = arg;
 	struct _starpu_worker *worker = arg;
 
 
 	_starpu_cpu_driver_init(worker);
 	_starpu_cpu_driver_init(worker);
-	_STARPU_TRACE_END_PROGRESS(worker->memory_node);
+	_STARPU_TRACE_START_PROGRESS(worker->memory_node);
 	while (_starpu_machine_is_running())
 	while (_starpu_machine_is_running())
 	{
 	{
 		_starpu_may_pause();
 		_starpu_may_pause();
 		_starpu_cpu_driver_run_once(worker);
 		_starpu_cpu_driver_run_once(worker);
 	}
 	}
-	_STARPU_TRACE_START_PROGRESS(worker->memory_node);
+	_STARPU_TRACE_END_PROGRESS(worker->memory_node);
 	_starpu_cpu_driver_deinit(worker);
 	_starpu_cpu_driver_deinit(worker);
 
 
 	return NULL;
 	return NULL;

+ 10 - 4
src/drivers/cuda/driver_cuda.c

@@ -507,6 +507,12 @@ static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *worke
 		unsigned workerid = worker->workerid;
 		unsigned workerid = worker->workerid;
 		if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE && !async)
 		if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE && !async)
 			func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
 			func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
+		else if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT && !async)
+			{
+				_SIMGRID_TIMER_BEGIN(1);
+				func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
+				_SIMGRID_TIMER_END;
+			}
 		else
 		else
 			_starpu_simgrid_submit_job(workerid, j, &worker->perf_arch, NAN,
 			_starpu_simgrid_submit_job(workerid, j, &worker->perf_arch, NAN,
 				async ? &task_finished[workerid][pipeline_idx] : NULL);
 				async ? &task_finished[workerid][pipeline_idx] : NULL);
@@ -763,6 +769,7 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 		task = worker->task_transferring;
 		task = worker->task_transferring;
 		if (task && worker->nb_buffers_transferred == worker->nb_buffers_totransfer)
 		if (task && worker->nb_buffers_transferred == worker->nb_buffers_totransfer)
 		{
 		{
+			_STARPU_TRACE_END_PROGRESS(memnode);
 			j = _starpu_get_job_associated_to_task(task);
 			j = _starpu_get_job_associated_to_task(task);
 
 
 			_starpu_set_local_worker_key(worker);
 			_starpu_set_local_worker_key(worker);
@@ -779,10 +786,9 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 			}
 			}
 			else
 			else
 			{
 			{
-				_STARPU_TRACE_END_PROGRESS(memnode);
 				execute_job_on_cuda(task, worker);
 				execute_job_on_cuda(task, worker);
-				_STARPU_TRACE_START_PROGRESS(memnode);
 			}
 			}
+			_STARPU_TRACE_START_PROGRESS(memnode);
 		}
 		}
 
 
 		/* Then test for termination of queued tasks */
 		/* Then test for termination of queued tasks */
@@ -811,6 +817,7 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 		else
 		else
 #endif /* !STARPU_SIMGRID */
 #endif /* !STARPU_SIMGRID */
 		{
 		{
+			_STARPU_TRACE_END_PROGRESS(memnode);
 			/* Asynchronous task completed! */
 			/* Asynchronous task completed! */
 			_starpu_set_local_worker_key(worker);
 			_starpu_set_local_worker_key(worker);
 			finish_job_on_cuda(_starpu_get_job_associated_to_task(task), worker);
 			finish_job_on_cuda(_starpu_get_job_associated_to_task(task), worker);
@@ -831,11 +838,9 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 					 * flushing the pipeline, we can now at
 					 * flushing the pipeline, we can now at
 					 * last execute it.  */
 					 * last execute it.  */
 
 
-					_STARPU_TRACE_END_PROGRESS(memnode);
 					_STARPU_TRACE_EVENT("sync_task");
 					_STARPU_TRACE_EVENT("sync_task");
 					execute_job_on_cuda(task, worker);
 					execute_job_on_cuda(task, worker);
 					_STARPU_TRACE_EVENT("end_sync_task");
 					_STARPU_TRACE_EVENT("end_sync_task");
-					_STARPU_TRACE_START_PROGRESS(memnode);
 					worker->pipeline_stuck = 0;
 					worker->pipeline_stuck = 0;
 				}
 				}
 			}
 			}
@@ -848,6 +853,7 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 				/* Everybody busy */
 				/* Everybody busy */
 				_STARPU_TRACE_END_EXECUTING()
 				_STARPU_TRACE_END_EXECUTING()
 #endif
 #endif
+			_STARPU_TRACE_START_PROGRESS(memnode);
 		}
 		}
 
 
 		if (!worker->pipeline_length || worker->ntasks < worker->pipeline_length)
 		if (!worker->pipeline_length || worker->ntasks < worker->pipeline_length)

+ 23 - 44
src/drivers/driver_common/driver_common.c

@@ -101,6 +101,7 @@ void _starpu_driver_start_job(struct _starpu_worker *worker, struct _starpu_job
 	}
 	}
 	else
 	else
 		_STARPU_TRACE_START_CODELET_BODY(j, j->nimpl, perf_arch, workerid);
 		_STARPU_TRACE_START_CODELET_BODY(j, j->nimpl, perf_arch, workerid);
+	_STARPU_TASK_BREAK_ON(task, exec);
 }
 }
 
 
 void _starpu_driver_end_job(struct _starpu_worker *worker, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch STARPU_ATTRIBUTE_UNUSED, struct timespec *codelet_end, int rank, int profiling)
 void _starpu_driver_end_job(struct _starpu_worker *worker, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch STARPU_ATTRIBUTE_UNUSED, struct timespec *codelet_end, int rank, int profiling)
@@ -358,6 +359,7 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *worker, int w
 			sched_ctx = _starpu_get_sched_ctx_struct(e->sched_ctx);
 			sched_ctx = _starpu_get_sched_ctx_struct(e->sched_ctx);
 			if(sched_ctx && sched_ctx->id > 0 && sched_ctx->id < STARPU_NMAX_SCHED_CTXS)
 			if(sched_ctx && sched_ctx->id > 0 && sched_ctx->id < STARPU_NMAX_SCHED_CTXS)
 			{
 			{
+				STARPU_PTHREAD_MUTEX_LOCK(&sched_ctx->parallel_sect_mutex[workerid]);
 				if(!sched_ctx->sched_policy)
 				if(!sched_ctx->sched_policy)
 					worker->is_slave_somewhere = sched_ctx->main_master != workerid;
 					worker->is_slave_somewhere = sched_ctx->main_master != workerid;
 
 
@@ -366,23 +368,18 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *worker, int w
 					/* don't let the worker sleep with the sched_mutex taken */
 					/* don't let the worker sleep with the sched_mutex taken */
 					/* we need it until here bc of the list of ctxs of the workers
 					/* we need it until here bc of the list of ctxs of the workers
 					   that can change in another thread */
 					   that can change in another thread */
+					STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&worker->sched_mutex);
 					needed = 0;
 					needed = 0;
-					worker->state_blocked = 1;
-					worker->state_wait_ack__blocked = 1;
-					STARPU_PTHREAD_COND_BROADCAST(&worker->sched_cond);
-					do
-					{
-						STARPU_PTHREAD_COND_WAIT(&worker->sched_cond, &worker->sched_mutex);
-					}
-					while (worker->state_wait_ack__blocked);
-					worker->state_blocked = 0;
+					_starpu_sched_ctx_signal_worker_blocked(sched_ctx->id, workerid);
+					sched_ctx->busy[workerid] = 1;
+					STARPU_PTHREAD_COND_WAIT(&sched_ctx->parallel_sect_cond[workerid], &sched_ctx->parallel_sect_mutex[workerid]);
+					sched_ctx->busy[workerid] = 0;
+					STARPU_PTHREAD_COND_SIGNAL(&sched_ctx->parallel_sect_cond_busy[workerid]);
+					_starpu_sched_ctx_signal_worker_woke_up(sched_ctx->id, workerid);
 					sched_ctx->parallel_sect[workerid] = 0;
 					sched_ctx->parallel_sect[workerid] = 0;
-					if (worker->state_wait_handshake__blocked)
-					{
-						worker->state_wait_handshake__blocked = 0;
-						STARPU_PTHREAD_COND_BROADCAST(&worker->sched_cond);
-					}
+					STARPU_PTHREAD_MUTEX_LOCK_SCHED(&worker->sched_mutex);
 				}
 				}
+				STARPU_PTHREAD_MUTEX_UNLOCK(&sched_ctx->parallel_sect_mutex[workerid]);
 			}
 			}
 			if(!needed)
 			if(!needed)
 				break;
 				break;
@@ -391,25 +388,21 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *worker, int w
 		if(worker->tmp_sched_ctx != -1)
 		if(worker->tmp_sched_ctx != -1)
 		{
 		{
 			sched_ctx = _starpu_get_sched_ctx_struct(worker->tmp_sched_ctx);
 			sched_ctx = _starpu_get_sched_ctx_struct(worker->tmp_sched_ctx);
+			STARPU_PTHREAD_MUTEX_LOCK(&sched_ctx->parallel_sect_mutex[workerid]);
 			if(sched_ctx->parallel_sect[workerid])
 			if(sched_ctx->parallel_sect[workerid])
 			{
 			{
 //				needed = 0;
 //				needed = 0;
-				worker->state_blocked = 1;
-				worker->state_wait_ack__blocked = 1;
-				STARPU_PTHREAD_COND_BROADCAST(&worker->sched_cond);
-				do
-				{
-					STARPU_PTHREAD_COND_WAIT(&worker->sched_cond, &worker->sched_mutex);
-				}
-				while (worker->state_wait_ack__blocked);
-				worker->state_blocked = 0;
+				STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&worker->sched_mutex);
+				_starpu_sched_ctx_signal_worker_blocked(sched_ctx->id, workerid);
+				sched_ctx->busy[workerid] = 1;
+				STARPU_PTHREAD_COND_WAIT(&sched_ctx->parallel_sect_cond[workerid], &sched_ctx->parallel_sect_mutex[workerid]);
+				sched_ctx->busy[workerid] = 0;
+				STARPU_PTHREAD_COND_SIGNAL(&sched_ctx->parallel_sect_cond_busy[workerid]);
+				_starpu_sched_ctx_signal_worker_woke_up(sched_ctx->id, workerid);
 				sched_ctx->parallel_sect[workerid] = 0;
 				sched_ctx->parallel_sect[workerid] = 0;
-				if (worker->state_wait_handshake__blocked)
-				{
-					worker->state_wait_handshake__blocked = 0;
-					STARPU_PTHREAD_COND_BROADCAST(&worker->sched_cond);
-				}
+				STARPU_PTHREAD_MUTEX_LOCK_SCHED(&worker->sched_mutex);
 			}
 			}
+			STARPU_PTHREAD_MUTEX_UNLOCK(&sched_ctx->parallel_sect_mutex[workerid]);
 		}
 		}
 
 
 		needed = !needed;
 		needed = !needed;
@@ -428,11 +421,7 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *worker, int w
 		task = NULL;
 		task = NULL;
 	/*else try to pop a task*/
 	/*else try to pop a task*/
 	else
 	else
-	{
-		_starpu_worker_enter_transient_sched_op(worker);
 		task = _starpu_pop_task(worker);
 		task = _starpu_pop_task(worker);
-		_starpu_worker_leave_transient_sched_op(worker);
-	}
 
 
 #if !defined(STARPU_SIMGRID)
 #if !defined(STARPU_SIMGRID)
 	if (task == NULL && !executing)
 	if (task == NULL && !executing)
@@ -449,11 +438,7 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *worker, int w
 		if (_starpu_worker_can_block(memnode, worker)
 		if (_starpu_worker_can_block(memnode, worker)
 			&& !_starpu_sched_ctx_last_worker_awake(worker))
 			&& !_starpu_sched_ctx_last_worker_awake(worker))
 		{
 		{
-			do
-			{
-				STARPU_PTHREAD_COND_WAIT(&worker->sched_cond, &worker->sched_mutex);
-			}
-			while (worker->status == STATUS_SLEEPING);
+			STARPU_PTHREAD_COND_WAIT(&worker->sched_cond, &worker->sched_mutex);
 			STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&worker->sched_mutex);
 			STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&worker->sched_mutex);
 		}
 		}
 		else
 		else
@@ -527,9 +512,7 @@ int _starpu_get_multi_worker_task(struct _starpu_worker *workers, struct starpu_
 #endif
 #endif
 			_starpu_worker_set_status_scheduling(workers[i].workerid);
 			_starpu_worker_set_status_scheduling(workers[i].workerid);
 			_starpu_set_local_worker_key(&workers[i]);
 			_starpu_set_local_worker_key(&workers[i]);
-			_starpu_worker_enter_transient_sched_op(&workers[i]);
 			tasks[i] = _starpu_pop_task(&workers[i]);
 			tasks[i] = _starpu_pop_task(&workers[i]);
-			_starpu_worker_leave_transient_sched_op(&workers[i]);
 			if(tasks[i] != NULL)
 			if(tasks[i] != NULL)
 			{
 			{
 				_starpu_worker_set_status_scheduling_done(workers[i].workerid);
 				_starpu_worker_set_status_scheduling_done(workers[i].workerid);
@@ -598,11 +581,7 @@ int _starpu_get_multi_worker_task(struct _starpu_worker *workers, struct starpu_
 		if (_starpu_worker_can_block(memnode, worker)
 		if (_starpu_worker_can_block(memnode, worker)
 				&& !_starpu_sched_ctx_last_worker_awake(worker))
 				&& !_starpu_sched_ctx_last_worker_awake(worker))
 		{
 		{
-			do
-			{
-				STARPU_PTHREAD_COND_WAIT(&worker->sched_cond, &worker->sched_mutex);
-			}
-			while (worker->status == STATUS_SLEEPING);
+			STARPU_PTHREAD_COND_WAIT(&worker->sched_cond, &worker->sched_mutex);
 			STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&worker->sched_mutex);
 			STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&worker->sched_mutex);
 		}
 		}
 		else
 		else

+ 1 - 1
src/drivers/mic/driver_mic_sink.c

@@ -39,7 +39,7 @@ void _starpu_mic_sink_init(struct _starpu_mp_node *node)
 	cpu_set_t cpuset;
 	cpu_set_t cpuset;
 	/* We reserve one core for the communications */
 	/* We reserve one core for the communications */
 	/*Bind on the first core*/
 	/*Bind on the first core*/
-	self = pthread_self();
+	self = starpu_pthread_self();
 	CPU_ZERO(&cpuset);
 	CPU_ZERO(&cpuset);
 	CPU_SET(0,&cpuset);
 	CPU_SET(0,&cpuset);
 	pthread_setaffinity_np(self,sizeof(cpu_set_t),&cpuset);
 	pthread_setaffinity_np(self,sizeof(cpu_set_t),&cpuset);

+ 2 - 3
src/drivers/mp_common/mp_common.c

@@ -15,7 +15,6 @@
  */
  */
 
 
 #include <stdlib.h>
 #include <stdlib.h>
-#include <pthread.h>
 
 
 #include <datawizard/interfaces/data_interface.h>
 #include <datawizard/interfaces/data_interface.h>
 #include <drivers/mp_common/mp_common.h>
 #include <drivers/mp_common/mp_common.h>
@@ -400,7 +399,7 @@ void _starpu_mp_common_send_command(const struct _starpu_mp_node *node,
 {
 {
 	STARPU_ASSERT_MSG(arg_size <= BUFFER_SIZE, "Too much data (%d) for the static MIC buffer (%d), increase BUFFER_SIZE perhaps?", arg_size, BUFFER_SIZE);
 	STARPU_ASSERT_MSG(arg_size <= BUFFER_SIZE, "Too much data (%d) for the static MIC buffer (%d), increase BUFFER_SIZE perhaps?", arg_size, BUFFER_SIZE);
 
 
-        //printf("SEND CMD : %d - arg_size %d by %lu \n", command, arg_size, pthread_self());
+        //printf("SEND CMD : %d - arg_size %d by %lu \n", command, arg_size, starpu_pthread_self());
 
 
 	/* MIC and MPI sizes are given through a int */
 	/* MIC and MPI sizes are given through a int */
 	int command_size = sizeof(enum _starpu_mp_command);
 	int command_size = sizeof(enum _starpu_mp_command);
@@ -436,7 +435,7 @@ enum _starpu_mp_command _starpu_mp_common_recv_command(const struct _starpu_mp_n
 	command = *((enum _starpu_mp_command *) node->buffer);
 	command = *((enum _starpu_mp_command *) node->buffer);
 	*arg_size = *((int *) ((uintptr_t)node->buffer + command_size));
 	*arg_size = *((int *) ((uintptr_t)node->buffer + command_size));
 
 
-        //printf("RECV command : %d - arg_size %d by %lu \n", command, *arg_size, pthread_self());
+        //printf("RECV command : %d - arg_size %d by %lu \n", command, *arg_size, starpu_pthread_self());
 
 
 	/* If there is no argument (ie. arg_size == 0),
 	/* If there is no argument (ie. arg_size == 0),
 	 * let's return the command right now */
 	 * let's return the command right now */

+ 0 - 1
src/drivers/mp_common/mp_common.h

@@ -17,7 +17,6 @@
 #ifndef __MP_COMMON_H__
 #ifndef __MP_COMMON_H__
 #define __MP_COMMON_H__
 #define __MP_COMMON_H__
 
 
-#include <pthread.h>
 #include <semaphore.h>
 #include <semaphore.h>
 
 
 #include <starpu.h>
 #include <starpu.h>

+ 4 - 1
src/drivers/mpi/driver_mpi_common.c

@@ -80,7 +80,10 @@ int _starpu_mpi_common_mp_init()
 #endif
 #endif
 
 
                 int thread_support;
                 int thread_support;
-                STARPU_ASSERT(MPI_Init_thread(_starpu_get_argc(), _starpu_get_argv(), required, &thread_support) == MPI_SUCCESS);
+                if (MPI_Init_thread(_starpu_get_argc(), _starpu_get_argv(), required, &thread_support) != MPI_SUCCESS)
+		{
+			STARPU_ABORT_MSG("Cannot Initialize MPI !");
+		}
 
 
                 if (thread_support != required)
                 if (thread_support != required)
                 {
                 {

+ 8 - 0
src/drivers/opencl/driver_opencl.c

@@ -954,6 +954,14 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker
 			simulate = 1;
 			simulate = 1;
 		#endif
 		#endif
 		}
 		}
+		else if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT && !async)
+			{
+				_SIMGRID_TIMER_BEGIN(1);
+				func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
+				_SIMGRID_TIMER_END;
+				simulate=0;
+			}
+
 		if (simulate)
 		if (simulate)
 			_starpu_simgrid_submit_job(worker->workerid, j, &worker->perf_arch, length,
 			_starpu_simgrid_submit_job(worker->workerid, j, &worker->perf_arch, length,
 						   async ? &task_finished[worker->devid][pipeline_idx] : NULL);
 						   async ? &task_finished[worker->devid][pipeline_idx] : NULL);

+ 7 - 7
src/profiling/bound.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010, 2011, 2012, 2013, 2016, 2017  CNRS
  * Copyright (C) 2010, 2011, 2012, 2013, 2016, 2017  CNRS
- * Copyright (C) 2010-2016  Université de Bordeaux
+ * Copyright (C) 2010-2017  Université de Bordeaux
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011  Télécom-SudParis
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -209,11 +209,11 @@ static double** initialize_arch_duration(int maxdevid, unsigned* maxncore_table)
 static void initialize_duration(struct bound_task *task)
 static void initialize_duration(struct bound_task *task)
 {
 {
 	struct _starpu_machine_config *conf = _starpu_get_machine_config();
 	struct _starpu_machine_config *conf = _starpu_get_machine_config();
-	task->duration[STARPU_CPU_WORKER] = initialize_arch_duration(1,&conf->topology.ncpus); 
-	task->duration[STARPU_CUDA_WORKER] = initialize_arch_duration(conf->topology.ncudagpus,NULL); 
-	task->duration[STARPU_OPENCL_WORKER] = initialize_arch_duration(conf->topology.nopenclgpus,NULL); 
-	task->duration[STARPU_MIC_WORKER] = initialize_arch_duration(conf->topology.nmicdevices,conf->topology.nmiccores); 
-	task->duration[STARPU_SCC_WORKER] = initialize_arch_duration(conf->topology.nsccdevices,NULL); 
+	task->duration[STARPU_CPU_WORKER] = initialize_arch_duration(1,&conf->topology.nhwcpus); 
+	task->duration[STARPU_CUDA_WORKER] = initialize_arch_duration(conf->topology.nhwcudagpus,NULL); 
+	task->duration[STARPU_OPENCL_WORKER] = initialize_arch_duration(conf->topology.nhwopenclgpus,NULL); 
+	task->duration[STARPU_MIC_WORKER] = initialize_arch_duration(conf->topology.nhwmicdevices,conf->topology.nmiccores); 
+	task->duration[STARPU_SCC_WORKER] = initialize_arch_duration(conf->topology.nhwscc,NULL); 
 }
 }
 
 
 static struct starpu_perfmodel_device device =
 static struct starpu_perfmodel_device device =
@@ -278,7 +278,7 @@ void _starpu_bound_record(struct _starpu_job *j)
 	{
 	{
 		struct bound_task_pool *tp;
 		struct bound_task_pool *tp;
 
 
-		_starpu_compute_buffers_footprint(j->task->cl?j->task->cl->model:NULL, STARPU_CPU_WORKER, 0, j);
+		_starpu_compute_buffers_footprint(j->task->cl?j->task->cl->model:NULL, NULL, 0, j);
 
 
 		if (last && last->cl == j->task->cl && last->footprint == j->footprint)
 		if (last && last->cl == j->task->cl && last->footprint == j->footprint)
 			tp = last;
 			tp = last;

+ 2 - 1
src/profiling/profiling_helpers.c

@@ -60,6 +60,7 @@ void _starpu_profiling_bus_helper_display_summary(FILE *stream)
 
 
 		unsigned unit = 0;
 		unsigned unit = 0;
 		double d = convert_to_byte_units(transferred, max_unit, &unit);
 		double d = convert_to_byte_units(transferred, max_unit, &unit);
+		double avg = (transfer_cnt != 0) ? (d / transfer_cnt) : 0;
 
 
 		_starpu_memory_node_get_name(src, src_name, sizeof(src_name));
 		_starpu_memory_node_get_name(src, src_name, sizeof(src_name));
 		_starpu_memory_node_get_name(dst, dst_name, sizeof(dst_name));
 		_starpu_memory_node_get_name(dst, dst_name, sizeof(dst_name));
@@ -67,7 +68,7 @@ void _starpu_profiling_bus_helper_display_summary(FILE *stream)
 		fprintf(stream, "\t%s -> %s", src_name, dst_name);
 		fprintf(stream, "\t%s -> %s", src_name, dst_name);
 		fprintf(stream, "\t%.2lf %s", d, byte_units[unit]);
 		fprintf(stream, "\t%.2lf %s", d, byte_units[unit]);
 		fprintf(stream, "\t%.2lf %s/s", d / elapsed_time, byte_units[unit]);
 		fprintf(stream, "\t%.2lf %s/s", d / elapsed_time, byte_units[unit]);
-		fprintf(stream, "\t(transfers : %lld - avg %.2lf %s)\n", transfer_cnt, d / transfer_cnt, byte_units[unit]);
+		fprintf(stream, "\t(transfers : %lld - avg %.2lf %s)\n", transfer_cnt, avg, byte_units[unit]);
 
 
 		sum_transferred += transferred;
 		sum_transferred += transferred;
 	}
 	}

+ 8 - 2
src/sched_policies/component_worker.c

@@ -503,8 +503,11 @@ static void simple_worker_can_pull(struct starpu_sched_component * worker_compon
 	}
 	}
 	if(_starpu_sched_component_worker_is_sleeping_status(worker_component))
 	if(_starpu_sched_component_worker_is_sleeping_status(worker_component))
 	{
 	{
+		starpu_pthread_mutex_t *sched_mutex;
+		starpu_pthread_cond_t *sched_cond;
+		starpu_worker_get_sched_condition(w->workerid, &sched_mutex, &sched_cond);
 		_starpu_sched_component_unlock_worker(worker_component->tree->sched_ctx_id, w->workerid);
 		_starpu_sched_component_unlock_worker(worker_component->tree->sched_ctx_id, w->workerid);
-		starpu_wake_worker(w->workerid);
+		starpu_wakeup_worker(w->workerid, sched_cond, sched_mutex);
 	}
 	}
 	else
 	else
 		_starpu_sched_component_unlock_worker(worker_component->tree->sched_ctx_id, w->workerid);
 		_starpu_sched_component_unlock_worker(worker_component->tree->sched_ctx_id, w->workerid);
@@ -723,7 +726,10 @@ static void combined_worker_can_pull(struct starpu_sched_component * component)
 		_starpu_sched_component_lock_worker(component->tree->sched_ctx_id, worker);
 		_starpu_sched_component_lock_worker(component->tree->sched_ctx_id, worker);
 		if(_starpu_sched_component_worker_is_sleeping_status(component))
 		if(_starpu_sched_component_worker_is_sleeping_status(component))
 		{
 		{
-			starpu_wake_worker(worker);
+			starpu_pthread_mutex_t *sched_mutex;
+			starpu_pthread_cond_t *sched_cond;
+			starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
+			starpu_wakeup_worker(worker, sched_cond, sched_mutex);
 		}
 		}
 		if(_starpu_sched_component_worker_is_reset_status(component))
 		if(_starpu_sched_component_worker_is_reset_status(component))
 			_starpu_sched_component_worker_set_changed_status(component);
 			_starpu_sched_component_worker_set_changed_status(component);

+ 10 - 3
src/sched_policies/deque_modeling_policy_data_aware.c

@@ -251,6 +251,7 @@ static struct starpu_task *dmda_pop_ready_task(unsigned sched_ctx_id)
 
 
 	/* Take the opportunity to update start time */
 	/* Take the opportunity to update start time */
 	fifo->exp_start = STARPU_MAX(starpu_timing_now(), fifo->exp_start);
 	fifo->exp_start = STARPU_MAX(starpu_timing_now(), fifo->exp_start);
+	fifo->exp_end = fifo->exp_start + fifo->exp_len;
 
 
 	task = _starpu_fifo_pop_first_ready_task(fifo, node, dt->num_priorities);
 	task = _starpu_fifo_pop_first_ready_task(fifo, node, dt->num_priorities);
 	if (task)
 	if (task)
@@ -285,6 +286,7 @@ static struct starpu_task *dmda_pop_task(unsigned sched_ctx_id)
 
 
 	/* Take the opportunity to update start time */
 	/* Take the opportunity to update start time */
 	fifo->exp_start = STARPU_MAX(starpu_timing_now(), fifo->exp_start);
 	fifo->exp_start = STARPU_MAX(starpu_timing_now(), fifo->exp_start);
+	fifo->exp_end = fifo->exp_start + fifo->exp_len;
 
 
 	STARPU_ASSERT_MSG(fifo, "worker %u does not belong to ctx %u anymore.\n", workerid, sched_ctx_id);
 	STARPU_ASSERT_MSG(fifo, "worker %u does not belong to ctx %u anymore.\n", workerid, sched_ctx_id);
 
 
@@ -321,6 +323,7 @@ static struct starpu_task *dmda_pop_every_task(unsigned sched_ctx_id)
 
 
 	/* Take the opportunity to update start time */
 	/* Take the opportunity to update start time */
 	fifo->exp_start = STARPU_MAX(starpu_timing_now(), fifo->exp_start);
 	fifo->exp_start = STARPU_MAX(starpu_timing_now(), fifo->exp_start);
+	fifo->exp_end = fifo->exp_start + fifo->exp_len;
 
 
 	starpu_pthread_mutex_t *sched_mutex;
 	starpu_pthread_mutex_t *sched_mutex;
 	starpu_pthread_cond_t *sched_cond;
 	starpu_pthread_cond_t *sched_cond;
@@ -367,6 +370,7 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
 
         /* Sometimes workers didn't take the tasks as early as we expected */
         /* Sometimes workers didn't take the tasks as early as we expected */
 	fifo->exp_start = isnan(fifo->exp_start) ? starpu_timing_now() + fifo->pipeline_len : STARPU_MAX(fifo->exp_start, starpu_timing_now());
 	fifo->exp_start = isnan(fifo->exp_start) ? starpu_timing_now() + fifo->pipeline_len : STARPU_MAX(fifo->exp_start, starpu_timing_now());
+	fifo->exp_end = fifo->exp_start + fifo->exp_len;
 
 
 	if ((starpu_timing_now() + predicted_transfer) < fifo->exp_end)
 	if ((starpu_timing_now() + predicted_transfer) < fifo->exp_end)
 	{
 	{
@@ -448,7 +452,7 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
 
 
 
 #if !defined(STARPU_NON_BLOCKING_DRIVERS) || defined(STARPU_SIMGRID)
 #if !defined(STARPU_NON_BLOCKING_DRIVERS) || defined(STARPU_SIMGRID)
-		starpu_wake_worker_locked(best_workerid);
+		starpu_wakeup_worker_locked(best_workerid, sched_cond, sched_mutex);
 #endif
 #endif
 		starpu_push_task_end(task);
 		starpu_push_task_end(task);
 		STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(sched_mutex);
 		STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(sched_mutex);
@@ -460,7 +464,7 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 		dt->queue_array[best_workerid]->ntasks++;
 		dt->queue_array[best_workerid]->ntasks++;
 		dt->queue_array[best_workerid]->nprocessed++;
 		dt->queue_array[best_workerid]->nprocessed++;
 #if !defined(STARPU_NON_BLOCKING_DRIVERS) || defined(STARPU_SIMGRID)
 #if !defined(STARPU_NON_BLOCKING_DRIVERS) || defined(STARPU_SIMGRID)
-		starpu_wake_worker_locked(best_workerid);
+		starpu_wakeup_worker_locked(best_workerid, sched_cond, sched_mutex);
 #endif
 #endif
 		starpu_push_task_end(task);
 		starpu_push_task_end(task);
 		STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(sched_mutex);
 		STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(sched_mutex);
@@ -773,7 +777,9 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 			if (unknown)
 			if (unknown)
 				continue;
 				continue;
 
 
-			exp_end[worker_ctx][nimpl] = exp_start + prev_exp_len + local_task_length[worker_ctx][nimpl];
+			double task_starting_time = STARPU_MAX(exp_start + prev_exp_len, starpu_timing_now() + local_data_penalty[worker_ctx][nimpl]); 
+
+			exp_end[worker_ctx][nimpl] = task_starting_time + local_task_length[worker_ctx][nimpl];
 
 
 			if (exp_end[worker_ctx][nimpl] < best_exp_end)
 			if (exp_end[worker_ctx][nimpl] < best_exp_end)
 			{
 			{
@@ -1126,6 +1132,7 @@ static void dmda_pre_exec_hook(struct starpu_task *task, unsigned sched_ctx_id)
 
 
 	/* Take the opportunity to update start time */
 	/* Take the opportunity to update start time */
 	fifo->exp_start = STARPU_MAX(starpu_timing_now() + fifo->pipeline_len, fifo->exp_start);
 	fifo->exp_start = STARPU_MAX(starpu_timing_now() + fifo->pipeline_len, fifo->exp_start);
+	fifo->exp_end = fifo->exp_start + fifo->exp_len;
 
 
 	STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(sched_mutex);
 	STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(sched_mutex);
 }
 }

+ 1 - 1
src/sched_policies/eager_central_policy.c

@@ -201,7 +201,7 @@ static void eager_add_workers(unsigned sched_ctx_id, int *workerids, unsigned nw
 		int workerid = workerids[i];
 		int workerid = workerids[i];
 		int curr_workerid = _starpu_worker_get_id();
 		int curr_workerid = _starpu_worker_get_id();
 		if(workerid != curr_workerid)
 		if(workerid != curr_workerid)
-			starpu_wake_worker_locked(workerid);
+			starpu_wake_worker(workerid);
 
 
 		starpu_sched_ctx_worker_shares_tasks_lists(workerid, sched_ctx_id);
 		starpu_sched_ctx_worker_shares_tasks_lists(workerid, sched_ctx_id);
 	}
 	}

+ 1 - 1
src/sched_policies/eager_central_priority_policy.c

@@ -308,7 +308,7 @@ static void eager_center_priority_add_workers(unsigned sched_ctx_id, int *worker
 		int workerid = workerids[i];
 		int workerid = workerids[i];
 		int curr_workerid = _starpu_worker_get_id();
 		int curr_workerid = _starpu_worker_get_id();
 		if(workerid != curr_workerid)
 		if(workerid != curr_workerid)
-			starpu_wake_worker_locked(workerid);
+			starpu_wake_worker(workerid);
 
 
                 starpu_sched_ctx_worker_shares_tasks_lists(workerid, sched_ctx_id);
                 starpu_sched_ctx_worker_shares_tasks_lists(workerid, sched_ctx_id);
         }
         }

+ 1 - 1
src/sched_policies/parallel_eager.c

@@ -265,7 +265,7 @@ static struct starpu_task *pop_task_peager_policy(unsigned sched_ctx_id)
 				_starpu_fifo_push_task(data->local_fifo[local_worker], alias);
 				_starpu_fifo_push_task(data->local_fifo[local_worker], alias);
 
 
 #if !defined(STARPU_NON_BLOCKING_DRIVERS) || defined(STARPU_SIMGRID)
 #if !defined(STARPU_NON_BLOCKING_DRIVERS) || defined(STARPU_SIMGRID)
-				starpu_wake_worker_locked(local_worker);
+				starpu_wakeup_worker_locked(local_worker, sched_cond, sched_mutex);
 #endif
 #endif
 				STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(sched_mutex);
 				STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(sched_mutex);
 
 

+ 3 - 1
src/util/fstarpu.c

@@ -85,6 +85,7 @@ static const intptr_t fstarpu_starpu_mic	= STARPU_MIC;
 static const intptr_t fstarpu_starpu_scc	= STARPU_SCC;
 static const intptr_t fstarpu_starpu_scc	= STARPU_SCC;
 
 
 static const intptr_t fstarpu_starpu_codelet_simgrid_execute	= STARPU_CODELET_SIMGRID_EXECUTE;
 static const intptr_t fstarpu_starpu_codelet_simgrid_execute	= STARPU_CODELET_SIMGRID_EXECUTE;
+static const intptr_t fstarpu_starpu_codelet_simgrid_execute_and_inject	= STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT;
 static const intptr_t fstarpu_starpu_cuda_async	= STARPU_CUDA_ASYNC;
 static const intptr_t fstarpu_starpu_cuda_async	= STARPU_CUDA_ASYNC;
 static const intptr_t fstarpu_starpu_opencl_async	= STARPU_OPENCL_ASYNC;
 static const intptr_t fstarpu_starpu_opencl_async	= STARPU_OPENCL_ASYNC;
 
 
@@ -153,6 +154,7 @@ intptr_t fstarpu_get_constant(char *s)
 	else if (!strcmp(s, "FSTARPU_SCC"))	{ return fstarpu_starpu_scc; }
 	else if (!strcmp(s, "FSTARPU_SCC"))	{ return fstarpu_starpu_scc; }
 
 
 	else if (!strcmp(s, "FSTARPU_CODELET_SIMGRID_EXECUTE"))	{ return fstarpu_starpu_codelet_simgrid_execute; }
 	else if (!strcmp(s, "FSTARPU_CODELET_SIMGRID_EXECUTE"))	{ return fstarpu_starpu_codelet_simgrid_execute; }
+	else if (!strcmp(s, "FSTARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT"))	{ return fstarpu_starpu_codelet_simgrid_execute_and_inject; }
 	else if (!strcmp(s, "FSTARPU_CUDA_ASYNC"))	{ return fstarpu_starpu_cuda_async; }
 	else if (!strcmp(s, "FSTARPU_CUDA_ASYNC"))	{ return fstarpu_starpu_cuda_async; }
 	else if (!strcmp(s, "FSTARPU_OPENCL_ASYNC"))	{ return fstarpu_starpu_opencl_async; }
 	else if (!strcmp(s, "FSTARPU_OPENCL_ASYNC"))	{ return fstarpu_starpu_opencl_async; }
 
 
@@ -542,7 +544,7 @@ void fstarpu_worker_get_type_as_string(intptr_t type, char *dst, size_t maxlen)
 	snprintf(dst, maxlen, "%s", str);
 	snprintf(dst, maxlen, "%s", str);
 }
 }
 
 
-struct starpu_data_handle *fstarpu_data_handle_array_alloc(int nb)
+starpu_data_handle_t *fstarpu_data_handle_array_alloc(int nb)
 {
 {
 	void *ptr;
 	void *ptr;
 	_STARPU_CALLOC(ptr, (size_t)nb, sizeof(starpu_data_handle_t));
 	_STARPU_CALLOC(ptr, (size_t)nb, sizeof(starpu_data_handle_t));

+ 1 - 1
starpu.mk

@@ -16,7 +16,7 @@
 
 
 if STARPU_USE_MPI_MASTER_SLAVE
 if STARPU_USE_MPI_MASTER_SLAVE
 MPI_LAUNCHER 			= $(MPIEXEC)  $(MPIEXEC_ARGS) -np 4
 MPI_LAUNCHER 			= $(MPIEXEC)  $(MPIEXEC_ARGS) -np 4
-MPI_RUN_ARGS			= STARPU_WORKERS_NOBIND=1 STARPU_NCPU=4
+MPI_RUN_ARGS			= STARPU_WORKERS_NOBIND=1 STARPU_NCPU=4 STARPU_NMPIMSTHREADS=4
 endif
 endif
 
 
 showcheck:
 showcheck:

+ 3 - 0
tests/Makefile.am

@@ -34,6 +34,7 @@ AM_LDFLAGS = @STARPU_EXPORT_DYNAMIC@ $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFL
 
 
 EXTRA_DIST =					\
 EXTRA_DIST =					\
 	helper.h				\
 	helper.h				\
+	datawizard/locality.sh			\
 	datawizard/scal.h			\
 	datawizard/scal.h			\
 	datawizard/mpi_like.h			\
 	datawizard/mpi_like.h			\
 	microbenchs/tasks_size_overhead.sh	\
 	microbenchs/tasks_size_overhead.sh	\
@@ -381,6 +382,8 @@ if STARPU_SIMGRID
 TESTS += $(MICROBENCHS:=.sh)
 TESTS += $(MICROBENCHS:=.sh)
 endif
 endif
 
 
+TESTS += datawizard/locality.sh
+
 #######################
 #######################
 # Source files        #
 # Source files        #
 #######################
 #######################

+ 33 - 0
tests/datawizard/locality.sh

@@ -0,0 +1,33 @@
+#!/bin/bash -x
+#
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2017  Université de Bordeaux
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+# Test generation of FxT traces
+
+set -e
+
+PREFIX=$(dirname $0)
+test -x $PREFIX/../../tools/starpu_fxt_tool || exit 77
+STARPU_FXT_PREFIX=$PREFIX/ $PREFIX/locality
+$PREFIX/../../tools/starpu_fxt_tool -i $PREFIX/prof_file_${USER}_0
+
+# Check that they are approved by Grenoble :)
+
+if type pj_dump > /dev/null 2> /dev/null
+then
+	$PREFIX/../../tools/starpu_paje_sort paje.trace
+	pj_dump paje.trace
+fi

+ 3 - 2
tools/starpu_fxt_tool.c

@@ -30,8 +30,9 @@ static void usage()
 	fprintf(stderr, "Usage: %s [ options ]\n", PROGNAME);
 	fprintf(stderr, "Usage: %s [ options ]\n", PROGNAME);
         fprintf(stderr, "\n");
         fprintf(stderr, "\n");
         fprintf(stderr, "Options:\n");
         fprintf(stderr, "Options:\n");
-	fprintf(stderr, "   -i <input file>     specify the input file. This can be specified several\n");
-	fprintf(stderr, "                       times for MPI execution case\n");
+	fprintf(stderr, "   -i <input file[s]>  specify the input file[s]. Several files can be provided,\n");
+	fprintf(stderr, "                       or the option specified several times for MPI execution\n");
+	fprintf(stderr, "                       case\n");
         fprintf(stderr, "   -o <output file>    specify the output file\n");
         fprintf(stderr, "   -o <output file>    specify the output file\n");
         fprintf(stderr, "   -c                  use a different colour for every type of task\n");
         fprintf(stderr, "   -c                  use a different colour for every type of task\n");
 	fprintf(stderr, "   -no-events          do not show events\n");
 	fprintf(stderr, "   -no-events          do not show events\n");