Corentin Salingue 8 éve
szülő
commit
134f1eb906
70 módosított fájl, 1232 hozzáadás és 716 törlés
  1. 1 0
      ChangeLog
  2. 9 2
      configure.ac
  3. 3 2
      doc/doxygen/chapters/320_scheduling.doxy
  4. 1 1
      doc/doxygen/chapters/380_offline_performance_tools.doxy
  5. 4 3
      doc/doxygen/chapters/470_simgrid.doxy
  6. 20 10
      doc/doxygen/chapters/501_environment_variables.doxy
  7. 5 0
      doc/doxygen/chapters/api/codelet_and_tasks.doxy
  8. 12 3
      examples/Makefile.am
  9. 105 79
      examples/heat/dw_sparse_cg.c
  10. 7 3
      examples/heat/heat.c
  11. 43 0
      examples/heat/heat.sh
  12. 34 0
      examples/lu/lu.sh
  13. 3 1
      examples/lu/lu_example.c
  14. 4 0
      examples/lu/xlu_implicit_pivot.c
  15. 5 0
      examples/lu/xlu_pivot.c
  16. 46 20
      examples/mlr/mlr.c
  17. 5 3
      examples/sched_ctx/gpu_partition.c
  18. 2 3
      examples/stencil/stencil-blocks.c
  19. 5 2
      include/fstarpu_mod.f90
  20. 2 1
      include/starpu_config.h.in
  21. 3 0
      include/starpu_scheduler.h
  22. 1 0
      include/starpu_task.h
  23. 41 10
      include/starpu_thread.h
  24. 109 0
      mpi/dev/starpu_mpi_comm_check.sh
  25. 1 1
      sc_hypervisor/src/hypervisor_policies/feft_lp_policy.c
  26. 3 3
      socl/src/cl_createkernel.c
  27. 2 2
      socl/src/cl_createprogramwithsource.c
  28. 6 6
      socl/src/cl_enqueuendrangekernel.c
  29. 3 3
      socl/src/cl_enqueuereadbuffer.c
  30. 3 3
      socl/src/cl_enqueuewritebuffer.c
  31. 1 1
      socl/src/cl_setkernelarg.c
  32. 1 1
      socl/src/task.c
  33. 1 1
      src/common/fxt.c
  34. 25 3
      src/common/prio_list.h
  35. 2 3
      src/common/starpu_spinlock.c
  36. 102 29
      src/common/thread.c
  37. 3 4
      src/common/utils.h
  38. 2 2
      src/core/jobs.c
  39. 308 289
      src/core/sched_ctx.c
  40. 41 42
      src/core/sched_ctx.h
  41. 11 6
      src/core/sched_policy.c
  42. 3 2
      src/core/sched_policy.h
  43. 4 4
      src/core/simgrid.h
  44. 6 11
      src/core/topology.c
  45. 37 21
      src/core/workers.c
  46. 20 47
      src/core/workers.h
  47. 16 0
      src/datawizard/malloc.c
  48. 5 2
      src/datawizard/memory_nodes.c
  49. 21 5
      src/debug/traces/starpu_fxt.c
  50. 6 6
      src/debug/traces/starpu_paje.c
  51. 8 2
      src/drivers/cpu/driver_cpu.c
  52. 10 4
      src/drivers/cuda/driver_cuda.c
  53. 23 44
      src/drivers/driver_common/driver_common.c
  54. 1 1
      src/drivers/mic/driver_mic_sink.c
  55. 2 3
      src/drivers/mp_common/mp_common.c
  56. 0 1
      src/drivers/mp_common/mp_common.h
  57. 4 1
      src/drivers/mpi/driver_mpi_common.c
  58. 8 0
      src/drivers/opencl/driver_opencl.c
  59. 7 7
      src/profiling/bound.c
  60. 2 1
      src/profiling/profiling_helpers.c
  61. 8 2
      src/sched_policies/component_worker.c
  62. 10 3
      src/sched_policies/deque_modeling_policy_data_aware.c
  63. 1 1
      src/sched_policies/eager_central_policy.c
  64. 1 1
      src/sched_policies/eager_central_priority_policy.c
  65. 1 1
      src/sched_policies/parallel_eager.c
  66. 3 1
      src/util/fstarpu.c
  67. 1 1
      starpu.mk
  68. 3 0
      tests/Makefile.am
  69. 33 0
      tests/datawizard/locality.sh
  70. 3 2
      tools/starpu_fxt_tool.c

+ 1 - 0
ChangeLog

@@ -273,6 +273,7 @@ Small features:
     allows to copy in a new buffer values which have not been unpacked by
     the current call
   * Add STARPU_CODELET_SIMGRID_EXECUTE flag.
+  * Add STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT flag.
   * Add STARPU_CL_ARGS flag to starpu_task_insert() and
     starpu_mpi_task_insert() functions call
 

+ 9 - 2
configure.ac

@@ -348,9 +348,10 @@ else
     build_mpi_master_slave=no
 fi
 
-#Warn users that they cannot use both at the same time
+#users cannot use both at the same time
 if test x$build_mpi_master_slave = xyes -a x$enable_mpi = xyes; then
-    AC_MSG_WARN(StarPU-MPI and MPI Master-Slave cannot be used at the same time !)
+    AC_MSG_WARN(StarPU-MPI and MPI Master-Slave cannot be used at the same time ! Disabling StarPU-MPI...)
+	enable_mpi=no
 fi
 
 if test x$build_mpi_master_slave = xyes; then
@@ -3117,6 +3118,12 @@ AC_CONFIG_COMMANDS([executable-scripts], [
   test -e tests/microbenchs/parallel_independent_heterogeneous_tasks.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_heterogeneous_tasks.sh tests/microbenchs/
   test -e tests/microbenchs/parallel_independent_homogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_homogeneous_tasks_data.sh tests/microbenchs/
   test -e tests/microbenchs/parallel_independent_homogeneous_tasks.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_homogeneous_tasks.sh tests/microbenchs/
+  mkdir -p tests/datawizard
+  test -e tests/datawizard/locality.sh || ln -sf $ac_abs_top_srcdir/tests/datawizard/locality.sh tests/datawizard/
+  mkdir -p examples/heat
+  test -e examples/heat/heat.sh || ln -sf $ac_abs_top_srcdir/examples/heat/heat.sh examples/heat/
+  mkdir -p examples/lu
+  test -e examples/lu/lu.sh || ln -sf $ac_abs_top_srcdir/examples/lu/lu.sh examples/lu/
 ])
 
 # Create links to ICD files in build/socl/vendors directory. SOCL will use this

+ 3 - 2
doc/doxygen/chapters/320_scheduling.doxy

@@ -285,8 +285,9 @@ be used to get information about how well the execution proceeded, and thus the
 overall quality of the execution.
 
 Precise debugging can also be performed by using the
-\ref STARPU_TASK_BREAK_ON_SCHED, \ref STARPU_TASK_BREAK_ON_PUSH, and
-\ref STARPU_TASK_BREAK_ON_POP environment variables. By setting the job_id of a task
+\ref STARPU_TASK_BREAK_ON_PUSH, \ref STARPU_TASK_BREAK_ON_SCHED,
+\ref STARPU_TASK_BREAK_ON_POP, and \ref STARPU_TASK_BREAK_ON_EXEC environment variables.
+By setting the job_id of a task
 in these environment variables, StarPU will raise <c>SIGTRAP</c> when the task is being
 scheduled, pushed, or popped by the scheduler. That means that when one notices
 that a task is being scheduled in a seemingly odd way, one can just reexecute

+ 1 - 1
doc/doxygen/chapters/380_offline_performance_tools.doxy

@@ -129,7 +129,7 @@ collect the trace files from the MPI nodes, and
 specify them all on the command <c>starpu_fxt_tool</c>, for instance:
 
 \verbatim
-$ starpu_fxt_tool -i /tmp/prof_file_something1 -i /tmp/prof_file_something2
+$ starpu_fxt_tool -i /tmp/prof_file_something*
 \endverbatim
 
 By default, all tasks are displayed using a green color. To display tasks with

+ 4 - 3
doc/doxygen/chapters/470_simgrid.doxy

@@ -9,8 +9,8 @@
 /*! \page SimGridSupport SimGrid Support
 
 StarPU can use Simgrid in order to simulate execution on an arbitrary
-platform. This was tested with simgrid 3.11, 3.12, 3.13, 3.14, and 3.14.159, other versions may have
-compatibility issues.
+platform. This was tested with simgrid from 3.11 to 3.15,
+other versions may have compatibility issues.
 
 \section Preparing Preparing Your Application For Simulation
 
@@ -36,7 +36,8 @@ To be able to run the application with e.g. CUDA simulation on a system which
 does not have CUDA installed, one can fill the cuda_funcs with (void*)1, to
 express that there is a CUDA implementation, even if one does not actually
 provide it. StarPU will not actually run it in Simgrid mode anyway by default
-(unless the ::STARPU_CODELET_SIMGRID_EXECUTE flag is set in the codelet)
+(unless the ::STARPU_CODELET_SIMGRID_EXECUTE or ::STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT
+flags are set in the codelet)
 
 \snippet simgrid.c To be included. You should update doxygen if you see this text.
 

+ 20 - 10
doc/doxygen/chapters/501_environment_variables.doxy

@@ -642,9 +642,10 @@ especially regarding data transfers.
 <dd>
 \anchor STARPU_SIMGRID_SCHED_COST
 \addindex __env__STARPU_SIMGRID_SCHED_COST
-When set to 1 (which is the default), scheduling costs are taken into
+When set to 1 (0 is the default), scheduling costs are taken into
 account in simgrid mode. This provides more accurate simgrid predictions,
-and allows studying scheduling overhead of the runtime system.
+and allows studying scheduling overhead of the runtime system. However,
+it also makes simulation non-deterministic.
 </dd>
 
 </dl>
@@ -1021,6 +1022,15 @@ dog is reached, thus allowing to catch the situation in gdb, etc
 (see \ref DetectionStuckConditions)
 </dd>
 
+<dt>STARPU_TASK_BREAK_ON_PUSH</dt>
+<dd>
+\anchor STARPU_TASK_BREAK_ON_PUSH
+\addindex __env__STARPU_TASK_BREAK_ON_PUSH
+When this variable contains a job id, StarPU will raise SIGTRAP when the task
+with that job id is being pushed to the scheduler, which will be nicely catched by debuggers
+(see \ref DebuggingScheduling)
+</dd>
+
 <dt>STARPU_TASK_BREAK_ON_SCHED</dt>
 <dd>
 \anchor STARPU_TASK_BREAK_ON_SCHED
@@ -1032,21 +1042,21 @@ This only works for schedulers which have such a scheduling point defined
 (see \ref DebuggingScheduling)
 </dd>
 
-<dt>STARPU_TASK_BREAK_ON_PUSH</dt>
+<dt>STARPU_TASK_BREAK_ON_POP</dt>
 <dd>
-\anchor STARPU_TASK_BREAK_ON_PUSH
-\addindex __env__STARPU_TASK_BREAK_ON_PUSH
+\anchor STARPU_TASK_BREAK_ON_POP
+\addindex __env__STARPU_TASK_BREAK_ON_POP
 When this variable contains a job id, StarPU will raise SIGTRAP when the task
-with that job id is being pushed to the scheduler, which will be nicely catched by debuggers
+with that job id is being popped from the scheduler, which will be nicely catched by debuggers
 (see \ref DebuggingScheduling)
 </dd>
 
-<dt>STARPU_TASK_BREAK_ON_POP</dt>
+<dt>STARPU_TASK_BREAK_ON_EXEC</dt>
 <dd>
-\anchor STARPU_TASK_BREAK_ON_POP
-\addindex __env__STARPU_TASK_BREAK_ON_POP
+\anchor STARPU_TASK_BREAK_ON_EXEC
+\addindex __env__STARPU_TASK_BREAK_ON_EXEC
 When this variable contains a job id, StarPU will raise SIGTRAP when the task
-with that job id is being popped from the scheduler, which will be nicely catched by debuggers
+with that job id is being executed, which will be nicely catched by debuggers
 (see \ref DebuggingScheduling)
 </dd>
 

+ 5 - 0
doc/doxygen/chapters/api/codelet_and_tasks.doxy

@@ -135,6 +135,11 @@ Value to be set in starpu_codelet::opencl_flags to allow asynchronous OpenCL ker
 \ingroup API_Codelet_And_Tasks
 Value to be set in starpu_codelet::flags to execute the codelet functions even in simgrid mode.
 
+\def STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT
+\ingroup API_Codelet_And_Tasks
+Value to be set in starpu_codelet::flags to execute the codelet functions even in simgrid mode,
+and later inject the measured timing inside the simulation.
+
 \typedef starpu_cpu_func_t
 \ingroup API_Codelet_And_Tasks
 CPU implementation of a codelet.

+ 12 - 3
examples/Makefile.am

@@ -77,11 +77,13 @@ EXTRA_DIST = 					\
 	scheduler/schedulers.sh				\
 	scheduler/schedulers_context.sh			\
 	fortran/Makefile				\
-	sched_ctx/axpy_partition_gpu.h				\
-	sched_ctx/axpy_partition_gpu.cu
+	sched_ctx/axpy_partition_gpu.h			\
+	sched_ctx/axpy_partition_gpu.cu			\
+	heat/heat.sh					\
+	lu/lu.sh
 
 
-CLEANFILES = *.gcno *.gcda *.linkinfo *.mod starpu_idle_microsec.log
+CLEANFILES = *.gcno *.gcda *.linkinfo *.mod starpu_idle_microsec.log *.mps *.dot *.pl
 
 if STARPU_USE_CUDA
 
@@ -300,6 +302,13 @@ STARPU_EXAMPLES +=				\
 	heat/heat				\
 	cg/cg					\
 	pipeline/pipeline
+
+if !STARPU_USE_MPI_MASTER_SLAVE
+TESTS += \
+	heat/heat.sh				\
+	lu/lu.sh
+
+endif
 endif
 endif
 

+ 105 - 79
examples/heat/dw_sparse_cg.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2011, 2015  Université de Bordeaux
+ * Copyright (C) 2009, 2010, 2011, 2015, 2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2016  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -25,11 +25,7 @@
 
 static struct starpu_task *create_task(starpu_tag_t id)
 {
-	struct starpu_codelet *cl = calloc(1,sizeof(struct starpu_codelet));
-
 	struct starpu_task *task = starpu_task_create();
-		task->cl = cl;
-		task->cl_arg = NULL;
 		task->use_tag = 1;
 		task->tag_id = id;
 
@@ -131,6 +127,30 @@ void init_problem(void)
  *	cg initialization phase
  */
 
+static struct starpu_codelet cl1 = {
+	.cpu_funcs = { cpu_codelet_func_1 },
+	.cpu_funcs_name = { "cpu_codelet_func_1" },
+	.nbuffers = 4,
+	.modes = { STARPU_R, STARPU_R, STARPU_W, STARPU_R },
+};
+
+static struct starpu_codelet cl2 = {
+	.cpu_funcs = { cpu_codelet_func_2 },
+	.cpu_funcs_name = { "cpu_codelet_func_2" },
+	.nbuffers = 2,
+	.modes = { STARPU_W, STARPU_R },
+};
+
+static struct starpu_codelet cl3 = {
+	.cpu_funcs = { cpu_codelet_func_3 },
+	.cpu_funcs_name = { "cpu_codelet_func_3" },
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = { cublas_codelet_func_3 },
+#endif
+	.nbuffers = 1,
+	.modes = { STARPU_R },
+};
+
 void init_cg(struct cg_problem *problem)
 {
 	int ret;
@@ -139,14 +159,7 @@ void init_cg(struct cg_problem *problem)
 
 	/* r = b  - A x */
 	struct starpu_task *task1 = create_task(1UL);
-	task1->cl->cpu_funcs[0] = cpu_codelet_func_1;
-	task1->cl->cpu_funcs_name[0] = "cpu_codelet_func_1";
-	task1->cl->nbuffers = 4;
-	task1->cl->modes[0] = STARPU_R;
-	task1->cl->modes[1] = STARPU_R;
-	task1->cl->modes[2] = STARPU_W;
-	task1->cl->modes[3] = STARPU_R;
-
+	task1->cl = &cl1;
 	task1->handles[0] = problem->ds_matrixA;
 	task1->handles[1] = problem->ds_vecx;
 	task1->handles[2] = problem->ds_vecr;
@@ -154,12 +167,7 @@ void init_cg(struct cg_problem *problem)
 
 	/* d = r */
 	struct starpu_task *task2 = create_task(2UL);
-	task2->cl->cpu_funcs[0] = cpu_codelet_func_2;
-	task2->cl->cpu_funcs_name[0] = "cpu_codelet_func_2";
-	task2->cl->nbuffers = 2;
-	task2->cl->modes[0] = STARPU_W;
-	task2->cl->modes[1] = STARPU_R;
-
+	task2->cl = &cl2;
 	task2->handles[0] = problem->ds_vecd;
 	task2->handles[1] = problem->ds_vecr;
 
@@ -167,15 +175,9 @@ void init_cg(struct cg_problem *problem)
 
 	/* delta_new = trans(r) r */
 	struct starpu_task *task3 = create_task(3UL);
-#ifdef STARPU_USE_CUDA
-	task3->cl->cuda_funcs[0] = cublas_codelet_func_3;
-#endif
-	task3->cl->cpu_funcs[0] = cpu_codelet_func_3;
-	task3->cl->cpu_funcs_name[0] = "cpu_codelet_func_3";
+	task3->cl = &cl3;
 	task3->cl_arg = problem;
 	task3->cl_arg_size = sizeof(*problem);
-	task3->cl->nbuffers = 1;
-	task3->cl->modes[0] = STARPU_R;
 	task3->handles[0] = problem->ds_vecr;
 
 	task3->callback_func = iteration_cg;
@@ -186,6 +188,11 @@ void init_cg(struct cg_problem *problem)
 
 	/* launch the computation now */
 	ret = starpu_task_submit(task1);
+	if (STARPU_UNLIKELY(ret == -ENODEV))
+	{
+		FPRINTF(stderr, "No worker may execute this task\n");
+		exit(0);
+	}
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 	ret = starpu_task_submit(task2);
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
@@ -198,6 +205,66 @@ void init_cg(struct cg_problem *problem)
  *		the codelet code launcher is its own callback !
  */
 
+static struct starpu_codelet cl4 = {
+	.cpu_funcs = { cpu_codelet_func_4 },
+	.cpu_funcs_name = { "cpu_codelet_func_4" },
+	.nbuffers = 3,
+	.modes = { STARPU_R, STARPU_R, STARPU_W },
+};
+
+static struct starpu_codelet cl5 = {
+	.cpu_funcs = { cpu_codelet_func_5 },
+	.cpu_funcs_name = { "cpu_codelet_func_5" },
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = { cublas_codelet_func_5 },
+#endif
+	.nbuffers = 2,
+	.modes = { STARPU_R, STARPU_R },
+};
+
+static struct starpu_codelet cl6 = {
+	.cpu_funcs = { cpu_codelet_func_6 },
+	.cpu_funcs_name = { "cpu_codelet_func_6" },
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = { cublas_codelet_func_6 },
+	.cuda_flags = { STARPU_CUDA_ASYNC },
+#endif
+	.nbuffers = 2,
+	.modes = { STARPU_RW, STARPU_R },
+};
+
+static struct starpu_codelet cl7 = {
+	.cpu_funcs = { cpu_codelet_func_7 },
+	.cpu_funcs_name = { "cpu_codelet_func_7" },
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = { cublas_codelet_func_7 },
+	.cuda_flags = { STARPU_CUDA_ASYNC },
+#endif
+	.nbuffers = 2,
+	.modes = { STARPU_RW, STARPU_R },
+};
+
+static struct starpu_codelet cl8 = {
+	.cpu_funcs = { cpu_codelet_func_8 },
+	.cpu_funcs_name = { "cpu_codelet_func_8" },
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = { cublas_codelet_func_8 },
+#endif
+	.nbuffers = 1,
+	.modes = { STARPU_R },
+};
+
+static struct starpu_codelet cl9 = {
+	.cpu_funcs = { cpu_codelet_func_9 },
+	.cpu_funcs_name = { "cpu_codelet_func_9" },
+#ifdef STARPU_USE_CUDA
+	.cuda_funcs = { cublas_codelet_func_9 },
+	.cuda_flags = { STARPU_CUDA_ASYNC },
+#endif
+	.nbuffers = 2,
+	.modes = { STARPU_RW, STARPU_R },
+};
+
 void launch_new_cg_iteration(struct cg_problem *problem)
 {
 	int ret;
@@ -208,30 +275,16 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 
 	/* q = A d */
 	struct starpu_task *task4 = create_task(maskiter | 4UL);
-	task4->cl->cpu_funcs[0] = cpu_codelet_func_4;
-	task4->cl->cpu_funcs_name[0] = "cpu_codelet_func_4";
-	task4->cl->nbuffers = 3;
-	task4->cl->modes[0] = STARPU_R;
-	task4->cl->modes[1] = STARPU_R;
-	task4->cl->modes[2] = STARPU_W;
-
+	task4->cl = &cl4;
 	task4->handles[0] = problem->ds_matrixA;
 	task4->handles[1] = problem->ds_vecd;
 	task4->handles[2] = problem->ds_vecq;
 
 	/* alpha = delta_new / ( trans(d) q )*/
 	struct starpu_task *task5 = create_task(maskiter | 5UL);
-#ifdef STARPU_USE_CUDA
-	task5->cl->cuda_funcs[0] = cublas_codelet_func_5;
-#endif
-	task5->cl->cpu_funcs[0] = cpu_codelet_func_5;
-	task5->cl->cpu_funcs_name[0] = "cpu_codelet_func_5";
+	task5->cl = &cl5;
 	task5->cl_arg = problem;
 	task5->cl_arg_size = sizeof(*problem);
-	task5->cl->nbuffers = 2;
-	task5->cl->modes[0] = STARPU_R;
-	task5->cl->modes[1] = STARPU_R;
-
 	task5->handles[0] = problem->ds_vecd;
 	task5->handles[1] = problem->ds_vecq;
 
@@ -239,18 +292,9 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 
 	/* x = x + alpha d */
 	struct starpu_task *task6 = create_task(maskiter | 6UL);
-#ifdef STARPU_USE_CUDA
-	task6->cl->cuda_funcs[0] = cublas_codelet_func_6;
-	task6->cl->cuda_flags[0] = STARPU_CUDA_ASYNC;
-#endif
-	task6->cl->cpu_funcs[0] = cpu_codelet_func_6;
-	task6->cl->cpu_funcs_name[0] = "cpu_codelet_func_6";
+	task6->cl = &cl6;
 	task6->cl_arg = problem;
 	task6->cl_arg_size = sizeof(*problem);
-	task6->cl->nbuffers = 2;
-	task6->cl->modes[0] = STARPU_RW;
-	task6->cl->modes[1] = STARPU_R;
-
 	task6->handles[0] = problem->ds_vecx;
 	task6->handles[1] = problem->ds_vecd;
 
@@ -258,18 +302,9 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 
 	/* r = r - alpha q */
 	struct starpu_task *task7 = create_task(maskiter | 7UL);
-#ifdef STARPU_USE_CUDA
-	task7->cl->cuda_funcs[0] = cublas_codelet_func_7;
-	task7->cl->cuda_flags[0] = STARPU_CUDA_ASYNC;
-#endif
-	task7->cl->cpu_funcs[0] = cpu_codelet_func_7;
-	task7->cl->cpu_funcs_name[0] = "cpu_codelet_func_7";
+	task7->cl = &cl7;
 	task7->cl_arg = problem;
 	task7->cl_arg_size = sizeof(*problem);
-	task7->cl->nbuffers = 2;
-	task7->cl->modes[0] = STARPU_RW;
-	task7->cl->modes[1] = STARPU_R;
-
 	task7->handles[0] = problem->ds_vecr;
 	task7->handles[1] = problem->ds_vecq;
 
@@ -277,33 +312,18 @@ void launch_new_cg_iteration(struct cg_problem *problem)
 
 	/* update delta_* and compute beta */
 	struct starpu_task *task8 = create_task(maskiter | 8UL);
-#ifdef STARPU_USE_CUDA
-	task8->cl->cuda_funcs[0] = cublas_codelet_func_8;
-#endif
-	task8->cl->cpu_funcs[0] = cpu_codelet_func_8;
-	task8->cl->cpu_funcs_name[0] = "cpu_codelet_func_8";
+	task8->cl = &cl8;
 	task8->cl_arg = problem;
 	task8->cl_arg_size = sizeof(*problem);
-	task8->cl->nbuffers = 1;
-	task8->cl->modes[0] = STARPU_R;
 	task8->handles[0] = problem->ds_vecr;
 
 	starpu_tag_declare_deps((starpu_tag_t)(maskiter | 8UL), 1, (starpu_tag_t)(maskiter | 7UL));
 
 	/* d = r + beta d */
 	struct starpu_task *task9 = create_task(maskiter | 9UL);
-#ifdef STARPU_USE_CUDA
-	task9->cl->cuda_funcs[0] = cublas_codelet_func_9;
-	task9->cl->cuda_flags[0] = STARPU_CUDA_ASYNC;
-#endif
-	task9->cl->cpu_funcs[0] = cpu_codelet_func_9;
-	task9->cl->cpu_funcs_name[0] = "cpu_codelet_func_9";
+	task9->cl = &cl9;
 	task9->cl_arg = problem;
 	task9->cl_arg_size = sizeof(*problem);
-	task9->cl->nbuffers = 2;
-	task9->cl->modes[0] = STARPU_RW;
-	task9->cl->modes[1] = STARPU_R;
-
 	task9->handles[0] = problem->ds_vecd;
 	task9->handles[1] = problem->ds_vecr;
 
@@ -427,6 +447,10 @@ void conjugate_gradient(float *nzvalA, float *vecb, float *vecx, uint32_t nnz,
 	starpu_data_unregister(ds_vecr);
 	starpu_data_unregister(ds_vecd);
 	starpu_data_unregister(ds_vecq);
+
+	free(ptr_vecr);
+	free(ptr_vecd);
+	free(ptr_vecq);
 }
 
 
@@ -444,4 +468,6 @@ void do_conjugate_gradient(float *nzvalA, float *vecb, float *vecx, uint32_t nnz
 	starpu_cublas_init();
 
 	conjugate_gradient(nzvalA, vecb, vecx, nnz, nrow, colind, rowptr);
+
+	starpu_shutdown();
 }

+ 7 - 3
examples/heat/heat.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2012, 2015  Université de Bordeaux
+ * Copyright (C) 2009, 2010, 2012, 2015, 2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2016  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -121,9 +121,9 @@ static void parse_args(int argc, char **argv)
 			STARPU_ASSERT((nthick - 2)*(ntheta - 2) == size);
 		}
 
-		if (strcmp(argv[i], "-h") == 0)
+		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-help") == 0)
 		{
-			printf("usage : %s [-v1|-v2|-v3] [-pin] [-nthick number] [-ntheta number] [-shape [0|1|2]] [-cg] [-size number] [-no-prio]\n", argv[0]);
+			printf("usage : %s [-v1|-v2|-v3|-v4] [-pin] [-nthick number] [-ntheta number] [-shape [0|1|2]] [-cg] [-size number] [-no-prio]\n", argv[0]);
 		}
 	}
 }
@@ -751,6 +751,10 @@ int main(int argc, char **argv)
 			result[TRANSLATE(i)] = Bformer[TRANSLATE(i)];
 		}
 
+		free(nzval);
+		free(colind);
+		free(rowptr);
+		free(B);
 	}
 	else
 	{

+ 43 - 0
examples/heat/heat.sh

@@ -0,0 +1,43 @@
+#!/bin/bash
+#
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2017  Université de Bordeaux
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+# Test various LU options
+
+set -e
+
+PREFIX=$(dirname $0)
+
+$PREFIX/heat -shape 0
+$PREFIX/heat -shape 1
+# sometimes lead to pivot being 0
+#$PREFIX/heat -shape 2
+
+$PREFIX/heat -cg
+
+# TODO: FIXME
+
+# segfault
+#$PREFIX/heat -v1
+
+# (actually the default...)
+$PREFIX/heat -v2
+
+# hang
+#$PREFIX/heat -v3
+
+# hang
+#$PREFIX/heat -v4

+ 34 - 0
examples/lu/lu.sh

@@ -0,0 +1,34 @@
+#!/bin/bash
+#
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2017  Université de Bordeaux
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+# Test various LU options
+
+set -e
+
+PREFIX=$(dirname $0)
+
+$PREFIX/lu_implicit_example_float -size $((960 * 4)) -nblocks 4 -piv
+$PREFIX/lu_implicit_example_float -size $((960 * 4)) -nblocks 4 -no-stride
+$PREFIX/lu_implicit_example_float -size $((960 * 4)) -nblocks 4 -bound
+$PREFIX/lu_implicit_example_float -size $((960 * 2)) -nblocks 2 -bounddeps
+$PREFIX/lu_implicit_example_float -size $((960 * 2)) -nblocks 2 -bound -bounddeps -bounddepsprio
+
+$PREFIX/lu_example_float -size $((960 * 4)) -nblocks 4 -piv
+$PREFIX/lu_example_float -size $((960 * 4)) -nblocks 4 -no-stride
+$PREFIX/lu_example_float -size $((960 * 4)) -nblocks 4 -bound
+$PREFIX/lu_example_float -size $((960 * 2)) -nblocks 2 -bounddeps
+$PREFIX/lu_example_float -size $((960 * 2)) -nblocks 2 -bound -bounddeps -bounddepsprio

+ 3 - 1
examples/lu/lu_example.c

@@ -422,13 +422,15 @@ int main(int argc, char **argv)
 		if (pivot)
 		{
 			pivot_saved_matrix(ipiv);
-			free(ipiv);
 		}
 
 		check_result();
 	}
 #endif
 
+	if (pivot)
+		free(ipiv);
+
 	starpu_free_flags(A, (size_t)size*size*sizeof(TYPE), STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
 
 	starpu_cublas_shutdown();

+ 4 - 0
examples/lu/xlu_implicit_pivot.c

@@ -232,6 +232,10 @@ starpu_data_handle_t get_block_with_striding(starpu_data_handle_t *dataAp,
 
 int STARPU_LU(lu_decomposition_pivot)(TYPE *matA, unsigned *ipiv, unsigned size, unsigned ld, unsigned nblocks)
 {
+	if (starpu_mic_worker_get_count() || starpu_scc_worker_get_count() || starpu_mpi_ms_worker_get_count())
+		/* These won't work with pivoting: we pass a pointer in cl_args */
+		return -ENODEV;
+
 	starpu_data_handle_t dataA;
 
 	/* monitor and partition the A matrix into blocks :

+ 5 - 0
examples/lu/xlu_pivot.c

@@ -399,6 +399,7 @@ int STARPU_LU(lu_decomposition_pivot)(TYPE *matA, unsigned *ipiv, unsigned size,
 
 	/* gather all the data */
 	starpu_data_unpartition(dataA, STARPU_MAIN_RAM);
+	starpu_data_unregister(dataA);
 	free(piv_description);
 
 	return ret;
@@ -413,6 +414,10 @@ starpu_data_handle_t get_block_with_no_striding(starpu_data_handle_t *dataAp, un
 
 int STARPU_LU(lu_decomposition_pivot_no_stride)(TYPE **matA, unsigned *ipiv, unsigned size, unsigned ld, unsigned nblocks)
 {
+	if (starpu_mic_worker_get_count() || starpu_scc_worker_get_count() || starpu_mpi_ms_worker_get_count())
+		/* These won't work with pivoting: we pass a pointer in cl_args */
+		return -ENODEV;
+
 	starpu_data_handle_t *dataAp = malloc(nblocks*nblocks*sizeof(starpu_data_handle_t));
 
 	/* monitor and partition the A matrix into blocks :

+ 46 - 20
examples/mlr/mlr.c

@@ -50,7 +50,15 @@ static long sum;
 static void cl_params(struct starpu_task *task, double *parameters)
 {
 	int m, n, k;
-	starpu_codelet_unpack_args(task->cl_arg, &m, &n, &k);
+	int* vector_mn;
+	starpu_data_handle_t vector_mn_handle;
+
+	vector_mn = (int*)STARPU_VECTOR_GET_PTR(task->interfaces[0]);
+	m = vector_mn[0];
+	n = vector_mn[1];
+
+	starpu_codelet_unpack_args(task->cl_arg, &k);
+
 	parameters[0] = m;
 	parameters[1] = n;
 	parameters[2] = k;
@@ -61,10 +69,13 @@ void cpu_func(void *buffers[], void *cl_arg)
 {
 	long i;
 	int m,n,k;
-	starpu_codelet_unpack_args(cl_arg,
-			     	  &m,
-     			     	  &n,
-     			     	  &k);
+	int* vector_mn;
+
+	vector_mn = (int*)STARPU_VECTOR_GET_PTR(buffers[0]);
+	m = vector_mn[0];
+	n = vector_mn[1];
+
+	starpu_codelet_unpack_args(cl_arg, &k);
 
 	for(i=0; i < (long) (m*m*n); i++)
 		sum+=i;
@@ -123,7 +134,8 @@ static struct starpu_codelet cl_init =
 {
 	.cpu_funcs = { cpu_func },
 	.cpu_funcs_name = { "cpu_func" },
-	.nbuffers = 0,
+	.nbuffers = 1,
+	.modes = {STARPU_R},
 	.model = &cl_model_init,
 };
 
@@ -131,7 +143,8 @@ static struct starpu_codelet cl_final =
 {
 	.cpu_funcs = { cpu_func },
 	.cpu_funcs_name = { "cpu_func" },
-	.nbuffers = 0,
+	.nbuffers = 1,
+	.modes = {STARPU_R},
 	.model = &cl_model_final,
 };
 
@@ -147,29 +160,42 @@ int main(int argc, char **argv)
 
 	sum=0;
 	int m,n,k;
+	int* vector_mn = malloc( 2 * sizeof(int) );
+	starpu_data_handle_t vector_mn_handle;
+
+	starpu_vector_data_register( &vector_mn_handle,
+				     STARPU_MAIN_RAM,
+				     (uintptr_t)vector_mn, 2,
+				     sizeof(int) );
 
-        /* Giving pseudo-random values to the M,N,K parameters and inserting tasks */
-	for(i=0; i < 42; i++)
+	/* Giving pseudo-random values to the M,N,K parameters and inserting tasks */
+	for ( i = 0; i < 42; i++)
 	{
 		m = (int) ((rand() % 10)+1);
 		n = (int) ((rand() % 10)+1);
 		k = (int) ((rand() % 10)+1);
 
-		for(j=0; j < 42; j++)
+		/* To illustrate the usage, M and N are stored in a data handle */
+		starpu_data_acquire(vector_mn_handle, STARPU_W);
+		vector_mn[0] = m;
+		vector_mn[1] = n;
+		starpu_data_release(vector_mn_handle);
+
+		for ( j = 0; j < 42; j++)
 		{
-			starpu_insert_task(&cl_init,
-				   STARPU_VALUE, &m, sizeof(int),
-				   STARPU_VALUE, &n, sizeof(int),
-				   STARPU_VALUE, &k, sizeof(int),
-				   0);
-			starpu_insert_task(&cl_final,
-				   STARPU_VALUE, &m, sizeof(int),
-				   STARPU_VALUE, &n, sizeof(int),
-				   STARPU_VALUE, &k, sizeof(int),
-				   0);
+			starpu_insert_task( &cl_init,
+					    STARPU_R, vector_mn_handle,
+					    STARPU_VALUE, &k, sizeof(int),
+					    0 );
+			starpu_insert_task( &cl_final,
+					    STARPU_R, vector_mn_handle,
+					    STARPU_VALUE, &k, sizeof(int),
+					    0 );
 		}
 	}
 
+	starpu_data_unregister(vector_mn_handle);
+	free(vector_mn);
 	starpu_shutdown();
 
 	return 0;

+ 5 - 3
examples/sched_ctx/gpu_partition.c

@@ -105,7 +105,9 @@ int main(int argc, char **argv)
 	int ncuda = 0;
 	int gpu_devid = -1;
 
+#ifdef STARPU_DEVEL
 #warning temporary fix: skip test as cuda computation fails
+#endif
  	return 77;
 
 #ifndef STARPU_HAVE_SETENV
@@ -172,8 +174,8 @@ int main(int argc, char **argv)
 	int ncpus = starpu_cpu_worker_get_count();
 	int workers[ncpus+nstreams];
 	starpu_worker_get_ids_by_type(STARPU_CPU_WORKER, workers, ncpus);
-	
-	int sched_ctxs[nstreams];
+
+	unsigned sched_ctxs[nstreams];
 	int nsms[nstreams];
 	nsms[0] = 6;
 	nsms[1] = 7;
@@ -185,7 +187,7 @@ int main(int argc, char **argv)
 	}
 	unsigned sched_ctx1 = starpu_sched_ctx_create(workers, ncpus+nstreams, "ctx1", STARPU_SCHED_CTX_SUB_CTXS, sched_ctxs, nstreams, STARPU_SCHED_CTX_POLICY_NAME, "dmdas", 0);
 
-	FPRINTF(stderr, "parent ctx %d\n", sched_ctx1);
+	FPRINTF(stderr, "parent ctx %u\n", sched_ctx1);
 	starpu_sched_ctx_set_context(&sched_ctx1);
 
 #endif

+ 2 - 3
examples/stencil/stencil-blocks.c

@@ -297,11 +297,10 @@ void allocate_memory_on_node(int rank)
 
 		int node = block->mpi_node;
 
-		unsigned size_bz = block_sizes_z[bz];
-
 		/* Main blocks */
 		if (node == rank)
 		{
+			unsigned size_bz = block_sizes_z[bz];
 			allocate_block_on_node(&block->layers_handle[0], bz, &block->layers[0],
 						(sizex + 2*K), (sizey + 2*K), (size_bz + 2*K));
 #ifndef STARPU_SIMGRID
@@ -389,8 +388,8 @@ void check(int rank)
 		/* Main blocks */
 		if (node == rank)
 		{
-			unsigned size_bz = block_sizes_z[bz];
 #ifdef LIFE
+			unsigned size_bz = block_sizes_z[bz];
 			unsigned x, y, z;
 			unsigned sum = 0;
 			for (x = 0; x < sizex; x++)

+ 5 - 2
include/fstarpu_mod.f90

@@ -82,6 +82,7 @@ module fstarpu_mod
         type(c_ptr), bind(C) :: FSTARPU_SCC
 
         type(c_ptr), bind(C) :: FSTARPU_CODELET_SIMGRID_EXECUTE
+        type(c_ptr), bind(C) :: FSTARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT
         type(c_ptr), bind(C) :: FSTARPU_CUDA_ASYNC
         type(c_ptr), bind(C) :: FSTARPU_OPENCL_ASYNC
 
@@ -1580,7 +1581,7 @@ module fstarpu_mod
                 end subroutine fstarpu_memchunk_tidy
 
                 ! == starpu_task_util.h ==
-                ! struct starpu_data_handle *fstarpu_data_handle_array_alloc(int nb);
+                ! starpu_data_handle_t *fstarpu_data_handle_array_alloc(int nb);
                 function fstarpu_data_handle_array_alloc (nb) bind(C)
                         use iso_c_binding, only: c_ptr, c_int
                         type(c_ptr) :: fstarpu_data_handle_array_alloc
@@ -2331,7 +2332,9 @@ module fstarpu_mod
                             fstarpu_get_constant(C_CHAR_"FSTARPU_SCC"//C_NULL_CHAR)
 
                         FSTARPU_CODELET_SIMGRID_EXECUTE = &
-                            fstarpu_get_constant(C_CHAR_"FSTARPU_CODELET_SIMGRID_EXECUTE"//C_NULL_CHAR)
+                             fstarpu_get_constant(C_CHAR_"FSTARPU_CODELET_SIMGRID_EXECUTE"//C_NULL_CHAR)
+                        FSTARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT = &
+                             fstarpu_get_constant(C_CHAR_"FSTARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT"//C_NULL_CHAR)
                         FSTARPU_CUDA_ASYNC = &
                             fstarpu_get_constant(C_CHAR_"FSTARPU_CUDA_ASYNC"//C_NULL_CHAR)
                         FSTARPU_OPENCL_ASYNC = &

+ 2 - 1
include/starpu_config.h.in

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2009-2016  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016, 2017  CNRS
  * Copyright (C) 2014  INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -152,5 +152,6 @@ typedef ssize_t starpu_ssize_t;
 #undef STARPU_HAVE_DARWIN
 
 #undef STARPU_HAVE_CXX11
+#undef STARPU_HAVE_STRERROR_R
 
 #endif

+ 3 - 0
include/starpu_scheduler.h

@@ -62,8 +62,11 @@ unsigned long starpu_task_get_job_id(struct starpu_task *task);
 /* This function must be called to wake up a worker that is sleeping on the cond. 
  * It returns 0 whenever the worker is not in a sleeping state */
 int starpu_wake_worker(int workerid);
+int starpu_wakeup_worker(int workerid, starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex);
 /* This is a version of starpu_wake_worker which assumes that the sched mutex is locked */
 int starpu_wake_worker_locked(int workerid);
+/* This is a version of starpu_wakeup_worker which assumes that the sched mutex is locked */
+int starpu_wakeup_worker_locked(int workerid, starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex);
 
 int starpu_worker_can_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl);
 int starpu_worker_can_execute_task_impl(unsigned workerid, struct starpu_task *task, unsigned *impl_mask);

+ 1 - 0
include/starpu_task.h

@@ -46,6 +46,7 @@ extern "C"
 #define STARPU_MPI_MS	((1ULL)<<9)
 
 #define STARPU_CODELET_SIMGRID_EXECUTE	(1<<0)
+#define STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT	(1<<1)
 #define STARPU_CUDA_ASYNC	(1<<0)
 #define STARPU_OPENCL_ASYNC	(1<<0)
 

+ 41 - 10
include/starpu_thread.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2012-2016  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
+ * Copyright (C) 2010, 2012-2017  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -33,6 +33,7 @@
 #endif
 #elif !defined(_MSC_VER) || defined(BUILDING_STARPU)
 #include <pthread.h>
+#include <semaphore.h>
 #endif
 #include <stdint.h>
 
@@ -50,8 +51,9 @@ extern "C"
 typedef msg_process_t starpu_pthread_t;
 typedef int starpu_pthread_attr_t;
 
+int starpu_pthread_equal(starpu_pthread_t t1, starpu_pthread_t t2);
+starpu_pthread_t starpu_pthread_self(void);
 int starpu_pthread_create_on(char *name, starpu_pthread_t *thread, const starpu_pthread_attr_t *attr, void *(*start_routine) (void *), void *arg, msg_host_t host);
-#define starpu_pthread_setname(name)
 int starpu_pthread_create(starpu_pthread_t *thread, const starpu_pthread_attr_t *attr, void *(*start_routine) (void *), void *arg);
 int starpu_pthread_join(starpu_pthread_t thread, void **retval);
 int starpu_pthread_exit(void *retval) STARPU_ATTRIBUTE_NORETURN;
@@ -64,8 +66,18 @@ int starpu_pthread_attr_setdetachstate(starpu_pthread_attr_t *attr, int detachst
 typedef pthread_t starpu_pthread_t;
 typedef pthread_attr_t starpu_pthread_attr_t;
 
+#define starpu_pthread_equal pthread_equal
+#define starpu_pthread_self pthread_self
 #define starpu_pthread_create pthread_create
 #define starpu_pthread_create_on(name, thread, attr, routine, arg, where) starpu_pthread_create(thread, attr, routine, arg)
+#define starpu_pthread_join pthread_join
+#define starpu_pthread_exit pthread_exit
+#define starpu_pthread_attr_init pthread_attr_init
+#define starpu_pthread_attr_destroy pthread_attr_destroy
+#define starpu_pthread_attr_setdetachstate pthread_attr_setdetachstate
+
+#endif /* STARPU_SIMGRID, _MSC_VER */
+
 #ifdef STARPU_HAVE_PTHREAD_SETNAME_NP
 #ifdef STARPU_HAVE_DARWIN
 #define starpu_pthread_setname(name) pthread_setname_np(name)
@@ -75,13 +87,6 @@ typedef pthread_attr_t starpu_pthread_attr_t;
 #else
 #define starpu_pthread_setname(name)
 #endif
-#define starpu_pthread_join pthread_join
-#define starpu_pthread_exit pthread_exit
-#define starpu_pthread_attr_init pthread_attr_init
-#define starpu_pthread_attr_destroy pthread_attr_destroy
-#define starpu_pthread_attr_setdetachstate pthread_attr_setdetachstate
-
-#endif /* STARPU_SIMGRID, _MSC_VER */
 
 /*
  * Encapsulation of the pthread_mutex_* functions.
@@ -403,6 +408,32 @@ int starpu_pthread_wait_wait(starpu_pthread_wait_t *w);
 int starpu_pthread_wait_destroy(starpu_pthread_wait_t *w);
 #endif
 
+/*
+ * Encapsulation of the semaphore functions.
+ */
+
+#ifdef STARPU_SIMGRID
+
+typedef msg_sem_t starpu_sem_t;
+int starpu_sem_destroy(starpu_sem_t *);
+int starpu_sem_getvalue(starpu_sem_t *, int *);
+int starpu_sem_init(starpu_sem_t *, int, unsigned);
+int starpu_sem_post(starpu_sem_t *);
+int starpu_sem_trywait(starpu_sem_t *);
+int starpu_sem_wait(starpu_sem_t *);
+
+#elif !defined(_MSC_VER) || defined(BUILDING_STARPU) /* !STARPU_SIMGRID */
+
+typedef sem_t starpu_sem_t;
+#define starpu_sem_destroy sem_destroy
+#define starpu_sem_getvalue sem_getvalue
+#define starpu_sem_init sem_init
+#define starpu_sem_post sem_post
+int starpu_sem_trywait(starpu_sem_t *);
+int starpu_sem_wait(starpu_sem_t *);
+
+#endif
+
 #ifdef __cplusplus
 }
 #endif

+ 109 - 0
mpi/dev/starpu_mpi_comm_check.sh

@@ -0,0 +1,109 @@
+#!/bin/bash
+
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2017 CNRS
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+# Script to check MPI communications are done properly
+# The application should be launched with STARPU_MPI_COMM=1
+# e.g
+#    $ export STARPU_MPI_COMM=1
+#    $ mpirun --output-filename starpu_mpi.log appli parameters
+# and then the script can be launched with the output files
+#    $ starpu_mpi_comm_check.sh starpu_mpi.log.*
+
+if test -z "$1"
+then
+    echo Syntax error: parameter missing
+    exit 1
+fi
+
+# Get the nodes identifiers
+nodes=$(for f in $*
+	do
+	    grep starpu_mpi $f | grep '\[' | awk '{print $1}'| sed 's/\[\(.*\)\]\[starpu_mpi\]/\1/' | grep "^[[:digit:]]*$"
+	done |sort|uniq
+     )
+echo nodes $nodes
+
+DIR=/tmp
+
+# for each node, extract send and receive communications
+for node in $nodes
+do
+    for f in $*
+    do
+	grep starpu_mpi $f |grep "\[$node"
+    done > $DIR/starpu_mpi_node$node.log
+    grep -- "-->" $DIR/starpu_mpi_node$node.log > $DIR/starpu_mpi_node${node}_send.log
+    grep -- "<--" $DIR/starpu_mpi_node$node.log > $DIR/starpu_mpi_node${node}_recv.log
+done
+
+# count the number of traced lines
+#for node in $nodes
+#do
+#    wc -l $DIR/starpu_mpi_node${node}_recv.log
+#    lines=$(grep :42:42 $DIR/starpu_mpi_node${node}_recv.log | wc -l)
+#    lines2=$(( lines + lines ))
+#    echo $lines2
+#    lines3=$(( lines2 + lines ))
+#    echo $lines3
+#done
+
+# for each pair of nodes, check tags are sent and received in the same order
+for src in $nodes
+do
+    for dst in $nodes
+    do
+	if test $src != $dst
+	then
+	    grep ":$dst:42:" $DIR/starpu_mpi_node${src}_send.log| awk -F':' '{print $6}' > $DIR/node${src}_send_to_${dst}.log
+	    grep ":$src:42:" $DIR/starpu_mpi_node${dst}_recv.log|awk -F ':' '{print $6}'> $DIR/node${dst}_recv_from_${src}.log
+ 	    diff --side-by-side  --suppress-common-lines $DIR/node${src}_send_to_${dst}.log $DIR/node${dst}_recv_from_${src}.log  > $DIR/check_$$
+	    if test -s $DIR/check_$$
+	    then
+		echo $src $dst
+		less $DIR/check_$$
+	    fi
+	fi
+    done
+done
+
+# check each envelope reception is followed by the appropriate data reception
+# first line: MPI_Recv of the envelope
+# second line: display envelope information
+# third line: MPI_Recv of the data
+for node in $nodes
+do
+    echo processing $DIR/starpu_mpi_node${node}_recv.log
+    (
+	while read line
+	do
+	    read line2
+	    read line3
+	    #echo processing
+	    tag2=$(echo $line2 | awk -F ':' '{print $6}')
+	    tag3=$(echo $line3 | awk -F ':' '{print $6}')
+	    if test "$tag2" != "$tag3"
+	    then
+		echo erreur
+		echo $tag2 $tag3
+		echo $line
+		echo $line2
+		echo $line3
+	    fi
+	done
+    ) < $DIR/starpu_mpi_node${node}_recv.log
+done
+

+ 1 - 1
sc_hypervisor/src/hypervisor_policies/feft_lp_policy.c

@@ -27,7 +27,7 @@ static void _try_resizing(unsigned *sched_ctxs, int nsched_ctxs, int *workers, i
 	/* for vite */
 	int ns = sched_ctxs == NULL ? sc_hypervisor_get_nsched_ctxs() : nsched_ctxs;
 #ifdef STARPU_SC_HYPERVISOR_DEBUG
-	printf("resize_no = %u %d ctxs\n", resize_no, ns);
+	printf("resize_no = %lu %d ctxs\n", resize_no, ns);
 #endif
 	if(ns <= 0) return;
 

+ 3 - 3
socl/src/cl_createkernel.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010,2011 University of Bordeaux
- * Copyright (C) 2016  CNRS
+ * Copyright (C) 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -25,7 +25,7 @@ static void soclCreateKernel_task(void *data) {
 
    if (k->program->cl_programs[range] == NULL) {
       k->errcodes[range] = CL_SUCCESS;
-      DEBUG_MSG("[Device %d] Kernel creation skipped: program has not been built for this device.\n", starpu_worker_get_id_check());
+      DEBUG_MSG("[Device %u] Kernel creation skipped: program has not been built for this device.\n", starpu_worker_get_id_check());
       return;
    }
 
@@ -163,7 +163,7 @@ soclCreateKernel(cl_program    program,
    }
 
    /* Create kernel on each device */
-   DEBUG_MSG("[Kernel %d] Create %d kernels (name \"%s\")\n", k->id, socl_device_count, kernel_name);
+   DEBUG_MSG("[Kernel %d] Create %u kernels (name \"%s\")\n", k->id, socl_device_count, kernel_name);
    starpu_execute_on_each_worker_ex(soclCreateKernel_task, k, STARPU_OPENCL, "SOCL_CREATE_KERNEL");
 
    if (errcode_ret != NULL) {

+ 2 - 2
socl/src/cl_createprogramwithsource.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010,2011 University of Bordeaux
- * Copyright (C) 2016  CNRS
+ * Copyright (C) 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -141,7 +141,7 @@ soclCreateProgramWithSource(cl_context      context,
       *errcode_ret = CL_SUCCESS;
       for (i=0; i<socl_device_count; i++) {
          if (data->errcodes[i] != CL_SUCCESS) {
-            DEBUG_MSG("Worker [%d] failed\n", i);
+            DEBUG_MSG("Worker [%u] failed\n", i);
             DEBUG_CL("clCreateProgramWithSource", data->errcodes[i]);
             *errcode_ret = data->errcodes[i];
             break;

+ 6 - 6
socl/src/cl_enqueuendrangekernel.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010,2011, 2016-2017 University of Bordeaux
- * Copyright (C) 2016  CNRS
+ * Copyright (C) 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -71,13 +71,13 @@ void soclEnqueueNDRangeKernel_task(void *descr[], void *args) {
    if (err != CL_SUCCESS) {
 	   ERROR_MSG("Worker[%d] Unable to Enqueue kernel (error %d)\n", wid, err);
 	   DEBUG_CL("clEnqueueNDRangeKernel", err);
-	   DEBUG_MSG("Workdim %d, global_work_offset %p, global_work_size %p, local_work_size %p\n",
+	   DEBUG_MSG("Workdim %u, global_work_offset %p, global_work_size %p, local_work_size %p\n",
 			   cmd->work_dim, cmd->global_work_offset, cmd->global_work_size, cmd->local_work_size);
-	   DEBUG_MSG("Global work size: %ld %ld %ld\n", cmd->global_work_size[0],
-			   (cmd->work_dim > 1 ? cmd->global_work_size[1] : 1), (cmd->work_dim > 2 ? cmd->global_work_size[2] : 1)); 
+	   DEBUG_MSG("Global work size: %ld %ld %ld\n", (long)cmd->global_work_size[0],
+		     (long)(cmd->work_dim > 1 ? cmd->global_work_size[1] : 1), (long)(cmd->work_dim > 2 ? cmd->global_work_size[2] : 1)); 
 	   if (cmd->local_work_size != NULL)
-		   DEBUG_MSG("Local work size: %ld %ld %ld\n", cmd->local_work_size[0],
-				   (cmd->work_dim > 1 ? cmd->local_work_size[1] : 1), (cmd->work_dim > 2 ? cmd->local_work_size[2] : 1)); 
+		   DEBUG_MSG("Local work size: %ld %ld %ld\n", (long)cmd->local_work_size[0],
+			     (long)(cmd->work_dim > 1 ? cmd->local_work_size[1] : 1), (long)(cmd->work_dim > 2 ? cmd->local_work_size[2] : 1)); 
    }
    else {
       /* Waiting for kernel to terminate */

+ 3 - 3
socl/src/cl_enqueuereadbuffer.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010,2011, 2014 University of Bordeaux
- * Copyright (C) 2016  CNRS
+ * Copyright (C) 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -25,7 +25,7 @@ static void soclEnqueueReadBuffer_cpu_task(void *descr[], void *args) {
   gc_entity_release(ev);
 
    char * ptr = (void*)STARPU_VARIABLE_GET_PTR(descr[0]);
-   DEBUG_MSG("[Buffer %d] Reading %ld bytes from %p to %p\n", cmd->buffer->id, cmd->cb, ptr+cmd->offset, cmd->ptr);
+   DEBUG_MSG("[Buffer %d] Reading %ld bytes from %p to %p\n", cmd->buffer->id, (long)cmd->cb, ptr+cmd->offset, cmd->ptr);
 
    //This fix is for people who use USE_HOST_PTR and still use ReadBuffer to sync the buffer in host mem at host_ptr.
    //They should use buffer mapping facilities instead.
@@ -44,7 +44,7 @@ static void soclEnqueueReadBuffer_opencl_task(void *descr[], void *args) {
 
    cl_mem mem = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
 
-   DEBUG_MSG("[Buffer %d] Reading %ld bytes from offset %ld into %p\n", cmd->buffer->id, cmd->cb, cmd->offset, cmd->ptr);
+   DEBUG_MSG("[Buffer %d] Reading %ld bytes from offset %ld into %p\n", cmd->buffer->id, (long)cmd->cb, (long)cmd->offset, cmd->ptr);
 
    int wid = starpu_worker_get_id_check();
    cl_command_queue cq;

+ 3 - 3
socl/src/cl_enqueuewritebuffer.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010,2011, 2014 University of Bordeaux
- * Copyright (C) 2016  CNRS
+ * Copyright (C) 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -26,7 +26,7 @@ static void soclEnqueueWriteBuffer_cpu_task(void *descr[], void *args) {
   gc_entity_release(ev);
 
    char * ptr = (void*)STARPU_VARIABLE_GET_PTR(descr[0]);
-   DEBUG_MSG("[Buffer %d] Writing %ld bytes from %p to %p\n", cmd->buffer->id, cmd->cb, cmd->ptr, ptr+cmd->offset);
+   DEBUG_MSG("[Buffer %d] Writing %ld bytes from %p to %p\n", cmd->buffer->id, (long)cmd->cb, cmd->ptr, ptr+cmd->offset);
 
    //FIXME: Fix for people who use USE_HOST_PTR, modify data at host_ptr and use WriteBuffer to commit the change.
    // StarPU may have erased host mem at host_ptr (for instance by retrieving current buffer data at host_ptr)
@@ -47,7 +47,7 @@ static void soclEnqueueWriteBuffer_opencl_task(void *descr[], void *args) {
 
    cl_mem mem = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[0]);
 
-   DEBUG_MSG("[Buffer %d] Writing %ld bytes to offset %ld from %p\n", cmd->buffer->id, cmd->cb, cmd->offset, cmd->ptr);
+   DEBUG_MSG("[Buffer %d] Writing %ld bytes to offset %ld from %p\n", cmd->buffer->id, (long)cmd->cb, (long)cmd->offset, cmd->ptr);
 
    int wid = starpu_worker_get_id_check();
    cl_command_queue cq;

+ 1 - 1
socl/src/cl_setkernelarg.c

@@ -68,7 +68,7 @@ soclSetKernelArg(cl_kernel  kernel,
    kernel->arg_type[arg_index] = Null;
    kernel->arg_size[arg_index] = arg_size;
 
-   DEBUG_MSG("[Kernel %d] Set argument %d: argsize %ld argvalue %p\n", kernel->id, arg_index, arg_size, arg_value);
+   DEBUG_MSG("[Kernel %d] Set argument %d: argsize %ld argvalue %p\n", kernel->id, arg_index, (long)arg_size, arg_value);
 
    /* Argument is not Null */
    if (arg_value != NULL) {

+ 1 - 1
socl/src/task.c

@@ -77,7 +77,7 @@ void task_depends_on(starpu_task task, cl_uint num_events, cl_event *events) {
     DEBUG_MSG("Task %p depends on events:", task);
     for (i=0; i<num_events; i++) {
        tags[i] = events[i]->id;
-       DEBUG_MSG_NOHEAD(" %u", events[i]->id);
+       DEBUG_MSG_NOHEAD(" %d", events[i]->id);
     }
     DEBUG_MSG_NOHEAD("\n");
 

+ 1 - 1
src/common/fxt.c

@@ -72,7 +72,7 @@ long _starpu_gettid(void)
 #elif defined(_WIN32) && !defined(__CYGWIN__)
 	return (long) GetCurrentThreadId();
 #else
-	return (long) pthread_self();
+	return (long) starpu_pthread_self();
 #endif
 #endif
 }

+ 25 - 3
src/common/prio_list.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2015-2016  Université de Bordeaux
+ * Copyright (C) 2015-2017  Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -252,9 +252,31 @@
 	static inline void ENAME##_prio_list_deinit(struct ENAME##_prio_list *priolist) \
 	{ (void) (priolist); /* ENAME##_list_deinit(&(priolist)->list); */ } \
 	static inline void ENAME##_prio_list_push_back(struct ENAME##_prio_list *priolist, struct ENAME *e) \
-	{ ENAME##_list_push_back(&(priolist)->list, (e)); } \
+	{ \
+		struct ENAME *cur; \
+		for (cur  = ENAME##_list_begin(&(priolist)->list); \
+		     cur != ENAME##_list_end(&(priolist)->list); \
+		     cur  = ENAME##_list_next(cur)) \
+			if ((e)->PRIOFIELD > cur->PRIOFIELD) \
+				break; \
+		if (cur == ENAME##_list_end(&(priolist)->list)) \
+			ENAME##_list_push_back(&(priolist)->list, (e)); \
+		else \
+			ENAME##_list_insert_before(&(priolist)->list, (e), cur); \
+	} \
 	static inline void ENAME##_prio_list_push_front(struct ENAME##_prio_list *priolist, struct ENAME *e) \
-	{ ENAME##_list_push_front(&(priolist)->list, (e)); } \
+	{ \
+		struct ENAME *cur; \
+		for (cur  = ENAME##_list_begin(&(priolist)->list); \
+		     cur != ENAME##_list_end(&(priolist)->list); \
+		     cur  = ENAME##_list_next(cur)) \
+			if ((e)->PRIOFIELD >= cur->PRIOFIELD) \
+				break; \
+		if (cur == ENAME##_list_end(&(priolist)->list)) \
+			ENAME##_list_push_back(&(priolist)->list, (e)); \
+		else \
+			ENAME##_list_insert_before(&(priolist)->list, (e), cur); \
+	} \
 	static inline int ENAME##_prio_list_empty(const struct ENAME##_prio_list *priolist) \
 	{ return ENAME##_list_empty(&(priolist)->list); } \
 	static inline void ENAME##_prio_list_erase(struct ENAME##_prio_list *priolist, struct ENAME *e) \

+ 2 - 3
src/common/starpu_spinlock.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2012-2014, 2016  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2013, 2014  CNRS
+ * Copyright (C) 2010, 2011, 2013, 2014, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -25,10 +25,9 @@
 int _starpu_spin_init(struct _starpu_spinlock *lock)
 {
 	starpu_pthread_mutexattr_t errcheck_attr;
-//	memcpy(&lock->errcheck_lock, PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP, sizeof(PTHREAD_ERRORCHECK_MUTEX_INITIALIZER_NP));
 	int ret;
 	ret = starpu_pthread_mutexattr_init(&errcheck_attr);
-	STARPU_CHECK_RETURN_VALUE(ret, "pthread_mutexattr_init");
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_pthread_mutexattr_init");
 
 	ret = starpu_pthread_mutexattr_settype(&errcheck_attr, PTHREAD_MUTEX_ERRORCHECK);
 	STARPU_ASSERT(!ret);

+ 102 - 29
src/common/thread.c

@@ -19,6 +19,7 @@
 #include <core/simgrid.h>
 #include <core/workers.h>
 
+#include <errno.h>
 #include <limits.h>
 
 #ifdef STARPU_SIMGRID
@@ -50,6 +51,16 @@ static int _starpu_futex_wake = FUTEX_WAKE;
 
 extern int _starpu_simgrid_thread_start(int argc, char *argv[]);
 
+int starpu_pthread_equal(starpu_pthread_t t1, starpu_pthread_t t2)
+{
+	return t1 == t2;
+}
+
+starpu_pthread_t starpu_pthread_self(void)
+{
+	return MSG_process_self();
+}
+
 int starpu_pthread_create_on(char *name, starpu_pthread_t *thread, const starpu_pthread_attr_t *attr STARPU_ATTRIBUTE_UNUSED, void *(*start_routine) (void *), void *arg, msg_host_t host)
 {
 	char **_args;
@@ -62,6 +73,9 @@ int starpu_pthread_create_on(char *name, starpu_pthread_t *thread, const starpu_
 	void *tsd;
 	_STARPU_CALLOC(tsd, MAX_TSD+1, sizeof(void*));
 	*thread = MSG_process_create_with_arguments(name, _starpu_simgrid_thread_start, tsd, host, 2, _args);
+#if SIMGRID_VERSION_MAJOR > 3 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR >= 15)
+	MSG_process_ref(*thread);
+#endif
 	return 0;
 }
 
@@ -74,6 +88,9 @@ int starpu_pthread_join(starpu_pthread_t thread STARPU_ATTRIBUTE_UNUSED, void **
 {
 #if SIMGRID_VERSION_MAJOR > 3 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR >= 14)
 	MSG_process_join(thread, 1000000);
+#if SIMGRID_VERSION_MAJOR > 3 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR >= 15)
+	MSG_process_unref(thread);
+#endif
 #else
 	MSG_process_sleep(1);
 #endif
@@ -519,7 +536,7 @@ int starpu_pthread_queue_destroy(starpu_pthread_queue_t *q)
 #endif /* STARPU_SIMGRID */
 
 #if (defined(STARPU_SIMGRID) && !defined(STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT)) || (!defined(STARPU_SIMGRID) && !defined(STARPU_HAVE_PTHREAD_BARRIER))
-int starpu_pthread_barrier_init(starpu_pthread_barrier_t *restrict barrier, const starpu_pthread_barrierattr_t *restrict attr, unsigned count)
+int starpu_pthread_barrier_init(starpu_pthread_barrier_t *restrict barrier, const starpu_pthread_barrierattr_t *restrict attr STARPU_ATTRIBUTE_UNUSED, unsigned count)
 {
 	int ret = starpu_pthread_mutex_init(&barrier->mutex, NULL);
 	if (!ret)
@@ -703,47 +720,34 @@ int starpu_pthread_barrier_wait(starpu_pthread_barrier_t *barrier)
  * macros of course) which record when the mutex is held or not */
 int starpu_pthread_mutex_lock_sched(starpu_pthread_mutex_t *mutex)
 {
-	const int workerid = starpu_worker_get_id();
-	struct _starpu_worker * const worker = (workerid != -1)?_starpu_get_worker_struct(workerid):NULL;
-	if(worker && mutex == &worker->sched_mutex)
-	{
-		STARPU_ASSERT(worker->sched_mutex_depth < UINT_MAX);
-		worker->sched_mutex_depth++;
-		if (worker->sched_mutex_depth > 1)
-			return 0;
-	}
-
-	return starpu_pthread_mutex_lock(mutex);
+	int p_ret = starpu_pthread_mutex_lock(mutex);
+	int workerid = starpu_worker_get_id();
+	if(workerid != -1 && _starpu_worker_mutex_is_sched_mutex(workerid, mutex))
+		_starpu_worker_set_flag_sched_mutex_locked(workerid, 1);
+	return p_ret;
 }
 
 int starpu_pthread_mutex_unlock_sched(starpu_pthread_mutex_t *mutex)
 {
-	const int workerid = starpu_worker_get_id();
-	struct _starpu_worker * const worker = (workerid != -1)?_starpu_get_worker_struct(workerid):NULL;
-	if(worker && mutex == &worker->sched_mutex)
-	{
-		STARPU_ASSERT(worker->sched_mutex_depth > 0);
-		worker->sched_mutex_depth--;
-		if (worker->sched_mutex_depth > 0)
-			return 0;
-	}
+	int workerid = starpu_worker_get_id();
+	if(workerid != -1 && _starpu_worker_mutex_is_sched_mutex(workerid, mutex))
+		_starpu_worker_set_flag_sched_mutex_locked(workerid, 0);
 
 	return starpu_pthread_mutex_unlock(mutex);
 }
 
 int starpu_pthread_mutex_trylock_sched(starpu_pthread_mutex_t *mutex)
 {
-	const int workerid = starpu_worker_get_id();
-	struct _starpu_worker * const worker = (workerid != -1)?_starpu_get_worker_struct(workerid):NULL;
-	if(worker && mutex == &worker->sched_mutex)
+	int ret = starpu_pthread_mutex_trylock(mutex);
+
+	if (!ret)
 	{
-		STARPU_ASSERT(worker->sched_mutex_depth < UINT_MAX);
-		worker->sched_mutex_depth++;
-		if (worker->sched_mutex_depth > 1)
-			return 0;
+		int workerid = starpu_worker_get_id();
+		if(workerid != -1 && _starpu_worker_mutex_is_sched_mutex(workerid, mutex))
+			_starpu_worker_set_flag_sched_mutex_locked(workerid, 1);
 	}
 
-	return starpu_pthread_mutex_trylock(mutex);
+	return ret;
 }
 
 #ifdef STARPU_DEBUG
@@ -870,3 +874,72 @@ void _starpu_pthread_spin_do_unlock(starpu_pthread_spinlock_t *lock)
 #endif
 
 #endif /* defined(STARPU_SIMGRID) || (defined(STARPU_LINUX_SYS) && defined(STARPU_HAVE_XCHG)) || !defined(STARPU_HAVE_PTHREAD_SPIN_LOCK) */
+
+#ifdef STARPU_SIMGRID
+
+int starpu_sem_destroy(starpu_sem_t *sem)
+{
+	MSG_sem_destroy(*sem);
+	return 0;
+}
+
+int starpu_sem_init(starpu_sem_t *sem, int pshared, unsigned value)
+{
+	STARPU_ASSERT_MSG(pshared == 0, "pshared semaphores not supported under simgrid");
+	*sem = MSG_sem_init(value);
+	return 0;
+}
+
+int starpu_sem_post(starpu_sem_t *sem)
+{
+	MSG_sem_release(*sem);
+	return 0;
+}
+
+int starpu_sem_wait(starpu_sem_t *sem)
+{
+	MSG_sem_acquire(*sem);
+	return 0;
+}
+
+int starpu_sem_trywait(starpu_sem_t *sem)
+{
+	if (MSG_sem_would_block(*sem))
+		return EAGAIN;
+	starpu_sem_wait(sem);
+	return 0;
+}
+
+int starpu_sem_getvalue(starpu_sem_t *sem, int *sval)
+{
+#if SIMGRID_VERSION_MAJOR > 3 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR > 13)
+	*sval = MSG_sem_get_capacity(*sem);
+	return 0;
+#else
+	(void) sem;
+	(void) sval;
+	STARPU_ABORT_MSG("sigmrid up to 3.13 did not have working MSG_sem_get_capacity");
+#endif
+}
+
+#elif !defined(_MSC_VER) || defined(BUILDING_STARPU) /* !STARPU_SIMGRID */
+
+int starpu_sem_wait(starpu_sem_t *sem)
+{
+	int ret;
+	while((ret = sem_wait(sem)) == -1 && errno == EINTR)
+		;
+
+	return ret;
+}
+
+int starpu_sem_trywait(starpu_sem_t *sem)
+{
+	int ret;
+	while((ret = sem_trywait(sem)) == -1 && errno == EINTR)
+		;
+	
+	return ret;
+}
+
+#endif

+ 3 - 4
src/common/utils.h

@@ -24,7 +24,6 @@
 #include <string.h>
 #include <stdlib.h>
 #include <math.h>
-#include <pthread.h>
 #ifdef STARPU_HAVE_SCHED_YIELD
 #include <sched.h>
 #endif
@@ -97,9 +96,9 @@
 #endif
 
 #ifdef STARPU_EXTRA_VERBOSE
-#  define _STARPU_LOG_IN()             do { if (!_starpu_silent) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s:%s@%d] -->\n", pthread_self(), __starpu_func__,__FILE__,  __LINE__); }} while(0)
-#  define _STARPU_LOG_OUT()            do { if (!_starpu_silent) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s:%s@%d] <--\n", pthread_self(), __starpu_func__, __FILE__,  __LINE__); }} while(0)
-#  define _STARPU_LOG_OUT_TAG(outtag)  do { if (!_starpu_silent) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s:%s@%d] <-- (%s)\n", pthread_self(), __starpu_func__, __FILE__, __LINE__, outtag); }} while(0)
+#  define _STARPU_LOG_IN()             do { if (!_starpu_silent) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s:%s@%d] -->\n", starpu_pthread_self(), __starpu_func__,__FILE__,  __LINE__); }} while(0)
+#  define _STARPU_LOG_OUT()            do { if (!_starpu_silent) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s:%s@%d] <--\n", starpu_pthread_self(), __starpu_func__, __FILE__,  __LINE__); }} while(0)
+#  define _STARPU_LOG_OUT_TAG(outtag)  do { if (!_starpu_silent) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s:%s@%d] <-- (%s)\n", starpu_pthread_self(), __starpu_func__, __FILE__, __LINE__, outtag); }} while(0)
 #else
 #  define _STARPU_LOG_IN()
 #  define _STARPU_LOG_OUT()

+ 2 - 2
src/core/jobs.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2016  Université de Bordeaux
+ * Copyright (C) 2009-2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011, 2014, 2016  INRIA
@@ -88,7 +88,7 @@ struct _starpu_job* STARPU_ATTRIBUTE_MALLOC _starpu_job_create(struct starpu_tas
 
 #ifndef STARPU_USE_FXT
 	if (_starpu_bound_recording || _starpu_top_status_get() ||
-		_starpu_task_break_on_push != -1 || _starpu_task_break_on_pop != -1 || _starpu_task_break_on_sched != -1
+		_starpu_task_break_on_push != -1 || _starpu_task_break_on_sched != -1 || _starpu_task_break_on_pop != -1 || _starpu_task_break_on_exec != -1
 		|| STARPU_AYU_EVENT)
 #endif
 	{

A különbségek nem kerülnek megjelenítésre, a fájl túl nagy
+ 308 - 289
src/core/sched_ctx.c


+ 41 - 42
src/core/sched_ctx.h

@@ -73,12 +73,27 @@ struct _starpu_sched_ctx
 	long iterations[2];
 	int iteration_level;
 
+	/* cond to block push when there are no workers in the ctx */
+	starpu_pthread_cond_t no_workers_cond;
+
+	/* mutex to block push when there are no workers in the ctx */
+	starpu_pthread_mutex_t no_workers_mutex;
+
 	/*ready tasks that couldn't be pushed because the ctx has no workers*/
 	struct starpu_task_list empty_ctx_tasks;
 
+	/* mutext protecting empty_ctx_tasks list */
+	starpu_pthread_mutex_t empty_ctx_mutex;
+
 	/*ready tasks that couldn't be pushed because the the window of tasks was already full*/
 	struct starpu_task_list waiting_tasks;
 
+	/* mutext protecting waiting_tasks list */
+	starpu_pthread_mutex_t waiting_tasks_mutex;
+
+	/* mutext protecting write to all worker's sched_ctx_list structure for this sched_ctx */
+	starpu_pthread_mutex_t sched_ctx_list_mutex;
+
 	/* min CPUs to execute*/
 	int min_ncpus;
 
@@ -127,10 +142,27 @@ struct _starpu_sched_ctx
 	   if not master is -1 */
 	int main_master;
 
+	/* conditions variables used when parallel sections are executed in contexts */
+	starpu_pthread_cond_t parallel_sect_cond[STARPU_NMAXWORKERS];
+	starpu_pthread_mutex_t parallel_sect_mutex[STARPU_NMAXWORKERS];
+	starpu_pthread_cond_t parallel_sect_cond_busy[STARPU_NMAXWORKERS];
+	int busy[STARPU_NMAXWORKERS];
+
 	/* boolean indicating that workers should block in order to allow
 	   parallel sections to be executed on their allocated resources */
 	unsigned parallel_sect[STARPU_NMAXWORKERS];
 
+	/* semaphore that block appl thread until starpu threads are
+	   all blocked and ready to exec the parallel code */
+	starpu_sem_t fall_asleep_sem[STARPU_NMAXWORKERS];
+
+	/* semaphore that block appl thread until starpu threads are 
+	   all woke up and ready continue appl */
+	starpu_sem_t wake_up_sem[STARPU_NMAXWORKERS];
+
+	/* bool indicating if the workers is sleeping in this ctx */
+	unsigned sleeping[STARPU_NMAXWORKERS];
+
 	/* ctx nesting the current ctx */
 	unsigned nesting_sched_ctx;
 
@@ -158,9 +190,6 @@ struct _starpu_sched_ctx
 	int sms_end_idx;
 
 	int stream_worker;
-
-	starpu_pthread_rwlock_t rwlock;
-	starpu_pthread_t lock_write_owner;
 };
 
 struct _starpu_machine_config;
@@ -212,10 +241,19 @@ void _starpu_worker_gets_out_of_ctx(unsigned sched_ctx_id, struct _starpu_worker
 /* Check if the worker belongs to another sched_ctx */
 unsigned _starpu_worker_belongs_to_a_sched_ctx(int workerid, unsigned sched_ctx_id);
 
+/* mutex synchronising several simultaneous modifications of a context */
+starpu_pthread_rwlock_t* _starpu_sched_ctx_get_changing_ctx_mutex(unsigned sched_ctx_id);
+
 /* indicates wheather this worker should go to sleep or not 
    (if it is the last one awake in a context he should better keep awake) */
 unsigned _starpu_sched_ctx_last_worker_awake(struct _starpu_worker *worker);
 
+/* let the appl know that the worker blocked to execute parallel code */
+void _starpu_sched_ctx_signal_worker_blocked(unsigned sched_ctx_id, int workerid);
+
+/* let the appl know that the worker woke up */
+void _starpu_sched_ctx_signal_worker_woke_up(unsigned sched_ctx_id, int workerid);
+
 /* If starpu_sched_ctx_set_context() has been called, returns the context
  * id set by its last call, or the id of the initial context */
 unsigned _starpu_sched_ctx_get_current_context();
@@ -240,43 +278,4 @@ struct _starpu_sched_ctx *__starpu_sched_ctx_get_sched_ctx_for_worker_and_job(st
 #define _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(w,j) \
 	(_starpu_get_nsched_ctxs() <= 1 ? _starpu_get_sched_ctx_struct(0) : __starpu_sched_ctx_get_sched_ctx_for_worker_and_job((w),(j)))
 
-static inline struct _starpu_sched_ctx *_starpu_get_sched_ctx_struct(unsigned id);
-
-static inline int _starpu_sched_ctx_check_write_locked(unsigned sched_ctx_id)
-{
-	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
-	return sched_ctx->lock_write_owner == pthread_self();
-}
-#define STARPU_SCHED_CTX_CHECK_LOCK(sched_ctx_id) STARPU_ASSERT(_starpu_sched_ctx_check_write_locked((sched_ctx_id)))
-
-static inline void _starpu_sched_ctx_lock_write(unsigned sched_ctx_id)
-{
-	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
-	STARPU_ASSERT(sched_ctx->lock_write_owner != pthread_self());
-	STARPU_PTHREAD_RWLOCK_WRLOCK(&sched_ctx->rwlock);
-	sched_ctx->lock_write_owner = pthread_self();
-}
-
-static inline void _starpu_sched_ctx_unlock_write(unsigned sched_ctx_id)
-{
-	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
-	STARPU_ASSERT(sched_ctx->lock_write_owner == pthread_self());
-	sched_ctx->lock_write_owner = 0;
-	STARPU_PTHREAD_RWLOCK_UNLOCK(&sched_ctx->rwlock);
-}
-
-static inline void _starpu_sched_ctx_lock_read(unsigned sched_ctx_id)
-{
-	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
-	STARPU_ASSERT(sched_ctx->lock_write_owner != pthread_self());
-	STARPU_PTHREAD_RWLOCK_RDLOCK(&sched_ctx->rwlock);
-}
-
-static inline void _starpu_sched_ctx_unlock_read(unsigned sched_ctx_id)
-{
-	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
-	STARPU_ASSERT(sched_ctx->lock_write_owner != pthread_self());
-	STARPU_PTHREAD_RWLOCK_UNLOCK(&sched_ctx->rwlock);
-}
-
 #endif // __SCHED_CONTEXT_H__

+ 11 - 6
src/core/sched_policy.c

@@ -31,15 +31,17 @@ static double idle[STARPU_NMAXWORKERS];
 static double idle_start[STARPU_NMAXWORKERS];
 
 long _starpu_task_break_on_push = -1;
-long _starpu_task_break_on_pop = -1;
 long _starpu_task_break_on_sched = -1;
+long _starpu_task_break_on_pop = -1;
+long _starpu_task_break_on_exec = -1;
 static const char *starpu_idle_file;
 
 void _starpu_sched_init(void)
 {
 	_starpu_task_break_on_push = starpu_get_env_number_default("STARPU_TASK_BREAK_ON_PUSH", -1);
-	_starpu_task_break_on_pop = starpu_get_env_number_default("STARPU_TASK_BREAK_ON_POP", -1);
 	_starpu_task_break_on_sched = starpu_get_env_number_default("STARPU_TASK_BREAK_ON_SCHED", -1);
+	_starpu_task_break_on_pop = starpu_get_env_number_default("STARPU_TASK_BREAK_ON_POP", -1);
+	_starpu_task_break_on_exec = starpu_get_env_number_default("STARPU_TASK_BREAK_ON_EXEC", -1);
 	starpu_idle_file = starpu_getenv("STARPU_IDLE_FILE");
 }
 
@@ -431,9 +433,9 @@ int _starpu_repush_task(struct _starpu_job *j)
 
 		if(nworkers == 0)
 		{
-			_starpu_sched_ctx_lock_write(sched_ctx->id);
+			STARPU_PTHREAD_MUTEX_LOCK(&sched_ctx->empty_ctx_mutex);
 			starpu_task_list_push_front(&sched_ctx->empty_ctx_tasks, task);
-			_starpu_sched_ctx_unlock_write(sched_ctx->id);
+			STARPU_PTHREAD_MUTEX_UNLOCK(&sched_ctx->empty_ctx_mutex);
 #ifdef STARPU_USE_SC_HYPERVISOR
 			if(sched_ctx->id != 0 && sched_ctx->perf_counters != NULL
 			   && sched_ctx->perf_counters->notify_empty_ctx)
@@ -497,9 +499,9 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 
 		if (nworkers == 0)
 		{
-			_starpu_sched_ctx_lock_write(sched_ctx->id);
+			STARPU_PTHREAD_MUTEX_LOCK(&sched_ctx->empty_ctx_mutex);
 			starpu_task_list_push_back(&sched_ctx->empty_ctx_tasks, task);
-			_starpu_sched_ctx_unlock_write(sched_ctx->id);
+			STARPU_PTHREAD_MUTEX_UNLOCK(&sched_ctx->empty_ctx_mutex);
 #ifdef STARPU_USE_SC_HYPERVISOR
 			if(sched_ctx->id != 0 && sched_ctx->perf_counters != NULL
 			   && sched_ctx->perf_counters->notify_empty_ctx)
@@ -589,6 +591,8 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 		{
 			STARPU_ASSERT(sched_ctx->sched_policy->push_task);
 			/* check out if there are any workers in the context */
+			starpu_pthread_rwlock_t *changing_ctx_mutex = _starpu_sched_ctx_get_changing_ctx_mutex(sched_ctx->id);
+			STARPU_PTHREAD_RWLOCK_RDLOCK(changing_ctx_mutex);
 			nworkers = starpu_sched_ctx_get_nworkers(sched_ctx->id);
 			if (nworkers == 0)
 				ret = -1;
@@ -599,6 +603,7 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 				ret = sched_ctx->sched_policy->push_task(task);
 				_STARPU_SCHED_END;
 			}
+			STARPU_PTHREAD_RWLOCK_UNLOCK(changing_ctx_mutex);
 		}
 
 		if(ret == -1)

+ 3 - 2
src/core/sched_policy.h

@@ -28,7 +28,7 @@
 
 #define _STARPU_SCHED_BEGIN \
 	_STARPU_TRACE_WORKER_SCHEDULING_PUSH;	\
-	_SIMGRID_TIMER_BEGIN
+	_SIMGRID_TIMER_BEGIN(_starpu_simgrid_sched_cost())
 #define _STARPU_SCHED_END \
 	_SIMGRID_TIMER_END;			\
 	_STARPU_TRACE_WORKER_SCHEDULING_POP
@@ -103,8 +103,9 @@ extern struct starpu_sched_policy _starpu_sched_modular_heft2_policy;
 extern struct starpu_sched_policy _starpu_sched_graph_test_policy;
 
 extern long _starpu_task_break_on_push;
-extern long _starpu_task_break_on_pop;
 extern long _starpu_task_break_on_sched;
+extern long _starpu_task_break_on_pop;
+extern long _starpu_task_break_on_exec;
 
 #ifdef SIGTRAP
 #define _STARPU_TASK_BREAK_ON(task, what) do { \

+ 4 - 4
src/core/simgrid.h

@@ -69,7 +69,7 @@ starpu_pthread_queue_t _starpu_simgrid_task_queue[STARPU_NMAXWORKERS];
 #define _starpu_simgrid_queue_malloc_cost() starpu_get_env_number_default("STARPU_SIMGRID_QUEUE_MALLOC_COST", 1)
 #define _starpu_simgrid_task_submit_cost() starpu_get_env_number_default("STARPU_SIMGRID_TASK_SUBMIT_COST", 1)
 #define _starpu_simgrid_fetching_input_cost() starpu_get_env_number_default("STARPU_SIMGRID_FETCHING_INPUT_COST", 1)
-#define _starpu_simgrid_sched_cost() starpu_get_env_number_default("STARPU_SIMGRID_SCHED_COST", 1)
+#define _starpu_simgrid_sched_cost() starpu_get_env_number_default("STARPU_SIMGRID_SCHED_COST", 0)
 
 /* Called at initialization to count how many GPUs are interfering with each
  * bus */
@@ -78,10 +78,10 @@ void _starpu_simgrid_count_ngpus(void);
 void _starpu_simgrid_xbt_thread_create(const char *name, void_f_pvoid_t code,
 				       void *param);
 
-#define _SIMGRID_TIMER_BEGIN		\
+#define _SIMGRID_TIMER_BEGIN(cond)			\
 	{		\
 		xbt_os_timer_t __timer = NULL;		\
-		if (_starpu_simgrid_sched_cost()) {		\
+		if (cond) {		\
 		  __timer = xbt_os_timer_new();		\
 		  xbt_os_threadtimer_start(__timer);	\
 		}
@@ -94,7 +94,7 @@ void _starpu_simgrid_xbt_thread_create(const char *name, void_f_pvoid_t code,
 	}
 
 #else // !STARPU_SIMGRID
-#define _SIMGRID_TIMER_BEGIN {
+#define _SIMGRID_TIMER_BEGIN(cond) {
 #define _SIMGRID_TIMER_END }
 #endif
 

+ 6 - 11
src/core/topology.c

@@ -1717,7 +1717,7 @@ _starpu_bind_thread_on_cpu (
 	CPU_ZERO(&aff_mask);
 	CPU_SET(cpuid, &aff_mask);
 
-	starpu_pthread_t self = pthread_self();
+	starpu_pthread_t self = starpu_pthread_self();
 
 	ret = pthread_setaffinity_np(self, sizeof(aff_mask), &aff_mask);
 	if (ret)
@@ -2186,8 +2186,7 @@ _starpu_init_workers_binding_and_memory (struct _starpu_machine_config *config,
 				_starpu_memory_node_add_nworkers(memory_node);
 
                                 _starpu_worker_drives_memory_node(&workerarg->set->workers[0], STARPU_MAIN_RAM);
-				if (memory_node != STARPU_MAIN_RAM)
-					_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
+				_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
 				break;
 #endif
 
@@ -2226,8 +2225,7 @@ _starpu_init_workers_binding_and_memory (struct _starpu_machine_config *config,
 				_starpu_memory_node_add_nworkers(memory_node);
 
                                 _starpu_worker_drives_memory_node(workerarg, STARPU_MAIN_RAM);
-				if (memory_node != STARPU_MAIN_RAM)
-					_starpu_worker_drives_memory_node(workerarg, memory_node);
+				_starpu_worker_drives_memory_node(workerarg, memory_node);
 				break;
 #endif
 
@@ -2259,8 +2257,7 @@ _starpu_init_workers_binding_and_memory (struct _starpu_machine_config *config,
 				_starpu_memory_node_add_nworkers(memory_node);
 
                                 _starpu_worker_drives_memory_node(&workerarg->set->workers[0], STARPU_MAIN_RAM);
-				if (memory_node != STARPU_MAIN_RAM)
-					_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
+				_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
 				break;
 #endif /* STARPU_USE_MIC */
 
@@ -2275,8 +2272,7 @@ _starpu_init_workers_binding_and_memory (struct _starpu_machine_config *config,
 				_starpu_memory_node_add_nworkers(memory_node);
 
                                 _starpu_worker_drives_memory_node(workerarg, STARPU_MAIN_RAM);
-				if (memory_node != STARPU_MAIN_RAM)
-					_starpu_worker_drives_memory_node(workerarg, memory_node);
+				_starpu_worker_drives_memory_node(workerarg, memory_node);
 			}
 				break;
 #endif /* STARPU_USE_SCC */
@@ -2298,8 +2294,7 @@ _starpu_init_workers_binding_and_memory (struct _starpu_machine_config *config,
 
 				}
                                 _starpu_worker_drives_memory_node(&workerarg->set->workers[0], STARPU_MAIN_RAM);
-				if (memory_node != STARPU_MAIN_RAM)
-					_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
+				_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
 #ifndef STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD
                                 /* MPI driver thread can manage all slave memories if we disable the MPI multiple thread */
                                 unsigned findworker;

+ 37 - 21
src/core/workers.c

@@ -576,15 +576,10 @@ static void _starpu_worker_init(struct _starpu_worker *workerarg, struct _starpu
 	workerarg->reverse_phase[0] = 0;
 	workerarg->reverse_phase[1] = 0;
 	workerarg->pop_ctx_priority = 1;
-	workerarg->sched_mutex_depth = 0;
+	workerarg->sched_mutex_locked = 0;
+	workerarg->blocked = 0;
 	workerarg->is_slave_somewhere = 0;
 
-	workerarg->state_sched_op_pending = 0;
-	workerarg->state_changing_ctx_waiting = 0;
-	workerarg->state_blocked = 0;
-	workerarg->state_wait_ack__blocked = 0;
-	workerarg->state_wait_handshake__blocked = 0;
-
 	/* cpu_set/hwloc_cpu_set initialized in topology.c */
 }
 
@@ -1417,14 +1412,12 @@ static void _starpu_terminate_workers(struct _starpu_machine_config *pconfig)
 		struct _starpu_worker *worker = &pconfig->workers[workerid];
 
 		/* in case StarPU termination code is called from a callback,
- 		 * we have to check if pthread_self() is the worker itself */
+ 		 * we have to check if starpu_pthread_self() is the worker itself */
 		if (set && set->nworkers > 0)
 		{
 			if (set->started)
 			{
-#ifndef STARPU_SIMGRID
-				if (!pthread_equal(pthread_self(), set->worker_thread))
-#endif
+				if (!starpu_pthread_equal(starpu_pthread_self(), set->worker_thread))
 					status = starpu_pthread_join(set->worker_thread, NULL);
 				if (status)
 				{
@@ -1440,9 +1433,7 @@ static void _starpu_terminate_workers(struct _starpu_machine_config *pconfig)
 			if (!worker->run_by_starpu)
 				goto out;
 
-#ifndef STARPU_SIMGRID
-			if (!pthread_equal(pthread_self(), worker->worker_thread))
-#endif
+			if (!starpu_pthread_equal(starpu_pthread_self(), worker->worker_thread))
 				status = starpu_pthread_join(worker->worker_thread, NULL);
 			if (status)
 			{
@@ -1697,7 +1688,7 @@ unsigned starpu_worker_get_count(void)
 
 unsigned starpu_worker_is_blocked(int workerid)
 {
-	return (unsigned)_starpu_config.workers[workerid].state_blocked;
+	return _starpu_config.workers[workerid].blocked;
 }
 
 unsigned starpu_worker_is_slave_somewhere(int workerid)
@@ -2057,7 +2048,7 @@ void starpu_worker_get_sched_condition(int workerid, starpu_pthread_mutex_t **sc
 	*sched_mutex = &_starpu_config.workers[workerid].sched_mutex;
 }
 
-static int starpu_wakeup_worker_locked(int workerid, starpu_pthread_cond_t *sched_cond, starpu_pthread_mutex_t *mutex STARPU_ATTRIBUTE_UNUSED)
+int starpu_wakeup_worker_locked(int workerid, starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex STARPU_ATTRIBUTE_UNUSED)
 {
 #ifdef STARPU_SIMGRID
 	starpu_pthread_queue_broadcast(&_starpu_simgrid_task_queue[workerid]);
@@ -2065,19 +2056,17 @@ static int starpu_wakeup_worker_locked(int workerid, starpu_pthread_cond_t *sche
 	if (_starpu_config.workers[workerid].status == STATUS_SLEEPING)
 	{
 		_starpu_config.workers[workerid].status = STATUS_WAKING_UP;
-		/* cond_broadcast is required over cond_signal since
-		 * the condition is share for multiple purpose */
-		STARPU_PTHREAD_COND_BROADCAST(sched_cond);
+		STARPU_PTHREAD_COND_SIGNAL(cond);
 		return 1;
 	}
 	return 0;
 }
 
-static int starpu_wakeup_worker(int workerid, starpu_pthread_cond_t *sched_cond, starpu_pthread_mutex_t *mutex)
+int starpu_wakeup_worker(int workerid, starpu_pthread_cond_t *cond, starpu_pthread_mutex_t *mutex)
 {
 	int success;
 	STARPU_PTHREAD_MUTEX_LOCK_SCHED(mutex);
-	success = starpu_wakeup_worker_locked(workerid, sched_cond, mutex);
+	success = starpu_wakeup_worker_locked(workerid, cond, mutex);
 	STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(mutex);
 	return success;
 }
@@ -2171,6 +2160,33 @@ void starpu_get_version(int *major, int *minor, int *release)
 	*release = STARPU_RELEASE_VERSION;
 }
 
+void _starpu_unlock_mutex_if_prev_locked()
+{
+	int workerid = starpu_worker_get_id();
+	if(workerid != -1)
+	{
+		struct _starpu_worker *w = _starpu_get_worker_struct(workerid);
+		if(w->sched_mutex_locked)
+		{
+			STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&w->sched_mutex);
+			_starpu_worker_set_flag_sched_mutex_locked(workerid, 1);
+		}
+	}
+	return;
+}
+
+void _starpu_relock_mutex_if_prev_locked()
+{
+	int workerid = starpu_worker_get_id();
+	if(workerid != -1)
+	{
+		struct _starpu_worker *w = _starpu_get_worker_struct(workerid);
+		if(w->sched_mutex_locked)
+			STARPU_PTHREAD_MUTEX_LOCK_SCHED(&w->sched_mutex);
+	}
+	return;
+}
+
 unsigned starpu_worker_get_sched_ctx_list(int workerid, unsigned **sched_ctxs)
 {
 	unsigned s = 0;

+ 20 - 47
src/core/workers.h

@@ -20,6 +20,8 @@
 #ifndef __WORKERS_H__
 #define __WORKERS_H__
 
+#include <limits.h>
+
 #include <starpu.h>
 #include <common/config.h>
 #include <common/timing.h>
@@ -83,11 +85,6 @@ LIST_TYPE(_starpu_worker,
 	unsigned numa_memory_node; /* which numa memory node is the worker associated with? (logical index) */
 	starpu_pthread_cond_t sched_cond; /* condition variable used when the worker waits for tasks. */
         starpu_pthread_mutex_t sched_mutex; /* mutex protecting sched_cond */
-	int state_sched_op_pending:1; /* a task pop is ongoing even though sched_mutex may temporarily be unlocked */
-	int state_changing_ctx_waiting:1; /* a thread is waiting for transient operations such as pop to complete before acquiring sched_mutex and modifying the worker ctx*/
-	int state_blocked:1;
-	int state_wait_ack__blocked:1;
-	int state_wait_handshake__blocked:1;
 	struct starpu_task_list local_tasks; /* this queue contains tasks that have been explicitely submitted to that queue */
 	struct starpu_task **local_ordered_tasks; /* this queue contains tasks that have been explicitely submitted to that queue with an explicit order */
 	unsigned local_ordered_tasks_size; /* this records the size of local_ordered_tasks */
@@ -141,8 +138,11 @@ LIST_TYPE(_starpu_worker,
 	/* indicate which priority of ctx is currently active: the values are 0 or 1*/
 	unsigned pop_ctx_priority;
 
-	/* sched mutex local worker locking depth */
-	unsigned sched_mutex_depth;
+	/* flag to know if sched_mutex is locked or not */
+	unsigned sched_mutex_locked;
+
+	/* bool to indicate if the worker is blocked in a ctx */
+	unsigned blocked;
 
 	/* bool to indicate if the worker is slave in a ctx */
 	unsigned is_slave_somewhere;
@@ -509,7 +509,7 @@ static inline struct _starpu_worker *_starpu_get_worker_struct(unsigned id)
 	return &_starpu_config.workers[id];
 }
 
-/* Returns the starpu_sched_ctx structure that describes the state of the 
+/* Returns the starpu_sched_ctx structure that descriebes the state of the 
  * specified ctx */
 static inline struct _starpu_sched_ctx *_starpu_get_sched_ctx_struct(unsigned id)
 {
@@ -559,6 +559,18 @@ int starpu_worker_get_nids_by_type(enum starpu_worker_archtype type, int *worker
    the list might not be updated */
 int starpu_worker_get_nids_ctx_free_by_type(enum starpu_worker_archtype type, int *workerids, int maxsize);
 
+/* if the current worker has the lock release it */
+void _starpu_unlock_mutex_if_prev_locked();
+
+/* if we prev released the lock relock it */
+void _starpu_relock_mutex_if_prev_locked();
+
+static inline void _starpu_worker_set_flag_sched_mutex_locked(int workerid, unsigned flag)
+{
+	struct _starpu_worker *w = _starpu_get_worker_struct(workerid);
+	w->sched_mutex_locked = flag;
+}
+
 static inline unsigned _starpu_worker_mutex_is_sched_mutex(int workerid, starpu_pthread_mutex_t *mutex)
 {
 	struct _starpu_worker *w = _starpu_get_worker_struct(workerid);
@@ -611,43 +623,4 @@ void _starpu_worker_set_stream_ctx(unsigned workerid, struct _starpu_sched_ctx *
 
 struct _starpu_sched_ctx* _starpu_worker_get_ctx_stream(unsigned stream_workerid);
 
-/* Must be called with worker's sched_mutex held.
- * Mark the beginning of a scheduling operation during which the sched_mutex
- * lock may be temporarily released, but the scheduling context of the worker
- * should not be modified */
-static inline void _starpu_worker_enter_transient_sched_op(struct _starpu_worker * const worker)
-{
-	worker->state_sched_op_pending = 1;
-}
-
-/* Must be called with worker's sched_mutex held.
- * Mark the end of a scheduling operation, and notify potential waiters that
- * scheduling context changes can safely be performed again.
- */
-static inline void  _starpu_worker_leave_transient_sched_op(struct _starpu_worker * const worker)
-{
-	worker->state_sched_op_pending = 0;
-	if (worker->state_changing_ctx_waiting)
-		/* cond_broadcast is required over cond_signal since
-		 * the condition is share for multiple purpose */
-		STARPU_PTHREAD_COND_BROADCAST(&worker->sched_cond);
-}
-
-/* Must be called with worker's sched_mutex held.
- * Passively wait until state_sched_op_pending is cleared.
- */
-static inline void _starpu_worker_wait_for_transient_sched_op_completion(struct _starpu_worker * const worker)
-{
-	if (worker->state_sched_op_pending)
-	{
-		worker->state_changing_ctx_waiting = 1;
-		do
-		{
-			STARPU_PTHREAD_COND_WAIT(&worker->sched_cond, &worker->sched_mutex);
-		}
-		while (worker->state_sched_op_pending);
-		worker->state_changing_ctx_waiting = 0;
-	}
-}
-
 #endif // __WORKERS_H__

+ 16 - 0
src/datawizard/malloc.c

@@ -32,6 +32,7 @@
 #ifdef STARPU_SIMGRID
 #include <sys/mman.h>
 #include <fcntl.h>
+#include <smpi/smpi.h>
 #endif
 
 #ifndef O_BINARY
@@ -48,9 +49,12 @@ static int malloc_on_node_default_flags[STARPU_MAXNODES];
 
 /* This file is used for implementing "folded" allocation */
 #ifdef STARPU_SIMGRID
+#if SIMGRID_VERSION_MAJOR < 3 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR < 15)
+/* TODO: drop when simgrid 3.15 is reasonably largely used by people who need the feature */
 static int bogusfile = -1;
 static unsigned long _starpu_malloc_simulation_fold;
 #endif
+#endif
 
 void starpu_malloc_set_align(size_t align)
 {
@@ -230,6 +234,10 @@ int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int fl
 #ifdef STARPU_SIMGRID
 	if (flags & STARPU_MALLOC_SIMULATION_FOLDED)
 	{
+#if SIMGRID_VERSION_MAJOR > 3 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR >= 15)
+		*A = SMPI_SHARED_MALLOC(dim);
+#else
+		/* TODO: drop when simgrid 3.15 is reasonably largely used by people who need the feature */
 		/* Use "folded" allocation: the same file is mapped several
 		 * times contiguously, to get a memory area one can read/write,
 		 * without consuming memory */
@@ -282,6 +290,7 @@ int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int fl
 			}
 			*A = buf;
 		}
+#endif
 	}
 	else
 #endif
@@ -465,7 +474,12 @@ int _starpu_free_flags_on_node(unsigned dst_node, void *A, size_t dim, int flags
 #ifdef STARPU_SIMGRID
 	if (flags & STARPU_MALLOC_SIMULATION_FOLDED)
 	{
+#if SIMGRID_VERSION_MAJOR > 3 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR >= 15)
+		SMPI_SHARED_FREE(A);
+#else
+		/* TODO: drop when simgrid 3.15 is reasonably largely used by people who need the feature */
 		munmap(A, dim);
+#endif
 	}
 	else
 #endif
@@ -840,9 +854,11 @@ _starpu_malloc_init(unsigned dst_node)
 	disable_pinning = starpu_get_env_number("STARPU_DISABLE_PINNING");
 	malloc_on_node_default_flags[dst_node] = STARPU_MALLOC_PINNED | STARPU_MALLOC_COUNT;
 #ifdef STARPU_SIMGRID
+#if SIMGRID_VERSION_MAJOR < 3 || (SIMGRID_VERSION_MAJOR == 3 && SIMGRID_VERSION_MINOR < 15)
 	/* Reasonably "costless" */
 	_starpu_malloc_simulation_fold = starpu_get_env_number_default("STARPU_MALLOC_SIMULATION_FOLD", 1) << 20;
 #endif
+#endif
 }
 
 void

+ 5 - 2
src/datawizard/memory_nodes.c

@@ -185,9 +185,12 @@ unsigned starpu_worker_get_memory_node(unsigned workerid)
 /* same utility as _starpu_memory_node_add_nworkers */
 void _starpu_worker_drives_memory_node(struct _starpu_worker *worker, unsigned memnode)
 {
-	_starpu_worker_drives_memory[worker->workerid][memnode] = 1;
+	if (! _starpu_worker_drives_memory[worker->workerid][memnode])
+	{
+		_starpu_worker_drives_memory[worker->workerid][memnode] = 1;
 #ifdef STARPU_SIMGRID
-	starpu_pthread_queue_register(&worker->wait, &_starpu_simgrid_transfer_queue[memnode]);
+		starpu_pthread_queue_register(&worker->wait, &_starpu_simgrid_transfer_queue[memnode]);
 #endif
+	}
 }
 

+ 21 - 5
src/debug/traces/starpu_fxt.c

@@ -157,6 +157,8 @@ static void task_dump(struct task_info *task)
 
 	if (task->exclude_from_dag)
 		goto out;
+	if (!tasks_file)
+		goto out;
 
 	if (task->name)
 	{
@@ -274,6 +276,8 @@ static struct data_info *get_data(unsigned long handle, int mpi_rank)
 
 static void data_dump(struct data_info *data)
 {
+	if (!data_file)
+		goto out;
 	fprintf(data_file, "Handle: %lx\n", data->handle);
 	fprintf(data_file, "MPIRank: %d\n", data->mpi_rank);
 	if (data->name)
@@ -291,6 +295,7 @@ static void data_dump(struct data_info *data)
 	}
 	fprintf(data_file, "MPIOwner: %d\n", data->mpi_owner);
 	fprintf(data_file, "\n");
+out:
 	HASH_DEL(data_info, data);
 	free(data);
 }
@@ -2388,8 +2393,8 @@ static void handle_task_done(struct fxt_ev_64 *ev, struct starpu_fxt_options *op
 	unsigned exclude_from_dag = ev->param[2];
 	struct task_info *task = get_task(job_id, options->file_rank);
 	task->exclude_from_dag = exclude_from_dag;
-	if (tasks_file)
-		task_dump(task);
+
+	task_dump(task);
 
 	if (!exclude_from_dag)
 		_starpu_fxt_dag_set_task_done(options->file_prefix, job_id, name, colour);
@@ -2698,9 +2703,8 @@ static void handle_task_wait_for_all(void)
 	_starpu_fxt_dag_add_sync_point();
 }
 
-static void handle_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
+static void handle_string_event(struct fxt_ev_64 *ev, const char *event, struct starpu_fxt_options *options)
 {
-	char *event = get_fxt_string(ev, 0);
 	/* Add an event in the trace */
 	if (out_paje_file)
 	{
@@ -2717,6 +2721,12 @@ static void handle_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *option
 		recfmt_dump_state(get_event_time_stamp(ev, options), "ProgEvent", -1, 0, event, "Program");
 }
 
+static void handle_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
+{
+	char *event = get_fxt_string(ev, 0);
+	handle_string_event(ev, event, options);
+}
+
 static void handle_thread_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
 {
 	/* Add an event in the trace */
@@ -3418,6 +3428,13 @@ void _starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *op
 				fut_keymask = ev.param[0];
 				break;
 
+			case FUT_START_FLUSH_CODE:
+				handle_string_event(&ev, "fxt_start_flush", options);
+				break;
+			case FUT_STOP_FLUSH_CODE:
+				handle_string_event(&ev, "fxt_stop_flush", options);
+				break;
+
 			/* We can safely ignore FUT internal events */
 			case FUT_CALIBRATE0_CODE:
 			case FUT_CALIBRATE1_CODE:
@@ -3467,7 +3484,6 @@ void _starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *op
 #endif
 	}
 
-	if (data_file)
 	{
 		/* TODO: move to handle_data_unregister */
 		struct data_info *data, *tmp;

+ 6 - 6
src/debug/traces/starpu_paje.c

@@ -193,7 +193,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 	poti_DefineEntityValue("E", "S", "Executing", ".0 .6 .5");
 	poti_DefineEntityValue("Sc", "S", "Scheduling", ".7 .36 .0");
 	poti_DefineEntityValue("Sl", "S", "Sleeping", ".9 .1 .0");
-	poti_DefineEntityValue("P", "S", "Progressing", ".4 .1 .6");
+	poti_DefineEntityValue("P", "S", "Progressing", ".1 .3 .1");
 	poti_DefineEntityValue("U", "S", "Unpartitioning", ".0 .0 1.0");
 	poti_DefineEntityValue("H", "S", "Hypervisor", ".5 .18 .0");
 	poti_DefineEntityValue("Bu", "S", "Building task", ".5 .18 .0");
@@ -213,7 +213,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 	poti_DefineEntityValue("E", "WS", "Executing", ".0 .6 .5");
 	poti_DefineEntityValue("Sc", "WS", "Scheduling", ".7 .36 .0");
 	poti_DefineEntityValue("Sl", "WS", "Sleeping", ".9 .1 .0");
-	poti_DefineEntityValue("P", "WS", "Progressing", ".4 .1 .6");
+	poti_DefineEntityValue("P", "WS", "Progressing", ".1 .3 .1");
 	poti_DefineEntityValue("U", "WS", "Unpartitioning", ".0 .0 1.0");
 	poti_DefineEntityValue("H", "WS", "Hypervisor", ".5 .18 .0");
 	poti_DefineEntityValue("Bu", "WS", "Building task", ".5 .18 .0");
@@ -268,7 +268,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 		poti_DefineEntityValue("E", ctx, "Executing", ".0 .6 .5");
 		poti_DefineEntityValue("Sc", ctx, "Scheduling", ".7 .36 .0");
 		poti_DefineEntityValue("Sl", ctx, "Sleeping", ".9 .1 .0");
-		poti_DefineEntityValue("P", ctx, "Progressing", ".4 .1 .6");
+		poti_DefineEntityValue("P", ctx, "Progressing", ".1 .3 .1");
 		poti_DefineEntityValue("U", ctx, "Unpartitioning", ".0 .0 1.0");
 		poti_DefineEntityValue("H", ctx, "Hypervisor", ".5 .18 .0");
 	}
@@ -331,7 +331,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 6       E       S       Executing         \".0 .6 .5\"		\n\
 6       Sc       S      Scheduling         \".7 .36 .0\"		\n\
 6       Sl       S      Sleeping         \".9 .1 .0\"		\n\
-6       P       S       Progressing         \".4 .1 .6\"		\n\
+6       P       S       Progressing         \".1 .3 .1\"		\n\
 6       U       S       Unpartitioning      \".0 .0 1.0\"		\n\
 6       H       S       Hypervisor      \".5 .18 .0\"		\n\
 6       Bu      S       \"Building task\"   \".5 .18 .0\"		\n\
@@ -351,7 +351,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 6       E       WS       Executing         \".0 .6 .5\"		\n\
 6       Sc       WS      Scheduling         \".7 .36 .0\"		\n\
 6       Sl       WS      Sleeping         \".9 .1 .0\"		\n\
-6       P       WS       Progressing         \".4 .1 .6\"		\n\
+6       P       WS       Progressing         \".1 .3 .1\"		\n\
 6       U       WS       Unpartitioning      \".0 .0 1.0\"		\n\
 6       H       WS       Hypervisor      \".5 .18 .0\"		\n\
 6       Bu      WS       \"Building task\"   \".5 .18 .0\"		\n\
@@ -394,7 +394,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 6       E       Ctx%u       Executing         \".0 .6 .5\"		\n\
 6       Sc       Ctx%u      Scheduling         \".7 .36 .0\"		\n\
 6       Sl       Ctx%u      Sleeping         \".9 .1 .0\"		\n\
-6       P       Ctx%u       Progressing         \".4 .1 .6\"		\n\
+6       P       Ctx%u       Progressing         \".1 .3 .1\"		\n\
 6       U       Ctx%u       Unpartitioning         \".0 .0 1.0\"	\n\
 6       H       Ctx%u       Hypervisor         \".5 .18 .0\"		\n",
 		i, i, i, i, i, i, i, i, i, i, i, i, i);

+ 8 - 2
src/drivers/cpu/driver_cpu.c

@@ -89,6 +89,12 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 #ifdef STARPU_SIMGRID
 			if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE)
 				func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
+			else if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT)
+			{
+				_SIMGRID_TIMER_BEGIN(1);
+				func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
+				_SIMGRID_TIMER_END;
+			}
 			else
 				_starpu_simgrid_submit_job(cpu_args->workerid, j, perf_arch, NAN, NULL);
 #else
@@ -410,13 +416,13 @@ void *_starpu_cpu_worker(void *arg)
 	struct _starpu_worker *worker = arg;
 
 	_starpu_cpu_driver_init(worker);
-	_STARPU_TRACE_END_PROGRESS(worker->memory_node);
+	_STARPU_TRACE_START_PROGRESS(worker->memory_node);
 	while (_starpu_machine_is_running())
 	{
 		_starpu_may_pause();
 		_starpu_cpu_driver_run_once(worker);
 	}
-	_STARPU_TRACE_START_PROGRESS(worker->memory_node);
+	_STARPU_TRACE_END_PROGRESS(worker->memory_node);
 	_starpu_cpu_driver_deinit(worker);
 
 	return NULL;

+ 10 - 4
src/drivers/cuda/driver_cuda.c

@@ -507,6 +507,12 @@ static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *worke
 		unsigned workerid = worker->workerid;
 		if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE && !async)
 			func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
+		else if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT && !async)
+			{
+				_SIMGRID_TIMER_BEGIN(1);
+				func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
+				_SIMGRID_TIMER_END;
+			}
 		else
 			_starpu_simgrid_submit_job(workerid, j, &worker->perf_arch, NAN,
 				async ? &task_finished[workerid][pipeline_idx] : NULL);
@@ -763,6 +769,7 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 		task = worker->task_transferring;
 		if (task && worker->nb_buffers_transferred == worker->nb_buffers_totransfer)
 		{
+			_STARPU_TRACE_END_PROGRESS(memnode);
 			j = _starpu_get_job_associated_to_task(task);
 
 			_starpu_set_local_worker_key(worker);
@@ -779,10 +786,9 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 			}
 			else
 			{
-				_STARPU_TRACE_END_PROGRESS(memnode);
 				execute_job_on_cuda(task, worker);
-				_STARPU_TRACE_START_PROGRESS(memnode);
 			}
+			_STARPU_TRACE_START_PROGRESS(memnode);
 		}
 
 		/* Then test for termination of queued tasks */
@@ -811,6 +817,7 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 		else
 #endif /* !STARPU_SIMGRID */
 		{
+			_STARPU_TRACE_END_PROGRESS(memnode);
 			/* Asynchronous task completed! */
 			_starpu_set_local_worker_key(worker);
 			finish_job_on_cuda(_starpu_get_job_associated_to_task(task), worker);
@@ -831,11 +838,9 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 					 * flushing the pipeline, we can now at
 					 * last execute it.  */
 
-					_STARPU_TRACE_END_PROGRESS(memnode);
 					_STARPU_TRACE_EVENT("sync_task");
 					execute_job_on_cuda(task, worker);
 					_STARPU_TRACE_EVENT("end_sync_task");
-					_STARPU_TRACE_START_PROGRESS(memnode);
 					worker->pipeline_stuck = 0;
 				}
 			}
@@ -848,6 +853,7 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 				/* Everybody busy */
 				_STARPU_TRACE_END_EXECUTING()
 #endif
+			_STARPU_TRACE_START_PROGRESS(memnode);
 		}
 
 		if (!worker->pipeline_length || worker->ntasks < worker->pipeline_length)

+ 23 - 44
src/drivers/driver_common/driver_common.c

@@ -101,6 +101,7 @@ void _starpu_driver_start_job(struct _starpu_worker *worker, struct _starpu_job
 	}
 	else
 		_STARPU_TRACE_START_CODELET_BODY(j, j->nimpl, perf_arch, workerid);
+	_STARPU_TASK_BREAK_ON(task, exec);
 }
 
 void _starpu_driver_end_job(struct _starpu_worker *worker, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch STARPU_ATTRIBUTE_UNUSED, struct timespec *codelet_end, int rank, int profiling)
@@ -358,6 +359,7 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *worker, int w
 			sched_ctx = _starpu_get_sched_ctx_struct(e->sched_ctx);
 			if(sched_ctx && sched_ctx->id > 0 && sched_ctx->id < STARPU_NMAX_SCHED_CTXS)
 			{
+				STARPU_PTHREAD_MUTEX_LOCK(&sched_ctx->parallel_sect_mutex[workerid]);
 				if(!sched_ctx->sched_policy)
 					worker->is_slave_somewhere = sched_ctx->main_master != workerid;
 
@@ -366,23 +368,18 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *worker, int w
 					/* don't let the worker sleep with the sched_mutex taken */
 					/* we need it until here bc of the list of ctxs of the workers
 					   that can change in another thread */
+					STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&worker->sched_mutex);
 					needed = 0;
-					worker->state_blocked = 1;
-					worker->state_wait_ack__blocked = 1;
-					STARPU_PTHREAD_COND_BROADCAST(&worker->sched_cond);
-					do
-					{
-						STARPU_PTHREAD_COND_WAIT(&worker->sched_cond, &worker->sched_mutex);
-					}
-					while (worker->state_wait_ack__blocked);
-					worker->state_blocked = 0;
+					_starpu_sched_ctx_signal_worker_blocked(sched_ctx->id, workerid);
+					sched_ctx->busy[workerid] = 1;
+					STARPU_PTHREAD_COND_WAIT(&sched_ctx->parallel_sect_cond[workerid], &sched_ctx->parallel_sect_mutex[workerid]);
+					sched_ctx->busy[workerid] = 0;
+					STARPU_PTHREAD_COND_SIGNAL(&sched_ctx->parallel_sect_cond_busy[workerid]);
+					_starpu_sched_ctx_signal_worker_woke_up(sched_ctx->id, workerid);
 					sched_ctx->parallel_sect[workerid] = 0;
-					if (worker->state_wait_handshake__blocked)
-					{
-						worker->state_wait_handshake__blocked = 0;
-						STARPU_PTHREAD_COND_BROADCAST(&worker->sched_cond);
-					}
+					STARPU_PTHREAD_MUTEX_LOCK_SCHED(&worker->sched_mutex);
 				}
+				STARPU_PTHREAD_MUTEX_UNLOCK(&sched_ctx->parallel_sect_mutex[workerid]);
 			}
 			if(!needed)
 				break;
@@ -391,25 +388,21 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *worker, int w
 		if(worker->tmp_sched_ctx != -1)
 		{
 			sched_ctx = _starpu_get_sched_ctx_struct(worker->tmp_sched_ctx);
+			STARPU_PTHREAD_MUTEX_LOCK(&sched_ctx->parallel_sect_mutex[workerid]);
 			if(sched_ctx->parallel_sect[workerid])
 			{
 //				needed = 0;
-				worker->state_blocked = 1;
-				worker->state_wait_ack__blocked = 1;
-				STARPU_PTHREAD_COND_BROADCAST(&worker->sched_cond);
-				do
-				{
-					STARPU_PTHREAD_COND_WAIT(&worker->sched_cond, &worker->sched_mutex);
-				}
-				while (worker->state_wait_ack__blocked);
-				worker->state_blocked = 0;
+				STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&worker->sched_mutex);
+				_starpu_sched_ctx_signal_worker_blocked(sched_ctx->id, workerid);
+				sched_ctx->busy[workerid] = 1;
+				STARPU_PTHREAD_COND_WAIT(&sched_ctx->parallel_sect_cond[workerid], &sched_ctx->parallel_sect_mutex[workerid]);
+				sched_ctx->busy[workerid] = 0;
+				STARPU_PTHREAD_COND_SIGNAL(&sched_ctx->parallel_sect_cond_busy[workerid]);
+				_starpu_sched_ctx_signal_worker_woke_up(sched_ctx->id, workerid);
 				sched_ctx->parallel_sect[workerid] = 0;
-				if (worker->state_wait_handshake__blocked)
-				{
-					worker->state_wait_handshake__blocked = 0;
-					STARPU_PTHREAD_COND_BROADCAST(&worker->sched_cond);
-				}
+				STARPU_PTHREAD_MUTEX_LOCK_SCHED(&worker->sched_mutex);
 			}
+			STARPU_PTHREAD_MUTEX_UNLOCK(&sched_ctx->parallel_sect_mutex[workerid]);
 		}
 
 		needed = !needed;
@@ -428,11 +421,7 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *worker, int w
 		task = NULL;
 	/*else try to pop a task*/
 	else
-	{
-		_starpu_worker_enter_transient_sched_op(worker);
 		task = _starpu_pop_task(worker);
-		_starpu_worker_leave_transient_sched_op(worker);
-	}
 
 #if !defined(STARPU_SIMGRID)
 	if (task == NULL && !executing)
@@ -449,11 +438,7 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *worker, int w
 		if (_starpu_worker_can_block(memnode, worker)
 			&& !_starpu_sched_ctx_last_worker_awake(worker))
 		{
-			do
-			{
-				STARPU_PTHREAD_COND_WAIT(&worker->sched_cond, &worker->sched_mutex);
-			}
-			while (worker->status == STATUS_SLEEPING);
+			STARPU_PTHREAD_COND_WAIT(&worker->sched_cond, &worker->sched_mutex);
 			STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&worker->sched_mutex);
 		}
 		else
@@ -527,9 +512,7 @@ int _starpu_get_multi_worker_task(struct _starpu_worker *workers, struct starpu_
 #endif
 			_starpu_worker_set_status_scheduling(workers[i].workerid);
 			_starpu_set_local_worker_key(&workers[i]);
-			_starpu_worker_enter_transient_sched_op(&workers[i]);
 			tasks[i] = _starpu_pop_task(&workers[i]);
-			_starpu_worker_leave_transient_sched_op(&workers[i]);
 			if(tasks[i] != NULL)
 			{
 				_starpu_worker_set_status_scheduling_done(workers[i].workerid);
@@ -598,11 +581,7 @@ int _starpu_get_multi_worker_task(struct _starpu_worker *workers, struct starpu_
 		if (_starpu_worker_can_block(memnode, worker)
 				&& !_starpu_sched_ctx_last_worker_awake(worker))
 		{
-			do
-			{
-				STARPU_PTHREAD_COND_WAIT(&worker->sched_cond, &worker->sched_mutex);
-			}
-			while (worker->status == STATUS_SLEEPING);
+			STARPU_PTHREAD_COND_WAIT(&worker->sched_cond, &worker->sched_mutex);
 			STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&worker->sched_mutex);
 		}
 		else

+ 1 - 1
src/drivers/mic/driver_mic_sink.c

@@ -39,7 +39,7 @@ void _starpu_mic_sink_init(struct _starpu_mp_node *node)
 	cpu_set_t cpuset;
 	/* We reserve one core for the communications */
 	/*Bind on the first core*/
-	self = pthread_self();
+	self = starpu_pthread_self();
 	CPU_ZERO(&cpuset);
 	CPU_SET(0,&cpuset);
 	pthread_setaffinity_np(self,sizeof(cpu_set_t),&cpuset);

+ 2 - 3
src/drivers/mp_common/mp_common.c

@@ -15,7 +15,6 @@
  */
 
 #include <stdlib.h>
-#include <pthread.h>
 
 #include <datawizard/interfaces/data_interface.h>
 #include <drivers/mp_common/mp_common.h>
@@ -400,7 +399,7 @@ void _starpu_mp_common_send_command(const struct _starpu_mp_node *node,
 {
 	STARPU_ASSERT_MSG(arg_size <= BUFFER_SIZE, "Too much data (%d) for the static MIC buffer (%d), increase BUFFER_SIZE perhaps?", arg_size, BUFFER_SIZE);
 
-        //printf("SEND CMD : %d - arg_size %d by %lu \n", command, arg_size, pthread_self());
+        //printf("SEND CMD : %d - arg_size %d by %lu \n", command, arg_size, starpu_pthread_self());
 
 	/* MIC and MPI sizes are given through a int */
 	int command_size = sizeof(enum _starpu_mp_command);
@@ -436,7 +435,7 @@ enum _starpu_mp_command _starpu_mp_common_recv_command(const struct _starpu_mp_n
 	command = *((enum _starpu_mp_command *) node->buffer);
 	*arg_size = *((int *) ((uintptr_t)node->buffer + command_size));
 
-        //printf("RECV command : %d - arg_size %d by %lu \n", command, *arg_size, pthread_self());
+        //printf("RECV command : %d - arg_size %d by %lu \n", command, *arg_size, starpu_pthread_self());
 
 	/* If there is no argument (ie. arg_size == 0),
 	 * let's return the command right now */

+ 0 - 1
src/drivers/mp_common/mp_common.h

@@ -17,7 +17,6 @@
 #ifndef __MP_COMMON_H__
 #define __MP_COMMON_H__
 
-#include <pthread.h>
 #include <semaphore.h>
 
 #include <starpu.h>

+ 4 - 1
src/drivers/mpi/driver_mpi_common.c

@@ -80,7 +80,10 @@ int _starpu_mpi_common_mp_init()
 #endif
 
                 int thread_support;
-                STARPU_ASSERT(MPI_Init_thread(_starpu_get_argc(), _starpu_get_argv(), required, &thread_support) == MPI_SUCCESS);
+                if (MPI_Init_thread(_starpu_get_argc(), _starpu_get_argv(), required, &thread_support) != MPI_SUCCESS)
+		{
+			STARPU_ABORT_MSG("Cannot Initialize MPI !");
+		}
 
                 if (thread_support != required)
                 {

+ 8 - 0
src/drivers/opencl/driver_opencl.c

@@ -954,6 +954,14 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker
 			simulate = 1;
 		#endif
 		}
+		else if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT && !async)
+			{
+				_SIMGRID_TIMER_BEGIN(1);
+				func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
+				_SIMGRID_TIMER_END;
+				simulate=0;
+			}
+
 		if (simulate)
 			_starpu_simgrid_submit_job(worker->workerid, j, &worker->perf_arch, length,
 						   async ? &task_finished[worker->devid][pipeline_idx] : NULL);

+ 7 - 7
src/profiling/bound.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2011, 2012, 2013, 2016, 2017  CNRS
- * Copyright (C) 2010-2016  Université de Bordeaux
+ * Copyright (C) 2010-2017  Université de Bordeaux
  * Copyright (C) 2011  Télécom-SudParis
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -209,11 +209,11 @@ static double** initialize_arch_duration(int maxdevid, unsigned* maxncore_table)
 static void initialize_duration(struct bound_task *task)
 {
 	struct _starpu_machine_config *conf = _starpu_get_machine_config();
-	task->duration[STARPU_CPU_WORKER] = initialize_arch_duration(1,&conf->topology.ncpus); 
-	task->duration[STARPU_CUDA_WORKER] = initialize_arch_duration(conf->topology.ncudagpus,NULL); 
-	task->duration[STARPU_OPENCL_WORKER] = initialize_arch_duration(conf->topology.nopenclgpus,NULL); 
-	task->duration[STARPU_MIC_WORKER] = initialize_arch_duration(conf->topology.nmicdevices,conf->topology.nmiccores); 
-	task->duration[STARPU_SCC_WORKER] = initialize_arch_duration(conf->topology.nsccdevices,NULL); 
+	task->duration[STARPU_CPU_WORKER] = initialize_arch_duration(1,&conf->topology.nhwcpus); 
+	task->duration[STARPU_CUDA_WORKER] = initialize_arch_duration(conf->topology.nhwcudagpus,NULL); 
+	task->duration[STARPU_OPENCL_WORKER] = initialize_arch_duration(conf->topology.nhwopenclgpus,NULL); 
+	task->duration[STARPU_MIC_WORKER] = initialize_arch_duration(conf->topology.nhwmicdevices,conf->topology.nmiccores); 
+	task->duration[STARPU_SCC_WORKER] = initialize_arch_duration(conf->topology.nhwscc,NULL); 
 }
 
 static struct starpu_perfmodel_device device =
@@ -278,7 +278,7 @@ void _starpu_bound_record(struct _starpu_job *j)
 	{
 		struct bound_task_pool *tp;
 
-		_starpu_compute_buffers_footprint(j->task->cl?j->task->cl->model:NULL, STARPU_CPU_WORKER, 0, j);
+		_starpu_compute_buffers_footprint(j->task->cl?j->task->cl->model:NULL, NULL, 0, j);
 
 		if (last && last->cl == j->task->cl && last->footprint == j->footprint)
 			tp = last;

+ 2 - 1
src/profiling/profiling_helpers.c

@@ -60,6 +60,7 @@ void _starpu_profiling_bus_helper_display_summary(FILE *stream)
 
 		unsigned unit = 0;
 		double d = convert_to_byte_units(transferred, max_unit, &unit);
+		double avg = (transfer_cnt != 0) ? (d / transfer_cnt) : 0;
 
 		_starpu_memory_node_get_name(src, src_name, sizeof(src_name));
 		_starpu_memory_node_get_name(dst, dst_name, sizeof(dst_name));
@@ -67,7 +68,7 @@ void _starpu_profiling_bus_helper_display_summary(FILE *stream)
 		fprintf(stream, "\t%s -> %s", src_name, dst_name);
 		fprintf(stream, "\t%.2lf %s", d, byte_units[unit]);
 		fprintf(stream, "\t%.2lf %s/s", d / elapsed_time, byte_units[unit]);
-		fprintf(stream, "\t(transfers : %lld - avg %.2lf %s)\n", transfer_cnt, d / transfer_cnt, byte_units[unit]);
+		fprintf(stream, "\t(transfers : %lld - avg %.2lf %s)\n", transfer_cnt, avg, byte_units[unit]);
 
 		sum_transferred += transferred;
 	}

+ 8 - 2
src/sched_policies/component_worker.c

@@ -503,8 +503,11 @@ static void simple_worker_can_pull(struct starpu_sched_component * worker_compon
 	}
 	if(_starpu_sched_component_worker_is_sleeping_status(worker_component))
 	{
+		starpu_pthread_mutex_t *sched_mutex;
+		starpu_pthread_cond_t *sched_cond;
+		starpu_worker_get_sched_condition(w->workerid, &sched_mutex, &sched_cond);
 		_starpu_sched_component_unlock_worker(worker_component->tree->sched_ctx_id, w->workerid);
-		starpu_wake_worker(w->workerid);
+		starpu_wakeup_worker(w->workerid, sched_cond, sched_mutex);
 	}
 	else
 		_starpu_sched_component_unlock_worker(worker_component->tree->sched_ctx_id, w->workerid);
@@ -723,7 +726,10 @@ static void combined_worker_can_pull(struct starpu_sched_component * component)
 		_starpu_sched_component_lock_worker(component->tree->sched_ctx_id, worker);
 		if(_starpu_sched_component_worker_is_sleeping_status(component))
 		{
-			starpu_wake_worker(worker);
+			starpu_pthread_mutex_t *sched_mutex;
+			starpu_pthread_cond_t *sched_cond;
+			starpu_worker_get_sched_condition(worker, &sched_mutex, &sched_cond);
+			starpu_wakeup_worker(worker, sched_cond, sched_mutex);
 		}
 		if(_starpu_sched_component_worker_is_reset_status(component))
 			_starpu_sched_component_worker_set_changed_status(component);

+ 10 - 3
src/sched_policies/deque_modeling_policy_data_aware.c

@@ -251,6 +251,7 @@ static struct starpu_task *dmda_pop_ready_task(unsigned sched_ctx_id)
 
 	/* Take the opportunity to update start time */
 	fifo->exp_start = STARPU_MAX(starpu_timing_now(), fifo->exp_start);
+	fifo->exp_end = fifo->exp_start + fifo->exp_len;
 
 	task = _starpu_fifo_pop_first_ready_task(fifo, node, dt->num_priorities);
 	if (task)
@@ -285,6 +286,7 @@ static struct starpu_task *dmda_pop_task(unsigned sched_ctx_id)
 
 	/* Take the opportunity to update start time */
 	fifo->exp_start = STARPU_MAX(starpu_timing_now(), fifo->exp_start);
+	fifo->exp_end = fifo->exp_start + fifo->exp_len;
 
 	STARPU_ASSERT_MSG(fifo, "worker %u does not belong to ctx %u anymore.\n", workerid, sched_ctx_id);
 
@@ -321,6 +323,7 @@ static struct starpu_task *dmda_pop_every_task(unsigned sched_ctx_id)
 
 	/* Take the opportunity to update start time */
 	fifo->exp_start = STARPU_MAX(starpu_timing_now(), fifo->exp_start);
+	fifo->exp_end = fifo->exp_start + fifo->exp_len;
 
 	starpu_pthread_mutex_t *sched_mutex;
 	starpu_pthread_cond_t *sched_cond;
@@ -367,6 +370,7 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
         /* Sometimes workers didn't take the tasks as early as we expected */
 	fifo->exp_start = isnan(fifo->exp_start) ? starpu_timing_now() + fifo->pipeline_len : STARPU_MAX(fifo->exp_start, starpu_timing_now());
+	fifo->exp_end = fifo->exp_start + fifo->exp_len;
 
 	if ((starpu_timing_now() + predicted_transfer) < fifo->exp_end)
 	{
@@ -448,7 +452,7 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 
 
 #if !defined(STARPU_NON_BLOCKING_DRIVERS) || defined(STARPU_SIMGRID)
-		starpu_wake_worker_locked(best_workerid);
+		starpu_wakeup_worker_locked(best_workerid, sched_cond, sched_mutex);
 #endif
 		starpu_push_task_end(task);
 		STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(sched_mutex);
@@ -460,7 +464,7 @@ static int push_task_on_best_worker(struct starpu_task *task, int best_workerid,
 		dt->queue_array[best_workerid]->ntasks++;
 		dt->queue_array[best_workerid]->nprocessed++;
 #if !defined(STARPU_NON_BLOCKING_DRIVERS) || defined(STARPU_SIMGRID)
-		starpu_wake_worker_locked(best_workerid);
+		starpu_wakeup_worker_locked(best_workerid, sched_cond, sched_mutex);
 #endif
 		starpu_push_task_end(task);
 		STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(sched_mutex);
@@ -773,7 +777,9 @@ static void compute_all_performance_predictions(struct starpu_task *task,
 			if (unknown)
 				continue;
 
-			exp_end[worker_ctx][nimpl] = exp_start + prev_exp_len + local_task_length[worker_ctx][nimpl];
+			double task_starting_time = STARPU_MAX(exp_start + prev_exp_len, starpu_timing_now() + local_data_penalty[worker_ctx][nimpl]); 
+
+			exp_end[worker_ctx][nimpl] = task_starting_time + local_task_length[worker_ctx][nimpl];
 
 			if (exp_end[worker_ctx][nimpl] < best_exp_end)
 			{
@@ -1126,6 +1132,7 @@ static void dmda_pre_exec_hook(struct starpu_task *task, unsigned sched_ctx_id)
 
 	/* Take the opportunity to update start time */
 	fifo->exp_start = STARPU_MAX(starpu_timing_now() + fifo->pipeline_len, fifo->exp_start);
+	fifo->exp_end = fifo->exp_start + fifo->exp_len;
 
 	STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(sched_mutex);
 }

+ 1 - 1
src/sched_policies/eager_central_policy.c

@@ -201,7 +201,7 @@ static void eager_add_workers(unsigned sched_ctx_id, int *workerids, unsigned nw
 		int workerid = workerids[i];
 		int curr_workerid = _starpu_worker_get_id();
 		if(workerid != curr_workerid)
-			starpu_wake_worker_locked(workerid);
+			starpu_wake_worker(workerid);
 
 		starpu_sched_ctx_worker_shares_tasks_lists(workerid, sched_ctx_id);
 	}

+ 1 - 1
src/sched_policies/eager_central_priority_policy.c

@@ -308,7 +308,7 @@ static void eager_center_priority_add_workers(unsigned sched_ctx_id, int *worker
 		int workerid = workerids[i];
 		int curr_workerid = _starpu_worker_get_id();
 		if(workerid != curr_workerid)
-			starpu_wake_worker_locked(workerid);
+			starpu_wake_worker(workerid);
 
                 starpu_sched_ctx_worker_shares_tasks_lists(workerid, sched_ctx_id);
         }

+ 1 - 1
src/sched_policies/parallel_eager.c

@@ -265,7 +265,7 @@ static struct starpu_task *pop_task_peager_policy(unsigned sched_ctx_id)
 				_starpu_fifo_push_task(data->local_fifo[local_worker], alias);
 
 #if !defined(STARPU_NON_BLOCKING_DRIVERS) || defined(STARPU_SIMGRID)
-				starpu_wake_worker_locked(local_worker);
+				starpu_wakeup_worker_locked(local_worker, sched_cond, sched_mutex);
 #endif
 				STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(sched_mutex);
 

+ 3 - 1
src/util/fstarpu.c

@@ -85,6 +85,7 @@ static const intptr_t fstarpu_starpu_mic	= STARPU_MIC;
 static const intptr_t fstarpu_starpu_scc	= STARPU_SCC;
 
 static const intptr_t fstarpu_starpu_codelet_simgrid_execute	= STARPU_CODELET_SIMGRID_EXECUTE;
+static const intptr_t fstarpu_starpu_codelet_simgrid_execute_and_inject	= STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT;
 static const intptr_t fstarpu_starpu_cuda_async	= STARPU_CUDA_ASYNC;
 static const intptr_t fstarpu_starpu_opencl_async	= STARPU_OPENCL_ASYNC;
 
@@ -153,6 +154,7 @@ intptr_t fstarpu_get_constant(char *s)
 	else if (!strcmp(s, "FSTARPU_SCC"))	{ return fstarpu_starpu_scc; }
 
 	else if (!strcmp(s, "FSTARPU_CODELET_SIMGRID_EXECUTE"))	{ return fstarpu_starpu_codelet_simgrid_execute; }
+	else if (!strcmp(s, "FSTARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT"))	{ return fstarpu_starpu_codelet_simgrid_execute_and_inject; }
 	else if (!strcmp(s, "FSTARPU_CUDA_ASYNC"))	{ return fstarpu_starpu_cuda_async; }
 	else if (!strcmp(s, "FSTARPU_OPENCL_ASYNC"))	{ return fstarpu_starpu_opencl_async; }
 
@@ -542,7 +544,7 @@ void fstarpu_worker_get_type_as_string(intptr_t type, char *dst, size_t maxlen)
 	snprintf(dst, maxlen, "%s", str);
 }
 
-struct starpu_data_handle *fstarpu_data_handle_array_alloc(int nb)
+starpu_data_handle_t *fstarpu_data_handle_array_alloc(int nb)
 {
 	void *ptr;
 	_STARPU_CALLOC(ptr, (size_t)nb, sizeof(starpu_data_handle_t));

+ 1 - 1
starpu.mk

@@ -16,7 +16,7 @@
 
 if STARPU_USE_MPI_MASTER_SLAVE
 MPI_LAUNCHER 			= $(MPIEXEC)  $(MPIEXEC_ARGS) -np 4
-MPI_RUN_ARGS			= STARPU_WORKERS_NOBIND=1 STARPU_NCPU=4
+MPI_RUN_ARGS			= STARPU_WORKERS_NOBIND=1 STARPU_NCPU=4 STARPU_NMPIMSTHREADS=4
 endif
 
 showcheck:

+ 3 - 0
tests/Makefile.am

@@ -34,6 +34,7 @@ AM_LDFLAGS = @STARPU_EXPORT_DYNAMIC@ $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFL
 
 EXTRA_DIST =					\
 	helper.h				\
+	datawizard/locality.sh			\
 	datawizard/scal.h			\
 	datawizard/mpi_like.h			\
 	microbenchs/tasks_size_overhead.sh	\
@@ -381,6 +382,8 @@ if STARPU_SIMGRID
 TESTS += $(MICROBENCHS:=.sh)
 endif
 
+TESTS += datawizard/locality.sh
+
 #######################
 # Source files        #
 #######################

+ 33 - 0
tests/datawizard/locality.sh

@@ -0,0 +1,33 @@
+#!/bin/bash -x
+#
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2017  Université de Bordeaux
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+# Test generation of FxT traces
+
+set -e
+
+PREFIX=$(dirname $0)
+test -x $PREFIX/../../tools/starpu_fxt_tool || exit 77
+STARPU_FXT_PREFIX=$PREFIX/ $PREFIX/locality
+$PREFIX/../../tools/starpu_fxt_tool -i $PREFIX/prof_file_${USER}_0
+
+# Check that they are approved by Grenoble :)
+
+if type pj_dump > /dev/null 2> /dev/null
+then
+	$PREFIX/../../tools/starpu_paje_sort paje.trace
+	pj_dump paje.trace
+fi

+ 3 - 2
tools/starpu_fxt_tool.c

@@ -30,8 +30,9 @@ static void usage()
 	fprintf(stderr, "Usage: %s [ options ]\n", PROGNAME);
         fprintf(stderr, "\n");
         fprintf(stderr, "Options:\n");
-	fprintf(stderr, "   -i <input file>     specify the input file. This can be specified several\n");
-	fprintf(stderr, "                       times for MPI execution case\n");
+	fprintf(stderr, "   -i <input file[s]>  specify the input file[s]. Several files can be provided,\n");
+	fprintf(stderr, "                       or the option specified several times for MPI execution\n");
+	fprintf(stderr, "                       case\n");
         fprintf(stderr, "   -o <output file>    specify the output file\n");
         fprintf(stderr, "   -c                  use a different colour for every type of task\n");
 	fprintf(stderr, "   -no-events          do not show events\n");