Pārlūkot izejas kodu

merge from trunk

Olivier Aumage 8 gadi atpakaļ
vecāks
revīzija
d4656fcb50

+ 1 - 0
ChangeLog

@@ -273,6 +273,7 @@ Small features:
     allows to copy in a new buffer values which have not been unpacked by
     the current call
   * Add STARPU_CODELET_SIMGRID_EXECUTE flag.
+  * Add STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT flag.
   * Add STARPU_CL_ARGS flag to starpu_task_insert() and
     starpu_mpi_task_insert() functions call
 

+ 3 - 2
doc/doxygen/chapters/320_scheduling.doxy

@@ -285,8 +285,9 @@ be used to get information about how well the execution proceeded, and thus the
 overall quality of the execution.
 
 Precise debugging can also be performed by using the
-\ref STARPU_TASK_BREAK_ON_SCHED, \ref STARPU_TASK_BREAK_ON_PUSH, and
-\ref STARPU_TASK_BREAK_ON_POP environment variables. By setting the job_id of a task
+\ref STARPU_TASK_BREAK_ON_PUSH, \ref STARPU_TASK_BREAK_ON_SCHED,
+\ref STARPU_TASK_BREAK_ON_POP, and \ref STARPU_TASK_BREAK_ON_EXEC environment variables.
+By setting the job_id of a task
 in these environment variables, StarPU will raise <c>SIGTRAP</c> when the task is being
 scheduled, pushed, or popped by the scheduler. That means that when one notices
 that a task is being scheduled in a seemingly odd way, one can just reexecute

+ 2 - 1
doc/doxygen/chapters/470_simgrid.doxy

@@ -36,7 +36,8 @@ To be able to run the application with e.g. CUDA simulation on a system which
 does not have CUDA installed, one can fill the cuda_funcs with (void*)1, to
 express that there is a CUDA implementation, even if one does not actually
 provide it. StarPU will not actually run it in Simgrid mode anyway by default
-(unless the ::STARPU_CODELET_SIMGRID_EXECUTE flag is set in the codelet)
+(unless the ::STARPU_CODELET_SIMGRID_EXECUTE or ::STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT
+flags are set in the codelet)
 
 \snippet simgrid.c To be included. You should update doxygen if you see this text.
 

+ 20 - 10
doc/doxygen/chapters/501_environment_variables.doxy

@@ -642,9 +642,10 @@ especially regarding data transfers.
 <dd>
 \anchor STARPU_SIMGRID_SCHED_COST
 \addindex __env__STARPU_SIMGRID_SCHED_COST
-When set to 1 (which is the default), scheduling costs are taken into
+When set to 1 (0 is the default), scheduling costs are taken into
 account in simgrid mode. This provides more accurate simgrid predictions,
-and allows studying scheduling overhead of the runtime system.
+and allows studying scheduling overhead of the runtime system. However,
+it also makes simulation non-deterministic.
 </dd>
 
 </dl>
@@ -1013,6 +1014,15 @@ dog is reached, thus allowing to catch the situation in gdb, etc
 (see \ref DetectionStuckConditions)
 </dd>
 
+<dt>STARPU_TASK_BREAK_ON_PUSH</dt>
+<dd>
+\anchor STARPU_TASK_BREAK_ON_PUSH
+\addindex __env__STARPU_TASK_BREAK_ON_PUSH
+When this variable contains a job id, StarPU will raise SIGTRAP when the task
+with that job id is being pushed to the scheduler, which will be nicely catched by debuggers
+(see \ref DebuggingScheduling)
+</dd>
+
 <dt>STARPU_TASK_BREAK_ON_SCHED</dt>
 <dd>
 \anchor STARPU_TASK_BREAK_ON_SCHED
@@ -1024,21 +1034,21 @@ This only works for schedulers which have such a scheduling point defined
 (see \ref DebuggingScheduling)
 </dd>
 
-<dt>STARPU_TASK_BREAK_ON_PUSH</dt>
+<dt>STARPU_TASK_BREAK_ON_POP</dt>
 <dd>
-\anchor STARPU_TASK_BREAK_ON_PUSH
-\addindex __env__STARPU_TASK_BREAK_ON_PUSH
+\anchor STARPU_TASK_BREAK_ON_POP
+\addindex __env__STARPU_TASK_BREAK_ON_POP
 When this variable contains a job id, StarPU will raise SIGTRAP when the task
-with that job id is being pushed to the scheduler, which will be nicely catched by debuggers
+with that job id is being popped from the scheduler, which will be nicely catched by debuggers
 (see \ref DebuggingScheduling)
 </dd>
 
-<dt>STARPU_TASK_BREAK_ON_POP</dt>
+<dt>STARPU_TASK_BREAK_ON_EXEC</dt>
 <dd>
-\anchor STARPU_TASK_BREAK_ON_POP
-\addindex __env__STARPU_TASK_BREAK_ON_POP
+\anchor STARPU_TASK_BREAK_ON_EXEC
+\addindex __env__STARPU_TASK_BREAK_ON_EXEC
 When this variable contains a job id, StarPU will raise SIGTRAP when the task
-with that job id is being popped from the scheduler, which will be nicely catched by debuggers
+with that job id is being executed, which will be nicely catched by debuggers
 (see \ref DebuggingScheduling)
 </dd>
 

+ 5 - 0
doc/doxygen/chapters/api/codelet_and_tasks.doxy

@@ -135,6 +135,11 @@ Value to be set in starpu_codelet::opencl_flags to allow asynchronous OpenCL ker
 \ingroup API_Codelet_And_Tasks
 Value to be set in starpu_codelet::flags to execute the codelet functions even in simgrid mode.
 
+\def STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT
+\ingroup API_Codelet_And_Tasks
+Value to be set in starpu_codelet::flags to execute the codelet functions even in simgrid mode,
+and later inject the measured timing inside the simulation.
+
 \typedef starpu_cpu_func_t
 \ingroup API_Codelet_And_Tasks
 CPU implementation of a codelet.

+ 4 - 1
include/fstarpu_mod.f90

@@ -82,6 +82,7 @@ module fstarpu_mod
         type(c_ptr), bind(C) :: FSTARPU_SCC
 
         type(c_ptr), bind(C) :: FSTARPU_CODELET_SIMGRID_EXECUTE
+        type(c_ptr), bind(C) :: FSTARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT
         type(c_ptr), bind(C) :: FSTARPU_CUDA_ASYNC
         type(c_ptr), bind(C) :: FSTARPU_OPENCL_ASYNC
 
@@ -2331,7 +2332,9 @@ module fstarpu_mod
                             fstarpu_get_constant(C_CHAR_"FSTARPU_SCC"//C_NULL_CHAR)
 
                         FSTARPU_CODELET_SIMGRID_EXECUTE = &
-                            fstarpu_get_constant(C_CHAR_"FSTARPU_CODELET_SIMGRID_EXECUTE"//C_NULL_CHAR)
+                             fstarpu_get_constant(C_CHAR_"FSTARPU_CODELET_SIMGRID_EXECUTE"//C_NULL_CHAR)
+                        FSTARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT = &
+                             fstarpu_get_constant(C_CHAR_"FSTARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT"//C_NULL_CHAR)
                         FSTARPU_CUDA_ASYNC = &
                             fstarpu_get_constant(C_CHAR_"FSTARPU_CUDA_ASYNC"//C_NULL_CHAR)
                         FSTARPU_OPENCL_ASYNC = &

+ 1 - 0
include/starpu_task.h

@@ -46,6 +46,7 @@ extern "C"
 #define STARPU_MPI_MS	((1ULL)<<9)
 
 #define STARPU_CODELET_SIMGRID_EXECUTE	(1<<0)
+#define STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT	(1<<1)
 #define STARPU_CUDA_ASYNC	(1<<0)
 #define STARPU_OPENCL_ASYNC	(1<<0)
 

+ 9 - 9
include/starpu_thread.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2012-2016  Université de Bordeaux
+ * Copyright (C) 2010, 2012-2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -53,7 +53,6 @@ typedef int starpu_pthread_attr_t;
 int starpu_pthread_equal(starpu_pthread_t t1, starpu_pthread_t t2);
 starpu_pthread_t starpu_pthread_self(void);
 int starpu_pthread_create_on(char *name, starpu_pthread_t *thread, const starpu_pthread_attr_t *attr, void *(*start_routine) (void *), void *arg, msg_host_t host);
-#define starpu_pthread_setname(name)
 int starpu_pthread_create(starpu_pthread_t *thread, const starpu_pthread_attr_t *attr, void *(*start_routine) (void *), void *arg);
 int starpu_pthread_join(starpu_pthread_t thread, void **retval);
 int starpu_pthread_exit(void *retval) STARPU_ATTRIBUTE_NORETURN;
@@ -70,6 +69,14 @@ typedef pthread_attr_t starpu_pthread_attr_t;
 #define starpu_pthread_self pthread_self
 #define starpu_pthread_create pthread_create
 #define starpu_pthread_create_on(name, thread, attr, routine, arg, where) starpu_pthread_create(thread, attr, routine, arg)
+#define starpu_pthread_join pthread_join
+#define starpu_pthread_exit pthread_exit
+#define starpu_pthread_attr_init pthread_attr_init
+#define starpu_pthread_attr_destroy pthread_attr_destroy
+#define starpu_pthread_attr_setdetachstate pthread_attr_setdetachstate
+
+#endif /* STARPU_SIMGRID, _MSC_VER */
+
 #ifdef STARPU_HAVE_PTHREAD_SETNAME_NP
 #ifdef STARPU_HAVE_DARWIN
 #define starpu_pthread_setname(name) pthread_setname_np(name)
@@ -79,13 +86,6 @@ typedef pthread_attr_t starpu_pthread_attr_t;
 #else
 #define starpu_pthread_setname(name)
 #endif
-#define starpu_pthread_join pthread_join
-#define starpu_pthread_exit pthread_exit
-#define starpu_pthread_attr_init pthread_attr_init
-#define starpu_pthread_attr_destroy pthread_attr_destroy
-#define starpu_pthread_attr_setdetachstate pthread_attr_setdetachstate
-
-#endif /* STARPU_SIMGRID, _MSC_VER */
 
 /*
  * Encapsulation of the pthread_mutex_* functions.

+ 2 - 2
src/core/jobs.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2016  Université de Bordeaux
+ * Copyright (C) 2009-2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016  CNRS
  * Copyright (C) 2011  Télécom-SudParis
  * Copyright (C) 2011, 2014, 2016  INRIA
@@ -88,7 +88,7 @@ struct _starpu_job* STARPU_ATTRIBUTE_MALLOC _starpu_job_create(struct starpu_tas
 
 #ifndef STARPU_USE_FXT
 	if (_starpu_bound_recording || _starpu_top_status_get() ||
-		_starpu_task_break_on_push != -1 || _starpu_task_break_on_pop != -1 || _starpu_task_break_on_sched != -1
+		_starpu_task_break_on_push != -1 || _starpu_task_break_on_sched != -1 || _starpu_task_break_on_pop != -1 || _starpu_task_break_on_exec != -1
 		|| STARPU_AYU_EVENT)
 #endif
 	{

+ 4 - 2
src/core/sched_policy.c

@@ -31,15 +31,17 @@ static double idle[STARPU_NMAXWORKERS];
 static double idle_start[STARPU_NMAXWORKERS];
 
 long _starpu_task_break_on_push = -1;
-long _starpu_task_break_on_pop = -1;
 long _starpu_task_break_on_sched = -1;
+long _starpu_task_break_on_pop = -1;
+long _starpu_task_break_on_exec = -1;
 static const char *starpu_idle_file;
 
 void _starpu_sched_init(void)
 {
 	_starpu_task_break_on_push = starpu_get_env_number_default("STARPU_TASK_BREAK_ON_PUSH", -1);
-	_starpu_task_break_on_pop = starpu_get_env_number_default("STARPU_TASK_BREAK_ON_POP", -1);
 	_starpu_task_break_on_sched = starpu_get_env_number_default("STARPU_TASK_BREAK_ON_SCHED", -1);
+	_starpu_task_break_on_pop = starpu_get_env_number_default("STARPU_TASK_BREAK_ON_POP", -1);
+	_starpu_task_break_on_exec = starpu_get_env_number_default("STARPU_TASK_BREAK_ON_EXEC", -1);
 	starpu_idle_file = starpu_getenv("STARPU_IDLE_FILE");
 }
 

+ 3 - 2
src/core/sched_policy.h

@@ -28,7 +28,7 @@
 
 #define _STARPU_SCHED_BEGIN \
 	_STARPU_TRACE_WORKER_SCHEDULING_PUSH;	\
-	_SIMGRID_TIMER_BEGIN
+	_SIMGRID_TIMER_BEGIN(_starpu_simgrid_sched_cost())
 #define _STARPU_SCHED_END \
 	_SIMGRID_TIMER_END;			\
 	_STARPU_TRACE_WORKER_SCHEDULING_POP
@@ -103,8 +103,9 @@ extern struct starpu_sched_policy _starpu_sched_modular_heft2_policy;
 extern struct starpu_sched_policy _starpu_sched_graph_test_policy;
 
 extern long _starpu_task_break_on_push;
-extern long _starpu_task_break_on_pop;
 extern long _starpu_task_break_on_sched;
+extern long _starpu_task_break_on_pop;
+extern long _starpu_task_break_on_exec;
 
 #ifdef SIGTRAP
 #define _STARPU_TASK_BREAK_ON(task, what) do { \

+ 4 - 4
src/core/simgrid.h

@@ -69,7 +69,7 @@ starpu_pthread_queue_t _starpu_simgrid_task_queue[STARPU_NMAXWORKERS];
 #define _starpu_simgrid_queue_malloc_cost() starpu_get_env_number_default("STARPU_SIMGRID_QUEUE_MALLOC_COST", 1)
 #define _starpu_simgrid_task_submit_cost() starpu_get_env_number_default("STARPU_SIMGRID_TASK_SUBMIT_COST", 1)
 #define _starpu_simgrid_fetching_input_cost() starpu_get_env_number_default("STARPU_SIMGRID_FETCHING_INPUT_COST", 1)
-#define _starpu_simgrid_sched_cost() starpu_get_env_number_default("STARPU_SIMGRID_SCHED_COST", 1)
+#define _starpu_simgrid_sched_cost() starpu_get_env_number_default("STARPU_SIMGRID_SCHED_COST", 0)
 
 /* Called at initialization to count how many GPUs are interfering with each
  * bus */
@@ -78,10 +78,10 @@ void _starpu_simgrid_count_ngpus(void);
 void _starpu_simgrid_xbt_thread_create(const char *name, void_f_pvoid_t code,
 				       void *param);
 
-#define _SIMGRID_TIMER_BEGIN		\
+#define _SIMGRID_TIMER_BEGIN(cond)			\
 	{		\
 		xbt_os_timer_t __timer = NULL;		\
-		if (_starpu_simgrid_sched_cost()) {		\
+		if (cond) {		\
 		  __timer = xbt_os_timer_new();		\
 		  xbt_os_threadtimer_start(__timer);	\
 		}
@@ -94,7 +94,7 @@ void _starpu_simgrid_xbt_thread_create(const char *name, void_f_pvoid_t code,
 	}
 
 #else // !STARPU_SIMGRID
-#define _SIMGRID_TIMER_BEGIN {
+#define _SIMGRID_TIMER_BEGIN(cond) {
 #define _SIMGRID_TIMER_END }
 #endif
 

+ 6 - 6
src/debug/traces/starpu_paje.c

@@ -193,7 +193,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 	poti_DefineEntityValue("E", "S", "Executing", ".0 .6 .5");
 	poti_DefineEntityValue("Sc", "S", "Scheduling", ".7 .36 .0");
 	poti_DefineEntityValue("Sl", "S", "Sleeping", ".9 .1 .0");
-	poti_DefineEntityValue("P", "S", "Progressing", ".0 .0 .4");
+	poti_DefineEntityValue("P", "S", "Progressing", ".1 .3 .1");
 	poti_DefineEntityValue("U", "S", "Unpartitioning", ".0 .0 1.0");
 	poti_DefineEntityValue("H", "S", "Hypervisor", ".5 .18 .0");
 	poti_DefineEntityValue("Bu", "S", "Building task", ".5 .18 .0");
@@ -213,7 +213,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 	poti_DefineEntityValue("E", "WS", "Executing", ".0 .6 .5");
 	poti_DefineEntityValue("Sc", "WS", "Scheduling", ".7 .36 .0");
 	poti_DefineEntityValue("Sl", "WS", "Sleeping", ".9 .1 .0");
-	poti_DefineEntityValue("P", "WS", "Progressing", ".0 .0 .4");
+	poti_DefineEntityValue("P", "WS", "Progressing", ".1 .3 .1");
 	poti_DefineEntityValue("U", "WS", "Unpartitioning", ".0 .0 1.0");
 	poti_DefineEntityValue("H", "WS", "Hypervisor", ".5 .18 .0");
 	poti_DefineEntityValue("Bu", "WS", "Building task", ".5 .18 .0");
@@ -268,7 +268,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 		poti_DefineEntityValue("E", ctx, "Executing", ".0 .6 .5");
 		poti_DefineEntityValue("Sc", ctx, "Scheduling", ".7 .36 .0");
 		poti_DefineEntityValue("Sl", ctx, "Sleeping", ".9 .1 .0");
-		poti_DefineEntityValue("P", ctx, "Progressing", ".0 .0 .4");
+		poti_DefineEntityValue("P", ctx, "Progressing", ".1 .3 .1");
 		poti_DefineEntityValue("U", ctx, "Unpartitioning", ".0 .0 1.0");
 		poti_DefineEntityValue("H", ctx, "Hypervisor", ".5 .18 .0");
 	}
@@ -331,7 +331,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 6       E       S       Executing         \".0 .6 .5\"		\n\
 6       Sc       S      Scheduling         \".7 .36 .0\"		\n\
 6       Sl       S      Sleeping         \".9 .1 .0\"		\n\
-6       P       S       Progressing         \".0 .0 .4\"		\n\
+6       P       S       Progressing         \".1 .3 .1\"		\n\
 6       U       S       Unpartitioning      \".0 .0 1.0\"		\n\
 6       H       S       Hypervisor      \".5 .18 .0\"		\n\
 6       Bu      S       \"Building task\"   \".5 .18 .0\"		\n\
@@ -351,7 +351,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 6       E       WS       Executing         \".0 .6 .5\"		\n\
 6       Sc       WS      Scheduling         \".7 .36 .0\"		\n\
 6       Sl       WS      Sleeping         \".9 .1 .0\"		\n\
-6       P       WS       Progressing         \".0 .0 .4\"		\n\
+6       P       WS       Progressing         \".1 .3 .1\"		\n\
 6       U       WS       Unpartitioning      \".0 .0 1.0\"		\n\
 6       H       WS       Hypervisor      \".5 .18 .0\"		\n\
 6       Bu      WS       \"Building task\"   \".5 .18 .0\"		\n\
@@ -394,7 +394,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 6       E       Ctx%u       Executing         \".0 .6 .5\"		\n\
 6       Sc       Ctx%u      Scheduling         \".7 .36 .0\"		\n\
 6       Sl       Ctx%u      Sleeping         \".9 .1 .0\"		\n\
-6       P       Ctx%u       Progressing         \".0 .0 .4\"		\n\
+6       P       Ctx%u       Progressing         \".1 .3 .1\"		\n\
 6       U       Ctx%u       Unpartitioning         \".0 .0 1.0\"	\n\
 6       H       Ctx%u       Hypervisor         \".5 .18 .0\"		\n",
 		i, i, i, i, i, i, i, i, i, i, i, i, i);

+ 6 - 0
src/drivers/cpu/driver_cpu.c

@@ -89,6 +89,12 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 #ifdef STARPU_SIMGRID
 			if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE)
 				func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
+			else if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT)
+			{
+				_SIMGRID_TIMER_BEGIN(1);
+				func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
+				_SIMGRID_TIMER_END;
+			}
 			else
 				_starpu_simgrid_submit_job(cpu_args->workerid, j, perf_arch, NAN, NULL);
 #else

+ 10 - 4
src/drivers/cuda/driver_cuda.c

@@ -507,6 +507,12 @@ static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *worke
 		unsigned workerid = worker->workerid;
 		if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE && !async)
 			func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
+		else if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT && !async)
+			{
+				_SIMGRID_TIMER_BEGIN(1);
+				func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
+				_SIMGRID_TIMER_END;
+			}
 		else
 			_starpu_simgrid_submit_job(workerid, j, &worker->perf_arch, NAN,
 				async ? &task_finished[workerid][pipeline_idx] : NULL);
@@ -763,6 +769,7 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 		task = worker->task_transferring;
 		if (task && worker->nb_buffers_transferred == worker->nb_buffers_totransfer)
 		{
+			_STARPU_TRACE_END_PROGRESS(memnode);
 			j = _starpu_get_job_associated_to_task(task);
 
 			_starpu_set_local_worker_key(worker);
@@ -779,10 +786,9 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 			}
 			else
 			{
-				_STARPU_TRACE_END_PROGRESS(memnode);
 				execute_job_on_cuda(task, worker);
-				_STARPU_TRACE_START_PROGRESS(memnode);
 			}
+			_STARPU_TRACE_START_PROGRESS(memnode);
 		}
 
 		/* Then test for termination of queued tasks */
@@ -811,6 +817,7 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 		else
 #endif /* !STARPU_SIMGRID */
 		{
+			_STARPU_TRACE_END_PROGRESS(memnode);
 			/* Asynchronous task completed! */
 			_starpu_set_local_worker_key(worker);
 			finish_job_on_cuda(_starpu_get_job_associated_to_task(task), worker);
@@ -831,11 +838,9 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 					 * flushing the pipeline, we can now at
 					 * last execute it.  */
 
-					_STARPU_TRACE_END_PROGRESS(memnode);
 					_STARPU_TRACE_EVENT("sync_task");
 					execute_job_on_cuda(task, worker);
 					_STARPU_TRACE_EVENT("end_sync_task");
-					_STARPU_TRACE_START_PROGRESS(memnode);
 					worker->pipeline_stuck = 0;
 				}
 			}
@@ -848,6 +853,7 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 				/* Everybody busy */
 				_STARPU_TRACE_END_EXECUTING()
 #endif
+			_STARPU_TRACE_START_PROGRESS(memnode);
 		}
 
 		if (!worker->pipeline_length || worker->ntasks < worker->pipeline_length)

+ 1 - 0
src/drivers/driver_common/driver_common.c

@@ -101,6 +101,7 @@ void _starpu_driver_start_job(struct _starpu_worker *worker, struct _starpu_job
 	}
 	else
 		_STARPU_TRACE_START_CODELET_BODY(j, j->nimpl, perf_arch, workerid);
+	_STARPU_TASK_BREAK_ON(task, exec);
 }
 
 void _starpu_driver_end_job(struct _starpu_worker *worker, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch STARPU_ATTRIBUTE_UNUSED, struct timespec *codelet_end, int rank, int profiling)

+ 8 - 0
src/drivers/opencl/driver_opencl.c

@@ -954,6 +954,14 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker
 			simulate = 1;
 		#endif
 		}
+		else if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT && !async)
+			{
+				_SIMGRID_TIMER_BEGIN(1);
+				func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
+				_SIMGRID_TIMER_END;
+				simulate=0;
+			}
+
 		if (simulate)
 			_starpu_simgrid_submit_job(worker->workerid, j, &worker->perf_arch, length,
 						   async ? &task_finished[worker->devid][pipeline_idx] : NULL);

+ 2 - 0
src/util/fstarpu.c

@@ -85,6 +85,7 @@ static const intptr_t fstarpu_starpu_mic	= STARPU_MIC;
 static const intptr_t fstarpu_starpu_scc	= STARPU_SCC;
 
 static const intptr_t fstarpu_starpu_codelet_simgrid_execute	= STARPU_CODELET_SIMGRID_EXECUTE;
+static const intptr_t fstarpu_starpu_codelet_simgrid_execute_and_inject	= STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT;
 static const intptr_t fstarpu_starpu_cuda_async	= STARPU_CUDA_ASYNC;
 static const intptr_t fstarpu_starpu_opencl_async	= STARPU_OPENCL_ASYNC;
 
@@ -153,6 +154,7 @@ intptr_t fstarpu_get_constant(char *s)
 	else if (!strcmp(s, "FSTARPU_SCC"))	{ return fstarpu_starpu_scc; }
 
 	else if (!strcmp(s, "FSTARPU_CODELET_SIMGRID_EXECUTE"))	{ return fstarpu_starpu_codelet_simgrid_execute; }
+	else if (!strcmp(s, "FSTARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT"))	{ return fstarpu_starpu_codelet_simgrid_execute_and_inject; }
 	else if (!strcmp(s, "FSTARPU_CUDA_ASYNC"))	{ return fstarpu_starpu_cuda_async; }
 	else if (!strcmp(s, "FSTARPU_OPENCL_ASYNC"))	{ return fstarpu_starpu_opencl_async; }