Просмотр исходного кода

Separate worker and thread state, to see streamed kernel executions

Samuel Thibault лет назад: 11
Родитель
Сommit
f14e68dc2c

+ 19 - 8
src/common/fxt.h

@@ -162,6 +162,9 @@
 #define _STARPU_FUT_WORKER_SCHEDULING_PUSH	0x5166
 #define _STARPU_FUT_WORKER_SCHEDULING_POP	0x5167
 
+#define	_STARPU_FUT_START_EXECUTING	0x5168
+#define	_STARPU_FUT_END_EXECUTING	0x5169
+
 #ifdef STARPU_USE_FXT
 #include <fxt/fxt.h>
 #include <fxt/fut.h>
@@ -406,31 +409,37 @@ do {									\
 #define _STARPU_TRACE_WORKER_INIT_END(workerid)				\
 	FUT_DO_PROBE2(_STARPU_FUT_WORKER_INIT_END, _starpu_gettid(), (workerid));
 
-#define _STARPU_TRACE_START_CODELET_BODY(job, nimpl, archtype)				\
+#define _STARPU_TRACE_START_CODELET_BODY(job, nimpl, archtype, workerid)				\
 do {									\
         const char *model_name = _starpu_job_get_model_name((job));         \
 	if (model_name)                                                 \
 	{								\
 		/* we include the symbol name */			\
-		_STARPU_FUT_DO_PROBE4STR(_STARPU_FUT_START_CODELET_BODY, (job), ((job)->task)->sched_ctx, _starpu_gettid(), 1, model_name); \
+		_STARPU_FUT_DO_PROBE4STR(_STARPU_FUT_START_CODELET_BODY, (job), ((job)->task)->sched_ctx, workerid, 1, model_name); \
 	}								\
 	else {                                                          \
-		FUT_DO_PROBE4(_STARPU_FUT_START_CODELET_BODY, (job), ((job)->task)->sched_ctx, _starpu_gettid(), 0); \
+		FUT_DO_PROBE4(_STARPU_FUT_START_CODELET_BODY, (job), ((job)->task)->sched_ctx, workerid, 0); \
 	}								\
 	{								\
 		const size_t __job_size = _starpu_job_get_data_size((job)->task->cl?(job)->task->cl->model:NULL, archtype, nimpl, (job));	\
 		const uint32_t __job_hash = _starpu_compute_buffers_footprint((job)->task->cl?(job)->task->cl->model:NULL, archtype, nimpl, (job));\
-		FUT_DO_PROBE6(_STARPU_FUT_CODELET_DETAILS, (job), ((job)->task)->sched_ctx, __job_size, __job_hash, (job)->task->tag_id, _starpu_gettid());	\
+		FUT_DO_PROBE6(_STARPU_FUT_CODELET_DETAILS, (job), ((job)->task)->sched_ctx, __job_size, __job_hash, (job)->task->tag_id, workerid);	\
 	}								\
 } while(0);
 
-#define _STARPU_TRACE_END_CODELET_BODY(job, nimpl, archtype)			\
+#define _STARPU_TRACE_END_CODELET_BODY(job, nimpl, archtype, workerid)			\
 do {									\
 	const size_t job_size = _starpu_job_get_data_size((job)->task->cl?(job)->task->cl->model:NULL, archtype, nimpl, (job));	\
 	const uint32_t job_hash = _starpu_compute_buffers_footprint((job)->task->cl?(job)->task->cl->model:NULL, archtype, nimpl, (job));\
-	FUT_DO_PROBE7(_STARPU_FUT_END_CODELET_BODY, (job), (job_size), (job_hash), (archtype)->type, (archtype)->devid, (archtype)->ncore, _starpu_gettid());	\
+	FUT_DO_PROBE7(_STARPU_FUT_END_CODELET_BODY, (job), (job_size), (job_hash), (archtype)->type, (archtype)->devid, (archtype)->ncore, workerid);	\
 } while(0);
 
+#define _STARPU_TRACE_START_EXECUTING()				\
+	FUT_DO_PROBE1(_STARPU_FUT_START_EXECUTING, _starpu_gettid());
+
+#define _STARPU_TRACE_END_EXECUTING()				\
+	FUT_DO_PROBE1(_STARPU_FUT_END_EXECUTING, _starpu_gettid());
+
 #define _STARPU_TRACE_START_CALLBACK(job)	\
 	FUT_DO_PROBE2(_STARPU_FUT_START_CALLBACK, job, _starpu_gettid());
 
@@ -782,8 +791,10 @@ do {										\
 #define _STARPU_TRACE_NEW_MEM_NODE(nodeid)	do {} while(0)
 #define _STARPU_TRACE_WORKER_INIT_START(a,b,c)	do {} while(0)
 #define _STARPU_TRACE_WORKER_INIT_END(workerid)	do {} while(0)
-#define _STARPU_TRACE_START_CODELET_BODY(job, nimpl, archtype)	do {} while(0)
-#define _STARPU_TRACE_END_CODELET_BODY(job, nimpl, a)	do {} while(0)
+#define _STARPU_TRACE_START_CODELET_BODY(job, nimpl, archtype, workerid)	do {} while(0)
+#define _STARPU_TRACE_END_CODELET_BODY(job, nimpl, a, workerid)	do {} while(0)
+#define _STARPU_TRACE_START_EXECUTING()	do {} while(0)
+#define _STARPU_TRACE_END_EXECUTING()	do {} while(0)
 #define _STARPU_TRACE_START_CALLBACK(job)	do {} while(0)
 #define _STARPU_TRACE_END_CALLBACK(job)		do {} while(0)
 #define _STARPU_TRACE_JOB_PUSH(task, prio)	do {} while(0)

+ 64 - 30
src/debug/traces/starpu_fxt.c

@@ -275,37 +275,48 @@ static void thread_set_state(double time, const char *prefix, long unsigned int
 #endif
 }
 
-static void thread_set_detailed_state(double time, const char *prefix, long unsigned int threadid, const char *name, unsigned long size, unsigned long footprint, unsigned long long tag)
+static void worker_set_state(double time, const char *prefix, long unsigned int workerid, const char *name)
 {
 #ifdef STARPU_HAVE_POTI
 	char container[STARPU_POTI_STR_LEN];
-	thread_container_alias(container, STARPU_POTI_STR_LEN, prefix, threadid);
-	/* TODO: set detailed state */
-	poti_SetState(time, container, "S", name);
+	worker_container_alias(container, STARPU_POTI_STR_LEN, prefix, workerid);
+	poti_SetState(time, container, "WS", name);
 #else
-	fprintf(out_paje_file, "20	%.9f	%st%lu	S	%s	%lu	%08lx	%016llx\n", time, prefix, threadid, name, size, footprint, tag);
+	fprintf(out_paje_file, "10	%.9f	%sw%lu	WS	%s\n", time, prefix, workerid, name);
 #endif
 }
 
-static void worker_push_state(double time, const char *prefix, long unsigned int workerid, const char *name)
+static void thread_push_state(double time, const char *prefix, long unsigned int threadid, const char *name)
 {
 #ifdef STARPU_HAVE_POTI
 	char container[STARPU_POTI_STR_LEN];
-	thread_container_alias(container, STARPU_POTI_STR_LEN, prefix, workerid);
+	thread_container_alias(container, STARPU_POTI_STR_LEN, prefix, threadid);
 	poti_PushState(time, container, "S", name);
 #else
-	fprintf(out_paje_file, "11	%.9f	%st%lu	S	%s\n", time, prefix, workerid, name);
+	fprintf(out_paje_file, "11	%.9f	%st%lu	S	%s\n", time, prefix, threadid, name);
 #endif
 }
 
-static void worker_pop_state(double time, const char *prefix, long unsigned int workerid)
+static void thread_pop_state(double time, const char *prefix, long unsigned int threadid)
 {
 #ifdef STARPU_HAVE_POTI
 	char container[STARPU_POTI_STR_LEN];
-	thread_container_alias(container, STARPU_POTI_STR_LEN, prefix, workerid);
+	thread_container_alias(container, STARPU_POTI_STR_LEN, prefix, threadid);
 	poti_PopState(time, container, "S");
 #else
-	fprintf(out_paje_file, "12	%.9f	%st%lu	S\n", time, prefix, workerid);
+	fprintf(out_paje_file, "12	%.9f	%st%lu	S\n", time, prefix, threadid);
+#endif
+}
+
+static void worker_set_detailed_state(double time, const char *prefix, long unsigned int workerid, const char *name, unsigned long size, unsigned long footprint, unsigned long long tag)
+{
+#ifdef STARPU_HAVE_POTI
+	char container[STARPU_POTI_STR_LEN];
+	worker_container_alias(container, STARPU_POTI_STR_LEN, prefix, workerid);
+	/* TODO: set detailed state */
+	poti_SetState(time, container, "WS", name);
+#else
+	fprintf(out_paje_file, "20	%.9f	%sw%lu	WS	%s	%lu	%08lx	%016llx\n", time, prefix, workerid, name, size, footprint, tag);
 #endif
 }
 
@@ -472,6 +483,9 @@ static void handle_worker_init_end(struct fxt_ev_64 *ev, struct starpu_fxt_optio
 	else
 		worker = ev->param[1];
 
+	if (out_paje_file)
+		worker_set_state(get_event_time_stamp(ev, options), prefix, worker, "I");
+
 	/* Initilize the accumulated time counters */
 	last_activity_flush_timestamp[worker] = get_event_time_stamp(ev, options);
 	accumulated_sleep_time[worker] = 0.0;
@@ -559,7 +573,7 @@ static void create_paje_state_if_not_found(char *name, struct starpu_fxt_options
 	if (out_paje_file)
 	{
 #ifdef STARPU_HAVE_POTI
-		create_paje_state_color(name, "S", red, green, blue);
+		create_paje_state_color(name, "WS", red, green, blue);
 		int i;
 		for(i = 1; i < STARPU_NMAX_SCHED_CTXS; i++)
 		{
@@ -598,7 +612,7 @@ static void create_paje_state_if_not_found(char *name, struct starpu_fxt_options
 /* 		create_paje_state_color(name, "Ctx9", .0, .0, 1.0); */
 /* 		create_paje_state_color(name, "Ctx10", 154.0, 205.0, 50.0); */
 #else
-		fprintf(out_paje_file, "6	%s	S	%s	\"%f %f %f\" \n", name, name, red, green, blue);
+		fprintf(out_paje_file, "6	%s	WS	%s	\"%f %f %f\" \n", name, name, red, green, blue);
 		int i;
 		for(i = 1; i < STARPU_NMAX_SCHED_CTXS; i++)
 		{
@@ -643,8 +657,7 @@ static void create_paje_state_if_not_found(char *name, struct starpu_fxt_options
 
 static void handle_start_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
 {
-	int worker;
-	worker = find_worker_id(ev->param[2]);
+	int worker = ev->param[2];
 
 	if (worker < 0) return;
 
@@ -664,17 +677,17 @@ static void handle_start_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_op
 		char *prefix = options->file_prefix;
 		unsigned sched_ctx = ev->param[1];
 
-		thread_set_state(start_codelet_time, prefix, ev->param[2], name);
+		worker_set_state(start_codelet_time, prefix, ev->param[2], name);
 		if (sched_ctx != 0)
 		{
 #ifdef STARPU_HAVE_POTI
 			char container[STARPU_POTI_STR_LEN];
 			char ctx[6];
 			snprintf(ctx, sizeof(ctx), "Ctx%d", sched_ctx);
-			thread_container_alias(container, STARPU_POTI_STR_LEN, prefix, ev->param[2]);
+			worker_container_alias(container, STARPU_POTI_STR_LEN, prefix, ev->param[2]);
 			poti_SetState(start_codelet_time, container, ctx, name);
 #else
-			fprintf(out_paje_file, "10	%.9f	%st%"PRIu64"	Ctx%d	%s\n", start_codelet_time, prefix, ev->param[2], sched_ctx, name);
+			fprintf(out_paje_file, "10	%.9f	%sw%"PRIu64"	Ctx%d	%s\n", start_codelet_time, prefix, ev->param[2], sched_ctx, name);
 #endif
 		}
 	}
@@ -685,8 +698,7 @@ static void handle_start_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_op
 static void handle_codelet_details(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
 {
 #ifdef STARPU_ENABLE_PAJE_CODELET_DETAILS
-	int worker;
-	worker = find_worker_id(ev->param[5]);
+	int worker = ev->param[5];
 
 	unsigned sched_ctx = ev->param[1];
 	if (worker < 0) return;
@@ -695,17 +707,17 @@ static void handle_codelet_details(struct fxt_ev_64 *ev, struct starpu_fxt_optio
 
 	if (out_paje_file)
 	{
-		thread_set_detailed_state(last_codelet_start[worker], prefix, ev->param[5], last_codelet_symbol[worker], ev->param[2], ev->param[3], ev->param[4]);
+		worker_set_detailed_state(last_codelet_start[worker], prefix, ev->param[5], last_codelet_symbol[worker], ev->param[2], ev->param[3], ev->param[4]);
 		if (sched_ctx != 0)
 		{
 #ifdef STARPU_HAVE_POTI
 			char container[STARPU_POTI_STR_LEN];
 			char ctx[6];
 			snprintf(ctx, sizeof(ctx), "Ctx%d", sched_ctx);
-			thread_container_alias(container, STARPU_POTI_STR_LEN, prefix, ev->param[5]);
+			worker_container_alias(container, STARPU_POTI_STR_LEN, prefix, ev->param[5]);
 			poti_SetState(last_codelet_start[worker], container, ctx, last_codelet_symbol[worker]);
 #else
-			fprintf(out_paje_file, "20	%.9f	%st%"PRIu64"	Ctx%d	%s	%08lx	%lu	%016llx\n", last_codelet_start[worker], prefix, ev->param[2], sched_ctx, last_codelet_symbol[worker], (unsigned long) ev->param[2], (unsigned long) ev->param[3], (unsigned long long) ev->param[4]);
+			fprintf(out_paje_file, "20	%.9f	%sw%"PRIu64"	Ctx%d	%s	%08lx	%lu	%016llx\n", last_codelet_start[worker], prefix, ev->param[2], sched_ctx, last_codelet_symbol[worker], (unsigned long) ev->param[2], (unsigned long) ev->param[3], (unsigned long long) ev->param[4]);
 #endif
 		}
 	}
@@ -717,8 +729,7 @@ static struct starpu_fxt_codelet_event *dumped_codelets;
 
 static void handle_end_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
 {
-	int worker;
-	worker = find_worker_id(ev->param[6]);
+	int worker = ev->param[6];
 	if (worker < 0) return;
 
 	char *prefix = options->file_prefix;
@@ -729,7 +740,7 @@ static void handle_end_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_opti
 	uint32_t codelet_hash = ev->param[2];
 
 	if (out_paje_file)
-		thread_set_state(end_codelet_time, prefix, ev->param[6], "B");
+		worker_set_state(end_codelet_time, prefix, ev->param[6], "I");
 
 	double codelet_length = (end_codelet_time - last_codelet_start[worker]);
 
@@ -756,6 +767,22 @@ static void handle_end_codelet_body(struct fxt_ev_64 *ev, struct starpu_fxt_opti
 	}
 }
 
+static void handle_start_thread_executing(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
+{
+	char *prefix = options->file_prefix;
+
+	if (out_paje_file)
+		thread_set_state(get_event_time_stamp(ev, options), prefix, ev->param[0], "E");
+}
+
+static void handle_end_thread_executing(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
+{
+	char *prefix = options->file_prefix;
+
+	if (out_paje_file)
+		thread_set_state(get_event_time_stamp(ev, options), prefix, ev->param[0], "B");
+}
+
 static void handle_user_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
 {
 	int worker;
@@ -876,7 +903,7 @@ static void handle_push_scheduling(struct fxt_ev_64 *ev, struct starpu_fxt_optio
 	if (worker < 0) return;
 
 	if (out_paje_file)
-		worker_push_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[0], "Sc");
+		thread_push_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[0], "Sc");
 }
 
 static void handle_pop_scheduling(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
@@ -886,7 +913,7 @@ static void handle_pop_scheduling(struct fxt_ev_64 *ev, struct starpu_fxt_option
 	if (worker < 0) return;
 
 	if (out_paje_file)
-		worker_pop_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[0]);
+		thread_pop_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[0]);
 }
 
 static void handle_start_sleep(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
@@ -1612,6 +1639,13 @@ void starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *opt
 				handle_end_codelet_body(&ev, options);
 				break;
 
+			case _STARPU_FUT_START_EXECUTING:
+				handle_start_thread_executing(&ev, options);
+				break;
+			case _STARPU_FUT_END_EXECUTING:
+				handle_end_thread_executing(&ev, options);
+				break;
+
 			case _STARPU_FUT_START_CALLBACK:
 				handle_start_callback(&ev, options);
 				break;
@@ -2351,7 +2385,7 @@ void starpu_fxt_write_data_trace(char *filename_in)
 			break;
 
 		case _STARPU_FUT_START_CODELET_BODY:
-			workerid = find_worker_id(ev.param[2]);
+			workerid = ev.param[2];
 			tasks[workerid].exec_time = ev.time;
 			has_name = ev.param[3];
 			tasks[workerid].codelet_name = strdup(has_name ? (char *) &ev.param[4] : "unknown");
@@ -2359,7 +2393,7 @@ void starpu_fxt_write_data_trace(char *filename_in)
 			break;
 
 		case _STARPU_FUT_END_CODELET_BODY:
-			workerid = find_worker_id(ev.param[6]);
+			workerid = ev.param[6];
 			assert(workerid != -1);
 			tasks[workerid].exec_time = ev.time - tasks[workerid].exec_time;
 			write_task(tasks[workerid]);

+ 8 - 0
src/debug/traces/starpu_paje.c

@@ -176,10 +176,13 @@ void _starpu_fxt_write_paje_header(FILE *file)
 	poti_DefineEntityValue("Po", "S", "PushingOutput", "0.1 1.0 1.0");
 	poti_DefineEntityValue("C", "S", "Callback", ".0 .3 .8");
 	poti_DefineEntityValue("B", "S", "Overhead", ".5 .18 .0");
+	poti_DefineEntityValue("E", "S", "Executing", ".0 .6 .5");
 	poti_DefineEntityValue("Sc", "S", "Scheduling", ".7 .36 .0");
 	poti_DefineEntityValue("Sl", "S", "Sleeping", ".9 .1 .0");
 	poti_DefineEntityValue("P", "S", "Progressing", ".4 .1 .6");
 	poti_DefineEntityValue("U", "S", "Unpartitioning", ".0 .0 1.0");
+	poti_DefineStateType("WS", "W", "Worker State");
+	poti_DefineEntityValue("I", "WS", "Idle", ".9 .1 .0");
 
 	/* Types for the MPI Communication Thread of the Memory Node */
 	poti_DefineEventType("MPIev", "MPICt", "MPI event type");
@@ -205,6 +208,7 @@ void _starpu_fxt_write_paje_header(FILE *file)
 		poti_DefineEntityValue("Po", ctx, "PushingOutput", "0.1 1.0 1.0");
 		poti_DefineEntityValue("C", ctx, "Callback", ".0 .3 .8");
 		poti_DefineEntityValue("B", ctx, "Overhead", ".5 .18 .0");
+		poti_DefineEntityValue("E", ctx, "Executing", ".0 .6 .5");
 		poti_DefineEntityValue("Sc", ctx, "Scheduling", ".7 .36 .0");
 		poti_DefineEntityValue("Sl", ctx, "Sleeping", ".9 .1 .0");
 		poti_DefineEntityValue("P", ctx, "Progressing", ".4 .1 .6");
@@ -249,10 +253,13 @@ void _starpu_fxt_write_paje_header(FILE *file)
 6       Po       S      PushingOutput       \"0.1 1.0 1.0\"            \n\
 6       C       S       Callback       \".0 .3 .8\"            \n\
 6       B       S       Overhead         \".5 .18 .0\"		\n\
+6       E       S       Executing         \".0 .6 .5\"		\n\
 6       Sc       S      Scheduling         \".7 .36 .0\"		\n\
 6       Sl       S      Sleeping         \".9 .1 .0\"		\n\
 6       P       S       Progressing         \".4 .1 .6\"		\n\
 6       U       S       Unpartitioning      \".0 .0 1.0\"		\n\
+3       WS       W       \"Worker State\"                        \n\
+6       I       WS       Idle         \".9 .1 .0\"		\n\
 6       H       S       Hypervisor      \".5 .18 .0\"		\n");
 	fprintf(file, "\
 6       P       CtS       Processing         \"0 0 0\"		\n\
@@ -271,6 +278,7 @@ void _starpu_fxt_write_paje_header(FILE *file)
 6       Po       Ctx%u      PushingOutput       \"0.1 1.0 1.0\"            \n\
 6       C       Ctx%u       Callback       \".0 .3 .8\"            \n\
 6       B       Ctx%u       Overhead         \".5 .18 .0\"		\n\
+6       E       Ctx%u       Executing         \".0 .6 .5\"		\n\
 6       Sc       Ctx%u      Scheduling         \".7 .36 .0\"		\n\
 6       Sl       Ctx%u      Sleeping         \".9 .1 .0\"		\n\
 6       P       Ctx%u       Progressing         \".4 .1 .6\"		\n\

+ 2 - 0
src/drivers/cpu/driver_cpu.c

@@ -94,11 +94,13 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 		STARPU_ASSERT_MSG(func, "when STARPU_CPU is defined in 'where', cpu_func or cpu_funcs has to be defined");
 		if (starpu_get_env_number("STARPU_DISABLE_KERNELS") <= 0)
 		{
+			_STARPU_TRACE_START_EXECUTING();
 #ifdef STARPU_SIMGRID
 			_starpu_simgrid_execute_job(j, perf_arch, NAN);
 #else
 			func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
 #endif
+			_STARPU_TRACE_END_EXECUTING();
 		}
 		if (is_parallel_task && cl->type == STARPU_FORKJOIN)
 			/* rebind to single CPU */

+ 20 - 0
src/drivers/cuda/driver_cuda.c

@@ -408,11 +408,13 @@ static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *args)
 
 	if (starpu_get_env_number("STARPU_DISABLE_KERNELS") <= 0)
 	{
+		_STARPU_TRACE_START_EXECUTING();
 #ifdef STARPU_SIMGRID
 		_starpu_simgrid_execute_job(j, &args->perf_arch, NAN);
 #else
 		func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
 #endif
+		_STARPU_TRACE_END_EXECUTING();
 	}
 
 	return 0;
@@ -550,6 +552,15 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 			_starpu_set_local_worker_key(args);
 			finish_job_on_cuda(_starpu_get_job_associated_to_task(task), args);
 			idle++;
+#ifdef STARPU_USE_FXT
+			int k;
+			for (k = 0; k < (int) worker_set->nworkers; k++)
+				if (worker_set->workers[k].current_task)
+					break;
+			if (k == (int) worker_set->nworkers)
+				/* Everybody busy */
+				_STARPU_TRACE_END_EXECUTING()
+#endif
 		}
 	}
 
@@ -612,6 +623,15 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 		{
 			/* Record event to synchronize with task termination later */
 			cudaEventRecord(task_events[workerid], starpu_cuda_get_local_stream());
+#ifdef STARPU_USE_FXT
+			int k;
+			for (k = 0; k < (int) worker_set->nworkers; k++)
+				if (worker_set->workers[k].current_task)
+					break;
+			if (k < (int) worker_set->nworkers)
+				/* Everybody busy */
+				_STARPU_TRACE_START_EXECUTING()
+#endif
 		}
 		else
 #else

+ 2 - 2
src/drivers/driver_common/driver_common.c

@@ -73,7 +73,7 @@ void _starpu_driver_start_job(struct _starpu_worker *args, struct _starpu_job *j
 	if (starpu_top)
 		_starpu_top_task_started(task,workerid,codelet_start);
 
-	_STARPU_TRACE_START_CODELET_BODY(j, j->nimpl, perf_arch);
+	_STARPU_TRACE_START_CODELET_BODY(j, j->nimpl, perf_arch, workerid);
 }
 
 void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch STARPU_ATTRIBUTE_UNUSED, struct timespec *codelet_end, int rank, int profiling)
@@ -85,7 +85,7 @@ void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j,
 	int workerid = args->workerid;
 	unsigned calibrate_model = 0;
 
-	_STARPU_TRACE_END_CODELET_BODY(j, j->nimpl, perf_arch);
+	_STARPU_TRACE_END_CODELET_BODY(j, j->nimpl, perf_arch, workerid);
 
 	if (cl && cl->model && cl->model->benchmarking)
 		calibrate_model = 1;

+ 5 - 0
src/drivers/opencl/driver_opencl.c

@@ -635,12 +635,14 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *args)
 
 		if (status != CL_COMPLETE)
 		{
+			_STARPU_TRACE_START_EXECUTING();
 			/* Not ready yet, no better thing to do than waiting */
 			__starpu_datawizard_progress(memnode, 1, 0);
 			return 0;
 		}
 
 		/* Asynchronous task completed! */
+		_STARPU_TRACE_END_EXECUTING();
 		_starpu_opencl_stop_job(_starpu_get_job_associated_to_task(task), args);
 	}
 #endif /* STARPU_SIMGRID */
@@ -698,6 +700,7 @@ int _starpu_opencl_driver_run_once(struct _starpu_worker *args)
 		 */
 		err = clEnqueueMarker(queue, &task_events[args->devid]);
 		if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
+		_STARPU_TRACE_START_EXECUTING();
 	}
 	else
 #else
@@ -832,6 +835,7 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker
 
 	if (starpu_get_env_number("STARPU_DISABLE_KERNELS") <= 0)
 	{
+		_STARPU_TRACE_START_EXECUTING();
 #ifdef STARPU_SIMGRID
 		double length = NAN;
 	  #ifdef STARPU_OPENCL_SIMULATOR
@@ -851,6 +855,7 @@ static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker
 #else
 		func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
 #endif
+		_STARPU_TRACE_END_EXECUTING();
 	}
 	return 0;
 }