Kaynağa Gözat

* Include application threads in the trace.

Samuel Thibault 9 yıl önce
ebeveyn
işleme
09a6294ec0

+ 1 - 0
ChangeLog

@@ -23,6 +23,7 @@ New features:
   * New scheduler with heterogeneous priorities
   * Support priorities for data transfers.
   * Add STARPU_MALLOC_SIMULATION_FOLDED flag to save memory when simulating.
+  * Include application threads in the trace.
 
 Changes:
   * Vastly improve simgrid simulation time.

+ 11 - 1
mpi/src/starpu_mpi_task_insert.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011, 2012, 2013, 2014, 2015  CNRS
- * Copyright (C) 2011-2015  Université de Bordeaux
+ * Copyright (C) 2011-2016  Université de Bordeaux
  * Copyright (C) 2014 INRIA
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -192,6 +192,8 @@ int _starpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nod
 	int nb_data;
 	int select_node_policy = STARPU_MPI_NODE_SELECTION_CURRENT_POLICY;
 
+	_STARPU_TRACE_TASK_MPI_DECODE_START();
+
 	descrs = (struct starpu_data_descr *)malloc(nb_allocated_data * sizeof(struct starpu_data_descr));
 	nb_data = 0;
 	*do_execute = -1;
@@ -237,6 +239,7 @@ int _starpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nod
 				{
 					free(descrs);
 					va_end(varg_list_copy);
+					_STARPU_TRACE_TASK_MPI_DECODE_END();
 					return ret;
 				}
 			}
@@ -265,6 +268,7 @@ int _starpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nod
 					{
 						free(descrs);
 						va_end(varg_list_copy);
+						_STARPU_TRACE_TASK_MPI_DECODE_END();
 						return ret;
 					}
 				}
@@ -294,6 +298,7 @@ int _starpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nod
 					{
 						free(descrs);
 						va_end(varg_list_copy);
+						_STARPU_TRACE_TASK_MPI_DECODE_END();
 						return ret;
 					}
 				}
@@ -418,6 +423,7 @@ int _starpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nod
 	*descrs_p = descrs;
 	*nb_data_p = nb_data;
 
+	_STARPU_TRACE_TASK_MPI_DECODE_END();
 	return 0;
 }
 
@@ -440,6 +446,7 @@ int _starpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, stru
 	ret = _starpu_mpi_task_decode_v(codelet, me, nb_nodes, &xrank, &do_execute, &descrs, &nb_data, varg_list);
 	if (ret < 0) return ret;
 
+	_STARPU_TRACE_TASK_MPI_PRE_START();
 	/* Send and receive data as requested */
 	for(i=0 ; i<nb_data ; i++)
 	{
@@ -452,6 +459,7 @@ int _starpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, stru
 		*descrs_p = descrs;
 	else
 		free(descrs);
+	_STARPU_TRACE_TASK_MPI_PRE_END();
 
 	if (do_execute == 0) return 1;
 	else
@@ -473,6 +481,7 @@ int _starpu_mpi_task_postbuild_v(MPI_Comm comm, int xrank, int do_execute, struc
 {
 	int me, i;
 
+	_STARPU_TRACE_TASK_MPI_POST_START();
 	starpu_mpi_comm_rank(comm, &me);
 
 	for(i=0 ; i<nb_data ; i++)
@@ -483,6 +492,7 @@ int _starpu_mpi_task_postbuild_v(MPI_Comm comm, int xrank, int do_execute, struc
 
 	free(descrs);
 
+	_STARPU_TRACE_TASK_MPI_POST_END();
 	_STARPU_MPI_LOG_OUT();
 	return 0;
 }

+ 78 - 1
src/common/fxt.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2015  Université de Bordeaux
+ * Copyright (C) 2009-2016  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2014  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -183,6 +183,27 @@
 #define _STARPU_FUT_SCHED_COMPONENT_PUSH	0x516c
 #define _STARPU_FUT_SCHED_COMPONENT_PULL	0x516d
 
+#define _STARPU_FUT_TASK_SUBMIT_START	0x516e
+#define _STARPU_FUT_TASK_SUBMIT_END	0x516f
+
+#define _STARPU_FUT_TASK_BUILD_START	0x5170
+#define _STARPU_FUT_TASK_BUILD_END	0x5171
+
+#define _STARPU_FUT_TASK_MPI_DECODE_START	0x5172
+#define _STARPU_FUT_TASK_MPI_DECODE_END	0x5173
+
+#define _STARPU_FUT_TASK_MPI_PRE_START	0x5174
+#define _STARPU_FUT_TASK_MPI_PRE_END	0x5175
+
+#define _STARPU_FUT_TASK_MPI_POST_START	0x5176
+#define _STARPU_FUT_TASK_MPI_POST_END	0x5177
+
+#define _STARPU_FUT_TASK_WAIT_START	0x5178
+#define _STARPU_FUT_TASK_WAIT_END	0x5179
+
+#define _STARPU_FUT_TASK_WAIT_FOR_ALL_START	0x517a
+#define _STARPU_FUT_TASK_WAIT_FOR_ALL_END	0x517b
+
 #ifdef STARPU_USE_FXT
 #include <fxt/fxt.h>
 #include <fxt/fut.h>
@@ -589,6 +610,48 @@ do {										\
 #define _STARPU_TRACE_TASK_SUBMIT(job)	\
 	FUT_DO_PROBE2(_STARPU_FUT_TASK_SUBMIT, (job)->job_id, _starpu_gettid());
 
+#define _STARPU_TRACE_TASK_SUBMIT_START()	\
+	FUT_DO_PROBE1(_STARPU_FUT_TASK_SUBMIT_START, _starpu_gettid());
+
+#define _STARPU_TRACE_TASK_SUBMIT_END()	\
+	FUT_DO_PROBE1(_STARPU_FUT_TASK_SUBMIT_END, _starpu_gettid());
+
+#define _STARPU_TRACE_TASK_BUILD_START()	\
+	FUT_DO_PROBE1(_STARPU_FUT_TASK_BUILD_START, _starpu_gettid());
+
+#define _STARPU_TRACE_TASK_BUILD_END()	\
+	FUT_DO_PROBE1(_STARPU_FUT_TASK_BUILD_END, _starpu_gettid());
+
+#define _STARPU_TRACE_TASK_MPI_DECODE_START()	\
+	FUT_DO_PROBE1(_STARPU_FUT_TASK_MPI_DECODE_START, _starpu_gettid());
+
+#define _STARPU_TRACE_TASK_MPI_DECODE_END()	\
+	FUT_DO_PROBE1(_STARPU_FUT_TASK_MPI_DECODE_END, _starpu_gettid());
+
+#define _STARPU_TRACE_TASK_MPI_PRE_START()	\
+	FUT_DO_PROBE1(_STARPU_FUT_TASK_MPI_PRE_START, _starpu_gettid());
+
+#define _STARPU_TRACE_TASK_MPI_PRE_END()	\
+	FUT_DO_PROBE1(_STARPU_FUT_TASK_MPI_PRE_END, _starpu_gettid());
+
+#define _STARPU_TRACE_TASK_MPI_POST_START()	\
+	FUT_DO_PROBE1(_STARPU_FUT_TASK_MPI_POST_START, _starpu_gettid());
+
+#define _STARPU_TRACE_TASK_MPI_POST_END()	\
+	FUT_DO_PROBE1(_STARPU_FUT_TASK_MPI_POST_END, _starpu_gettid());
+
+#define _STARPU_TRACE_TASK_WAIT_START(job)	\
+	FUT_DO_PROBE2(_STARPU_FUT_TASK_WAIT_START, (job)->job_id, _starpu_gettid());
+
+#define _STARPU_TRACE_TASK_WAIT_END(job)	\
+	FUT_DO_PROBE2(_STARPU_FUT_TASK_WAIT_END, (job)->job_id, _starpu_gettid());
+
+#define _STARPU_TRACE_TASK_WAIT_FOR_ALL_START()	\
+	FUT_DO_PROBE1(_STARPU_FUT_TASK_WAIT_FOR_ALL_START, _starpu_gettid());
+
+#define _STARPU_TRACE_TASK_WAIT_FOR_ALL_END()	\
+	FUT_DO_PROBE1(_STARPU_FUT_TASK_WAIT_FOR_ALL_END, _starpu_gettid());
+
 #define _STARPU_TRACE_USER_DEFINED_START	\
 	FUT_DO_PROBE1(_STARPU_FUT_USER_DEFINED_START, _starpu_gettid());
 
@@ -891,6 +954,20 @@ do {										\
 #define _STARPU_TRACE_WORKER_SLEEP_START		do {} while(0)
 #define _STARPU_TRACE_WORKER_SLEEP_END		do {} while(0)
 #define _STARPU_TRACE_TASK_SUBMIT(job)		do {} while(0)
+#define _STARPU_TRACE_TASK_SUBMIT_START()		do {} while(0)
+#define _STARPU_TRACE_TASK_SUBMIT_END()			do {} while(0)
+#define _STARPU_TRACE_TASK_BUILD_START()		do {} while(0)
+#define _STARPU_TRACE_TASK_BUILD_END()			do {} while(0)
+#define _STARPU_TRACE_TASK_MPI_DECODE_START()		do {} while(0)
+#define _STARPU_TRACE_TASK_MPI_DECODE_END()		do {} while(0)
+#define _STARPU_TRACE_TASK_MPI_PRE_START()		do {} while(0)
+#define _STARPU_TRACE_TASK_MPI_PRE_END()		do {} while(0)
+#define _STARPU_TRACE_TASK_MPI_POST_START()		do {} while(0)
+#define _STARPU_TRACE_TASK_MPI_POST_END()		do {} while(0)
+#define _STARPU_TRACE_TASK_WAIT_START(job)		do {} while(0)
+#define _STARPU_TRACE_TASK_WAIT_END(job)		do {} while(0)
+#define _STARPU_TRACE_TASK_WAIT_FOR_ALL_START()		do {} while(0)
+#define _STARPU_TRACE_TASK_WAIT_FOR_ALL_END()		do {} while(0)
 #define _STARPU_TRACE_USER_DEFINED_START		do {} while(0)
 #define _STARPU_TRACE_USER_DEFINED_END		do {} while(0)
 #define _STARPU_TRACE_START_ALLOC(memnode, size)	do {} while(0)

+ 11 - 1
src/core/task.c

@@ -230,6 +230,8 @@ int starpu_task_wait(struct starpu_task *task)
 
 	struct _starpu_job *j = (struct _starpu_job *)task->starpu_private;
 
+	_STARPU_TRACE_TASK_WAIT_START(j);
+
 	_starpu_wait_job(j);
 
 	/* as this is a synchronous task, the liberation of the job
@@ -237,6 +239,7 @@ int starpu_task_wait(struct starpu_task *task)
 	if (task->destroy)
 		_starpu_task_destroy(task);
 
+	_STARPU_TRACE_TASK_WAIT_END(j);
         _STARPU_LOG_OUT();
 	return 0;
 }
@@ -622,6 +625,8 @@ int starpu_task_submit(struct starpu_task *task)
 #endif
 		;
 
+	_STARPU_TRACE_TASK_SUBMIT_START();
+
 	if (!j->internal)
 	{
 		int nsubmitted_tasks = starpu_task_nsubmitted();
@@ -633,7 +638,10 @@ int starpu_task_submit(struct starpu_task *task)
 
 	ret = _starpu_task_submit_head(task);
 	if (ret)
+	{
+		_STARPU_TRACE_TASK_SUBMIT_END();
 		return ret;
+	}
 
 	if (!j->internal && !continuation)
 		_STARPU_TRACE_TASK_SUBMIT(j);
@@ -692,6 +700,7 @@ int starpu_task_submit(struct starpu_task *task)
 		     _starpu_task_destroy(task);
 	}
 
+	_STARPU_TRACE_TASK_SUBMIT_END();
         _STARPU_LOG_OUT();
 	return ret;
 }
@@ -859,8 +868,9 @@ int starpu_task_wait_for_all(void)
 
 int starpu_task_wait_for_all_in_ctx(unsigned sched_ctx)
 {
-	_STARPU_TRACE_EVENT("starpu_task_wait_for_all");
+	_STARPU_TRACE_TASK_WAIT_FOR_ALL_START();
 	_starpu_wait_for_all_tasks_of_sched_ctx(sched_ctx);
+	_STARPU_TRACE_TASK_WAIT_FOR_ALL_END();
 #ifdef HAVE_AYUDAME_H
 	/* TODO: improve Temanejo into knowing about contexts ... */
 	if (AYU_event) AYU_event(AYU_BARRIER, 0, NULL);

+ 227 - 32
src/debug/traces/starpu_fxt.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2015  Université de Bordeaux
+ * Copyright (C) 2009-2016  Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -317,15 +317,12 @@ struct worker_entry
 	int sync;
 } *worker_ids;
 
-static int register_worker_id(unsigned long tid, int workerid, int sync)
+static int register_thread(unsigned long tid, int workerid, int sync)
 {
-	nworkers++;
 	struct worker_entry *entry;
 
 	HASH_FIND(hh, worker_ids, &tid, sizeof(tid), entry);
 
-	STARPU_ASSERT_MSG(workerid < STARPU_NMAXWORKERS, "Too many workers in this trace, please increase in ./configure invocation the maximum number of CPUs and GPUs to the same value as was used for execution");
-
 	/* only register a thread once */
 	if (entry)
 		return 0;
@@ -339,6 +336,40 @@ static int register_worker_id(unsigned long tid, int workerid, int sync)
 	return 1;
 }
 
+static int register_worker_id(unsigned long tid, int workerid, int sync)
+{
+	nworkers++;
+	STARPU_ASSERT_MSG(workerid < STARPU_NMAXWORKERS, "Too many workers in this trace, please increase in ./configure invocation the maximum number of CPUs and GPUs to the same value as was used for execution");
+
+	return register_thread(tid, workerid, sync);
+}
+
+/* Register user threads if not done already */
+static void register_user_thread(double timestamp, unsigned long tid, const char *prefix)
+{
+	if (register_thread(tid, -1, 0) && out_paje_file)
+	{
+#ifdef STARPU_HAVE_POTI
+		char program_container[STARPU_POTI_STR_LEN];
+		program_container_alias(program_container, STARPU_POTI_STR_LEN, prefix);
+		char new_thread_container_alias[STARPU_POTI_STR_LEN];
+		thread_container_alias (new_thread_container_alias, STARPU_POTI_STR_LEN, prefix, tid);
+		char new_thread_container_name[STARPU_POTI_STR_LEN];
+		snprintf(new_thread_container_name, STARPU_POTI_STR_LEN, "%sUserThread%lu", prefix, tid);
+		poti_CreateContainer(timestamp, new_thread_container_alias, "UT", program_container, new_thread_container_alias);
+#else
+		fprintf(out_paje_file, "7	%.9f	%st%lu	UT	%sp	%sUserThread%lu\n",
+			timestamp, prefix, tid, prefix, prefix, tid);
+#endif
+	}
+}
+
+static void register_mpi_thread(unsigned long tid)
+{
+	int ret = register_thread(tid, -2, 0);
+	STARPU_ASSERT(ret == 1);
+}
+
 static int find_worker_id(unsigned long tid)
 {
 	struct worker_entry *entry;
@@ -487,6 +518,45 @@ static void thread_set_state(double time, const char *prefix, long unsigned int
 #endif
 }
 
+#if 0
+/* currently unused */
+static void user_thread_set_state(double time, const char *prefix, long unsigned int threadid, const char *name)
+{
+	register_user_thread(time, threadid, prefix);
+#ifdef STARPU_HAVE_POTI
+	char container[STARPU_POTI_STR_LEN];
+	thread_container_alias(container, STARPU_POTI_STR_LEN, prefix, threadid);
+	poti_SetState(time, container, "US", name);
+#else
+	fprintf(out_paje_file, "10	%.9f	%st%lu	US	%s\n", time, prefix, threadid, name);
+#endif
+}
+#endif
+
+static void user_thread_push_state(double time, const char *prefix, long unsigned int threadid, const char *name)
+{
+	register_user_thread(time, threadid, prefix);
+#ifdef STARPU_HAVE_POTI
+	char container[STARPU_POTI_STR_LEN];
+	thread_container_alias(container, STARPU_POTI_STR_LEN, prefix, threadid);
+	poti_SetState(time, container, "US", name);
+#else
+	fprintf(out_paje_file, "11	%.9f	%st%lu	US	%s\n", time, prefix, threadid, name);
+#endif
+}
+
+static void user_thread_pop_state(double time, const char *prefix, long unsigned int threadid)
+{
+	register_user_thread(time, threadid, prefix);
+#ifdef STARPU_HAVE_POTI
+	char container[STARPU_POTI_STR_LEN];
+	thread_container_alias(container, STARPU_POTI_STR_LEN, prefix, threadid);
+	poti_SetState(time, container, "US", name);
+#else
+	fprintf(out_paje_file, "12	%.9f	%st%lu	US\n", time, prefix, threadid);
+#endif
+}
+
 static void thread_push_state(double time, const char *prefix, long unsigned int threadid, const char *name)
 {
 	if (find_sync(threadid))
@@ -542,6 +612,28 @@ static void mpicommthread_set_state(double time, const char *prefix, const char
 #endif
 }
 
+static void mpicommthread_push_state(double time, const char *prefix, const char *name)
+{
+#ifdef STARPU_HAVE_POTI
+	char container[STARPU_POTI_STR_LEN];
+	mpicommthread_container_alias(container, STARPU_POTI_STR_LEN, prefix);
+	poti_SetState(time, container, "CtS", name);
+#else
+	fprintf(out_paje_file, "11	%.9f	%smpict	CtS 	%s\n", time, prefix, name);
+#endif
+}
+
+static void mpicommthread_pop_state(double time, const char *prefix)
+{
+#ifdef STARPU_HAVE_POTI
+	char container[STARPU_POTI_STR_LEN];
+	mpicommthread_container_alias(container, STARPU_POTI_STR_LEN, prefix);
+	poti_SetState(time, container, "CtS", name);
+#else
+	fprintf(out_paje_file, "12	%.9f	%smpict	CtS\n", time, prefix);
+#endif
+}
+
 static void recfmt_set_state(double time, const char *event, int workerid, long unsigned int threadid, const char *name, const char *type)
 {
 	fprintf(trace_file, "E: %s\n", event);
@@ -1157,52 +1249,81 @@ static void handle_start_callback(struct fxt_ev_64 *ev, struct starpu_fxt_option
 {
 	int worker;
 	worker = find_worker_id(ev->param[1]);
-	if (worker < 0)
-		return;
-
-	if (out_paje_file)
-		thread_set_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[1], "C");
-	if (trace_file)
-		recfmt_thread_set_state(get_event_time_stamp(ev, options), ev->param[1], "Callback", "Runtime");
+	if (worker >= 0)
+	{
+		if (out_paje_file)
+			thread_set_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[1], "C");
+		if (trace_file)
+			recfmt_thread_set_state(get_event_time_stamp(ev, options), ev->param[1], "Callback", "Runtime");
+	}
+	else if (worker == -2)
+	{
+		/* MPI thread */
+		mpicommthread_push_state(get_event_time_stamp(ev, options), options->file_prefix, "C");
+	}
+	else
+		user_thread_push_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[1], "C");
 }
 
 static void handle_end_callback(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
 {
 	int worker;
 	worker = find_worker_id(ev->param[1]);
-	if (worker < 0)
-		return;
 
-	if (out_paje_file)
-		thread_set_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[1], "B");
-	if (trace_file)
-		recfmt_thread_set_state(get_event_time_stamp(ev, options), ev->param[1], "Blocked", "Runtime");
+	if (worker >= 0)
+	{
+		if (out_paje_file)
+			thread_set_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[1], "B");
+		if (trace_file)
+			recfmt_thread_set_state(get_event_time_stamp(ev, options), ev->param[1], "Blocked", "Runtime");
+	}
+	else if (worker == -2)
+	{
+		/* MPI thread */
+		mpicommthread_pop_state(get_event_time_stamp(ev, options), options->file_prefix);
+	}
+	else
+		user_thread_pop_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[1]);
 }
 
 static void handle_hypervisor_begin(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
 {
 	int worker;
 	worker = find_worker_id(ev->param[0]);
-	if (worker < 0)
-		return;
-
-	if (out_paje_file)
-		thread_set_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[0], "H");
-	if (trace_file)
-		recfmt_thread_set_state(get_event_time_stamp(ev, options), ev->param[0], "Hypervisor", "Runtime");
+	if (worker >= 0)
+	{
+		if (out_paje_file)
+			thread_set_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[0], "H");
+		if (trace_file)
+			recfmt_thread_set_state(get_event_time_stamp(ev, options), ev->param[0], "Hypervisor", "Runtime");
+	}
+	else if (worker == -2)
+	{
+		/* MPI thread */
+		mpicommthread_push_state(get_event_time_stamp(ev, options), options->file_prefix, "H");
+	}
+	else
+		user_thread_push_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[1], "H");
 }
 
 static void handle_hypervisor_end(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
 {
 	int worker;
 	worker = find_worker_id(ev->param[0]);
-	if (worker < 0)
-		return;
-
-	if (out_paje_file)
-		thread_set_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[0], "B");
-	if (trace_file)
-		recfmt_thread_set_state(get_event_time_stamp(ev, options), ev->param[0], "Blocked", "Runtime");
+	if (worker >= 0)
+	{
+		if (out_paje_file)
+			thread_set_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[0], "B");
+		if (trace_file)
+			recfmt_thread_set_state(get_event_time_stamp(ev, options), ev->param[0], "Blocked", "Runtime");
+	}
+	else if (worker == -2)
+	{
+		/* MPI thread */
+		mpicommthread_pop_state(get_event_time_stamp(ev, options), options->file_prefix);
+	}
+	else
+		user_thread_pop_state(get_event_time_stamp(ev, options), options->file_prefix, ev->param[1]);
 }
 
 static void handle_worker_status(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, const char *newstatus)
@@ -1495,6 +1616,37 @@ static void handle_memnode_event(struct fxt_ev_64 *ev, struct starpu_fxt_options
 		memnode_set_state(get_event_time_stamp(ev, options), options->file_prefix, memnode, eventstr);
 }
 
+static void handle_task_submit_event(struct fxt_ev_64 *ev, struct starpu_fxt_options *options, unsigned long tid, const char *eventstr)
+{
+	int workerid = find_worker_id(tid);
+	double timestamp = get_event_time_stamp(ev, options);
+	char *prefix = options->file_prefix;
+
+	if (workerid >= 0)
+	{
+		/* Normal worker */
+		if (eventstr)
+			thread_push_state(timestamp, prefix, tid, eventstr);
+		else
+			thread_pop_state(timestamp, prefix, tid);
+	}
+	else if (workerid == -2)
+	{
+		/* MPI thread */
+		if (eventstr)
+			mpicommthread_push_state(timestamp, prefix, eventstr);
+		else
+			mpicommthread_pop_state(timestamp, prefix);
+	}
+	else
+	{
+		if (eventstr)
+			user_thread_push_state(timestamp, prefix, tid, eventstr);
+		else
+			user_thread_pop_state(timestamp, prefix, tid);
+	}
+}
+
 /*
  *	Number of task submitted to the scheduler
  */
@@ -1747,6 +1899,8 @@ static void handle_mpi_start(struct fxt_ev_64 *ev, struct starpu_fxt_options *op
 
 	char *prefix = options->file_prefix;
 
+	register_mpi_thread(ev->param[2]);
+
 	if (out_paje_file)
 	{
 #ifdef STARPU_HAVE_POTI
@@ -2208,6 +2362,47 @@ void _starpu_fxt_parse_new_file(char *filename_in, struct starpu_fxt_options *op
 				handle_task_submit(&ev, options);
 				break;
 
+			case _STARPU_FUT_TASK_BUILD_START:
+				handle_task_submit_event(&ev, options, ev.param[0], "Bu");
+				break;
+
+			case _STARPU_FUT_TASK_SUBMIT_START:
+				handle_task_submit_event(&ev, options, ev.param[0], "Su");
+				break;
+
+			case _STARPU_FUT_TASK_MPI_DECODE_START:
+				handle_task_submit_event(&ev, options, ev.param[0], "MD");
+				break;
+
+			case _STARPU_FUT_TASK_MPI_PRE_START:
+				handle_task_submit_event(&ev, options, ev.param[0], "MPr");
+				break;
+
+			case _STARPU_FUT_TASK_MPI_POST_START:
+				handle_task_submit_event(&ev, options, ev.param[0], "MPo");
+				break;
+
+			case _STARPU_FUT_TASK_WAIT_START:
+				handle_task_submit_event(&ev, options, ev.param[1], "W");
+				break;
+
+			case _STARPU_FUT_TASK_WAIT_FOR_ALL_START:
+				handle_task_submit_event(&ev, options, ev.param[0], "WA");
+				break;
+
+			case _STARPU_FUT_TASK_BUILD_END:
+			case _STARPU_FUT_TASK_SUBMIT_END:
+			case _STARPU_FUT_TASK_MPI_DECODE_END:
+			case _STARPU_FUT_TASK_MPI_PRE_END:
+			case _STARPU_FUT_TASK_MPI_POST_END:
+			case _STARPU_FUT_TASK_WAIT_FOR_ALL_END:
+				handle_task_submit_event(&ev, options, ev.param[0], NULL);
+				break;
+
+			case _STARPU_FUT_TASK_WAIT_END:
+				handle_task_submit_event(&ev, options, ev.param[1], NULL);
+				break;
+
 			case _STARPU_FUT_TASK_DONE:
 				handle_task_done(&ev, options);
 				break;

+ 46 - 4
src/debug/traces/starpu_paje.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2015  Université de Bordeaux
+ * Copyright (C) 2010-2016  Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -150,6 +150,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 	poti_DefineContainerType("P", "MPIP", "Program");
 	poti_DefineContainerType("Mn", "P", "Memory Node");
 	poti_DefineContainerType("T", "Mn", "Thread");
+	poti_DefineContainerType("UT", "P", "User Thread");
 	poti_DefineContainerType("Mm", "Mn", "Memory Manager");
 	poti_DefineContainerType("W", "T", "Worker");
 	poti_DefineContainerType("MPICt", "T", "MPI Communication Thread");
@@ -186,6 +187,11 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 	poti_DefineEntityValue("P", "S", "Progressing", ".4 .1 .6");
 	poti_DefineEntityValue("U", "S", "Unpartitioning", ".0 .0 1.0");
 	poti_DefineEntityValue("H", "S", "Hypervisor", ".5 .18 .0");
+	poti_DefineEntityValue("Bu", "S", "Building task", ".5 .18 .0");
+	poti_DefineEntityValue("Su", "S", "Submiting task", ".3 .09 .0");
+	poti_DefineEntityValue("MD", "S", "Decoding task for MPI", ".5 .18 .2");
+	poti_DefineEntityValue("MPr", "S", "Preparing task for MPI", ".4 .14 .2");
+	poti_DefineEntityValue("MPo", "S", "Post-processing task for MPI", ".3 .09 .2");
 	poti_DefineStateType("WS", "W", "Worker State");
 	poti_DefineEntityValue("I", "WS", "Idle", ".9 .1 .0");
 	poti_DefineEntityValue("In", "WS", "Initializing", "0.0 .7 1.0");
@@ -211,7 +217,22 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 	poti_DefineEntityValue("SdS", "CtS", "SendSubmitted", "1.0 .1 1.0");
 	poti_DefineEntityValue("RvS", "CtS", "RecieveSubmitted", "0.1 1.0 1.0");
 	poti_DefineEntityValue("SdC", "CtS", "SendCompleted", "1.0 .5 1.0");
-	poti_DefineEntityValue("RvC", "CtS", "RecieveCompleted", "0.5 1.0 1.0");
+	poti_DefineEntityValue("RvC", "CtS", "ReceiveCompleted", "0.5 1.0 1.0");
+	poti_DefineEntityValue("Bu", "CtS", "Building task", ".5 .18 .0");
+	poti_DefineEntityValue("Su", "CtS", "Submiting task", ".3 .09 .0");
+
+	/* Type for other threads */
+	poti_DefineEventType("user_event", "UT", "user event type");
+	poti_DefineEventType("thread_event", "UT", "thread event type");
+	poti_DefineStateType("US", "UT", "User Thread State");
+	poti_DefineEntityValue("Bu", "US", "Building task", ".5 .18 .0");
+	poti_DefineEntityValue("Su", "US", "Submiting task", ".3 .09 .0");
+	poti_DefineEntityValue("MD", "US", "Decoding task for MPI", ".5 .18 .2");
+	poti_DefineEntityValue("MPr", "US", "Preparing task for MPI", ".4 .14 .2");
+	poti_DefineEntityValue("MPo", "US", "Post-processing task for MPI", ".3 .09 .2");
+	poti_DefineEntityValue("W", "US", "Waiting task", ".9 .1 .0");
+	poti_DefineEntityValue("WA", "US", "Waiting all tasks", ".9 .1 .0");
+	poti_DefineEntityValue("No", "US", "Nothing", ".0 .0 .0");
 
 	for (i=1; i<STARPU_NMAX_SCHED_CTXS; i++)
 	{
@@ -253,6 +274,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 1       P      MPIP       \"Program\"                      	\n\
 1       Mn      P       \"Memory Node\"                         \n\
 1       T      Mn       \"Thread\"                               \n\
+1       UT      P       \"User Thread\"                               \n\
 1       Mm      Mn       \"Memory Manager\"                         \n\
 1       W      T       \"Worker\"                               \n\
 1       MPICt   T       \"MPI Communication Thread\"              \n\
@@ -260,6 +282,8 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 2       prog_event   P       \"program event type\"				\n\
 2       user_event   T       \"user event type\"				\n\
 2       thread_event   T       \"thread event type\"				\n\
+2       user_event   UT       \"user event type\"				\n\
+2       thread_event   UT       \"thread event type\"				\n\
 2       MPIev   MPICt    \"MPI event type\"			\n\
 3       S       T       \"Thread State\"                        \n\
 3       CtS     MPICt    \"Communication Thread State\"          \n");
@@ -283,6 +307,11 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 6       P       S       Progressing         \".4 .1 .6\"		\n\
 6       U       S       Unpartitioning      \".0 .0 1.0\"		\n\
 6       H       S       Hypervisor      \".5 .18 .0\"		\n\
+6       Bu      S       \"Building task\"   \".5 .18 .0\"		\n\
+6       Su      S       \"Submittings task\" \".3 .09 .0\"		\n\
+6       MD      S       \"Decoding task for MPI\" \".5 .18 .2\"		\n\
+6       MPr     S       \"Preparing task for MPI\" \".4 .14 .2\"		\n\
+6       MPo     S       \"Post-processing task for MPI\" \".3 .09 .2\"		\n\
 3       WS       W       \"Worker State\"                        \n\
 6       I       WS       Idle         \".9 .1 .0\"		\n\
 6       In       WS      Initializing       \"0.0 .7 1.0\"            \n\
@@ -296,7 +325,17 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 6       Sl       WS      Sleeping         \".9 .1 .0\"		\n\
 6       P       WS       Progressing         \".4 .1 .6\"		\n\
 6       U       WS       Unpartitioning      \".0 .0 1.0\"		\n\
-6       H       WS       Hypervisor      \".5 .18 .0\"		\n");
+6       H       WS       Hypervisor      \".5 .18 .0\"		\n\
+3       US       UT       \"User Thread State\"                        \n\
+6       Bu      US      \"Building task\"   \".5 .18 .0\"		\n\
+6       Su      US      \"Submittings task\" \".3 .09 .0\"		\n\
+6       MD      US      \"Decoding task for MPI\" \".5 .18 .2\"		\n\
+6       MPr     US      \"Preparing task for MPI\" \".4 .14 .2\"		\n\
+6       MPo     US      \"Post-processing task for MPI\" \".3 .09 .2\"		\n\
+6       W       US      \"Waiting task\" \".9 .1 .0\"		\n\
+6       WA      US      \"Waiting all tasks\" \".9 .1 .0\"		\n\
+6       No      US      Nothing \".0 .0 .0\"		\n\
+");
 	fprintf(file, "\
 6       P       CtS       Processing         \"0 0 0\"		\n\
 6       Sl       CtS      Sleeping         \".9 .1 .0\"		\n\
@@ -305,7 +344,10 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 6       SdS       CtS      SendSubmitted     \"1.0 .1 1.0\"	\n\
 6       RvS       CtS      RecieveSubmitted  \"0.1 1.0 1.0\"	\n\
 6       SdC       CtS      SendCompleted     \"1.0 .5 1.0\"	\n\
-6       RvC       CtS      RecieveCompleted  \"0.5 1.0 1.0\"	\n");
+6       RvC       CtS      ReceiveCompleted  \"0.5 1.0 1.0\"	\n\
+6       Bu      CtS      \"Building task\"   \".5 .18 .0\"		\n\
+6       Su      CtS      \"Submittings task\" \".3 .09 .0\"		\n\
+");
 	for (i=1; i<STARPU_NMAX_SCHED_CTXS; i++)
 		fprintf(file, "\
 6       I       Ctx%u      Idle         \".9 .1 .0\"		\n\

+ 4 - 1
src/util/starpu_task_insert_utils.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2011, 2013-2015   Université Bordeaux
+ * Copyright (C) 2011, 2013-2016   Université Bordeaux
  * Copyright (C) 2011-2015         CNRS
  * Copyright (C) 2011, 2014        INRIA
  *
@@ -234,6 +234,8 @@ void _starpu_task_insert_create(struct starpu_codelet *cl, struct starpu_task **
 	int nargs = 0;
 	int allocated_buffers = 0;
 
+	_STARPU_TRACE_TASK_BUILD_START();
+
 	struct _starpu_task_insert_cb_wrapper *cl_arg_wrapper = (struct _starpu_task_insert_cb_wrapper *) malloc(sizeof(struct _starpu_task_insert_cb_wrapper));
 	STARPU_ASSERT(cl_arg_wrapper);
 
@@ -502,4 +504,5 @@ void _starpu_task_insert_create(struct starpu_codelet *cl, struct starpu_task **
 	(*task)->prologue_callback_pop_func = _starpu_task_insert_callback_wrapper;
 	(*task)->prologue_callback_pop_arg = prologue_pop_cl_arg_wrapper;
 	(*task)->prologue_callback_pop_arg_free = 1;
+	_STARPU_TRACE_TASK_BUILD_END();
 }