Browse Source

Add starpu_set_iteration and starpu_set_subiteration to describe tasks, for better offline traces analysis

Samuel Thibault 8 years ago
parent
commit
7d765d4d6f

+ 3 - 1
ChangeLog

@@ -62,7 +62,9 @@ New features:
   * Add modular-heft-prio scheduler.
   * Add starpu_cublas_get_local_handle helper.
   * Add starpu_data_set_name, starpu_data_set_coordinates_array, and
-    starpu_data_set_coordinates to describe data.
+    starpu_data_set_coordinates to describe data, and starpu_set_iteration and
+    starpu_set_subiteration to describe tasks, for better offline traces
+    analysis.
 
 Changes:
   * Fix performance regression of lws for small tasks.

+ 4 - 0
doc/doxygen/chapters/380_offline_performance_tools.doxy

@@ -143,6 +143,10 @@ It can also set the starpu_task::name field of the task (or use \ref STARPU_NAME
 when using starpu_task_insert()), to replace in traces the name of the codelet
 with an arbitrarily chosen name.
 
+It can also set the iteration number, by just calling starpu_set_iteration() at
+the beginning of the first task submission loop. This iteration number will show
+up in traces for all tasks submitted from there.
+
 Coordinates can also be given to data with the starpu_data_set_coordinates() or
 starpu_data_set_coordinates_array() function. In the trace, tasks will then be
 assigned the coordinates of the first data they write to.

+ 12 - 0
doc/doxygen/chapters/api/codelet_and_tasks.doxy

@@ -951,6 +951,18 @@ codelet implementation to be executed when executing \p task.
 Return the codelet implementation to be executed
 when executing \p task.
 
+\fn void starpu_set_iteration(unsigned long iteration)
+\ingroup API_Codelet_And_Tasks
+Sets the iteration number for all the tasks to be submitted after this
+call. This is typically called at the beginning of the main task submission
+loop. This number will then show up in tracing tools.
+
+\fn void starpu_set_subiteration(unsigned long subiteration)
+\ingroup API_Codelet_And_Tasks
+Sets the subiteration number for all the tasks to be submitted after this
+call. This is typically called at the beginning of the second-nested task
+submission loop. This number will then show up in tracing tools.
+
 \fn void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t *deps, void (*callback)(void *), void *callback_arg)
 \ingroup API_Codelet_And_Tasks
 Create (and submit) an empty task that unlocks a tag once all its dependencies are fulfilled.

+ 3 - 1
examples/cg/cg.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2012, 2014-2016  Université de Bordeaux
+ * Copyright (C) 2010-2012, 2014-2017  Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -308,6 +308,8 @@ static int cg(void)
 		double delta_old;
 		double alpha, beta;
 
+		starpu_set_iteration(i);
+
 		/* q <- A d */
 		gemv_kernel(q_handle, A_handle, d_handle, 0.0, 1.0, nblocks, use_reduction);
 

+ 2 - 1
examples/cholesky/cholesky_grain_tag.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2016  Université de Bordeaux
+ * Copyright (C) 2009-2017  Université de Bordeaux
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012, 2017  CNRS
  *
@@ -190,6 +190,7 @@ static int cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 
 	for (k = 0; k < nbigblocks; k++)
 	{
+		starpu_set_iteration(k);
 		struct starpu_task *task = create_task_11(dataA, k, reclevel);
 		/* we defer the launch of the first task */
 		if (k == 0)

+ 1 - 0
examples/cholesky/cholesky_implicit.c

@@ -59,6 +59,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 	for (k = 0; k < nblocks; k++)
 	{
 		int ret;
+		starpu_set_iteration(k);
                 starpu_data_handle_t sdatakk = starpu_data_get_sub_data(dataA, 2, k, k);
 
                 ret = starpu_task_insert(&cl11,

+ 2 - 1
examples/cholesky/cholesky_tag.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2016  Université de Bordeaux
+ * Copyright (C) 2009-2017  Université de Bordeaux
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012, 2013, 2017  CNRS
  *
@@ -171,6 +171,7 @@ static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
 	for (k = 0; k < nblocks; k++)
 	{
+		starpu_set_iteration(k);
 		struct starpu_task *task = create_task_11(dataA, k);
 		/* we defer the launch of the first task */
 		if (k == 0)

+ 1 - 0
examples/cholesky/cholesky_tile_tag.c

@@ -167,6 +167,7 @@ static int cholesky_no_stride(void)
 
 	for (k = 0; k < nblocks_p; k++)
 	{
+		starpu_set_iteration(k);
 		struct starpu_task *task = create_task_11(k, nblocks_p);
 		/* we defer the launch of the first task */
 		if (k == 0)

+ 2 - 1
examples/lu/xlu.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010-2011, 2014-2015  Université de Bordeaux
+ * Copyright (C) 2009, 2010-2011, 2014-2015, 2017  Université de Bordeaux
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012  CNRS
  *
@@ -184,6 +184,7 @@ static int dw_codelet_facto_v3(starpu_data_handle_t dataA, unsigned nblocks)
 
 	for (k = 0; k < nblocks; k++)
 	{
+		starpu_set_iteration(k);
 		struct starpu_task *task = create_task_11(dataA, k);
 
 		/* we defer the launch of the first task */

+ 3 - 1
examples/lu/xlu_implicit.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011, 2014-2015  Université de Bordeaux
+ * Copyright (C) 2010-2011, 2014-2015, 2017  Université de Bordeaux
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012, 2015, 2016  CNRS
  *
@@ -127,6 +127,8 @@ static int dw_codelet_facto_v3(starpu_data_handle_t dataA, unsigned nblocks)
 	{
 		int ret;
 
+		starpu_set_iteration(k);
+
 		ret = create_task_11(dataA, k);
 		if (ret == -ENODEV) return ret;
 

+ 3 - 1
examples/lu/xlu_implicit_pivot.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2012, 2014-2015  Université de Bordeaux
+ * Copyright (C) 2010-2012, 2014-2015, 2017  Université de Bordeaux
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012, 2016  CNRS
  *
@@ -173,6 +173,8 @@ static int dw_codelet_facto_pivot(starpu_data_handle_t *dataAp,
 	{
 		int ret;
 
+		starpu_set_iteration(k);
+
 		ret = create_task_11_pivot(dataAp, nblocks, k, piv_description, get_block);
 		if (ret == -ENODEV) return ret;
 

+ 2 - 1
examples/lu/xlu_pivot.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009-2012, 2014-2015  Université de Bordeaux
+ * Copyright (C) 2009-2012, 2014-2015, 2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -247,6 +247,7 @@ static int dw_codelet_facto_pivot(starpu_data_handle_t *dataAp,
 
 	for (k = 0; k < nblocks; k++)
 	{
+		starpu_set_iteration(k);
 		struct starpu_task *task = create_task_11_pivot(dataAp, nblocks, k, piv_description, get_block);
 
 		/* we defer the launch of the first task */

+ 3 - 1
examples/mandelbrot/mandelbrot.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2011, 2014-2015  Université de Bordeaux
+ * Copyright (C) 2010, 2011, 2014-2015, 2017  Université de Bordeaux
  * Copyright (C) 2010, 2011, 2012, 2013, 2016, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -544,6 +544,8 @@ int main(int argc, char **argv)
 		 * parallel task. */
 		int per_block_cnt[nblocks_p];
 
+		starpu_set_iteration(niter_p);
+
 		for (iby = 0; iby < nblocks_p; iby++)
 		{
 			per_block_cnt[iby] = 0;

+ 2 - 1
examples/ppm_downscaler/yuv_downscaler.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2011, 2013-2015  Université de Bordeaux
+ * Copyright (C) 2010-2011, 2013-2015, 2017  Université de Bordeaux
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012, 2013, 2016  CNRS
  *
@@ -218,6 +218,7 @@ int main(int argc, char **argv)
 	/* do the computation */
 	for (frame = 0; frame < nframes; frame++)
 	{
+		starpu_set_iteration(frame);
 		unsigned blocky;
 		for (blocky = 0; blocky < nblocks_y; blocky++)
 		{

+ 2 - 1
examples/stencil/stencil-tasks.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010, 2013-2015  Université de Bordeaux
+ * Copyright (C) 2010, 2013-2015, 2017  Université de Bordeaux
  * Copyright (C) 2012, 2013, 2015, 2017  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -296,6 +296,7 @@ void create_tasks(int rank)
 
 	for (iter = 0; iter <= niter; iter++)
 	{
+	     starpu_set_iteration(iter);
 	     for (bz = 0; bz < nbz; bz++)
 	     {
 		  if ((iter > 0) && (get_block_mpi_node(bz) == rank))

+ 3 - 0
include/starpu_task.h

@@ -317,6 +317,9 @@ int starpu_task_wait_for_no_ready(void);
 int starpu_task_nready(void);
 int starpu_task_nsubmitted(void);
 
+void starpu_set_iteration(unsigned long iteration);
+void starpu_set_subiteration(unsigned long subiteration);
+
 void starpu_do_schedule(void);
 
 void starpu_codelet_init(struct starpu_codelet *cl);

+ 3 - 3
src/common/fxt.h

@@ -650,8 +650,8 @@ do {										\
 #define _STARPU_TRACE_WORKER_SLEEP_END	\
 	FUT_DO_PROBE1(_STARPU_FUT_WORKER_SLEEP_END, _starpu_gettid());
 
-#define _STARPU_TRACE_TASK_SUBMIT(job)	\
-	FUT_DO_PROBE2(_STARPU_FUT_TASK_SUBMIT, (job)->job_id, _starpu_gettid());
+#define _STARPU_TRACE_TASK_SUBMIT(job, iter, subiter)	\
+	FUT_DO_PROBE4(_STARPU_FUT_TASK_SUBMIT, (job)->job_id, iter, subiter, _starpu_gettid());
 
 #define _STARPU_TRACE_TASK_SUBMIT_START()	\
 	FUT_DO_PROBE1(_STARPU_FUT_TASK_SUBMIT_START, _starpu_gettid());
@@ -1009,7 +1009,7 @@ do {										\
 #define _STARPU_TRACE_WORKER_SCHEDULING_POP		do {} while(0)
 #define _STARPU_TRACE_WORKER_SLEEP_START		do {} while(0)
 #define _STARPU_TRACE_WORKER_SLEEP_END			do {} while(0)
-#define _STARPU_TRACE_TASK_SUBMIT(job)			do {(void)(job);} while(0)
+#define _STARPU_TRACE_TASK_SUBMIT(job, a, b)			do {(void)(job); (void)(a);(void)(b);} while(0)
 #define _STARPU_TRACE_TASK_SUBMIT_START()		do {} while(0)
 #define _STARPU_TRACE_TASK_SUBMIT_END()			do {} while(0)
 #define _STARPU_TRACE_TASK_BUILD_START()		do {} while(0)

+ 2 - 0
src/core/sched_ctx.c

@@ -501,6 +501,8 @@ struct _starpu_sched_ctx* _starpu_create_sched_ctx(struct starpu_sched_policy *p
 	_starpu_barrier_counter_init(&sched_ctx->ready_tasks_barrier, 0);
 
 	sched_ctx->ready_flops = 0.0;
+	sched_ctx->iteration = 0;
+	sched_ctx->subiteration = 0;
 	sched_ctx->main_master = -1;
 	sched_ctx->perf_arch.devices = NULL;
 	sched_ctx->perf_arch.ndevices = 0;

+ 3 - 0
src/core/sched_ctx.h

@@ -69,6 +69,9 @@ struct _starpu_sched_ctx
 	/* amount of ready flops in a context */
 	double ready_flops;
 
+	/* Iteration number, as advertised by application */
+	unsigned long iteration, subiteration;
+
 	/* cond to block push when there are no workers in the ctx */
 	starpu_pthread_cond_t no_workers_cond;
 

+ 13 - 1
src/core/task.c

@@ -651,7 +651,9 @@ int starpu_task_submit(struct starpu_task *task)
 	}
 
 	if (!j->internal && !continuation)
-		_STARPU_TRACE_TASK_SUBMIT(j);
+		_STARPU_TRACE_TASK_SUBMIT(j,
+			_starpu_get_sched_ctx_struct(task->sched_ctx)->iteration,
+			_starpu_get_sched_ctx_struct(task->sched_ctx)->subiteration);
 
 	/* If this is a continuation, we don't modify the implicit data dependencies detected earlier. */
 	if (task->cl && !continuation)
@@ -989,6 +991,16 @@ int starpu_task_wait_for_no_ready(void)
 	return 0;
 }
 
+void starpu_set_iteration(unsigned long iteration)
+{
+	_starpu_get_sched_ctx_struct(_starpu_sched_ctx_get_current_context())->iteration = iteration;
+}
+
+void starpu_set_subiteration(unsigned long subiteration)
+{
+	_starpu_get_sched_ctx_struct(_starpu_sched_ctx_get_current_context())->subiteration = subiteration;
+}
+
 void starpu_do_schedule(void)
 {
 	struct _starpu_machine_config *config = _starpu_get_machine_config();

+ 18 - 6
src/debug/traces/starpu_fxt.c

@@ -101,6 +101,8 @@ struct task_info {
 	double end_time;
 	unsigned long footprint;
 	unsigned long kflops;
+	long iteration;
+	long subiteration;
 	char *parameters;
 	unsigned int ndeps;
 	unsigned long *dependencies;
@@ -131,6 +133,8 @@ static struct task_info *get_task(unsigned long job_id, int mpi_rank)
 		task->end_time = 0.;
 		task->footprint = 0;
 		task->kflops = 0.;
+		task->iteration = -1;
+		task->subiteration = -1;
 		task->parameters = NULL;
 		task->ndeps = 0;
 		task->dependencies = NULL;
@@ -187,6 +191,10 @@ static void task_dump(unsigned long job_id, int mpi_rank)
 	fprintf(tasks_file, "Footprint: %lx\n", task->footprint);
 	if (task->kflops != 0)
 		fprintf(tasks_file, "GFlop: %f\n", ((double) task->kflops) / 1000000);
+	if (task->iteration != -1)
+		fprintf(tasks_file, "Iteration: %ld\n", task->iteration);
+	if (task->subiteration != -1)
+		fprintf(tasks_file, "Subiteration: %ld\n", task->subiteration);
 	if (task->parameters)
 	{
 		fprintf(tasks_file, "Parameters: %s\n", task->parameters);
@@ -781,7 +789,7 @@ static void thread_pop_state(double time, const char *prefix, long unsigned int
 #endif
 }
 
-static void worker_set_detailed_state(double time, const char *prefix, long unsigned int workerid, const char *name, unsigned long size, const char *parameters, unsigned long footprint, unsigned long long tag, unsigned long job_id, double gflop, unsigned X, unsigned Y, unsigned Z)
+static void worker_set_detailed_state(double time, const char *prefix, long unsigned int workerid, const char *name, unsigned long size, const char *parameters, unsigned long footprint, unsigned long long tag, unsigned long job_id, double gflop, unsigned X, unsigned Y, unsigned Z, long iteration, long subiteration)
 {
 #ifdef STARPU_HAVE_POTI
 	char container[STARPU_POTI_STR_LEN];
@@ -789,7 +797,7 @@ static void worker_set_detailed_state(double time, const char *prefix, long unsi
 	/* TODO: set detailed state */
 	poti_SetState(time, container, "WS", name);
 #else
-	fprintf(out_paje_file, "20	%.9f	%sw%lu	WS	%s	%lu	%s	%08lx	%016llx	%lu	%f	%u	%u	%u\n", time, prefix, workerid, name, size, parameters, footprint, tag, job_id, gflop, X, Y, Z);
+	fprintf(out_paje_file, "20	%.9f	%sw%lu	WS	%s	%lu	%s	%08lx	%016llx	%lu	%f	%u	%u	%u	%ld	%ld\n", time, prefix, workerid, name, size, parameters, footprint, tag, job_id, gflop, X, Y, Z, iteration, subiteration);
 #endif
 }
 
@@ -1444,7 +1452,7 @@ static void handle_codelet_details(struct fxt_ev_64 *ev, struct starpu_fxt_optio
 		char *prefix = options->file_prefix;
 		unsigned sched_ctx = ev->param[0];
 
-		worker_set_detailed_state(last_codelet_start[worker], prefix, worker, _starpu_last_codelet_symbol[worker], ev->param[1], parameters, ev->param[2], ev->param[4], job_id, ((double) task->kflops) / 1000000, X, Y, Z);
+		worker_set_detailed_state(last_codelet_start[worker], prefix, worker, _starpu_last_codelet_symbol[worker], ev->param[1], parameters, ev->param[2], ev->param[4], job_id, ((double) task->kflops) / 1000000, X, Y, Z, task->iteration, task->subiteration);
 		if (sched_ctx != 0)
 		{
 #ifdef STARPU_HAVE_POTI
@@ -2296,10 +2304,14 @@ static void handle_task_deps(struct fxt_ev_64 *ev, struct starpu_fxt_options *op
 
 static void handle_task_submit(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
 {
-	unsigned long job_id;
-	job_id = ev->param[0];
+	unsigned long job_id = ev->param[0];
+	unsigned long iteration = ev->param[1];
+	unsigned long subiteration = ev->param[2];
 
-	get_task(job_id, options->file_rank)->submit_time = get_event_time_stamp(ev, options);
+	struct task_info *task = get_task(job_id, options->file_rank);
+	task->submit_time = get_event_time_stamp(ev, options);
+	task->iteration = iteration;
+	task->subiteration = subiteration;
 }
 
 static void handle_task_done(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)

+ 2 - 0
src/debug/traces/starpu_paje.c

@@ -144,6 +144,8 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED)
 	fprintf(file, "%%	X	string\n");
 	fprintf(file, "%%	Y	string\n");
 	fprintf(file, "%%	Z	string\n");
+	fprintf(file, "%%	Iteration	string\n");
+	fprintf(file, "%%	Subiteration	string\n");
 	fprintf(file, "%%EndEventDef\n");
 #endif