Przeglądaj źródła

Replace starpu_set_iteration/subiteration with starpu_iteration_push/pop, to make it nested

Samuel Thibault 8 lat temu
rodzic
commit
3ebbf095cc

+ 2 - 3
ChangeLog

@@ -62,9 +62,8 @@ New features:
   * Add modular-heft-prio scheduler.
   * Add starpu_cublas_get_local_handle helper.
   * Add starpu_data_set_name, starpu_data_set_coordinates_array, and
-    starpu_data_set_coordinates to describe data, and starpu_set_iteration and
-    starpu_set_subiteration to describe tasks, for better offline traces
-    analysis.
+    starpu_data_set_coordinates to describe data, and starpu_iteration_push and
+    starpu_iteration_pop to describe tasks, for better offline traces analysis.
 
 Changes:
   * Fix performance regression of lws for small tasks.

+ 4 - 3
doc/doxygen/chapters/380_offline_performance_tools.doxy

@@ -143,9 +143,10 @@ It can also set the starpu_task::name field of the task (or use \ref STARPU_NAME
 when using starpu_task_insert()), to replace in traces the name of the codelet
 with an arbitrarily chosen name.
 
-It can also set the iteration number, by just calling starpu_set_iteration() at
-the beginning of the first task submission loop. This iteration number will show
-up in traces for all tasks submitted from there.
+It can also set the iteration number, by just calling starpu_iteration_push()
+at the beginning of submission loops and starpu_iteration_pop() at the end of
+submission loops. These iteration numbers will show up in traces for all tasks
+submitted from there.
 
 Coordinates can also be given to data with the starpu_data_set_coordinates() or
 starpu_data_set_coordinates_array() function. In the trace, tasks will then be

+ 13 - 8
doc/doxygen/chapters/api/codelet_and_tasks.doxy

@@ -951,17 +951,22 @@ codelet implementation to be executed when executing \p task.
 Return the codelet implementation to be executed
 when executing \p task.
 
-\fn void starpu_set_iteration(unsigned long iteration)
+\fn void starpu_iteration_push(unsigned long iteration)
 \ingroup API_Codelet_And_Tasks
-Sets the iteration number for all the tasks to be submitted after this
-call. This is typically called at the beginning of the main task submission
-loop. This number will then show up in tracing tools.
+Sets the iteration number for all the tasks to be submitted after
+this call. This is typically called at the beginning of a task
+submission loop. This number will then show up in tracing tools. A
+corresponding starpu_iteration_pop() call must be made to match the call to
+starpu_iteration_push(), at the end of the same task submission loop, typically.
 
-\fn void starpu_set_subiteration(unsigned long subiteration)
+Nested calls to starpu_iteration_push and starpu_iteration_pop are allowed, to
+describe a loop nest for instance, provided that they match properly.
+
+\fn void starpu_iteration_pop(void)
 \ingroup API_Codelet_And_Tasks
-Sets the subiteration number for all the tasks to be submitted after this
-call. This is typically called at the beginning of the second-nested task
-submission loop. This number will then show up in tracing tools.
+Drops the iteration number for submitted tasks. This must match a previous
+call to starpu_iteration_push(), and is typically called at the end of a task
+submission loop.
 
 \fn void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t *deps, void (*callback)(void *), void *callback_arg)
 \ingroup API_Codelet_And_Tasks

+ 2 - 1
examples/cg/cg.c

@@ -308,7 +308,7 @@ static int cg(void)
 		double delta_old;
 		double alpha, beta;
 
-		starpu_set_iteration(i);
+		starpu_iteration_push(i);
 
 		/* q <- A d */
 		gemv_kernel(q_handle, A_handle, d_handle, 0.0, 1.0, nblocks, use_reduction);
@@ -358,6 +358,7 @@ static int cg(void)
 			FPRINTF(stderr, "iter %d DELTA %e - %e\n", i, delta_new, error);
 		}
 
+		starpu_iteration_pop();
 		i++;
 	}
 

+ 2 - 1
examples/cholesky/cholesky_grain_tag.c

@@ -190,7 +190,7 @@ static int cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 
 	for (k = 0; k < nbigblocks; k++)
 	{
-		starpu_set_iteration(k);
+		starpu_iteration_push(k);
 		struct starpu_task *task = create_task_11(dataA, k, reclevel);
 		/* we defer the launch of the first task */
 		if (k == 0)
@@ -218,6 +218,7 @@ static int cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 				}
 			}
 		}
+		starpu_iteration_pop();
 	}
 
 	/* schedule the codelet */

+ 2 - 1
examples/cholesky/cholesky_implicit.c

@@ -59,7 +59,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 	for (k = 0; k < nblocks; k++)
 	{
 		int ret;
-		starpu_set_iteration(k);
+		starpu_iteration_push(k);
                 starpu_data_handle_t sdatakk = starpu_data_get_sub_data(dataA, 2, k, k);
 
                 ret = starpu_task_insert(&cl11,
@@ -112,6 +112,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 			}
 			starpu_data_wont_use(sdatakj);
 		}
+		starpu_iteration_pop();
 	}
 
 	starpu_task_wait_for_all();

+ 2 - 1
examples/cholesky/cholesky_tag.c

@@ -171,7 +171,7 @@ static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
 	for (k = 0; k < nblocks; k++)
 	{
-		starpu_set_iteration(k);
+		starpu_iteration_push(k);
 		struct starpu_task *task = create_task_11(dataA, k);
 		/* we defer the launch of the first task */
 		if (k == 0)
@@ -199,6 +199,7 @@ static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 					create_task_22(dataA, k, i, j);
 			}
 		}
+		starpu_iteration_pop();
 	}
 
 	/* schedule the codelet */

+ 2 - 1
examples/cholesky/cholesky_tile_tag.c

@@ -167,7 +167,7 @@ static int cholesky_no_stride(void)
 
 	for (k = 0; k < nblocks_p; k++)
 	{
-		starpu_set_iteration(k);
+		starpu_iteration_push(k);
 		struct starpu_task *task = create_task_11(k, nblocks_p);
 		/* we defer the launch of the first task */
 		if (k == 0)
@@ -194,6 +194,7 @@ static int cholesky_no_stride(void)
 				}
 			}
 		}
+		starpu_iteration_pop();
 	}
 
 	/* schedule the codelet */

+ 2 - 1
examples/lu/xlu.c

@@ -184,7 +184,7 @@ static int dw_codelet_facto_v3(starpu_data_handle_t dataA, unsigned nblocks)
 
 	for (k = 0; k < nblocks; k++)
 	{
-		starpu_set_iteration(k);
+		starpu_iteration_push(k);
 		struct starpu_task *task = create_task_11(dataA, k);
 
 		/* we defer the launch of the first task */
@@ -215,6 +215,7 @@ static int dw_codelet_facto_v3(starpu_data_handle_t dataA, unsigned nblocks)
 			     if (ret == -ENODEV) return ret;
 			}
 		}
+		starpu_iteration_pop();
 	}
 
 	/* schedule the codelet */

+ 2 - 1
examples/lu/xlu_implicit.c

@@ -127,7 +127,7 @@ static int dw_codelet_facto_v3(starpu_data_handle_t dataA, unsigned nblocks)
 	{
 		int ret;
 
-		starpu_set_iteration(k);
+		starpu_iteration_push(k);
 
 		ret = create_task_11(dataA, k);
 		if (ret == -ENODEV) return ret;
@@ -152,6 +152,7 @@ static int dw_codelet_facto_v3(starpu_data_handle_t dataA, unsigned nblocks)
 		    starpu_data_wont_use(starpu_data_get_sub_data(dataA, 2, k, i));
 		    starpu_data_wont_use(starpu_data_get_sub_data(dataA, 2, i, k));
 		}
+		starpu_iteration_pop();
 	}
 
 	/* stall the application until the end of computations */

+ 2 - 1
examples/lu/xlu_implicit_pivot.c

@@ -173,7 +173,7 @@ static int dw_codelet_facto_pivot(starpu_data_handle_t *dataAp,
 	{
 		int ret;
 
-		starpu_set_iteration(k);
+		starpu_iteration_push(k);
 
 		ret = create_task_11_pivot(dataAp, nblocks, k, piv_description, get_block);
 		if (ret == -ENODEV) return ret;
@@ -207,6 +207,7 @@ static int dw_codelet_facto_pivot(starpu_data_handle_t *dataAp,
 		    starpu_data_wont_use(get_block(dataAp, nblocks, k, i));
 		    starpu_data_wont_use(get_block(dataAp, nblocks, i, k));
 		}
+		starpu_iteration_pop();
 	}
 
 	/* stall the application until the end of computations */

+ 2 - 1
examples/lu/xlu_pivot.c

@@ -247,7 +247,7 @@ static int dw_codelet_facto_pivot(starpu_data_handle_t *dataAp,
 
 	for (k = 0; k < nblocks; k++)
 	{
-		starpu_set_iteration(k);
+		starpu_iteration_push(k);
 		struct starpu_task *task = create_task_11_pivot(dataAp, nblocks, k, piv_description, get_block);
 
 		/* we defer the launch of the first task */
@@ -287,6 +287,7 @@ static int dw_codelet_facto_pivot(starpu_data_handle_t *dataAp,
 			     if (ret == -ENODEV) return ret;
 			}
 		}
+		starpu_iteration_pop();
 	}
 
 	/* we wait the last task (TAG11(nblocks - 1)) and all the pivot tasks */

+ 2 - 1
examples/mandelbrot/mandelbrot.c

@@ -544,7 +544,7 @@ int main(int argc, char **argv)
 		 * parallel task. */
 		int per_block_cnt[nblocks_p];
 
-		starpu_set_iteration(niter_p);
+		starpu_iteration_push(niter_p);
 
 		for (iby = 0; iby < nblocks_p; iby++)
 		{
@@ -579,6 +579,7 @@ int main(int argc, char **argv)
 		}
 
 
+		starpu_iteration_pop();
 		if (demo_p)
 		{
 			/* Zoom in */

+ 2 - 1
examples/ppm_downscaler/yuv_downscaler.c

@@ -218,7 +218,7 @@ int main(int argc, char **argv)
 	/* do the computation */
 	for (frame = 0; frame < nframes; frame++)
 	{
-		starpu_set_iteration(frame);
+		starpu_iteration_push(frame);
 		unsigned blocky;
 		for (blocky = 0; blocky < nblocks_y; blocky++)
 		{
@@ -266,6 +266,7 @@ int main(int argc, char **argv)
 			ret = starpu_task_submit(task);
 			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 		}
+		starpu_iteration_pop();
 	}
 
 	/* make sure all output buffers are sync'ed */

+ 2 - 1
examples/stencil/stencil-tasks.c

@@ -296,7 +296,7 @@ void create_tasks(int rank)
 
 	for (iter = 0; iter <= niter; iter++)
 	{
-	     starpu_set_iteration(iter);
+	     starpu_iteration_push(iter);
 	     for (bz = 0; bz < nbz; bz++)
 	     {
 		  if ((iter > 0) && (get_block_mpi_node(bz) == rank))
@@ -314,6 +314,7 @@ void create_tasks(int rank)
 				     create_task_save(iter, bz, -1, rank);
 		     }
 	     }
+	     starpu_iteration_pop();
 	}
 }
 

+ 2 - 1
examples/stencil/stencil.c

@@ -330,7 +330,7 @@ int main(int argc, char **argv)
 		int iter;
 		for (iter = 0; iter < who_runs_what_len; iter++)
 		{
-			starpu_set_iteration(iter);
+			starpu_iteration_push(iter);
 			unsigned last, bz;
 			last = 1;
 			for (bz = 0; bz < nbz; bz++)
@@ -351,6 +351,7 @@ int main(int argc, char **argv)
 			}
 			FPRINTF(stderr, "\n");
 
+			starpu_iteration_pop();
 			if (last)
 				break;
 		}

+ 2 - 2
include/starpu_task.h

@@ -317,8 +317,8 @@ int starpu_task_wait_for_no_ready(void);
 int starpu_task_nready(void);
 int starpu_task_nsubmitted(void);
 
-void starpu_set_iteration(unsigned long iteration);
-void starpu_set_subiteration(unsigned long subiteration);
+void starpu_iteration_push(unsigned long iteration);
+void starpu_iteration_pop(void);
 
 void starpu_do_schedule(void);
 

+ 2 - 1
mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c

@@ -112,7 +112,7 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 
 	for (k = 0; k < nblocks; k++)
 	{
-		starpu_set_iteration(k);
+		starpu_iteration_push(k);
 
 		int prio = STARPU_DEFAULT_PRIO;
 		if (!noprio) prio = STARPU_MAX_PRIO;
@@ -155,6 +155,7 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 			if (my_distrib(k, j, nodes) == rank)
 				starpu_data_wont_use(data_handles[k][j]);
 		}
+		starpu_iteration_pop();
 	}
 
 	starpu_task_wait_for_all();

+ 2 - 1
mpi/examples/mpi_lu/pxlu.c

@@ -839,7 +839,7 @@ double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
 
 	for (k = 0; k < nblocks; k++)
 	{
-		starpu_set_iteration(k);
+		starpu_iteration_push(k);
 
 		create_task_11(k);
 
@@ -856,6 +856,7 @@ double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
 				create_task_22(k, i, j);
 			}
 		}
+		starpu_iteration_pop();
 	}
 
 	int barrier_ret = starpu_mpi_barrier(MPI_COMM_WORLD);

+ 2 - 1
mpi/examples/mpi_lu/pxlu_implicit.c

@@ -135,7 +135,7 @@ double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
 
 	for (k = 0; k < nblocks; k++)
 	{
-		starpu_set_iteration(k);
+		starpu_iteration_push(k);
 
 		create_task_11(k);
 
@@ -166,6 +166,7 @@ double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
 			if (get_block_rank(i, k) == _rank)
 				starpu_data_wont_use(STARPU_PLU(get_block_handle)(i,k));
 		}
+		starpu_iteration_pop();
 	}
 
 	starpu_task_wait_for_all();

+ 4 - 2
mpi/examples/stencil/stencil5.c

@@ -175,7 +175,7 @@ int main(int argc, char **argv)
 	/* First computation with initial distribution */
 	for(loop=0 ; loop<niter; loop++)
 	{
-		starpu_set_iteration(loop);
+		starpu_iteration_push(loop);
 
 		for (x = 1; x < X-1; x++)
 		{
@@ -187,6 +187,7 @@ int main(int argc, char **argv)
 						       0);
 			}
 		}
+		starpu_iteration_pop();
 	}
 	FPRINTF(stderr, "Waiting ...\n");
 	starpu_task_wait_for_all();
@@ -216,7 +217,7 @@ int main(int argc, char **argv)
 	/* Second computation with new distribution */
 	for(loop=0 ; loop<niter; loop++)
 	{
-		starpu_set_iteration(niter + loop);
+		starpu_iteration_push(niter + loop);
 
 		for (x = 1; x < X-1; x++)
 		{
@@ -228,6 +229,7 @@ int main(int argc, char **argv)
 						       0);
 			}
 		}
+		starpu_iteration_pop();
 	}
 	FPRINTF(stderr, "Waiting ...\n");
 	starpu_task_wait_for_all();

+ 2 - 1
mpi/examples/stencil/stencil5_lb.c

@@ -236,7 +236,7 @@ int main(int argc, char **argv)
 	/* First computation with initial distribution */
 	for(loop=0 ; loop<niter; loop++)
 	{
-		starpu_set_iteration(loop);
+		starpu_iteration_push(loop);
 
 		for (x = 1; x < X-1; x++)
 		{
@@ -249,6 +249,7 @@ int main(int argc, char **argv)
 						       0);
 			}
 		}
+		starpu_iteration_pop();
 	}
 	FPRINTF(stderr, "Waiting ...\n");
 	starpu_task_wait_for_all();

+ 4 - 2
src/core/sched_ctx.c

@@ -468,6 +468,7 @@ struct _starpu_sched_ctx* _starpu_create_sched_ctx(struct starpu_sched_policy *p
 	STARPU_PTHREAD_MUTEX_UNLOCK(&sched_ctx_manag);
 
 	int nworkers = config->topology.nworkers;
+	unsigned i;
 
 	STARPU_ASSERT(nworkers_ctx <= nworkers);
 
@@ -501,8 +502,9 @@ struct _starpu_sched_ctx* _starpu_create_sched_ctx(struct starpu_sched_policy *p
 	_starpu_barrier_counter_init(&sched_ctx->ready_tasks_barrier, 0);
 
 	sched_ctx->ready_flops = 0.0;
-	sched_ctx->iteration = -1;
-	sched_ctx->subiteration = -1;
+	for (i = 0; i < sizeof(sched_ctx->iterations)/sizeof(sched_ctx->iterations[0]); i++)
+		sched_ctx->iterations[i] = -1;
+	sched_ctx->iteration_level = 0;
 	sched_ctx->main_master = -1;
 	sched_ctx->perf_arch.devices = NULL;
 	sched_ctx->perf_arch.ndevices = 0;

+ 2 - 1
src/core/sched_ctx.h

@@ -70,7 +70,8 @@ struct _starpu_sched_ctx
 	double ready_flops;
 
 	/* Iteration number, as advertised by application */
-	long iteration, subiteration;
+	long iterations[2];
+	int iteration_level;
 
 	/* cond to block push when there are no workers in the ctx */
 	starpu_pthread_cond_t no_workers_cond;

+ 13 - 6
src/core/task.c

@@ -652,8 +652,8 @@ int starpu_task_submit(struct starpu_task *task)
 
 	if (!j->internal && !continuation)
 		_STARPU_TRACE_TASK_SUBMIT(j,
-			_starpu_get_sched_ctx_struct(task->sched_ctx)->iteration,
-			_starpu_get_sched_ctx_struct(task->sched_ctx)->subiteration);
+			_starpu_get_sched_ctx_struct(task->sched_ctx)->iterations[0],
+			_starpu_get_sched_ctx_struct(task->sched_ctx)->iterations[1]);
 
 	/* If this is a continuation, we don't modify the implicit data dependencies detected earlier. */
 	if (task->cl && !continuation)
@@ -991,14 +991,21 @@ int starpu_task_wait_for_no_ready(void)
 	return 0;
 }
 
-void starpu_set_iteration(unsigned long iteration)
+void starpu_iteration_push(unsigned long iteration)
 {
-	_starpu_get_sched_ctx_struct(_starpu_sched_ctx_get_current_context())->iteration = iteration;
+	struct _starpu_sched_ctx *ctx = _starpu_get_sched_ctx_struct(_starpu_sched_ctx_get_current_context());
+	unsigned level = ctx->iteration_level++;
+	if (level < sizeof(ctx->iterations)/sizeof(ctx->iterations[0]))
+		ctx->iterations[level] = iteration;
 }
 
-void starpu_set_subiteration(unsigned long subiteration)
+void starpu_iteration_pop(void)
 {
-	_starpu_get_sched_ctx_struct(_starpu_sched_ctx_get_current_context())->subiteration = subiteration;
+	struct _starpu_sched_ctx *ctx = _starpu_get_sched_ctx_struct(_starpu_sched_ctx_get_current_context());
+	STARPU_ASSERT_MSG(ctx->iteration_level > 0, "calls to starpu_iteration_pop must match starpu_iteration_push calls")
+	unsigned level = ctx->iteration_level--;
+	if (level < sizeof(ctx->iterations)/sizeof(ctx->iterations[0]))
+		ctx->iterations[level] = -1;
 }
 
 void starpu_do_schedule(void)

+ 13 - 11
src/debug/traces/starpu_fxt.c

@@ -101,8 +101,7 @@ struct task_info {
 	double end_time;
 	unsigned long footprint;
 	unsigned long kflops;
-	long iteration;
-	long subiteration;
+	long iterations[2];
 	char *parameters;
 	unsigned int ndeps;
 	unsigned long *dependencies;
@@ -116,6 +115,7 @@ struct task_info *tasks_info;
 static struct task_info *get_task(unsigned long job_id, int mpi_rank)
 {
 	struct task_info *task;
+	unsigned i;
 
 	HASH_FIND(hh, tasks_info, &job_id, sizeof(job_id), task);
 	if (!task)
@@ -133,8 +133,8 @@ static struct task_info *get_task(unsigned long job_id, int mpi_rank)
 		task->end_time = 0.;
 		task->footprint = 0;
 		task->kflops = 0.;
-		task->iteration = -1;
-		task->subiteration = -1;
+		for (i = 0; i < sizeof(task->iterations)/sizeof(task->iterations[0]); i++)
+			task->iterations[i] = -1;
 		task->parameters = NULL;
 		task->ndeps = 0;
 		task->dependencies = NULL;
@@ -191,10 +191,12 @@ static void task_dump(unsigned long job_id, int mpi_rank)
 	fprintf(tasks_file, "Footprint: %lx\n", task->footprint);
 	if (task->kflops != 0)
 		fprintf(tasks_file, "GFlop: %f\n", ((double) task->kflops) / 1000000);
-	if (task->iteration != -1)
-		fprintf(tasks_file, "Iteration: %ld\n", task->iteration);
-	if (task->subiteration != -1)
-		fprintf(tasks_file, "Subiteration: %ld\n", task->subiteration);
+	if (task->iterations[0] != -1) {
+		fprintf(tasks_file, "Iteration:");
+		for (i = 0; i < sizeof(task->iterations)/sizeof(task->iterations[0]); i++)
+			fprintf(tasks_file, " %ld", task->iterations[i]);
+		fprintf(tasks_file, "\n");
+	}
 	if (task->parameters)
 	{
 		fprintf(tasks_file, "Parameters: %s\n", task->parameters);
@@ -1456,7 +1458,7 @@ static void handle_codelet_details(struct fxt_ev_64 *ev, struct starpu_fxt_optio
 		char *prefix = options->file_prefix;
 		unsigned sched_ctx = ev->param[0];
 
-		worker_set_detailed_state(last_codelet_start[worker], prefix, worker, _starpu_last_codelet_symbol[worker], ev->param[1], parameters, ev->param[2], ev->param[4], job_id, ((double) task->kflops) / 1000000, X, Y, Z, task->iteration, task->subiteration);
+		worker_set_detailed_state(last_codelet_start[worker], prefix, worker, _starpu_last_codelet_symbol[worker], ev->param[1], parameters, ev->param[2], ev->param[4], job_id, ((double) task->kflops) / 1000000, X, Y, Z, task->iterations[0], task->iterations[1]);
 		if (sched_ctx != 0)
 		{
 #ifdef STARPU_HAVE_POTI
@@ -2323,8 +2325,8 @@ static void handle_task_submit(struct fxt_ev_64 *ev, struct starpu_fxt_options *
 
 	struct task_info *task = get_task(job_id, options->file_rank);
 	task->submit_time = get_event_time_stamp(ev, options);
-	task->iteration = iteration;
-	task->subiteration = subiteration;
+	task->iterations[0] = iteration;
+	task->iterations[1] = subiteration;
 }
 
 static void handle_task_done(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)