Browse Source

Replace starpu_set_iteration/subiteration with starpu_iteration_push/pop, to make it nested

Samuel Thibault 8 years ago
parent
commit
3ebbf095cc

+ 2 - 3
ChangeLog

@@ -62,9 +62,8 @@ New features:
   * Add modular-heft-prio scheduler.
   * Add modular-heft-prio scheduler.
   * Add starpu_cublas_get_local_handle helper.
   * Add starpu_cublas_get_local_handle helper.
   * Add starpu_data_set_name, starpu_data_set_coordinates_array, and
   * Add starpu_data_set_name, starpu_data_set_coordinates_array, and
-    starpu_data_set_coordinates to describe data, and starpu_set_iteration and
+    starpu_data_set_coordinates to describe data, and starpu_iteration_push and
-    starpu_set_subiteration to describe tasks, for better offline traces
+    starpu_iteration_pop to describe tasks, for better offline traces analysis.
-    analysis.
 
 
 Changes:
 Changes:
   * Fix performance regression of lws for small tasks.
   * Fix performance regression of lws for small tasks.

+ 4 - 3
doc/doxygen/chapters/380_offline_performance_tools.doxy

@@ -143,9 +143,10 @@ It can also set the starpu_task::name field of the task (or use \ref STARPU_NAME
 when using starpu_task_insert()), to replace in traces the name of the codelet
 when using starpu_task_insert()), to replace in traces the name of the codelet
 with an arbitrarily chosen name.
 with an arbitrarily chosen name.
 
 
-It can also set the iteration number, by just calling starpu_set_iteration() at
+It can also set the iteration number, by just calling starpu_iteration_push()
-the beginning of the first task submission loop. This iteration number will show
+at the beginning of submission loops and starpu_iteration_pop() at the end of
-up in traces for all tasks submitted from there.
+submission loops. These iteration numbers will show up in traces for all tasks
+submitted from there.
 
 
 Coordinates can also be given to data with the starpu_data_set_coordinates() or
 Coordinates can also be given to data with the starpu_data_set_coordinates() or
 starpu_data_set_coordinates_array() function. In the trace, tasks will then be
 starpu_data_set_coordinates_array() function. In the trace, tasks will then be

+ 13 - 8
doc/doxygen/chapters/api/codelet_and_tasks.doxy

@@ -951,17 +951,22 @@ codelet implementation to be executed when executing \p task.
 Return the codelet implementation to be executed
 Return the codelet implementation to be executed
 when executing \p task.
 when executing \p task.
 
 
-\fn void starpu_set_iteration(unsigned long iteration)
+\fn void starpu_iteration_push(unsigned long iteration)
 \ingroup API_Codelet_And_Tasks
 \ingroup API_Codelet_And_Tasks
-Sets the iteration number for all the tasks to be submitted after this
+Sets the iteration number for all the tasks to be submitted after
-call. This is typically called at the beginning of the main task submission
+this call. This is typically called at the beginning of a task
-loop. This number will then show up in tracing tools.
+submission loop. This number will then show up in tracing tools. A
+corresponding starpu_iteration_pop() call must be made to match the call to
+starpu_iteration_push(), at the end of the same task submission loop, typically.
 
 
-\fn void starpu_set_subiteration(unsigned long subiteration)
+Nested calls to starpu_iteration_push and starpu_iteration_pop are allowed, to
+describe a loop nest for instance, provided that they match properly.
+
+\fn void starpu_iteration_pop(void)
 \ingroup API_Codelet_And_Tasks
 \ingroup API_Codelet_And_Tasks
-Sets the subiteration number for all the tasks to be submitted after this
+Drops the iteration number for submitted tasks. This must match a previous
-call. This is typically called at the beginning of the second-nested task
+call to starpu_iteration_push(), and is typically called at the end of a task
-submission loop. This number will then show up in tracing tools.
+submission loop.
 
 
 \fn void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t *deps, void (*callback)(void *), void *callback_arg)
 \fn void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t *deps, void (*callback)(void *), void *callback_arg)
 \ingroup API_Codelet_And_Tasks
 \ingroup API_Codelet_And_Tasks

+ 2 - 1
examples/cg/cg.c

@@ -308,7 +308,7 @@ static int cg(void)
 		double delta_old;
 		double delta_old;
 		double alpha, beta;
 		double alpha, beta;
 
 
-		starpu_set_iteration(i);
+		starpu_iteration_push(i);
 
 
 		/* q <- A d */
 		/* q <- A d */
 		gemv_kernel(q_handle, A_handle, d_handle, 0.0, 1.0, nblocks, use_reduction);
 		gemv_kernel(q_handle, A_handle, d_handle, 0.0, 1.0, nblocks, use_reduction);
@@ -358,6 +358,7 @@ static int cg(void)
 			FPRINTF(stderr, "iter %d DELTA %e - %e\n", i, delta_new, error);
 			FPRINTF(stderr, "iter %d DELTA %e - %e\n", i, delta_new, error);
 		}
 		}
 
 
+		starpu_iteration_pop();
 		i++;
 		i++;
 	}
 	}
 
 

+ 2 - 1
examples/cholesky/cholesky_grain_tag.c

@@ -190,7 +190,7 @@ static int cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 
 
 	for (k = 0; k < nbigblocks; k++)
 	for (k = 0; k < nbigblocks; k++)
 	{
 	{
-		starpu_set_iteration(k);
+		starpu_iteration_push(k);
 		struct starpu_task *task = create_task_11(dataA, k, reclevel);
 		struct starpu_task *task = create_task_11(dataA, k, reclevel);
 		/* we defer the launch of the first task */
 		/* we defer the launch of the first task */
 		if (k == 0)
 		if (k == 0)
@@ -218,6 +218,7 @@ static int cholesky_grain_rec(float *matA, unsigned size, unsigned ld, unsigned
 				}
 				}
 			}
 			}
 		}
 		}
+		starpu_iteration_pop();
 	}
 	}
 
 
 	/* schedule the codelet */
 	/* schedule the codelet */

+ 2 - 1
examples/cholesky/cholesky_implicit.c

@@ -59,7 +59,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 	for (k = 0; k < nblocks; k++)
 	for (k = 0; k < nblocks; k++)
 	{
 	{
 		int ret;
 		int ret;
-		starpu_set_iteration(k);
+		starpu_iteration_push(k);
                 starpu_data_handle_t sdatakk = starpu_data_get_sub_data(dataA, 2, k, k);
                 starpu_data_handle_t sdatakk = starpu_data_get_sub_data(dataA, 2, k, k);
 
 
                 ret = starpu_task_insert(&cl11,
                 ret = starpu_task_insert(&cl11,
@@ -112,6 +112,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 			}
 			}
 			starpu_data_wont_use(sdatakj);
 			starpu_data_wont_use(sdatakj);
 		}
 		}
+		starpu_iteration_pop();
 	}
 	}
 
 
 	starpu_task_wait_for_all();
 	starpu_task_wait_for_all();

+ 2 - 1
examples/cholesky/cholesky_tag.c

@@ -171,7 +171,7 @@ static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
 
 	for (k = 0; k < nblocks; k++)
 	for (k = 0; k < nblocks; k++)
 	{
 	{
-		starpu_set_iteration(k);
+		starpu_iteration_push(k);
 		struct starpu_task *task = create_task_11(dataA, k);
 		struct starpu_task *task = create_task_11(dataA, k);
 		/* we defer the launch of the first task */
 		/* we defer the launch of the first task */
 		if (k == 0)
 		if (k == 0)
@@ -199,6 +199,7 @@ static void _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 					create_task_22(dataA, k, i, j);
 					create_task_22(dataA, k, i, j);
 			}
 			}
 		}
 		}
+		starpu_iteration_pop();
 	}
 	}
 
 
 	/* schedule the codelet */
 	/* schedule the codelet */

+ 2 - 1
examples/cholesky/cholesky_tile_tag.c

@@ -167,7 +167,7 @@ static int cholesky_no_stride(void)
 
 
 	for (k = 0; k < nblocks_p; k++)
 	for (k = 0; k < nblocks_p; k++)
 	{
 	{
-		starpu_set_iteration(k);
+		starpu_iteration_push(k);
 		struct starpu_task *task = create_task_11(k, nblocks_p);
 		struct starpu_task *task = create_task_11(k, nblocks_p);
 		/* we defer the launch of the first task */
 		/* we defer the launch of the first task */
 		if (k == 0)
 		if (k == 0)
@@ -194,6 +194,7 @@ static int cholesky_no_stride(void)
 				}
 				}
 			}
 			}
 		}
 		}
+		starpu_iteration_pop();
 	}
 	}
 
 
 	/* schedule the codelet */
 	/* schedule the codelet */

+ 2 - 1
examples/lu/xlu.c

@@ -184,7 +184,7 @@ static int dw_codelet_facto_v3(starpu_data_handle_t dataA, unsigned nblocks)
 
 
 	for (k = 0; k < nblocks; k++)
 	for (k = 0; k < nblocks; k++)
 	{
 	{
-		starpu_set_iteration(k);
+		starpu_iteration_push(k);
 		struct starpu_task *task = create_task_11(dataA, k);
 		struct starpu_task *task = create_task_11(dataA, k);
 
 
 		/* we defer the launch of the first task */
 		/* we defer the launch of the first task */
@@ -215,6 +215,7 @@ static int dw_codelet_facto_v3(starpu_data_handle_t dataA, unsigned nblocks)
 			     if (ret == -ENODEV) return ret;
 			     if (ret == -ENODEV) return ret;
 			}
 			}
 		}
 		}
+		starpu_iteration_pop();
 	}
 	}
 
 
 	/* schedule the codelet */
 	/* schedule the codelet */

+ 2 - 1
examples/lu/xlu_implicit.c

@@ -127,7 +127,7 @@ static int dw_codelet_facto_v3(starpu_data_handle_t dataA, unsigned nblocks)
 	{
 	{
 		int ret;
 		int ret;
 
 
-		starpu_set_iteration(k);
+		starpu_iteration_push(k);
 
 
 		ret = create_task_11(dataA, k);
 		ret = create_task_11(dataA, k);
 		if (ret == -ENODEV) return ret;
 		if (ret == -ENODEV) return ret;
@@ -152,6 +152,7 @@ static int dw_codelet_facto_v3(starpu_data_handle_t dataA, unsigned nblocks)
 		    starpu_data_wont_use(starpu_data_get_sub_data(dataA, 2, k, i));
 		    starpu_data_wont_use(starpu_data_get_sub_data(dataA, 2, k, i));
 		    starpu_data_wont_use(starpu_data_get_sub_data(dataA, 2, i, k));
 		    starpu_data_wont_use(starpu_data_get_sub_data(dataA, 2, i, k));
 		}
 		}
+		starpu_iteration_pop();
 	}
 	}
 
 
 	/* stall the application until the end of computations */
 	/* stall the application until the end of computations */

+ 2 - 1
examples/lu/xlu_implicit_pivot.c

@@ -173,7 +173,7 @@ static int dw_codelet_facto_pivot(starpu_data_handle_t *dataAp,
 	{
 	{
 		int ret;
 		int ret;
 
 
-		starpu_set_iteration(k);
+		starpu_iteration_push(k);
 
 
 		ret = create_task_11_pivot(dataAp, nblocks, k, piv_description, get_block);
 		ret = create_task_11_pivot(dataAp, nblocks, k, piv_description, get_block);
 		if (ret == -ENODEV) return ret;
 		if (ret == -ENODEV) return ret;
@@ -207,6 +207,7 @@ static int dw_codelet_facto_pivot(starpu_data_handle_t *dataAp,
 		    starpu_data_wont_use(get_block(dataAp, nblocks, k, i));
 		    starpu_data_wont_use(get_block(dataAp, nblocks, k, i));
 		    starpu_data_wont_use(get_block(dataAp, nblocks, i, k));
 		    starpu_data_wont_use(get_block(dataAp, nblocks, i, k));
 		}
 		}
+		starpu_iteration_pop();
 	}
 	}
 
 
 	/* stall the application until the end of computations */
 	/* stall the application until the end of computations */

+ 2 - 1
examples/lu/xlu_pivot.c

@@ -247,7 +247,7 @@ static int dw_codelet_facto_pivot(starpu_data_handle_t *dataAp,
 
 
 	for (k = 0; k < nblocks; k++)
 	for (k = 0; k < nblocks; k++)
 	{
 	{
-		starpu_set_iteration(k);
+		starpu_iteration_push(k);
 		struct starpu_task *task = create_task_11_pivot(dataAp, nblocks, k, piv_description, get_block);
 		struct starpu_task *task = create_task_11_pivot(dataAp, nblocks, k, piv_description, get_block);
 
 
 		/* we defer the launch of the first task */
 		/* we defer the launch of the first task */
@@ -287,6 +287,7 @@ static int dw_codelet_facto_pivot(starpu_data_handle_t *dataAp,
 			     if (ret == -ENODEV) return ret;
 			     if (ret == -ENODEV) return ret;
 			}
 			}
 		}
 		}
+		starpu_iteration_pop();
 	}
 	}
 
 
 	/* we wait the last task (TAG11(nblocks - 1)) and all the pivot tasks */
 	/* we wait the last task (TAG11(nblocks - 1)) and all the pivot tasks */

+ 2 - 1
examples/mandelbrot/mandelbrot.c

@@ -544,7 +544,7 @@ int main(int argc, char **argv)
 		 * parallel task. */
 		 * parallel task. */
 		int per_block_cnt[nblocks_p];
 		int per_block_cnt[nblocks_p];
 
 
-		starpu_set_iteration(niter_p);
+		starpu_iteration_push(niter_p);
 
 
 		for (iby = 0; iby < nblocks_p; iby++)
 		for (iby = 0; iby < nblocks_p; iby++)
 		{
 		{
@@ -579,6 +579,7 @@ int main(int argc, char **argv)
 		}
 		}
 
 
 
 
+		starpu_iteration_pop();
 		if (demo_p)
 		if (demo_p)
 		{
 		{
 			/* Zoom in */
 			/* Zoom in */

+ 2 - 1
examples/ppm_downscaler/yuv_downscaler.c

@@ -218,7 +218,7 @@ int main(int argc, char **argv)
 	/* do the computation */
 	/* do the computation */
 	for (frame = 0; frame < nframes; frame++)
 	for (frame = 0; frame < nframes; frame++)
 	{
 	{
-		starpu_set_iteration(frame);
+		starpu_iteration_push(frame);
 		unsigned blocky;
 		unsigned blocky;
 		for (blocky = 0; blocky < nblocks_y; blocky++)
 		for (blocky = 0; blocky < nblocks_y; blocky++)
 		{
 		{
@@ -266,6 +266,7 @@ int main(int argc, char **argv)
 			ret = starpu_task_submit(task);
 			ret = starpu_task_submit(task);
 			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 			STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
 		}
 		}
+		starpu_iteration_pop();
 	}
 	}
 
 
 	/* make sure all output buffers are sync'ed */
 	/* make sure all output buffers are sync'ed */

+ 2 - 1
examples/stencil/stencil-tasks.c

@@ -296,7 +296,7 @@ void create_tasks(int rank)
 
 
 	for (iter = 0; iter <= niter; iter++)
 	for (iter = 0; iter <= niter; iter++)
 	{
 	{
-	     starpu_set_iteration(iter);
+	     starpu_iteration_push(iter);
 	     for (bz = 0; bz < nbz; bz++)
 	     for (bz = 0; bz < nbz; bz++)
 	     {
 	     {
 		  if ((iter > 0) && (get_block_mpi_node(bz) == rank))
 		  if ((iter > 0) && (get_block_mpi_node(bz) == rank))
@@ -314,6 +314,7 @@ void create_tasks(int rank)
 				     create_task_save(iter, bz, -1, rank);
 				     create_task_save(iter, bz, -1, rank);
 		     }
 		     }
 	     }
 	     }
+	     starpu_iteration_pop();
 	}
 	}
 }
 }
 
 

+ 2 - 1
examples/stencil/stencil.c

@@ -330,7 +330,7 @@ int main(int argc, char **argv)
 		int iter;
 		int iter;
 		for (iter = 0; iter < who_runs_what_len; iter++)
 		for (iter = 0; iter < who_runs_what_len; iter++)
 		{
 		{
-			starpu_set_iteration(iter);
+			starpu_iteration_push(iter);
 			unsigned last, bz;
 			unsigned last, bz;
 			last = 1;
 			last = 1;
 			for (bz = 0; bz < nbz; bz++)
 			for (bz = 0; bz < nbz; bz++)
@@ -351,6 +351,7 @@ int main(int argc, char **argv)
 			}
 			}
 			FPRINTF(stderr, "\n");
 			FPRINTF(stderr, "\n");
 
 
+			starpu_iteration_pop();
 			if (last)
 			if (last)
 				break;
 				break;
 		}
 		}

+ 2 - 2
include/starpu_task.h

@@ -317,8 +317,8 @@ int starpu_task_wait_for_no_ready(void);
 int starpu_task_nready(void);
 int starpu_task_nready(void);
 int starpu_task_nsubmitted(void);
 int starpu_task_nsubmitted(void);
 
 
-void starpu_set_iteration(unsigned long iteration);
+void starpu_iteration_push(unsigned long iteration);
-void starpu_set_subiteration(unsigned long subiteration);
+void starpu_iteration_pop(void);
 
 
 void starpu_do_schedule(void);
 void starpu_do_schedule(void);
 
 

+ 2 - 1
mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c

@@ -112,7 +112,7 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 
 
 	for (k = 0; k < nblocks; k++)
 	for (k = 0; k < nblocks; k++)
 	{
 	{
-		starpu_set_iteration(k);
+		starpu_iteration_push(k);
 
 
 		int prio = STARPU_DEFAULT_PRIO;
 		int prio = STARPU_DEFAULT_PRIO;
 		if (!noprio) prio = STARPU_MAX_PRIO;
 		if (!noprio) prio = STARPU_MAX_PRIO;
@@ -155,6 +155,7 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 			if (my_distrib(k, j, nodes) == rank)
 			if (my_distrib(k, j, nodes) == rank)
 				starpu_data_wont_use(data_handles[k][j]);
 				starpu_data_wont_use(data_handles[k][j]);
 		}
 		}
+		starpu_iteration_pop();
 	}
 	}
 
 
 	starpu_task_wait_for_all();
 	starpu_task_wait_for_all();

+ 2 - 1
mpi/examples/mpi_lu/pxlu.c

@@ -839,7 +839,7 @@ double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
 
 
 	for (k = 0; k < nblocks; k++)
 	for (k = 0; k < nblocks; k++)
 	{
 	{
-		starpu_set_iteration(k);
+		starpu_iteration_push(k);
 
 
 		create_task_11(k);
 		create_task_11(k);
 
 
@@ -856,6 +856,7 @@ double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
 				create_task_22(k, i, j);
 				create_task_22(k, i, j);
 			}
 			}
 		}
 		}
+		starpu_iteration_pop();
 	}
 	}
 
 
 	int barrier_ret = starpu_mpi_barrier(MPI_COMM_WORLD);
 	int barrier_ret = starpu_mpi_barrier(MPI_COMM_WORLD);

+ 2 - 1
mpi/examples/mpi_lu/pxlu_implicit.c

@@ -135,7 +135,7 @@ double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
 
 
 	for (k = 0; k < nblocks; k++)
 	for (k = 0; k < nblocks; k++)
 	{
 	{
-		starpu_set_iteration(k);
+		starpu_iteration_push(k);
 
 
 		create_task_11(k);
 		create_task_11(k);
 
 
@@ -166,6 +166,7 @@ double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
 			if (get_block_rank(i, k) == _rank)
 			if (get_block_rank(i, k) == _rank)
 				starpu_data_wont_use(STARPU_PLU(get_block_handle)(i,k));
 				starpu_data_wont_use(STARPU_PLU(get_block_handle)(i,k));
 		}
 		}
+		starpu_iteration_pop();
 	}
 	}
 
 
 	starpu_task_wait_for_all();
 	starpu_task_wait_for_all();

+ 4 - 2
mpi/examples/stencil/stencil5.c

@@ -175,7 +175,7 @@ int main(int argc, char **argv)
 	/* First computation with initial distribution */
 	/* First computation with initial distribution */
 	for(loop=0 ; loop<niter; loop++)
 	for(loop=0 ; loop<niter; loop++)
 	{
 	{
-		starpu_set_iteration(loop);
+		starpu_iteration_push(loop);
 
 
 		for (x = 1; x < X-1; x++)
 		for (x = 1; x < X-1; x++)
 		{
 		{
@@ -187,6 +187,7 @@ int main(int argc, char **argv)
 						       0);
 						       0);
 			}
 			}
 		}
 		}
+		starpu_iteration_pop();
 	}
 	}
 	FPRINTF(stderr, "Waiting ...\n");
 	FPRINTF(stderr, "Waiting ...\n");
 	starpu_task_wait_for_all();
 	starpu_task_wait_for_all();
@@ -216,7 +217,7 @@ int main(int argc, char **argv)
 	/* Second computation with new distribution */
 	/* Second computation with new distribution */
 	for(loop=0 ; loop<niter; loop++)
 	for(loop=0 ; loop<niter; loop++)
 	{
 	{
-		starpu_set_iteration(niter + loop);
+		starpu_iteration_push(niter + loop);
 
 
 		for (x = 1; x < X-1; x++)
 		for (x = 1; x < X-1; x++)
 		{
 		{
@@ -228,6 +229,7 @@ int main(int argc, char **argv)
 						       0);
 						       0);
 			}
 			}
 		}
 		}
+		starpu_iteration_pop();
 	}
 	}
 	FPRINTF(stderr, "Waiting ...\n");
 	FPRINTF(stderr, "Waiting ...\n");
 	starpu_task_wait_for_all();
 	starpu_task_wait_for_all();

+ 2 - 1
mpi/examples/stencil/stencil5_lb.c

@@ -236,7 +236,7 @@ int main(int argc, char **argv)
 	/* First computation with initial distribution */
 	/* First computation with initial distribution */
 	for(loop=0 ; loop<niter; loop++)
 	for(loop=0 ; loop<niter; loop++)
 	{
 	{
-		starpu_set_iteration(loop);
+		starpu_iteration_push(loop);
 
 
 		for (x = 1; x < X-1; x++)
 		for (x = 1; x < X-1; x++)
 		{
 		{
@@ -249,6 +249,7 @@ int main(int argc, char **argv)
 						       0);
 						       0);
 			}
 			}
 		}
 		}
+		starpu_iteration_pop();
 	}
 	}
 	FPRINTF(stderr, "Waiting ...\n");
 	FPRINTF(stderr, "Waiting ...\n");
 	starpu_task_wait_for_all();
 	starpu_task_wait_for_all();

+ 4 - 2
src/core/sched_ctx.c

@@ -468,6 +468,7 @@ struct _starpu_sched_ctx* _starpu_create_sched_ctx(struct starpu_sched_policy *p
 	STARPU_PTHREAD_MUTEX_UNLOCK(&sched_ctx_manag);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&sched_ctx_manag);
 
 
 	int nworkers = config->topology.nworkers;
 	int nworkers = config->topology.nworkers;
+	unsigned i;
 
 
 	STARPU_ASSERT(nworkers_ctx <= nworkers);
 	STARPU_ASSERT(nworkers_ctx <= nworkers);
 
 
@@ -501,8 +502,9 @@ struct _starpu_sched_ctx* _starpu_create_sched_ctx(struct starpu_sched_policy *p
 	_starpu_barrier_counter_init(&sched_ctx->ready_tasks_barrier, 0);
 	_starpu_barrier_counter_init(&sched_ctx->ready_tasks_barrier, 0);
 
 
 	sched_ctx->ready_flops = 0.0;
 	sched_ctx->ready_flops = 0.0;
-	sched_ctx->iteration = -1;
+	for (i = 0; i < sizeof(sched_ctx->iterations)/sizeof(sched_ctx->iterations[0]); i++)
-	sched_ctx->subiteration = -1;
+		sched_ctx->iterations[i] = -1;
+	sched_ctx->iteration_level = 0;
 	sched_ctx->main_master = -1;
 	sched_ctx->main_master = -1;
 	sched_ctx->perf_arch.devices = NULL;
 	sched_ctx->perf_arch.devices = NULL;
 	sched_ctx->perf_arch.ndevices = 0;
 	sched_ctx->perf_arch.ndevices = 0;

+ 2 - 1
src/core/sched_ctx.h

@@ -70,7 +70,8 @@ struct _starpu_sched_ctx
 	double ready_flops;
 	double ready_flops;
 
 
 	/* Iteration number, as advertised by application */
 	/* Iteration number, as advertised by application */
-	long iteration, subiteration;
+	long iterations[2];
+	int iteration_level;
 
 
 	/* cond to block push when there are no workers in the ctx */
 	/* cond to block push when there are no workers in the ctx */
 	starpu_pthread_cond_t no_workers_cond;
 	starpu_pthread_cond_t no_workers_cond;

+ 13 - 6
src/core/task.c

@@ -652,8 +652,8 @@ int starpu_task_submit(struct starpu_task *task)
 
 
 	if (!j->internal && !continuation)
 	if (!j->internal && !continuation)
 		_STARPU_TRACE_TASK_SUBMIT(j,
 		_STARPU_TRACE_TASK_SUBMIT(j,
-			_starpu_get_sched_ctx_struct(task->sched_ctx)->iteration,
+			_starpu_get_sched_ctx_struct(task->sched_ctx)->iterations[0],
-			_starpu_get_sched_ctx_struct(task->sched_ctx)->subiteration);
+			_starpu_get_sched_ctx_struct(task->sched_ctx)->iterations[1]);
 
 
 	/* If this is a continuation, we don't modify the implicit data dependencies detected earlier. */
 	/* If this is a continuation, we don't modify the implicit data dependencies detected earlier. */
 	if (task->cl && !continuation)
 	if (task->cl && !continuation)
@@ -991,14 +991,21 @@ int starpu_task_wait_for_no_ready(void)
 	return 0;
 	return 0;
 }
 }
 
 
-void starpu_set_iteration(unsigned long iteration)
+void starpu_iteration_push(unsigned long iteration)
 {
 {
-	_starpu_get_sched_ctx_struct(_starpu_sched_ctx_get_current_context())->iteration = iteration;
+	struct _starpu_sched_ctx *ctx = _starpu_get_sched_ctx_struct(_starpu_sched_ctx_get_current_context());
+	unsigned level = ctx->iteration_level++;
+	if (level < sizeof(ctx->iterations)/sizeof(ctx->iterations[0]))
+		ctx->iterations[level] = iteration;
 }
 }
 
 
-void starpu_set_subiteration(unsigned long subiteration)
+void starpu_iteration_pop(void)
 {
 {
-	_starpu_get_sched_ctx_struct(_starpu_sched_ctx_get_current_context())->subiteration = subiteration;
+	struct _starpu_sched_ctx *ctx = _starpu_get_sched_ctx_struct(_starpu_sched_ctx_get_current_context());
+	STARPU_ASSERT_MSG(ctx->iteration_level > 0, "calls to starpu_iteration_pop must match starpu_iteration_push calls")
+	unsigned level = ctx->iteration_level--;
+	if (level < sizeof(ctx->iterations)/sizeof(ctx->iterations[0]))
+		ctx->iterations[level] = -1;
 }
 }
 
 
 void starpu_do_schedule(void)
 void starpu_do_schedule(void)

+ 13 - 11
src/debug/traces/starpu_fxt.c

@@ -101,8 +101,7 @@ struct task_info {
 	double end_time;
 	double end_time;
 	unsigned long footprint;
 	unsigned long footprint;
 	unsigned long kflops;
 	unsigned long kflops;
-	long iteration;
+	long iterations[2];
-	long subiteration;
 	char *parameters;
 	char *parameters;
 	unsigned int ndeps;
 	unsigned int ndeps;
 	unsigned long *dependencies;
 	unsigned long *dependencies;
@@ -116,6 +115,7 @@ struct task_info *tasks_info;
 static struct task_info *get_task(unsigned long job_id, int mpi_rank)
 static struct task_info *get_task(unsigned long job_id, int mpi_rank)
 {
 {
 	struct task_info *task;
 	struct task_info *task;
+	unsigned i;
 
 
 	HASH_FIND(hh, tasks_info, &job_id, sizeof(job_id), task);
 	HASH_FIND(hh, tasks_info, &job_id, sizeof(job_id), task);
 	if (!task)
 	if (!task)
@@ -133,8 +133,8 @@ static struct task_info *get_task(unsigned long job_id, int mpi_rank)
 		task->end_time = 0.;
 		task->end_time = 0.;
 		task->footprint = 0;
 		task->footprint = 0;
 		task->kflops = 0.;
 		task->kflops = 0.;
-		task->iteration = -1;
+		for (i = 0; i < sizeof(task->iterations)/sizeof(task->iterations[0]); i++)
-		task->subiteration = -1;
+			task->iterations[i] = -1;
 		task->parameters = NULL;
 		task->parameters = NULL;
 		task->ndeps = 0;
 		task->ndeps = 0;
 		task->dependencies = NULL;
 		task->dependencies = NULL;
@@ -191,10 +191,12 @@ static void task_dump(unsigned long job_id, int mpi_rank)
 	fprintf(tasks_file, "Footprint: %lx\n", task->footprint);
 	fprintf(tasks_file, "Footprint: %lx\n", task->footprint);
 	if (task->kflops != 0)
 	if (task->kflops != 0)
 		fprintf(tasks_file, "GFlop: %f\n", ((double) task->kflops) / 1000000);
 		fprintf(tasks_file, "GFlop: %f\n", ((double) task->kflops) / 1000000);
-	if (task->iteration != -1)
+	if (task->iterations[0] != -1) {
-		fprintf(tasks_file, "Iteration: %ld\n", task->iteration);
+		fprintf(tasks_file, "Iteration:");
-	if (task->subiteration != -1)
+		for (i = 0; i < sizeof(task->iterations)/sizeof(task->iterations[0]); i++)
-		fprintf(tasks_file, "Subiteration: %ld\n", task->subiteration);
+			fprintf(tasks_file, " %ld", task->iterations[i]);
+		fprintf(tasks_file, "\n");
+	}
 	if (task->parameters)
 	if (task->parameters)
 	{
 	{
 		fprintf(tasks_file, "Parameters: %s\n", task->parameters);
 		fprintf(tasks_file, "Parameters: %s\n", task->parameters);
@@ -1456,7 +1458,7 @@ static void handle_codelet_details(struct fxt_ev_64 *ev, struct starpu_fxt_optio
 		char *prefix = options->file_prefix;
 		char *prefix = options->file_prefix;
 		unsigned sched_ctx = ev->param[0];
 		unsigned sched_ctx = ev->param[0];
 
 
-		worker_set_detailed_state(last_codelet_start[worker], prefix, worker, _starpu_last_codelet_symbol[worker], ev->param[1], parameters, ev->param[2], ev->param[4], job_id, ((double) task->kflops) / 1000000, X, Y, Z, task->iteration, task->subiteration);
+		worker_set_detailed_state(last_codelet_start[worker], prefix, worker, _starpu_last_codelet_symbol[worker], ev->param[1], parameters, ev->param[2], ev->param[4], job_id, ((double) task->kflops) / 1000000, X, Y, Z, task->iterations[0], task->iterations[1]);
 		if (sched_ctx != 0)
 		if (sched_ctx != 0)
 		{
 		{
 #ifdef STARPU_HAVE_POTI
 #ifdef STARPU_HAVE_POTI
@@ -2323,8 +2325,8 @@ static void handle_task_submit(struct fxt_ev_64 *ev, struct starpu_fxt_options *
 
 
 	struct task_info *task = get_task(job_id, options->file_rank);
 	struct task_info *task = get_task(job_id, options->file_rank);
 	task->submit_time = get_event_time_stamp(ev, options);
 	task->submit_time = get_event_time_stamp(ev, options);
-	task->iteration = iteration;
+	task->iterations[0] = iteration;
-	task->subiteration = subiteration;
+	task->iterations[1] = subiteration;
 }
 }
 
 
 static void handle_task_done(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)
 static void handle_task_done(struct fxt_ev_64 *ev, struct starpu_fxt_options *options)