소스 검색

beginning of support for dependencies in theoretical bound; not working yet.

Samuel Thibault 14 년 전
부모
커밋
11eb858e43
9개의 변경된 파일384개의 추가작업 그리고 90개의 파일을 삭제
  1. 6 3
      doc/starpu.texi
  2. 12 1
      examples/lu/lu_example.c
  3. 3 2
      include/starpu_bound.h
  4. 3 0
      src/core/dependencies/tags.c
  5. 1 0
      src/core/dependencies/task_deps.c
  6. 2 0
      src/core/jobs.c
  7. 1 0
      src/core/jobs.h
  8. 350 84
      src/profiling/bound.c
  9. 6 0
      src/profiling/bound.h

+ 6 - 3
doc/starpu.texi

@@ -1126,9 +1126,12 @@ lower bound for the total execution time of your tasks. If StarPU was compiled
 with the glpk library installed, @code{starpu_bound_compute} can be used to solve it
 immediately and get the optimized minimum.
 
-Note that this is not taking into account task dependencies and data
-transfers. It only takes into account the actual computations on processing
-units. It however properly takes into account the varying performances of
+The @code{deps} parameter tells StarPU whether to take tasks and data
+dependencies into account. It must be understood that the linear programming
+problem size is quadratic with number of tasks and thus the time to solve it
+will be very long, typically one minute for just 200 tasks. Setting @code{deps}
+to 0 will only takes into account the actual computations on processing
+units. It however still properly takes into account the varying performances of
 kernels and processing units, which is quite more accurate than just comparing
 StarPU performances with the fastest of the kernels being used.
 

+ 12 - 1
examples/lu/lu_example.c

@@ -32,6 +32,7 @@ static unsigned pivot = 0;
 static unsigned no_stride = 0;
 static unsigned profile = 0;
 static unsigned bound = 0;
+static unsigned bounddeps = 0;
 
 TYPE *A, *A_saved;
 
@@ -71,6 +72,10 @@ static void parse_args(int argc, char **argv)
 		if (strcmp(argv[i], "-bound") == 0) {
 			bound = 1;
 		}
+		if (strcmp(argv[i], "-bounddeps") == 0) {
+			bound = 1;
+			bounddeps = 1;
+		}
 	}
 }
 
@@ -267,7 +272,7 @@ int main(int argc, char **argv)
 	display_matrix(A, size, size, "A");
 
 	if (bound)
-		starpu_bound_start();
+		starpu_bound_start(bounddeps);
 
 	if (profile)
 		starpu_profiling_status_set(STARPU_PROFILING_ENABLE);
@@ -320,9 +325,15 @@ int main(int argc, char **argv)
 	if (bound) {
 		double min;
 		starpu_bound_stop();
+#if 0
+		FILE *f = fopen("lu.pl", "w");
+		starpu_bound_print_lp(f);
+		starpu_bound_print(stderr);
+#else
 		starpu_bound_compute(&min);
 		if (min != 0.)
 			fprintf(stderr, "theoretical min: %lf ms\n", min);
+#endif
 	}
 
 	if (check)

+ 3 - 2
include/starpu_bound.h

@@ -22,8 +22,9 @@
 #ifndef __STARPU_BOUND_H__
 #define __STARPU_BOUND_H__
 
-/* Start recording tasks (resets stats) */
-void starpu_bound_start(void);
+/* Start recording tasks (resets stats).  `deps' tells whether dependencies
+ * should be recorded too (this is quite expensive).  */
+void starpu_bound_start(int deps);
 /* Stop recording tasks */
 void starpu_bound_stop(void);
 

+ 3 - 0
src/core/dependencies/tags.c

@@ -22,6 +22,7 @@
 #include <core/jobs.h>
 #include <core/sched_policy.h>
 #include <core/dependencies/data_concurrency.h>
+#include <profiling/bound.h>
 
 static starpu_htbl_node_t *tag_htbl = NULL;
 static pthread_rwlock_t tag_global_rwlock = PTHREAD_RWLOCK_INITIALIZER;
@@ -228,6 +229,7 @@ void starpu_tag_declare_deps_array(starpu_tag_t id, unsigned ndeps, starpu_tag_t
 		/* id depends on dep_id
 		 * so cg should be among dep_id's successors*/
 		STARPU_TRACE_TAG_DEPS(id, dep_id);
+		_starpu_bound_tag_dep(id, dep_id);
 		struct starpu_tag_s *tag_dep = gettag_struct(dep_id);
 		STARPU_ASSERT(tag_dep != tag_child);
 		_starpu_spin_lock(&tag_dep->lock);
@@ -261,6 +263,7 @@ void starpu_tag_declare_deps(starpu_tag_t id, unsigned ndeps, ...)
 		/* id depends on dep_id
 		 * so cg should be among dep_id's successors*/
 		STARPU_TRACE_TAG_DEPS(id, dep_id);
+		_starpu_bound_tag_dep(id, dep_id);
 		struct starpu_tag_s *tag_dep = gettag_struct(dep_id);
 		STARPU_ASSERT(tag_dep != tag_child);
 		_starpu_spin_lock(&tag_dep->lock);

+ 1 - 0
src/core/dependencies/task_deps.c

@@ -81,6 +81,7 @@ void starpu_task_declare_deps_array(struct starpu_task *task, unsigned ndeps, st
 		STARPU_ASSERT(dep_job != job);
 
 		STARPU_TRACE_TASK_DEPS(dep_job, job);
+		_starpu_bound_task_dep(job, dep_job);
 
 		PTHREAD_MUTEX_LOCK(&dep_job->sync_mutex);
 		_starpu_task_add_succ(dep_job, cg);

+ 2 - 0
src/core/jobs.c

@@ -77,6 +77,8 @@ starpu_job_t __attribute__((malloc)) _starpu_job_create(struct starpu_task *task
 	PTHREAD_MUTEX_INIT(&job->sync_mutex, NULL);
 	PTHREAD_COND_INIT(&job->sync_cond, NULL);
 
+	job->bound_task = NULL;
+
 	if (task->use_tag)
 		_starpu_tag_declare(task->tag_id, job);
 

+ 1 - 0
src/core/jobs.h

@@ -76,6 +76,7 @@ LIST_TYPE(starpu_job,
         unsigned exclude_from_dag;
         const char *model_name;
 #endif
+	struct bound_task *bound_task;
 );
 
 starpu_job_t __attribute__((malloc)) _starpu_job_create(struct starpu_task *task);

+ 350 - 84
src/profiling/bound.c

@@ -30,52 +30,167 @@
 
 /* TODO: output duration between starpu_bound_start and starpu_bound_stop */
 
-struct task_pool {
+/*
+ * Record without dependencies: just count each kind of task
+ *
+ * The linear programming problem will just have as variables:
+ * - the number of tasks of kind `t' executed by worker `w'
+ * - the total duration
+ *
+ * and the constraints will be:
+ * - the time taken by each worker to complete its assigned tasks is lower than
+ *   the total duration.
+ * - the total numer of tasks of a given kind is equal to the number run by the
+ *   application.
+ */
+struct bound_task_pool {
 	/* Which codelet has been executed */
 	struct starpu_codelet_t *cl;
 	/* Task footprint key */
 	uint32_t footprint;
 	/* Number of tasks of this kind */
 	unsigned long n;
+	/* Other task kinds */
+	struct bound_task_pool *next;
+};
+
+/*
+ * Record with dependencies: each task is recorded separately
+ *
+ * The linear programming problem will have as variables:
+ * - The start time of each task
+ * - The completion time of each tag
+ * - The total duration
+ * - For each task and for each worker, whether the task is executing on that worker.
+ * - For each pair of task, which task is scheduled first.
+ *
+ * and the constraints will be:
+ * - All task start time plus duration are less than total duration
+ * - Each task is executed on exactly one worker.
+ * - Each task starts after all its task dependencies finish.
+ * - Each task starts after all its tag dependencies finish.
+ * - For each task pair and each worker, if both tasks are executed by that worker,
+ *   one is started after the other's completion.
+ */
+/* Note: only task-task, implicit data dependencies or task-tag dependencies
+ * are taken into account. Tags released in a callback or something like this
+ * is not taken into account, only tags associated with a task are. */
+struct bound_task {
+	/* Unique ID */
+	int id;
+	/* Tag ID, if any */
+	starpu_tag_t tag_id;
+	int use_tag;
+	/* Which codelet has been executed */
+	struct starpu_codelet_t *cl;
+	/* Task footprint key */
+	uint32_t footprint;
+	/* Task priority */
+	int priority;
+	/* Tasks this one depends on */
+	struct bound_task **deps;
+	int depsn;
+
+	/* Estimated duration */
+	double duration[STARPU_NARCH_VARIATIONS];
+
 	/* Other tasks */
-	struct task_pool *next;
+	struct bound_task *next;
+};
+
+struct bound_tag_dep {
+	starpu_tag_t tag;
+	starpu_tag_t dep_tag;
+	struct bound_tag_dep *next;
 };
 
-static struct task_pool *task_pools, *last;
+static struct bound_task_pool *task_pools, *last;
+static struct bound_task *tasks;
+static struct bound_tag_dep *tag_deps;
 static int recording;
+static int recorddeps;
 
 static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
 
-void starpu_bound_start(void)
+void starpu_bound_start(int deps)
 {
-	struct task_pool *tp;
+	struct bound_task_pool *tp;
+	struct bound_task *t;
+	struct bound_tag_dep *td;
 
 	PTHREAD_MUTEX_LOCK(&mutex);
+
 	tp = task_pools;
 	task_pools = NULL;
 	last = NULL;
+
+	t = tasks;
+	tasks = NULL;
+
+	td = tag_deps;
+	tag_deps = NULL;
+
 	recording = 1;
+	recorddeps = deps;
+
 	PTHREAD_MUTEX_UNLOCK(&mutex);
 
 	for ( ; tp; tp = tp->next)
 		free(tp);
-}
 
-void _starpu_bound_record(starpu_job_t j)
-{
-	struct task_pool *tp;
+	for ( ; t; t = t->next)
+		free(t);
 
-	if (!recording)
-		return;
+	for ( ; td; td = td->next)
+		free(td);
+}
 
+static int good_job(starpu_job_t j)
+{
 	/* No codelet, nothing to measure */
 	if (!j->task->cl)
-		return;
+		return 0;
 	/* No performance model, no time duration estimation */
 	if (!j->task->cl->model)
-		return;
+		return 0;
 	/* Only support history based */
 	if (j->task->cl->model->type != STARPU_HISTORY_BASED)
+		return 0;
+	return 1;
+}
+
+static void new_task(starpu_job_t j)
+{
+	struct bound_task *t;
+	static int task_ids;
+
+	if (j->bound_task)
+		return;
+
+	if (STARPU_UNLIKELY(!j->footprint_is_computed))
+		_starpu_compute_buffers_footprint(j);
+
+	t = malloc(sizeof(*t));
+	memset(t, 0, sizeof(*t));
+	t->id = task_ids++;
+	t->tag_id = j->task->tag_id;
+	t->use_tag = j->task->use_tag;
+	t->cl = j->task->cl;
+	t->footprint = j->footprint;
+	t->priority = j->task->priority;
+	t->deps = NULL;
+	t->depsn = 0;
+	t->next = tasks;
+	j->bound_task = t;
+	tasks = t;
+}
+
+void _starpu_bound_record(starpu_job_t j)
+{
+	if (!recording)
+		return;
+
+	if (!good_job(j))
 		return;
 
 	PTHREAD_MUTEX_LOCK(&mutex);
@@ -85,28 +200,81 @@ void _starpu_bound_record(starpu_job_t j)
 		return;
 	}
 
-	if (STARPU_UNLIKELY(!j->footprint_is_computed))
-		_starpu_compute_buffers_footprint(j);
+	if (recorddeps) {
+		new_task(j);
+	} else {
+		struct bound_task_pool *tp;
+
+		if (STARPU_UNLIKELY(!j->footprint_is_computed))
+			_starpu_compute_buffers_footprint(j);
+
+		if (last && last->cl == j->task->cl && last->footprint == j->footprint)
+			tp = last;
+		else
+			for (tp = task_pools; tp; tp = tp->next)
+				if (tp->cl == j->task->cl && tp->footprint == j->footprint)
+					break;
+
+		if (!tp) {
+			tp = malloc(sizeof(*tp));
+			tp->cl = j->task->cl;
+			tp->footprint = j->footprint;
+			tp->n = 0;
+			tp->next = task_pools;
+			task_pools = tp;
+		}
 
-	if (last && last->cl == j->task->cl && last->footprint == j->footprint)
-		tp = last;
-	else
-		for (tp = task_pools; tp; tp = tp->next)
-			if (tp->cl == j->task->cl && tp->footprint == j->footprint)
-				break;
-
-	if (!tp) {
-		tp = malloc(sizeof(*tp));
-		tp->cl = j->task->cl;
-		tp->footprint = j->footprint;
-		tp->n = 0;
-		tp->next = task_pools;
-		task_pools = tp;
+		/* One more task of this kind */
+		tp->n++;
+	}
+
+	PTHREAD_MUTEX_UNLOCK(&mutex);
+}
+
+void _starpu_bound_tag_dep(starpu_tag_t id, starpu_tag_t dep_id)
+{
+	struct bound_tag_dep *td;
+
+	if (!recording || !recorddeps)
+		return;
+
+	PTHREAD_MUTEX_LOCK(&mutex);
+	/* Re-check, this time with mutex held */
+	if (!recording || !recorddeps) {
+		PTHREAD_MUTEX_UNLOCK(&mutex);
+		return;
 	}
 
-	/* One more task of this kind */
-	tp->n++;
+	td = malloc(sizeof(*td));
+	td->tag = id;
+	td->dep_tag = dep_id;
+	td->next = tag_deps;
+	tag_deps = td;
+	PTHREAD_MUTEX_UNLOCK(&mutex);
+}
+
+void _starpu_bound_task_dep(starpu_job_t j, starpu_job_t dep_j)
+{
+	struct bound_task *t;
+
+	if (!recording || !recorddeps)
+		return;
 
+	if (!good_job(j) || !good_job(dep_j))
+		return;
+
+	PTHREAD_MUTEX_LOCK(&mutex);
+	/* Re-check, this time with mutex held */
+	if (!recording || !recorddeps) {
+		PTHREAD_MUTEX_UNLOCK(&mutex);
+		return;
+	}
+
+	new_task(j);
+	new_task(dep_j);
+	t = j->bound_task;
+	t->deps = realloc(t->deps, ++t->depsn * sizeof(t->deps[0]));
+	t->deps[t->depsn-1] = dep_j->bound_task;
 	PTHREAD_MUTEX_UNLOCK(&mutex);
 }
 
@@ -118,7 +286,7 @@ void starpu_bound_stop(void)
 }
 
 static void _starpu_get_tasks_times(int nw, int nt, double times[nw][nt]) {
-	struct task_pool *tp;
+	struct bound_task_pool *tp;
 	int w, t;
 	for (w = 0; w < nw; w++) {
 		for (t = 0, tp = task_pools; tp; t++, tp = tp->next) {
@@ -137,63 +305,145 @@ static void _starpu_get_tasks_times(int nw, int nt, double times[nw][nt]) {
  */
 void starpu_bound_print_lp(FILE *output)
 {
-	struct task_pool *tp;
 	int nt; /* Number of different kinds of tasks */
 	int nw; /* Number of different workers */
 	int t, w;
 
 	PTHREAD_MUTEX_LOCK(&mutex);
-
 	nw = starpu_worker_get_count();
-	nt = 0;
-	for (tp = task_pools; tp; tp = tp->next)
-		nt++;
 
-	{
-		double times[nw][nt];
-
-		_starpu_get_tasks_times(nw, nt, times);
+	if (recorddeps) {
+		struct bound_task *t, *t2;
+		struct bound_tag_dep *td;
+		int i;
 
+		nt = 0;
+		for (t = tasks; t; t = t->next) {
+			struct starpu_job_s j = {
+				.footprint = t->footprint,
+				.footprint_is_computed = 1,
+			};
+			for (w = 0; w < nw; w++) {
+				enum starpu_perf_archtype arch = starpu_worker_get_perf_archtype(w);
+				if (t->duration[arch] == 0.)
+					t->duration[arch] = _starpu_history_based_job_expected_length(t->cl->model, arch, &j) / 1000.;
+			}
+			nt++;
+		}
 		fprintf(output, "/* StarPU upper bound linear programming problem, to be run in lp_solve. */\n\n");
+		fprintf(output, "/* !! This is a big system, it will be long to solve !! */\n\n");
 		fprintf(output, "/* We want to minimize total execution time (ms) */\n");
 		fprintf(output, "min: tmax;\n\n");
 
-		fprintf(output, "/* Which is the maximum of all worker execution times (ms) */\n");
-		for (w = 0; w < nw; w++) {
-			char name[32];
-			starpu_worker_get_name(w, name, sizeof(name));
-			fprintf(output, "/* worker %s */\n", name);
-			for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
-				fprintf(output, "\t%+f * w%dt%dn", (float) times[w][t], w, t);
-			fprintf(output, " <= tmax;\n");
-		}
-		fprintf(output, "\n");
+		fprintf(output, "/* Which is the maximum of all task completion times (ms) */\n");
+		for (t = tasks; t; t = t->next)
+			fprintf(output, "c%u <= tmax;\n", t->id);
 
-		fprintf(output, "/* And we have to have computed exactly all tasks */\n");
-		for (t = 0, tp = task_pools; tp; t++, tp = tp->next) {
-			fprintf(output, "/* task %s key %x */\n", tp->cl->model->symbol, (unsigned) tp->footprint);
+		fprintf(output, "\n/* We have tasks executing on workers, exactly one worker executes each task */\n");
+		for (t = tasks; t; t = t->next) {
 			for (w = 0; w < nw; w++)
-				fprintf(output, "\t+w%dt%dn", w, t);
-			fprintf(output, " = %ld;\n", tp->n);
-			/* Show actual values */
-			fprintf(output, "/*");
-			for (w = 0; w < nw; w++)
-				fprintf(output, "\t+%ld", tp->cl->per_worker_stats[w]);
-			fprintf(output, "\t*/\n\n");
+				fprintf(output, " +t%uw%u", t->id, w);
+			fprintf(output, " = 1;\n");
 		}
 
-		fprintf(output, "/* Optionally tell that tasks can not be divided */\n");
-		fprintf(output, "/* int ");
-		int first = 1;
-		for (w = 0; w < nw; w++)
+		fprintf(output, "\n/* Completion time is start time plus computation time */\n");
+		fprintf(output, "/* According to where the task is indeed executed */\n");
+		for (t = tasks; t; t = t->next) {
+			fprintf(output, "c%u = s%u", t->id, t->id);
+			for (w = 0; w < nw; w++) {
+				enum starpu_perf_archtype arch = starpu_worker_get_perf_archtype(w);
+				fprintf(output, " + %f t%uw%u", t->duration[arch], t->id, w);
+			}
+			fprintf(output, ";\n");
+		}
+
+		fprintf(output, "\n/* Each task starts after all its task dependencies finish. */\n");
+		fprintf(output, "/* Note that the dependency finish time depends on the worker where it's working */\n");
+		for (t = tasks; t; t = t->next)
+			for (i = 0; i < t->depsn; i++)
+				fprintf(output, "s%u >= c%u;\n", t->id, t->deps[i]->id);
+
+		fprintf(output, "\n/* Each tag finishes when its corresponding task finishes */");
+		for (t = tasks; t; t = t->next)
+			if (t->use_tag) {
+				for (w = 0; w < nw; w++)
+					fprintf(output, "c%u = tag%lu;\n", t->id, (unsigned long) t->tag_id);
+			}
+
+		fprintf(output, "\n/* tags start after all their tag dependencies finish. */\n");
+		for (td = tag_deps; td; td = td->next)
+			fprintf(output, "tag%lu >= tag%lu;\n", (unsigned long) td->tag, (unsigned long) td->dep_tag);
+
+		fprintf(output, "\n/* For each task pair and each worker, if both tasks are executed by the same worker,\n");
+		fprintf(output, "   one is started after the other's completion */\n");
+		for (t = tasks; t; t = t->next)
+			for (t2 = t->next; t2; t2 = t2->next) {
+				for (w = 0; w < nw; w++) {
+					fprintf(output, "s%u - c%u >= -3e10 + 1e10 t%uw%u + 1e10 t%uw%u + 1e10 t%uafter%u;\n",
+							t->id, t2->id, t->id, w, t2->id, w, t->id, t2->id);
+					fprintf(output, "s%u - c%u >= -2e10 + 1e10 t%uw%u + 1e10 t%uw%u - 1e10 t%uafter%u;\n",
+							t2->id, t->id, t->id, w, t2->id, w, t->id, t2->id);
+				}
+			}
+
+		for (t = tasks; t; t = t->next)
+			for (w = 0; w < nw; w++)
+				fprintf(output, "bin t%uw%u;\n", t->id, w);
+		for (t = tasks; t; t = t->next)
+			for (t2 = t->next; t2; t2 = t2->next)
+				fprintf(output, "bin t%uafter%u;\n", t->id, t2->id);
+	} else {
+		struct bound_task_pool *tp;
+		nt = 0;
+		for (tp = task_pools; tp; tp = tp->next)
+			nt++;
+
+		{
+			double times[nw][nt];
+
+			_starpu_get_tasks_times(nw, nt, times);
+
+			fprintf(output, "/* StarPU upper bound linear programming problem, to be run in lp_solve. */\n\n");
+			fprintf(output, "/* We want to minimize total execution time (ms) */\n");
+			fprintf(output, "min: tmax;\n\n");
+
+			fprintf(output, "/* Which is the maximum of all worker execution times (ms) */\n");
+			for (w = 0; w < nw; w++) {
+				char name[32];
+				starpu_worker_get_name(w, name, sizeof(name));
+				fprintf(output, "/* worker %s */\n", name);
+				for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
+					fprintf(output, "\t%+f * w%ut%un", (float) times[w][t], w, t);
+				fprintf(output, " <= tmax;\n");
+			}
+			fprintf(output, "\n");
+
+			fprintf(output, "/* And we have to have computed exactly all tasks */\n");
 			for (t = 0, tp = task_pools; tp; t++, tp = tp->next) {
-				if (!first)
-					fprintf(output, ",");
-				else
-					first = 0;
-				fprintf(output, "w%dt%dn", w, t);
+				fprintf(output, "/* task %s key %x */\n", tp->cl->model->symbol, (unsigned) tp->footprint);
+				for (w = 0; w < nw; w++)
+					fprintf(output, "\t+w%ut%un", w, t);
+				fprintf(output, " = %ld;\n", tp->n);
+				/* Show actual values */
+				fprintf(output, "/*");
+				for (w = 0; w < nw; w++)
+					fprintf(output, "\t+%ld", tp->cl->per_worker_stats[w]);
+				fprintf(output, "\t*/\n\n");
 			}
-		fprintf(output, "; */\n");
+
+			fprintf(output, "/* Optionally tell that tasks can not be divided */\n");
+			fprintf(output, "/* int ");
+			int first = 1;
+			for (w = 0; w < nw; w++)
+				for (t = 0, tp = task_pools; tp; t++, tp = tp->next) {
+					if (!first)
+						fprintf(output, ",");
+					else
+						first = 0;
+					fprintf(output, "w%ut%un", w, t);
+				}
+			fprintf(output, "; */\n");
+		}
 	}
 
 	PTHREAD_MUTEX_UNLOCK(&mutex);
@@ -204,11 +454,16 @@ void starpu_bound_print_lp(FILE *output)
  */
 void starpu_bound_print_mps(FILE *output)
 {
-	struct task_pool * tp;
+	struct bound_task_pool * tp;
 	int nt; /* Number of different kinds of tasks */
 	int nw; /* Number of different workers */
 	int t, w;
 
+	if (recorddeps) {
+		fprintf(output, "Not supported\n");
+		return;
+	}
+
 	PTHREAD_MUTEX_LOCK(&mutex);
 
 	nw = starpu_worker_get_count();
@@ -233,13 +488,13 @@ void starpu_bound_print_mps(FILE *output)
 			char name[32];
 			starpu_worker_get_name(w, name, sizeof(name));
 			fprintf(output, "* worker %s\n", name);
-			fprintf(output, " L  W%d\n", w);
+			fprintf(output, " L  W%u\n", w);
 		}
 
 		fprintf(output, "\n* And we have to have computed exactly all tasks\n");
 		for (t = 0, tp = task_pools; tp; t++, tp = tp->next) {
 			fprintf(output, "* task %s key %x\n", tp->cl->model->symbol, (unsigned) tp->footprint);
-			fprintf(output, " E  T%d\n", t);
+			fprintf(output, " E  T%u\n", t);
 		}
 
 		fprintf(output, "\nCOLUMNS\n");
@@ -248,21 +503,21 @@ void starpu_bound_print_mps(FILE *output)
 		for (w = 0; w < nw; w++)
 			for (t = 0, tp = task_pools; tp; t++, tp = tp->next) {
 				char name[9];
-				snprintf(name, sizeof(name), "W%dT%d", w, t);
-				fprintf(stderr,"    %-8s  W%-7d  %12f\n", name, w, times[w][t]);
-				fprintf(stderr,"    %-8s  T%-7d  %12u\n", name, t, 1);
+				snprintf(name, sizeof(name), "W%uT%u", w, t);
+				fprintf(stderr,"    %-8s  W%-7u  %12f\n", name, w, times[w][t]);
+				fprintf(stderr,"    %-8s  T%-7u  %12u\n", name, t, 1);
 			}
 
 		fprintf(output, "\n* Total execution time\n");
 		for (w = 0; w < nw; w++)
-			fprintf(stderr,"    TMAX      W%-2d       %12d\n", w, -1);
-		fprintf(stderr,"    TMAX      TMAX      %12d\n", 1);
+			fprintf(stderr,"    TMAX      W%-2u       %12u\n", w, -1);
+		fprintf(stderr,"    TMAX      TMAX      %12u\n", 1);
 
 		fprintf(output, "\nRHS\n");
 
 		fprintf(output, "\n* Total number of tasks\n");
 		for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
-			fprintf(stderr,"    NT%-2d      T%-7d  %12lu\n", t, t, tp->n);
+			fprintf(stderr,"    NT%-2u      T%-7u  %12lu\n", t, t, tp->n);
 
 		fprintf(output, "ENDATA\n");
 	}
@@ -276,7 +531,7 @@ void starpu_bound_print_mps(FILE *output)
 #ifdef HAVE_GLPK_H
 static glp_prob *_starpu_bound_glp_resolve(void)
 {
-	struct task_pool * tp;
+	struct bound_task_pool * tp;
 	int nt; /* Number of different kinds of tasks */
 	int nw; /* Number of different workers */
 	int t, w;
@@ -313,7 +568,7 @@ static glp_prob *_starpu_bound_glp_resolve(void)
 		for (w = 0; w < nw; w++)
 			for (t = 0, tp = task_pools; tp; t++, tp = tp->next) {
 				char name[32];
-				snprintf(name, sizeof(name), "w%dt%dn", w, t);
+				snprintf(name, sizeof(name), "w%ut%un", w, t);
 				glp_set_col_name(lp, colnum(w, t), name);
 				glp_set_col_bnds(lp, colnum(w, t), GLP_LO, 0., 0.);
 			}
@@ -376,10 +631,15 @@ static glp_prob *_starpu_bound_glp_resolve(void)
 
 void starpu_bound_print(FILE *output) {
 #ifdef HAVE_GLPK_H
+	if (recorddeps) {
+		fprintf(output, "Not supported\n");
+		return;
+	}
+
 	PTHREAD_MUTEX_LOCK(&mutex);
 	glp_prob *lp = _starpu_bound_glp_resolve();
 	if (lp) {
-		struct task_pool * tp;
+		struct bound_task_pool * tp;
 		int t, w;
 		int nw; /* Number of different workers */
 		double tmax;
@@ -393,7 +653,7 @@ void starpu_bound_print(FILE *output) {
 		for (t = 0, tp = task_pools; tp; t++, tp = tp->next) {
 			fprintf(output, "%s key %x\n", tp->cl->model->symbol, (unsigned) tp->footprint);
 			for (w = 0; w < nw; w++)
-				fprintf(output, "\tw%dt%d %f", w, t, glp_get_col_prim(lp, colnum(w, t)));
+				fprintf(output, "\tw%ut%u %f", w, t, glp_get_col_prim(lp, colnum(w, t)));
 			fprintf(output, "\n");
 		}
 
@@ -410,6 +670,12 @@ void starpu_bound_print(FILE *output) {
 void starpu_bound_compute(double *res) {
 #ifdef HAVE_GLPK_H
 	double ret;
+
+	if (recorddeps) {
+		*res = 0.;
+		return;
+	}
+
 	PTHREAD_MUTEX_LOCK(&mutex);
 	glp_prob *lp = _starpu_bound_glp_resolve();
 	if (lp) {

+ 6 - 0
src/profiling/bound.h

@@ -24,4 +24,10 @@
 /* Record task for bound computation */
 extern void _starpu_bound_record(starpu_job_t j);
 
+/* Record tag dependency */
+extern void _starpu_bound_tag_dep(starpu_tag_t id, starpu_tag_t dep_id);
+
+/* Record task dependency */
+extern void _starpu_bound_task_dep(starpu_job_t j, starpu_job_t dep_j);
+
 #endif // __BOUND_H__