/* StarPU --- Runtime system for heterogeneous multicore architectures. * * Copyright (C) 2011,2012,2014 Inria * Copyright (C) 2010-2017 Université de Bordeaux * Copyright (C) 2010-2017 CNRS * Copyright (C) 2013 Thibaut Lambert * Copyright (C) 2011 Télécom-SudParis * * StarPU is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation; either version 2.1 of the License, or (at * your option) any later version. * * StarPU is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * * See the GNU Lesser General Public License in COPYING.LGPL for more details. */ /* * Record which kinds of tasks have been executed, to later on compute an upper * bound of the performance that could have theoretically been achieved */ #include #include #include #include #include #ifdef STARPU_HAVE_GLPK_H #include #endif /* STARPU_HAVE_GLPK_H */ /* TODO: output duration between starpu_bound_start and starpu_bound_stop */ /* TODO: compute critical path and introduce it in the LP */ /* * Record without dependencies: just count each kind of task * * The linear programming problem will just have as variables: * - the number of tasks of kind `t' executed by worker `w' * - the total duration * * and the constraints will be: * - the time taken by each worker to complete its assigned tasks is lower than * the total duration. * - the total numer of tasks of a given kind is equal to the number run by the * application. */ struct bound_task_pool { /* Which codelet has been executed */ struct starpu_codelet *cl; /* Task footprint key (for history-based perfmodel) */ uint32_t footprint; /* Number of tasks of this kind */ unsigned long n; /* Other task kinds */ struct bound_task_pool *next; }; /* * Record with dependencies: each task is recorded separately * * The linear programming problem will have as variables: * - The start time of each task * - The completion time of each tag * - The total duration * - For each task and for each worker, whether the task is executing on that worker. * - For each pair of task, which task is scheduled first. * * and the constraints will be: * - All task start time plus duration are less than total duration * - Each task is executed on exactly one worker. * - Each task starts after all its task dependencies finish. * - Each task starts after all its tag dependencies finish. * - For each task pair and each worker, if both tasks are executed by that worker, * one is started after the other's completion. */ struct task_dep { /* Task this depends on */ struct bound_task *dep; /* Data transferred between tasks (i.e. implicit data dep size) */ size_t size; }; struct bound_task { /* Unique ID */ unsigned long id; /* Tag ID, if any */ starpu_tag_t tag_id; int use_tag; /* Which codelet has been executed */ struct starpu_codelet *cl; /* Task footprint key */ uint32_t footprint; /* Task priority */ int priority; /* Tasks this one depends on */ struct task_dep *deps; int depsn; /* Estimated duration */ double** duration[STARPU_NARCH]; /* Other tasks */ struct bound_task *next; }; struct bound_tag_dep { starpu_tag_t tag; starpu_tag_t dep_tag; struct bound_tag_dep *next; }; static struct bound_task_pool *task_pools, *last; static struct bound_task *tasks; static struct bound_tag_dep *tag_deps; int _starpu_bound_recording; static int recorddeps; static int recordprio; static starpu_pthread_mutex_t mutex = STARPU_PTHREAD_MUTEX_INITIALIZER; /* Initialization */ void starpu_bound_start(int deps, int prio) { struct bound_task_pool *tp; struct bound_task *t; struct bound_tag_dep *td; STARPU_PTHREAD_MUTEX_LOCK(&mutex); tp = task_pools; task_pools = NULL; last = NULL; t = tasks; tasks = NULL; td = tag_deps; tag_deps = NULL; _starpu_bound_recording = 1; recorddeps = deps; recordprio = prio; STARPU_PTHREAD_MUTEX_UNLOCK(&mutex); while (tp != NULL) { struct bound_task_pool *next = tp->next; free(tp); tp = next; } while (t != NULL) { struct bound_task *next = t->next; free(t); t = next; } while (td != NULL) { struct bound_tag_dep *next = td->next; free(td); td = next; } } /* Whether we will include it in the computation */ static int good_job(struct _starpu_job *j) { /* No codelet, nothing to measure */ if (j->exclude_from_dag) return 0; if (!j->task->cl) return 0; /* No performance model, no time duration estimation */ if (!j->task->cl->model) return 0; /* Only support history based */ if (j->task->cl->model->type != STARPU_HISTORY_BASED && j->task->cl->model->type != STARPU_NL_REGRESSION_BASED) return 0; return 1; } static double** initialize_arch_duration(int maxdevid, unsigned* maxncore_table) { int devid, maxncore; double ** arch_model; _STARPU_MALLOC(arch_model, sizeof(*arch_model)*(maxdevid+1)); arch_model[maxdevid] = NULL; for(devid=0; devidduration[STARPU_CPU_WORKER] = initialize_arch_duration(1,&conf->topology.nhwcpus); task->duration[STARPU_CUDA_WORKER] = initialize_arch_duration(conf->topology.nhwcudagpus,NULL); task->duration[STARPU_OPENCL_WORKER] = initialize_arch_duration(conf->topology.nhwopenclgpus,NULL); task->duration[STARPU_MIC_WORKER] = initialize_arch_duration(conf->topology.nhwmicdevices,conf->topology.nmiccores); task->duration[STARPU_SCC_WORKER] = initialize_arch_duration(conf->topology.nhwscc,NULL); } static struct starpu_perfmodel_device device = { .type = STARPU_CPU_WORKER, .devid = 0, .ncores = 1, }; static struct starpu_perfmodel_arch dumb_arch = { .ndevices = 1, .devices = &device, }; /* Create a new task (either because it has just been submitted, or a * dependency was added before submission) */ static void new_task(struct _starpu_job *j) { struct bound_task *t; if (j->bound_task) return; _STARPU_MALLOC(t, sizeof(*t)); memset(t, 0, sizeof(*t)); t->id = j->job_id; t->tag_id = j->task->tag_id; t->use_tag = j->task->use_tag; t->cl = j->task->cl; t->footprint = _starpu_compute_buffers_footprint(j->task->cl?j->task->cl->model:NULL, &dumb_arch, 0, j); t->priority = j->task->priority; t->deps = NULL; t->depsn = 0; initialize_duration(t); t->next = tasks; j->bound_task = t; tasks = t; } /* A new task was submitted, record it */ void _starpu_bound_record(struct _starpu_job *j) { if (!_starpu_bound_recording) return; if (!good_job(j)) return; STARPU_PTHREAD_MUTEX_LOCK(&mutex); /* Re-check, this time with mutex held */ if (!_starpu_bound_recording) { STARPU_PTHREAD_MUTEX_UNLOCK(&mutex); return; } if (recorddeps) { new_task(j); } else { struct bound_task_pool *tp; _starpu_compute_buffers_footprint(j->task->cl?j->task->cl->model:NULL, NULL, 0, j); if (last && last->cl == j->task->cl && last->footprint == j->footprint) tp = last; else for (tp = task_pools; tp; tp = tp->next) if (tp->cl == j->task->cl && tp->footprint == j->footprint) break; if (!tp) { _STARPU_MALLOC(tp, sizeof(*tp)); tp->cl = j->task->cl; tp->footprint = j->footprint; tp->n = 0; tp->next = task_pools; task_pools = tp; } /* One more task of this kind */ tp->n++; } STARPU_PTHREAD_MUTEX_UNLOCK(&mutex); } /* A tag dependency was emitted, record it */ void _starpu_bound_tag_dep(starpu_tag_t id, starpu_tag_t dep_id) { struct bound_tag_dep *td; if (!_starpu_bound_recording || !recorddeps) return; STARPU_PTHREAD_MUTEX_LOCK(&mutex); /* Re-check, this time with mutex held */ if (!_starpu_bound_recording || !recorddeps) { STARPU_PTHREAD_MUTEX_UNLOCK(&mutex); return; } _STARPU_MALLOC(td, sizeof(*td)); td->tag = id; td->dep_tag = dep_id; td->next = tag_deps; tag_deps = td; STARPU_PTHREAD_MUTEX_UNLOCK(&mutex); } /* A task dependency was emitted, record it */ void _starpu_bound_task_dep(struct _starpu_job *j, struct _starpu_job *dep_j) { struct bound_task *t; int i; if (!_starpu_bound_recording || !recorddeps) return; if (!good_job(j) || !good_job(dep_j)) return; STARPU_PTHREAD_MUTEX_LOCK(&mutex); /* Re-check, this time with mutex held */ if (!_starpu_bound_recording || !recorddeps) { STARPU_PTHREAD_MUTEX_UNLOCK(&mutex); return; } new_task(j); new_task(dep_j); t = j->bound_task; for (i = 0; i < t->depsn; i++) if (t->deps[i].dep == dep_j->bound_task) break; if (i == t->depsn) { /* Not already there, add */ _STARPU_REALLOC(t->deps, ++t->depsn * sizeof(t->deps[0])); t->deps[t->depsn-1].dep = dep_j->bound_task; t->deps[t->depsn-1].size = 0; /* We don't have data information in that case */ } STARPU_PTHREAD_MUTEX_UNLOCK(&mutex); } /* Look for job with id ID among our tasks */ static struct bound_task *find_job(unsigned long id) { struct bound_task *t; for (t = tasks; t; t = t->next) if (t->id == id) return t; return NULL; } /* Job J depends on previous job of id ID (which is already finished) */ void _starpu_bound_job_id_dep(starpu_data_handle_t handle, struct _starpu_job *j, unsigned long id) { struct bound_task *t, *dep_t; int i; if (!_starpu_bound_recording || !recorddeps) return; if (!good_job(j)) return; STARPU_PTHREAD_MUTEX_LOCK(&mutex); /* Re-check, this time with mutex held */ if (!_starpu_bound_recording || !recorddeps) { STARPU_PTHREAD_MUTEX_UNLOCK(&mutex); return; } new_task(j); dep_t = find_job(id); if (!dep_t) { _STARPU_MSG("dependency %lu not found !\n", id); STARPU_PTHREAD_MUTEX_UNLOCK(&mutex); return; } t = j->bound_task; for (i = 0; i < t->depsn; i++) if (t->deps[i].dep == dep_t) { /* Found, just add size */ t->deps[i].size += _starpu_data_get_size(handle); break; } if (i == t->depsn) { /* Not already there, add */ _STARPU_REALLOC(t->deps, ++t->depsn * sizeof(t->deps[0])); t->deps[t->depsn-1].dep = dep_t; t->deps[t->depsn-1].size = _starpu_data_get_size(handle); } STARPU_PTHREAD_MUTEX_UNLOCK(&mutex); } void starpu_bound_stop(void) { STARPU_PTHREAD_MUTEX_LOCK(&mutex); _starpu_bound_recording = 0; STARPU_PTHREAD_MUTEX_UNLOCK(&mutex); } /* Compute all tasks times on all workers */ static void _starpu_get_tasks_times(int nw, int nt, double *times) { struct bound_task_pool *tp; int w, t; for (w = 0; w < nw; w++) { for (t = 0, tp = task_pools; tp; t++, tp = tp->next) { struct _starpu_job j = { .footprint = tp->footprint, .footprint_is_computed = 1, }; struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w, STARPU_NMAX_SCHED_CTXS); double length = _starpu_history_based_job_expected_perf(tp->cl->model, arch, &j, j.nimpl); if (isnan(length)) times[w*nt+t] = NAN; else times[w*nt+t] = length / 1000.; } } } /* Return whether PARENT is an ancestor of CHILD */ static int ancestor(struct bound_task *child, struct bound_task *parent) { int i; for (i = 0; i < child->depsn; i++) { if (parent == child->deps[i].dep) return 1; if (ancestor(child->deps[i].dep, parent)) return -1; } return 0; } /* Print bound recording in .dot format */ void starpu_bound_print_dot(FILE *output) { struct bound_task *t; struct bound_tag_dep *td; int i; if (!recorddeps) { fprintf(output, "Not supported\n"); return; } fprintf(output, "strict digraph bounddeps {\n"); for (t = tasks; t; t = t->next) { fprintf(output, "\"t%lu\" [label=\"%lu: %s\"]\n", t->id, t->id, _starpu_codelet_get_model_name(t->cl)); for (i = 0; i < t->depsn; i++) fprintf(output, "\"t%lu\" -> \"t%lu\"\n", t->deps[i].dep->id, t->id); } for (td = tag_deps; td; td = td->next) fprintf(output, "\"tag%lu\" -> \"tag%lu\";\n", (unsigned long) td->dep_tag, (unsigned long) td->tag); fprintf(output, "}\n"); } /* * Print bound system in lp_solve format * * When dependencies are enabled, you can check the set of tasks and deps that * were recorded by using tools/lp2paje and vite. */ void starpu_bound_print_lp(FILE *output) { int nt; /* Number of different kinds of tasks */ int nw; /* Number of different workers */ int t; int w, w2; /* worker */ unsigned n, n2; STARPU_PTHREAD_MUTEX_LOCK(&mutex); nw = starpu_worker_get_count(); if (!nw) /* Make llvm happy about the VLA below */ return; if (recorddeps) { struct bound_task *t1, *t2; struct bound_tag_dep *td; int i; nt = 0; for (t1 = tasks; t1; t1 = t1->next) { if (t1->cl->model->type != STARPU_HISTORY_BASED && t1->cl->model->type != STARPU_NL_REGRESSION_BASED) /* TODO: */ _STARPU_MSG("Warning: task %s uses a perf model which is neither history nor non-linear regression-based, support for such model is not implemented yet, system will not be solvable.\n", _starpu_codelet_get_model_name(t1->cl)); struct _starpu_job j = { .footprint = t1->footprint, .footprint_is_computed = 1, }; for (w = 0; w < nw; w++) { struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w, STARPU_NMAX_SCHED_CTXS); if (_STARPU_IS_ZERO(t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores])) { double length = _starpu_history_based_job_expected_perf(t1->cl->model, arch, &j,j.nimpl); if (isnan(length)) /* Avoid problems with binary coding of doubles */ t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores] = NAN; else t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores] = length / 1000.; } } nt++; } if (!nt) return; fprintf(output, "/* StarPU upper bound linear programming problem, to be run in lp_solve. */\n\n"); fprintf(output, "/* !! This is a big system, it will be long to solve !! */\n\n"); fprintf(output, "/* We want to minimize total execution time (ms) */\n"); fprintf(output, "min: tmax;\n\n"); fprintf(output, "/* Number of tasks */\n"); fprintf(output, "nt = %d;\n", nt); fprintf(output, "/* Number of workers */\n"); fprintf(output, "nw = %d;\n", nw); fprintf(output, "/* The total execution time is the maximum of all task completion times (ms) */\n"); for (t1 = tasks; t1; t1 = t1->next) fprintf(output, "c%lu <= tmax;\n", t1->id); fprintf(output, "\n/* We have tasks executing on workers, exactly one worker executes each task */\n"); for (t1 = tasks; t1; t1 = t1->next) { for (w = 0; w < nw; w++) { struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w, STARPU_NMAX_SCHED_CTXS); if (!isnan(t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores])) fprintf(output, " +t%luw%d", t1->id, w); } fprintf(output, " = 1;\n"); } fprintf(output, "\n/* Completion time is start time plus computation time */\n"); fprintf(output, "/* According to where the task is indeed executed */\n"); for (t1 = tasks; t1; t1 = t1->next) { fprintf(output, "/* %s %x */\tc%lu = s%lu", _starpu_codelet_get_model_name(t1->cl), (unsigned) t1->footprint, t1->id, t1->id); for (w = 0; w < nw; w++) { struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w, STARPU_NMAX_SCHED_CTXS); if (!isnan(t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores])) fprintf(output, " + %f t%luw%d", t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores], t1->id, w); } fprintf(output, ";\n"); } fprintf(output, "\n/* Each task starts after all its task dependencies finish and data is transferred. */\n"); fprintf(output, "/* Note that the dependency finish time depends on the worker where it's working */\n"); for (t1 = tasks; t1; t1 = t1->next) for (i = 0; i < t1->depsn; i++) { fprintf(output, "/* %lu bytes transferred */\n", (unsigned long) t1->deps[i].size); fprintf(output, "s%lu >= c%lu", t1->id, t1->deps[i].dep->id); /* Transfer time: pick up one source node and a worker on it */ for (n = 0; n < starpu_memory_nodes_get_count(); n++) for (w = 0; w < nw; w++) if (starpu_worker_get_memory_node(w) == n) { /* pick up another destination node and a worker on it */ for (n2 = 0; n2 < starpu_memory_nodes_get_count(); n2++) if (n2 != n) { for (w2 = 0; w2 < nw; w2++) if (starpu_worker_get_memory_node(w2) == n2) { /* If predecessor is on worker w and successor * on worker w2 on different nodes, we need to * transfer the data. */ fprintf(output, " + d_t%luw%dt%luw%d", t1->deps[i].dep->id, w, t1->id, w2); } } } fprintf(output, ";\n"); /* Transfer time: pick up one source node and a worker on it */ for (n = 0; n < starpu_memory_nodes_get_count(); n++) for (w = 0; w < nw; w++) if (starpu_worker_get_memory_node(w) == n) { /* pick up another destination node and a worker on it */ for (n2 = 0; n2 < starpu_memory_nodes_get_count(); n2++) if (n2 != n) { for (w2 = 0; w2 < nw; w2++) if (starpu_worker_get_memory_node(w2) == n2) { /* The data transfer is at least 0ms */ fprintf(output, "d_t%luw%dt%luw%d >= 0;\n", t1->deps[i].dep->id, w, t1->id, w2); /* The data transfer from w to w2 only happens if tasks run there */ fprintf(output, "d_t%luw%dt%luw%d >= %f - 2e5 + 1e5 t%luw%d + 1e5 t%luw%d;\n", t1->deps[i].dep->id, w, t1->id, w2, starpu_transfer_predict(n, n2, t1->deps[i].size)/1000., t1->deps[i].dep->id, w, t1->id, w2); } } } } fprintf(output, "\n/* Each tag finishes when its corresponding task finishes */\n"); for (t1 = tasks; t1; t1 = t1->next) if (t1->use_tag) { for (w = 0; w < nw; w++) fprintf(output, "c%lu = tag%lu;\n", t1->id, (unsigned long) t1->tag_id); } fprintf(output, "\n/* tags start after all their tag dependencies finish. */\n"); for (td = tag_deps; td; td = td->next) fprintf(output, "tag%lu >= tag%lu;\n", (unsigned long) td->tag, (unsigned long) td->dep_tag); /* TODO: factorize ancestor calls */ fprintf(output, "\n/* For each task pair and each worker, if both tasks are executed by the same worker,\n"); fprintf(output, " one is started after the other's completion */\n"); for (t1 = tasks; t1; t1 = t1->next) { for (t2 = t1->next; t2; t2 = t2->next) { if (!ancestor(t1, t2) && !ancestor(t2, t1)) { for (w = 0; w < nw; w++) { struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w, STARPU_NMAX_SCHED_CTXS); if (!isnan(t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores])) { fprintf(output, "s%lu - c%lu >= -3e5 + 1e5 t%luw%d + 1e5 t%luw%d + 1e5 t%luafter%lu;\n", t1->id, t2->id, t1->id, w, t2->id, w, t1->id, t2->id); fprintf(output, "s%lu - c%lu >= -2e5 + 1e5 t%luw%d + 1e5 t%luw%d - 1e5 t%luafter%lu;\n", t2->id, t1->id, t1->id, w, t2->id, w, t1->id, t2->id); } } } } } #if 0 /* Doesn't help at all to actually express what "after" means */ for (t1 = tasks; t1; t1 = t1->next) for (t2 = t1->next; t2; t2 = t2->next) if (!ancestor(t1, t2) && !ancestor(t2, t1)) { fprintf(output, "s%lu - s%lu >= -1e5 + 1e5 t%luafter%lu;\n", t1->id, t2->id, t1->id, t2->id); fprintf(output, "s%lu - s%lu >= -1e5 t%luafter%lu;\n", t2->id, t1->id, t1->id, t2->id); } #endif if (recordprio) { fprintf(output, "\n/* For StarPU, a priority means given schedulable tasks it will consider the\n"); fprintf(output, " * more prioritized first */\n"); for (t1 = tasks; t1; t1 = t1->next) { for (t2 = t1->next; t2; t2 = t2->next) { if (!ancestor(t1, t2) && !ancestor(t2, t1) && t1->priority != t2->priority) { if (t1->priority > t2->priority) { /* Either t2 is scheduled before t1, but then it needs to be scheduled before some t dep finishes */ /* One of the t1 deps to give the maximum start time for t2 */ if (t1->depsn > 1) { for (i = 0; i < t1->depsn; i++) fprintf(output, " + t%lut%lud%d", t2->id, t1->id, i); fprintf(output, " = 1;\n"); } for (i = 0; i < t1->depsn; i++) { fprintf(output, "c%lu - s%lu >= ", t1->deps[i].dep->id, t2->id); if (t1->depsn > 1) /* Only checks this when it's this dependency that is chosen */ fprintf(output, "-2e5 + 1e5 t%lut%lud%d", t2->id, t1->id, i); else fprintf(output, "-1e5"); /* Only check this if t1 is after t2 */ fprintf(output, " + 1e5 t%luafter%lu", t1->id, t2->id); fprintf(output, ";\n"); } /* Or t2 is scheduled after t1 is. */ fprintf(output, "s%lu - s%lu >= -1e5 t%luafter%lu;\n", t2->id, t1->id, t1->id, t2->id); } else { /* Either t1 is scheduled before t2, but then it needs to be scheduled before some t2 dep finishes */ /* One of the t2 deps to give the maximum start time for t1 */ if (t2->depsn > 1) { for (i = 0; i < t2->depsn; i++) fprintf(output, " + t%lut%lud%d", t1->id, t2->id, i); fprintf(output, " = 1;\n"); } for (i = 0; i < t2->depsn; i++) { fprintf(output, "c%lu - s%lu >= ", t2->deps[i].dep->id, t1->id); if (t2->depsn > 1) /* Only checks this when it's this dependency that is chosen */ fprintf(output, "-1e5 + 1e5 t%lut%lud%d", t1->id, t2->id, i); /* Only check this if t2 is after t1 */ fprintf(output, " - 1e5 t%luafter%lu;\n", t1->id, t2->id); } /* Or t1 is scheduled after t2 is. */ fprintf(output, "s%lu - s%lu >= -1e5 + 1e5 t%luafter%lu;\n", t1->id, t2->id, t1->id, t2->id); } } } } } for (t1 = tasks; t1; t1 = t1->next) for (t2 = t1->next; t2; t2 = t2->next) if (!ancestor(t1, t2) && !ancestor(t2, t1)) { fprintf(output, "bin t%luafter%lu;\n", t1->id, t2->id); if (recordprio && t1->priority != t2->priority) { if (t1->priority > t2->priority) { if (t1->depsn > 1) for (i = 0; i < t1->depsn; i++) fprintf(output, "bin t%lut%lud%d;\n", t2->id, t1->id, i); } else { if (t2->depsn > 1) for (i = 0; i < t2->depsn; i++) fprintf(output, "bin t%lut%lud%d;\n", t1->id, t2->id, i); } } } for (t1 = tasks; t1; t1 = t1->next) for (w = 0; w < nw; w++) fprintf(output, "bin t%luw%d;\n", t1->id, w); } else { struct bound_task_pool *tp; nt = 0; for (tp = task_pools; tp; tp = tp->next) nt++; if (!nt) return; { double times[nw*nt]; _starpu_get_tasks_times(nw, nt, times); fprintf(output, "/* StarPU upper bound linear programming problem, to be run in lp_solve. */\n\n"); fprintf(output, "/* We want to minimize total execution time (ms) */\n"); fprintf(output, "min: tmax;\n\n"); fprintf(output, "/* Which is the maximum of all worker execution times (ms) */\n"); for (w = 0; w < nw; w++) { char name[32]; starpu_worker_get_name(w, name, sizeof(name)); fprintf(output, "/* worker %s */\n0", name); for (t = 0, tp = task_pools; tp; t++, tp = tp->next) { if (!isnan(times[w*nt+t])) fprintf(output, "\t%+f * w%dt%dn", (float) times[w*nt+t], w, t); } fprintf(output, " <= tmax;\n"); } fprintf(output, "\n"); fprintf(output, "/* And we have to have computed exactly all tasks */\n"); for (t = 0, tp = task_pools; tp; t++, tp = tp->next) { int got_one = 0; fprintf(output, "/* task %s key %x */\n0", _starpu_codelet_get_model_name(tp->cl), (unsigned) tp->footprint); for (w = 0; w < nw; w++) { if (isnan(times[w*nt+t])) _STARPU_MSG("Warning: task %s has no performance measurement for worker %d.\n", _starpu_codelet_get_model_name(tp->cl), w); else { got_one = 1; fprintf(output, "\t+w%dt%dn", w, t); } } fprintf(output, " = %lu;\n", tp->n); if (!got_one) _STARPU_MSG("Warning: task %s has no performance measurement for any worker, system will not be solvable!\n", _starpu_codelet_get_model_name(tp->cl)); /* Show actual values */ fprintf(output, "/*"); for (w = 0; w < nw; w++) fprintf(output, "\t+%lu", tp->cl->per_worker_stats[w]); fprintf(output, "\t*/\n\n"); } fprintf(output, "/* Optionally tell that tasks can not be divided */\n"); fprintf(output, "/* int "); int first = 1; for (w = 0; w < nw; w++) for (t = 0, tp = task_pools; tp; t++, tp = tp->next) { if (!first) fprintf(output, ","); else first = 0; fprintf(output, "w%dt%dn", w, t); } fprintf(output, "; */\n"); } } STARPU_PTHREAD_MUTEX_UNLOCK(&mutex); } /* * Print bound system in MPS output format */ void starpu_bound_print_mps(FILE *output) { struct bound_task_pool * tp; int nt; /* Number of different kinds of tasks */ int nw; /* Number of different workers */ int t, w; if (recorddeps) { fprintf(output, "Not supported\n"); return; } nw = starpu_worker_get_count(); if (!nw) /* Make llvm happy about the VLA below */ return; STARPU_PTHREAD_MUTEX_LOCK(&mutex); nt = 0; for (tp = task_pools; tp; tp = tp->next) nt++; if (!nt) { STARPU_PTHREAD_MUTEX_UNLOCK(&mutex); return; } { double times[nw*nt]; _starpu_get_tasks_times(nw, nt, times); fprintf(output, "NAME StarPU theoretical bound\n"); fprintf(output, "*\nROWS\n"); fprintf(output, "* We want to minimize total execution time (ms)\n"); fprintf(output, " N TMAX\n"); fprintf(output, "* Which is the maximum of all worker execution times (ms)\n"); for (w = 0; w < nw; w++) { char name[32]; starpu_worker_get_name(w, name, sizeof(name)); fprintf(output, "* worker %s\n", name); fprintf(output, " L W%d\n", w); } fprintf(output, "*\n* And we have to have computed exactly all tasks\n*\n"); for (t = 0, tp = task_pools; tp; t++, tp = tp->next) { fprintf(output, "* task %s key %x\n", _starpu_codelet_get_model_name(tp->cl), (unsigned) tp->footprint); fprintf(output, " E T%d\n", t); } fprintf(output, "*\nCOLUMNS\n*\n"); fprintf(output, "*\n* Execution times and completion of all tasks\n*\n"); for (w = 0; w < nw; w++) for (t = 0, tp = task_pools; tp; t++, tp = tp->next) if (!isnan(times[w*nt+t])) { char name[23]; snprintf(name, sizeof(name), "W%dT%d", w, t); fprintf(output," %-8s W%-7d %12f\n", name, w, times[w*nt+t]); fprintf(output," %-8s T%-7d %12d\n", name, t, 1); } fprintf(output, "*\n* Total execution time\n*\n"); for (w = 0; w < nw; w++) fprintf(output," TMAX W%-2d %12d\n", w, -1); fprintf(output," TMAX TMAX %12d\n", 1); fprintf(output, "*\nRHS\n*\n"); fprintf(output, "*\n* Total number of tasks\n*\n"); for (t = 0, tp = task_pools; tp; t++, tp = tp->next) fprintf(output," NT%-2d T%-7d %12lu\n", t, t, tp->n); fprintf(output, "ENDATA\n"); } STARPU_PTHREAD_MUTEX_UNLOCK(&mutex); } /* * Solve bound system thanks to GNU Linear Programming Kit backend */ #ifdef STARPU_HAVE_GLPK_H static glp_prob *_starpu_bound_glp_resolve(int integer) { struct bound_task_pool * tp; int nt; /* Number of different kinds of tasks */ int nw; /* Number of different workers */ int t, w; glp_prob *lp; int ret; nw = starpu_worker_get_count(); if (!nw) /* Make llvm happy about the VLA below */ return NULL; nt = 0; for (tp = task_pools; tp; tp = tp->next) nt++; if (!nt) return NULL; lp = glp_create_prob(); glp_set_prob_name(lp, "StarPU theoretical bound"); glp_set_obj_dir(lp, GLP_MIN); glp_set_obj_name(lp, "total execution time"); { double times[nw*nt]; int ne = nw * (nt+1) /* worker execution time */ + nt * nw + 1; /* glp dumbness */ int n = 1; int ia[ne], ja[ne]; double ar[ne]; _starpu_get_tasks_times(nw, nt, times); /* Variables: number of tasks i assigned to worker j, and tmax */ glp_add_cols(lp, nw*nt+1); #define colnum(w, t) ((t)*nw+(w)+1) glp_set_obj_coef(lp, nw*nt+1, 1.); for (w = 0; w < nw; w++) for (t = 0, tp = task_pools; tp; t++, tp = tp->next) { char name[32]; snprintf(name, sizeof(name), "w%dt%dn", w, t); glp_set_col_name(lp, colnum(w, t), name); if (integer) glp_set_col_kind(lp, colnum(w, t), GLP_IV); glp_set_col_bnds(lp, colnum(w, t), GLP_LO, 0., 0.); } glp_set_col_bnds(lp, nw*nt+1, GLP_LO, 0., 0.); /* Total worker execution time */ glp_add_rows(lp, nw); for (t = 0, tp = task_pools; tp; t++, tp = tp->next) { int someone = 0; for (w = 0; w < nw; w++) if (!isnan(times[w*nt+t])) someone = 1; if (!someone) { /* This task does not have any performance model at all, abort */ glp_delete_prob(lp); return NULL; } } for (w = 0; w < nw; w++) { char name[32], title[64]; starpu_worker_get_name(w, name, sizeof(name)); snprintf(title, sizeof(title), "worker %s", name); glp_set_row_name(lp, w+1, title); for (t = 0, tp = task_pools; tp; t++, tp = tp->next) { ia[n] = w+1; ja[n] = colnum(w, t); if (isnan(times[w*nt+t])) ar[n] = 1000000000.; else ar[n] = times[w*nt+t]; n++; } /* tmax */ ia[n] = w+1; ja[n] = nw*nt+1; ar[n] = -1; n++; glp_set_row_bnds(lp, w+1, GLP_UP, 0, 0); } /* Total task completion */ glp_add_rows(lp, nt); for (t = 0, tp = task_pools; tp; t++, tp = tp->next) { char name[32], title[64]; starpu_worker_get_name(w, name, sizeof(name)); snprintf(title, sizeof(title), "task %s key %x", _starpu_codelet_get_model_name(tp->cl), (unsigned) tp->footprint); glp_set_row_name(lp, nw+t+1, title); for (w = 0; w < nw; w++) { ia[n] = nw+t+1; ja[n] = colnum(w, t); ar[n] = 1; n++; } glp_set_row_bnds(lp, nw+t+1, GLP_FX, tp->n, tp->n); } STARPU_ASSERT(n == ne); glp_load_matrix(lp, ne-1, ia, ja, ar); } glp_smcp parm; glp_init_smcp(&parm); parm.msg_lev = GLP_MSG_OFF; ret = glp_simplex(lp, &parm); if (ret) { glp_delete_prob(lp); lp = NULL; return NULL; } if (integer) { glp_iocp iocp; glp_init_iocp(&iocp); iocp.msg_lev = GLP_MSG_OFF; glp_intopt(lp, &iocp); } return lp; } #endif /* STARPU_HAVE_GLPK_H */ /* Print the computed bound as well as the optimized distribution of tasks */ void starpu_bound_print(FILE *output, int integer) { #ifdef STARPU_HAVE_GLPK_H if (recorddeps) { fprintf(output, "Not supported\n"); return; } STARPU_PTHREAD_MUTEX_LOCK(&mutex); glp_prob *lp = _starpu_bound_glp_resolve(integer); if (lp) { struct bound_task_pool * tp; int t, w; int nw; /* Number of different workers */ double tmax; nw = starpu_worker_get_count(); if (integer) tmax = glp_mip_obj_val(lp); else tmax = glp_get_obj_val(lp); fprintf(output, "Theoretical minimum execution time: %f ms\n", tmax); for (t = 0, tp = task_pools; tp; t++, tp = tp->next) { fprintf(output, "%s key %x\n", _starpu_codelet_get_model_name(tp->cl), (unsigned) tp->footprint); for (w = 0; w < nw; w++) if (integer) fprintf(output, "\tw%dt%dn %f", w, t, glp_mip_col_val(lp, colnum(w, t))); else fprintf(output, "\tw%dt%dn %f", w, t, glp_get_col_prim(lp, colnum(w, t))); fprintf(output, "\n"); } glp_delete_prob(lp); } else { _STARPU_MSG("Simplex failed\n"); } STARPU_PTHREAD_MUTEX_UNLOCK(&mutex); #else /* STARPU_HAVE_GLPK_H */ (void) integer; fprintf(output, "Please rebuild StarPU with glpk installed.\n"); #endif /* STARPU_HAVE_GLPK_H */ } /* Compute and return the bound */ void starpu_bound_compute(double *res, double *integer_res, int integer) { #ifdef STARPU_HAVE_GLPK_H double ret; if (recorddeps) { *res = 0.; return; } STARPU_PTHREAD_MUTEX_LOCK(&mutex); glp_prob *lp = _starpu_bound_glp_resolve(integer); if (lp) { ret = glp_get_obj_val(lp); if (integer) *integer_res = glp_mip_obj_val(lp); glp_delete_prob(lp); } else ret = 0.; STARPU_PTHREAD_MUTEX_UNLOCK(&mutex); *res = ret; #else /* STARPU_HAVE_GLPK_H */ (void) integer_res; (void) integer; *res = 0.; #endif /* STARPU_HAVE_GLPK_H */ }