bound.c 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2010, 2011, 2012, 2013 CNRS
  4. * Copyright (C) 2010-2015 Université de Bordeaux
  5. * Copyright (C) 2011 Télécom-SudParis
  6. *
  7. * StarPU is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU Lesser General Public License as published by
  9. * the Free Software Foundation; either version 2.1 of the License, or (at
  10. * your option) any later version.
  11. *
  12. * StarPU is distributed in the hope that it will be useful, but
  13. * WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  15. *
  16. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  17. */
  18. /*
  19. * Record which kinds of tasks have been executed, to later on compute an upper
  20. * bound of the performance that could have theoretically been achieved
  21. */
  22. #include <starpu.h>
  23. #include <starpu_config.h>
  24. #include <profiling/bound.h>
  25. #include <core/jobs.h>
  26. #include <core/workers.h>
  27. #ifdef STARPU_HAVE_GLPK_H
  28. #include <glpk.h>
  29. #endif /* STARPU_HAVE_GLPK_H */
  30. /* TODO: output duration between starpu_bound_start and starpu_bound_stop */
  31. /* TODO: compute critical path and introduce it in the LP */
  32. /*
  33. * Record without dependencies: just count each kind of task
  34. *
  35. * The linear programming problem will just have as variables:
  36. * - the number of tasks of kind `t' executed by worker `w'
  37. * - the total duration
  38. *
  39. * and the constraints will be:
  40. * - the time taken by each worker to complete its assigned tasks is lower than
  41. * the total duration.
  42. * - the total numer of tasks of a given kind is equal to the number run by the
  43. * application.
  44. */
  45. struct bound_task_pool
  46. {
  47. /* Which codelet has been executed */
  48. struct starpu_codelet *cl;
  49. /* Task footprint key (for history-based perfmodel) */
  50. uint32_t footprint;
  51. /* Number of tasks of this kind */
  52. unsigned long n;
  53. /* Other task kinds */
  54. struct bound_task_pool *next;
  55. };
  56. /*
  57. * Record with dependencies: each task is recorded separately
  58. *
  59. * The linear programming problem will have as variables:
  60. * - The start time of each task
  61. * - The completion time of each tag
  62. * - The total duration
  63. * - For each task and for each worker, whether the task is executing on that worker.
  64. * - For each pair of task, which task is scheduled first.
  65. *
  66. * and the constraints will be:
  67. * - All task start time plus duration are less than total duration
  68. * - Each task is executed on exactly one worker.
  69. * - Each task starts after all its task dependencies finish.
  70. * - Each task starts after all its tag dependencies finish.
  71. * - For each task pair and each worker, if both tasks are executed by that worker,
  72. * one is started after the other's completion.
  73. */
  74. struct task_dep
  75. {
  76. /* Task this depends on */
  77. struct bound_task *dep;
  78. /* Data transferred between tasks (i.e. implicit data dep size) */
  79. size_t size;
  80. };
  81. struct bound_task
  82. {
  83. /* Unique ID */
  84. unsigned long id;
  85. /* Tag ID, if any */
  86. starpu_tag_t tag_id;
  87. int use_tag;
  88. /* Which codelet has been executed */
  89. struct starpu_codelet *cl;
  90. /* Task footprint key */
  91. uint32_t footprint;
  92. /* Task priority */
  93. int priority;
  94. /* Tasks this one depends on */
  95. struct task_dep *deps;
  96. int depsn;
  97. /* Estimated duration */
  98. double** duration[STARPU_NARCH];
  99. /* Other tasks */
  100. struct bound_task *next;
  101. };
  102. struct bound_tag_dep
  103. {
  104. starpu_tag_t tag;
  105. starpu_tag_t dep_tag;
  106. struct bound_tag_dep *next;
  107. };
  108. static struct bound_task_pool *task_pools, *last;
  109. static struct bound_task *tasks;
  110. static struct bound_tag_dep *tag_deps;
  111. int _starpu_bound_recording;
  112. static int recorddeps;
  113. static int recordprio;
  114. static starpu_pthread_mutex_t mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
  115. /* Initialization */
  116. void starpu_bound_start(int deps, int prio)
  117. {
  118. struct bound_task_pool *tp;
  119. struct bound_task *t;
  120. struct bound_tag_dep *td;
  121. STARPU_PTHREAD_MUTEX_LOCK(&mutex);
  122. tp = task_pools;
  123. task_pools = NULL;
  124. last = NULL;
  125. t = tasks;
  126. tasks = NULL;
  127. td = tag_deps;
  128. tag_deps = NULL;
  129. _starpu_bound_recording = 1;
  130. recorddeps = deps;
  131. recordprio = prio;
  132. STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
  133. while (tp != NULL)
  134. {
  135. struct bound_task_pool *next = tp->next;
  136. free(tp);
  137. tp = next;
  138. }
  139. while (t != NULL)
  140. {
  141. struct bound_task *next = t->next;
  142. free(t);
  143. t = next;
  144. }
  145. while (td != NULL)
  146. {
  147. struct bound_tag_dep *next = td->next;
  148. free(td);
  149. td = next;
  150. }
  151. }
  152. /* Whether we will include it in the computation */
  153. static int good_job(struct _starpu_job *j)
  154. {
  155. /* No codelet, nothing to measure */
  156. if (j->exclude_from_dag)
  157. return 0;
  158. if (!j->task->cl)
  159. return 0;
  160. /* No performance model, no time duration estimation */
  161. if (!j->task->cl->model)
  162. return 0;
  163. /* Only support history based */
  164. if (j->task->cl->model->type != STARPU_HISTORY_BASED
  165. && j->task->cl->model->type != STARPU_NL_REGRESSION_BASED)
  166. return 0;
  167. return 1;
  168. }
  169. static double** initialize_arch_duration(int maxdevid, unsigned* maxncore_table)
  170. {
  171. int devid, maxncore;
  172. double ** arch_model = malloc(sizeof(*arch_model)*(maxdevid+1));
  173. arch_model[maxdevid] = NULL;
  174. for(devid=0; devid<maxdevid; devid++)
  175. {
  176. if(maxncore_table != NULL)
  177. maxncore = maxncore_table[devid];
  178. else
  179. maxncore = 1;
  180. arch_model[devid] = calloc(maxncore+1,sizeof(*arch_model[devid]));
  181. }
  182. return arch_model;
  183. }
  184. static void initialize_duration(struct bound_task *task)
  185. {
  186. struct _starpu_machine_config *conf = _starpu_get_machine_config();
  187. task->duration[STARPU_CPU_WORKER] = initialize_arch_duration(1,&conf->topology.ncpus);
  188. task->duration[STARPU_CUDA_WORKER] = initialize_arch_duration(conf->topology.ncudagpus,NULL);
  189. task->duration[STARPU_OPENCL_WORKER] = initialize_arch_duration(conf->topology.nopenclgpus,NULL);
  190. task->duration[STARPU_MIC_WORKER] = initialize_arch_duration(conf->topology.nmicdevices,conf->topology.nmiccores);
  191. task->duration[STARPU_SCC_WORKER] = initialize_arch_duration(conf->topology.nsccdevices,NULL);
  192. }
  193. /* Create a new task (either because it has just been submitted, or a
  194. * dependency was added before submission) */
  195. static void new_task(struct _starpu_job *j)
  196. {
  197. struct bound_task *t;
  198. if (j->bound_task)
  199. return;
  200. t = (struct bound_task *) malloc(sizeof(*t));
  201. memset(t, 0, sizeof(*t));
  202. t->id = j->job_id;
  203. t->tag_id = j->task->tag_id;
  204. t->use_tag = j->task->use_tag;
  205. t->cl = j->task->cl;
  206. t->footprint = _starpu_compute_buffers_footprint(j->task->cl?j->task->cl->model:NULL, STARPU_CPU_WORKER, 0, j);
  207. t->priority = j->task->priority;
  208. t->deps = NULL;
  209. t->depsn = 0;
  210. initialize_duration(t);
  211. t->next = tasks;
  212. j->bound_task = t;
  213. tasks = t;
  214. }
  215. /* A new task was submitted, record it */
  216. void _starpu_bound_record(struct _starpu_job *j)
  217. {
  218. if (!_starpu_bound_recording)
  219. return;
  220. if (!good_job(j))
  221. return;
  222. STARPU_PTHREAD_MUTEX_LOCK(&mutex);
  223. /* Re-check, this time with mutex held */
  224. if (!_starpu_bound_recording)
  225. {
  226. STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
  227. return;
  228. }
  229. if (recorddeps)
  230. {
  231. new_task(j);
  232. }
  233. else
  234. {
  235. struct bound_task_pool *tp;
  236. _starpu_compute_buffers_footprint(j->task->cl?j->task->cl->model:NULL, STARPU_CPU_WORKER, 0, j);
  237. if (last && last->cl == j->task->cl && last->footprint == j->footprint)
  238. tp = last;
  239. else
  240. for (tp = task_pools; tp; tp = tp->next)
  241. if (tp->cl == j->task->cl && tp->footprint == j->footprint)
  242. break;
  243. if (!tp)
  244. {
  245. tp = (struct bound_task_pool *) malloc(sizeof(*tp));
  246. tp->cl = j->task->cl;
  247. tp->footprint = j->footprint;
  248. tp->n = 0;
  249. tp->next = task_pools;
  250. task_pools = tp;
  251. }
  252. /* One more task of this kind */
  253. tp->n++;
  254. }
  255. STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
  256. }
  257. /* A tag dependency was emitted, record it */
  258. void _starpu_bound_tag_dep(starpu_tag_t id, starpu_tag_t dep_id)
  259. {
  260. struct bound_tag_dep *td;
  261. if (!_starpu_bound_recording || !recorddeps)
  262. return;
  263. STARPU_PTHREAD_MUTEX_LOCK(&mutex);
  264. /* Re-check, this time with mutex held */
  265. if (!_starpu_bound_recording || !recorddeps)
  266. {
  267. STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
  268. return;
  269. }
  270. td = (struct bound_tag_dep *) malloc(sizeof(*td));
  271. td->tag = id;
  272. td->dep_tag = dep_id;
  273. td->next = tag_deps;
  274. tag_deps = td;
  275. STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
  276. }
  277. /* A task dependency was emitted, record it */
  278. void _starpu_bound_task_dep(struct _starpu_job *j, struct _starpu_job *dep_j)
  279. {
  280. struct bound_task *t;
  281. int i;
  282. if (!_starpu_bound_recording || !recorddeps)
  283. return;
  284. if (!good_job(j) || !good_job(dep_j))
  285. return;
  286. STARPU_PTHREAD_MUTEX_LOCK(&mutex);
  287. /* Re-check, this time with mutex held */
  288. if (!_starpu_bound_recording || !recorddeps)
  289. {
  290. STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
  291. return;
  292. }
  293. new_task(j);
  294. new_task(dep_j);
  295. t = j->bound_task;
  296. for (i = 0; i < t->depsn; i++)
  297. if (t->deps[i].dep == dep_j->bound_task)
  298. break;
  299. if (i == t->depsn)
  300. {
  301. /* Not already there, add */
  302. t->deps = (struct task_dep *) realloc(t->deps, ++t->depsn * sizeof(t->deps[0]));
  303. t->deps[t->depsn-1].dep = dep_j->bound_task;
  304. t->deps[t->depsn-1].size = 0; /* We don't have data information in that case */
  305. }
  306. STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
  307. }
  308. /* Look for job with id ID among our tasks */
  309. static struct bound_task *find_job(unsigned long id)
  310. {
  311. struct bound_task *t;
  312. for (t = tasks; t; t = t->next)
  313. if (t->id == id)
  314. return t;
  315. return NULL;
  316. }
  317. /* Job J depends on previous job of id ID (which is already finished) */
  318. void _starpu_bound_job_id_dep(starpu_data_handle_t handle, struct _starpu_job *j, unsigned long id)
  319. {
  320. struct bound_task *t, *dep_t;
  321. int i;
  322. if (!_starpu_bound_recording || !recorddeps)
  323. return;
  324. if (!good_job(j))
  325. return;
  326. STARPU_PTHREAD_MUTEX_LOCK(&mutex);
  327. /* Re-check, this time with mutex held */
  328. if (!_starpu_bound_recording || !recorddeps)
  329. {
  330. STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
  331. return;
  332. }
  333. new_task(j);
  334. dep_t = find_job(id);
  335. if (!dep_t)
  336. {
  337. fprintf(stderr,"dependency %lu not found !\n", id);
  338. STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
  339. return;
  340. }
  341. t = j->bound_task;
  342. for (i = 0; i < t->depsn; i++)
  343. if (t->deps[i].dep == dep_t)
  344. {
  345. /* Found, just add size */
  346. t->deps[i].size += _starpu_data_get_size(handle);
  347. break;
  348. }
  349. if (i == t->depsn)
  350. {
  351. /* Not already there, add */
  352. t->deps = (struct task_dep *) realloc(t->deps, ++t->depsn * sizeof(t->deps[0]));
  353. t->deps[t->depsn-1].dep = dep_t;
  354. t->deps[t->depsn-1].size = _starpu_data_get_size(handle);
  355. }
  356. STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
  357. }
  358. void starpu_bound_stop(void)
  359. {
  360. STARPU_PTHREAD_MUTEX_LOCK(&mutex);
  361. _starpu_bound_recording = 0;
  362. STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
  363. }
  364. /* Compute all tasks times on all workers */
  365. static void _starpu_get_tasks_times(int nw, int nt, double *times)
  366. {
  367. struct bound_task_pool *tp;
  368. int w, t;
  369. for (w = 0; w < nw; w++)
  370. {
  371. for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
  372. {
  373. struct _starpu_job j =
  374. {
  375. .footprint = tp->footprint,
  376. .footprint_is_computed = 1,
  377. };
  378. struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w, STARPU_NMAX_SCHED_CTXS);
  379. double length = _starpu_history_based_job_expected_perf(tp->cl->model, arch, &j, j.nimpl);
  380. if (isnan(length))
  381. times[w*nt+t] = NAN;
  382. else
  383. times[w*nt+t] = length / 1000.;
  384. }
  385. }
  386. }
  387. /* Return whether PARENT is an ancestor of CHILD */
  388. static int ancestor(struct bound_task *child, struct bound_task *parent)
  389. {
  390. int i;
  391. for (i = 0; i < child->depsn; i++)
  392. {
  393. if (parent == child->deps[i].dep)
  394. return 1;
  395. if (ancestor(child->deps[i].dep, parent))
  396. return -1;
  397. }
  398. return 0;
  399. }
  400. /* Print bound recording in .dot format */
  401. void starpu_bound_print_dot(FILE *output)
  402. {
  403. struct bound_task *t;
  404. struct bound_tag_dep *td;
  405. int i;
  406. if (!recorddeps)
  407. {
  408. fprintf(output, "Not supported\n");
  409. return;
  410. }
  411. fprintf(output, "strict digraph bounddeps {\n");
  412. for (t = tasks; t; t = t->next)
  413. {
  414. fprintf(output, "\"t%lu\" [label=\"%lu: %s\"]\n", t->id, t->id, _starpu_codelet_get_model_name(t->cl));
  415. for (i = 0; i < t->depsn; i++)
  416. fprintf(output, "\"t%lu\" -> \"t%lu\"\n", t->deps[i].dep->id, t->id);
  417. }
  418. for (td = tag_deps; td; td = td->next)
  419. fprintf(output, "\"tag%lu\" -> \"tag%lu\";\n", (unsigned long) td->dep_tag, (unsigned long) td->tag);
  420. fprintf(output, "}\n");
  421. }
  422. /*
  423. * Print bound system in lp_solve format
  424. *
  425. * When dependencies are enabled, you can check the set of tasks and deps that
  426. * were recorded by using tools/lp2paje and vite.
  427. */
  428. void starpu_bound_print_lp(FILE *output)
  429. {
  430. int nt; /* Number of different kinds of tasks */
  431. int nw; /* Number of different workers */
  432. int t;
  433. int w, w2; /* worker */
  434. unsigned n, n2;
  435. STARPU_PTHREAD_MUTEX_LOCK(&mutex);
  436. nw = starpu_worker_get_count();
  437. if (!nw)
  438. /* Make llvm happy about the VLA below */
  439. return;
  440. if (recorddeps)
  441. {
  442. struct bound_task *t1, *t2;
  443. struct bound_tag_dep *td;
  444. int i;
  445. nt = 0;
  446. for (t1 = tasks; t1; t1 = t1->next)
  447. {
  448. if (t1->cl->model->type != STARPU_HISTORY_BASED &&
  449. t1->cl->model->type != STARPU_NL_REGRESSION_BASED)
  450. /* TODO: */
  451. fprintf(stderr, "Warning: task %s uses a perf model which is neither history nor non-linear regression-based, support for such model is not implemented yet, system will not be solvable.\n", _starpu_codelet_get_model_name(t1->cl));
  452. struct _starpu_job j =
  453. {
  454. .footprint = t1->footprint,
  455. .footprint_is_computed = 1,
  456. };
  457. for (w = 0; w < nw; w++)
  458. {
  459. struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w, STARPU_NMAX_SCHED_CTXS);
  460. if (_STARPU_IS_ZERO(t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores]))
  461. {
  462. double length = _starpu_history_based_job_expected_perf(t1->cl->model, arch, &j,j.nimpl);
  463. if (isnan(length))
  464. /* Avoid problems with binary coding of doubles */
  465. t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores] = NAN;
  466. else
  467. t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores] = length / 1000.;
  468. }
  469. }
  470. nt++;
  471. }
  472. if (!nt)
  473. return;
  474. fprintf(output, "/* StarPU upper bound linear programming problem, to be run in lp_solve. */\n\n");
  475. fprintf(output, "/* !! This is a big system, it will be long to solve !! */\n\n");
  476. fprintf(output, "/* We want to minimize total execution time (ms) */\n");
  477. fprintf(output, "min: tmax;\n\n");
  478. fprintf(output, "/* Number of tasks */\n");
  479. fprintf(output, "nt = %d;\n", nt);
  480. fprintf(output, "/* Number of workers */\n");
  481. fprintf(output, "nw = %d;\n", nw);
  482. fprintf(output, "/* The total execution time is the maximum of all task completion times (ms) */\n");
  483. for (t1 = tasks; t1; t1 = t1->next)
  484. fprintf(output, "c%lu <= tmax;\n", t1->id);
  485. fprintf(output, "\n/* We have tasks executing on workers, exactly one worker executes each task */\n");
  486. for (t1 = tasks; t1; t1 = t1->next)
  487. {
  488. for (w = 0; w < nw; w++)
  489. {
  490. struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w, STARPU_NMAX_SCHED_CTXS);
  491. if (!isnan(t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores]))
  492. fprintf(output, " +t%luw%d", t1->id, w);
  493. }
  494. fprintf(output, " = 1;\n");
  495. }
  496. fprintf(output, "\n/* Completion time is start time plus computation time */\n");
  497. fprintf(output, "/* According to where the task is indeed executed */\n");
  498. for (t1 = tasks; t1; t1 = t1->next)
  499. {
  500. fprintf(output, "/* %s %x */\tc%lu = s%lu", _starpu_codelet_get_model_name(t1->cl), (unsigned) t1->footprint, t1->id, t1->id);
  501. for (w = 0; w < nw; w++)
  502. {
  503. struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w, STARPU_NMAX_SCHED_CTXS);
  504. if (!isnan(t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores]))
  505. fprintf(output, " + %f t%luw%d", t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores], t1->id, w);
  506. }
  507. fprintf(output, ";\n");
  508. }
  509. fprintf(output, "\n/* Each task starts after all its task dependencies finish and data is transferred. */\n");
  510. fprintf(output, "/* Note that the dependency finish time depends on the worker where it's working */\n");
  511. for (t1 = tasks; t1; t1 = t1->next)
  512. for (i = 0; i < t1->depsn; i++)
  513. {
  514. fprintf(output, "/* %lu bytes transferred */\n", (unsigned long) t1->deps[i].size);
  515. fprintf(output, "s%lu >= c%lu", t1->id, t1->deps[i].dep->id);
  516. /* Transfer time: pick up one source node and a worker on it */
  517. for (n = 0; n < starpu_memory_nodes_get_count(); n++)
  518. for (w = 0; w < nw; w++)
  519. if (starpu_worker_get_memory_node(w) == n)
  520. {
  521. /* pick up another destination node and a worker on it */
  522. for (n2 = 0; n2 < starpu_memory_nodes_get_count(); n2++)
  523. if (n2 != n)
  524. {
  525. for (w2 = 0; w2 < nw; w2++)
  526. if (starpu_worker_get_memory_node(w2) == n2)
  527. {
  528. /* If predecessor is on worker w and successor
  529. * on worker w2 on different nodes, we need to
  530. * transfer the data. */
  531. fprintf(output, " + d_t%luw%ut%luw%u", t1->deps[i].dep->id, w, t1->id, w2);
  532. }
  533. }
  534. }
  535. fprintf(output, ";\n");
  536. /* Transfer time: pick up one source node and a worker on it */
  537. for (n = 0; n < starpu_memory_nodes_get_count(); n++)
  538. for (w = 0; w < nw; w++)
  539. if (starpu_worker_get_memory_node(w) == n)
  540. {
  541. /* pick up another destination node and a worker on it */
  542. for (n2 = 0; n2 < starpu_memory_nodes_get_count(); n2++)
  543. if (n2 != n)
  544. {
  545. for (w2 = 0; w2 < nw; w2++)
  546. if (starpu_worker_get_memory_node(w2) == n2)
  547. {
  548. /* The data transfer is at least 0ms */
  549. fprintf(output, "d_t%luw%ut%luw%u >= 0;\n", t1->deps[i].dep->id, w, t1->id, w2);
  550. /* The data transfer from w to w2 only happens if tasks run there */
  551. fprintf(output, "d_t%luw%ut%luw%u >= %f - 2e5 + 1e5 t%luw%u + 1e5 t%luw%u;\n",
  552. t1->deps[i].dep->id, w, t1->id, w2,
  553. starpu_transfer_predict(n, n2, t1->deps[i].size)/1000.,
  554. t1->deps[i].dep->id, w, t1->id, w2);
  555. }
  556. }
  557. }
  558. }
  559. fprintf(output, "\n/* Each tag finishes when its corresponding task finishes */\n");
  560. for (t1 = tasks; t1; t1 = t1->next)
  561. if (t1->use_tag)
  562. {
  563. for (w = 0; w < nw; w++)
  564. fprintf(output, "c%lu = tag%lu;\n", t1->id, (unsigned long) t1->tag_id);
  565. }
  566. fprintf(output, "\n/* tags start after all their tag dependencies finish. */\n");
  567. for (td = tag_deps; td; td = td->next)
  568. fprintf(output, "tag%lu >= tag%lu;\n", (unsigned long) td->tag, (unsigned long) td->dep_tag);
  569. /* TODO: factorize ancestor calls */
  570. fprintf(output, "\n/* For each task pair and each worker, if both tasks are executed by the same worker,\n");
  571. fprintf(output, " one is started after the other's completion */\n");
  572. for (t1 = tasks; t1; t1 = t1->next)
  573. {
  574. for (t2 = t1->next; t2; t2 = t2->next)
  575. {
  576. if (!ancestor(t1, t2) && !ancestor(t2, t1))
  577. {
  578. for (w = 0; w < nw; w++)
  579. {
  580. struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w, STARPU_NMAX_SCHED_CTXS);
  581. if (!isnan(t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores]))
  582. {
  583. fprintf(output, "s%lu - c%lu >= -3e5 + 1e5 t%luw%d + 1e5 t%luw%d + 1e5 t%luafter%lu;\n",
  584. t1->id, t2->id, t1->id, w, t2->id, w, t1->id, t2->id);
  585. fprintf(output, "s%lu - c%lu >= -2e5 + 1e5 t%luw%d + 1e5 t%luw%d - 1e5 t%luafter%lu;\n",
  586. t2->id, t1->id, t1->id, w, t2->id, w, t1->id, t2->id);
  587. }
  588. }
  589. }
  590. }
  591. }
  592. #if 0
  593. /* Doesn't help at all to actually express what "after" means */
  594. for (t1 = tasks; t1; t1 = t1->next)
  595. for (t2 = t1->next; t2; t2 = t2->next)
  596. if (!ancestor(t1, t2) && !ancestor(t2, t1))
  597. {
  598. fprintf(output, "s%lu - s%lu >= -1e5 + 1e5 t%luafter%lu;\n", t1->id, t2->id, t1->id, t2->id);
  599. fprintf(output, "s%lu - s%lu >= -1e5 t%luafter%lu;\n", t2->id, t1->id, t1->id, t2->id);
  600. }
  601. #endif
  602. if (recordprio)
  603. {
  604. fprintf(output, "\n/* For StarPU, a priority means given schedulable tasks it will consider the\n");
  605. fprintf(output, " * more prioritized first */\n");
  606. for (t1 = tasks; t1; t1 = t1->next)
  607. {
  608. for (t2 = t1->next; t2; t2 = t2->next)
  609. {
  610. if (!ancestor(t1, t2) && !ancestor(t2, t1)
  611. && t1->priority != t2->priority)
  612. {
  613. if (t1->priority > t2->priority)
  614. {
  615. /* Either t2 is scheduled before t1, but then it
  616. needs to be scheduled before some t dep finishes */
  617. /* One of the t1 deps to give the maximum start time for t2 */
  618. if (t1->depsn > 1)
  619. {
  620. for (i = 0; i < t1->depsn; i++)
  621. fprintf(output, " + t%lut%lud%d", t2->id, t1->id, i);
  622. fprintf(output, " = 1;\n");
  623. }
  624. for (i = 0; i < t1->depsn; i++)
  625. {
  626. fprintf(output, "c%lu - s%lu >= ", t1->deps[i].dep->id, t2->id);
  627. if (t1->depsn > 1)
  628. /* Only checks this when it's this dependency that is chosen */
  629. fprintf(output, "-2e5 + 1e5 t%lut%lud%d", t2->id, t1->id, i);
  630. else
  631. fprintf(output, "-1e5");
  632. /* Only check this if t1 is after t2 */
  633. fprintf(output, " + 1e5 t%luafter%lu", t1->id, t2->id);
  634. fprintf(output, ";\n");
  635. }
  636. /* Or t2 is scheduled after t1 is. */
  637. fprintf(output, "s%lu - s%lu >= -1e5 t%luafter%lu;\n", t2->id, t1->id, t1->id, t2->id);
  638. }
  639. else
  640. {
  641. /* Either t1 is scheduled before t2, but then it
  642. needs to be scheduled before some t2 dep finishes */
  643. /* One of the t2 deps to give the maximum start time for t1 */
  644. if (t2->depsn > 1)
  645. {
  646. for (i = 0; i < t2->depsn; i++)
  647. fprintf(output, " + t%lut%lud%d", t1->id, t2->id, i);
  648. fprintf(output, " = 1;\n");
  649. }
  650. for (i = 0; i < t2->depsn; i++)
  651. {
  652. fprintf(output, "c%lu - s%lu >= ", t2->deps[i].dep->id, t1->id);
  653. if (t2->depsn > 1)
  654. /* Only checks this when it's this dependency that is chosen */
  655. fprintf(output, "-1e5 + 1e5 t%lut%lud%d", t1->id, t2->id, i);
  656. /* Only check this if t2 is after t1 */
  657. fprintf(output, " - 1e5 t%luafter%lu;\n", t1->id, t2->id);
  658. }
  659. /* Or t1 is scheduled after t2 is. */
  660. fprintf(output, "s%lu - s%lu >= -1e5 + 1e5 t%luafter%lu;\n", t1->id, t2->id, t1->id, t2->id);
  661. }
  662. }
  663. }
  664. }
  665. }
  666. for (t1 = tasks; t1; t1 = t1->next)
  667. for (t2 = t1->next; t2; t2 = t2->next)
  668. if (!ancestor(t1, t2) && !ancestor(t2, t1))
  669. {
  670. fprintf(output, "bin t%luafter%lu;\n", t1->id, t2->id);
  671. if (recordprio && t1->priority != t2->priority)
  672. {
  673. if (t1->priority > t2->priority)
  674. {
  675. if (t1->depsn > 1)
  676. for (i = 0; i < t1->depsn; i++)
  677. fprintf(output, "bin t%lut%lud%d;\n", t2->id, t1->id, i);
  678. }
  679. else
  680. {
  681. if (t2->depsn > 1)
  682. for (i = 0; i < t2->depsn; i++)
  683. fprintf(output, "bin t%lut%lud%d;\n", t1->id, t2->id, i);
  684. }
  685. }
  686. }
  687. for (t1 = tasks; t1; t1 = t1->next)
  688. for (w = 0; w < nw; w++)
  689. fprintf(output, "bin t%luw%d;\n", t1->id, w);
  690. }
  691. else
  692. {
  693. struct bound_task_pool *tp;
  694. nt = 0;
  695. for (tp = task_pools; tp; tp = tp->next)
  696. nt++;
  697. if (!nt)
  698. return;
  699. {
  700. double times[nw*nt];
  701. _starpu_get_tasks_times(nw, nt, times);
  702. fprintf(output, "/* StarPU upper bound linear programming problem, to be run in lp_solve. */\n\n");
  703. fprintf(output, "/* We want to minimize total execution time (ms) */\n");
  704. fprintf(output, "min: tmax;\n\n");
  705. fprintf(output, "/* Which is the maximum of all worker execution times (ms) */\n");
  706. for (w = 0; w < nw; w++)
  707. {
  708. char name[32];
  709. starpu_worker_get_name(w, name, sizeof(name));
  710. fprintf(output, "/* worker %s */\n0", name);
  711. for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
  712. {
  713. if (!isnan(times[w*nt+t]))
  714. fprintf(output, "\t%+f * w%dt%dn", (float) times[w*nt+t], w, t);
  715. }
  716. fprintf(output, " <= tmax;\n");
  717. }
  718. fprintf(output, "\n");
  719. fprintf(output, "/* And we have to have computed exactly all tasks */\n");
  720. for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
  721. {
  722. int got_one = 0;
  723. fprintf(output, "/* task %s key %x */\n0", _starpu_codelet_get_model_name(tp->cl), (unsigned) tp->footprint);
  724. for (w = 0; w < nw; w++)
  725. {
  726. if (isnan(times[w*nt+t]))
  727. fprintf(stderr, "Warning: task %s has no performance measurement for worker %d.\n", _starpu_codelet_get_model_name(tp->cl), w);
  728. else
  729. {
  730. got_one = 1;
  731. fprintf(output, "\t+w%dt%dn", w, t);
  732. }
  733. }
  734. fprintf(output, " = %lu;\n", tp->n);
  735. if (!got_one)
  736. fprintf(stderr, "Warning: task %s has no performance measurement for any worker, system will not be solvable!\n", _starpu_codelet_get_model_name(tp->cl));
  737. /* Show actual values */
  738. fprintf(output, "/*");
  739. for (w = 0; w < nw; w++)
  740. fprintf(output, "\t+%lu", tp->cl->per_worker_stats[w]);
  741. fprintf(output, "\t*/\n\n");
  742. }
  743. fprintf(output, "/* Optionally tell that tasks can not be divided */\n");
  744. fprintf(output, "/* int ");
  745. int first = 1;
  746. for (w = 0; w < nw; w++)
  747. for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
  748. {
  749. if (!first)
  750. fprintf(output, ",");
  751. else
  752. first = 0;
  753. fprintf(output, "w%dt%dn", w, t);
  754. }
  755. fprintf(output, "; */\n");
  756. }
  757. }
  758. STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
  759. }
  760. /*
  761. * Print bound system in MPS output format
  762. */
  763. void starpu_bound_print_mps(FILE *output)
  764. {
  765. struct bound_task_pool * tp;
  766. int nt; /* Number of different kinds of tasks */
  767. int nw; /* Number of different workers */
  768. int t, w;
  769. if (recorddeps)
  770. {
  771. fprintf(output, "Not supported\n");
  772. return;
  773. }
  774. nw = starpu_worker_get_count();
  775. if (!nw)
  776. /* Make llvm happy about the VLA below */
  777. return;
  778. STARPU_PTHREAD_MUTEX_LOCK(&mutex);
  779. nt = 0;
  780. for (tp = task_pools; tp; tp = tp->next)
  781. nt++;
  782. if (!nt)
  783. {
  784. STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
  785. return;
  786. }
  787. {
  788. double times[nw*nt];
  789. _starpu_get_tasks_times(nw, nt, times);
  790. fprintf(output, "NAME StarPU theoretical bound\n");
  791. fprintf(output, "*\nROWS\n");
  792. fprintf(output, "* We want to minimize total execution time (ms)\n");
  793. fprintf(output, " N TMAX\n");
  794. fprintf(output, "* Which is the maximum of all worker execution times (ms)\n");
  795. for (w = 0; w < nw; w++)
  796. {
  797. char name[32];
  798. starpu_worker_get_name(w, name, sizeof(name));
  799. fprintf(output, "* worker %s\n", name);
  800. fprintf(output, " L W%d\n", w);
  801. }
  802. fprintf(output, "*\n* And we have to have computed exactly all tasks\n*\n");
  803. for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
  804. {
  805. fprintf(output, "* task %s key %x\n", _starpu_codelet_get_model_name(tp->cl), (unsigned) tp->footprint);
  806. fprintf(output, " E T%d\n", t);
  807. }
  808. fprintf(output, "*\nCOLUMNS\n*\n");
  809. fprintf(output, "*\n* Execution times and completion of all tasks\n*\n");
  810. for (w = 0; w < nw; w++)
  811. for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
  812. if (!isnan(times[w*nt+t]))
  813. {
  814. char name[9];
  815. snprintf(name, sizeof(name), "W%dT%d", w, t);
  816. fprintf(output," %-8s W%-7d %12f\n", name, w, times[w*nt+t]);
  817. fprintf(output," %-8s T%-7d %12d\n", name, t, 1);
  818. }
  819. fprintf(output, "*\n* Total execution time\n*\n");
  820. for (w = 0; w < nw; w++)
  821. fprintf(output," TMAX W%-2d %12d\n", w, -1);
  822. fprintf(output," TMAX TMAX %12d\n", 1);
  823. fprintf(output, "*\nRHS\n*\n");
  824. fprintf(output, "*\n* Total number of tasks\n*\n");
  825. for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
  826. fprintf(output," NT%-2d T%-7d %12lu\n", t, t, tp->n);
  827. fprintf(output, "ENDATA\n");
  828. }
  829. STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
  830. }
  831. /*
  832. * Solve bound system thanks to GNU Linear Programming Kit backend
  833. */
  834. #ifdef STARPU_HAVE_GLPK_H
  835. static glp_prob *_starpu_bound_glp_resolve(int integer)
  836. {
  837. struct bound_task_pool * tp;
  838. int nt; /* Number of different kinds of tasks */
  839. int nw; /* Number of different workers */
  840. int t, w;
  841. glp_prob *lp;
  842. int ret;
  843. nw = starpu_worker_get_count();
  844. if (!nw)
  845. /* Make llvm happy about the VLA below */
  846. return NULL;
  847. nt = 0;
  848. for (tp = task_pools; tp; tp = tp->next)
  849. nt++;
  850. if (!nt)
  851. return NULL;
  852. lp = glp_create_prob();
  853. glp_set_prob_name(lp, "StarPU theoretical bound");
  854. glp_set_obj_dir(lp, GLP_MIN);
  855. glp_set_obj_name(lp, "total execution time");
  856. {
  857. double times[nw*nt];
  858. int ne =
  859. nw * (nt+1) /* worker execution time */
  860. + nt * nw
  861. + 1; /* glp dumbness */
  862. int n = 1;
  863. int ia[ne], ja[ne];
  864. double ar[ne];
  865. _starpu_get_tasks_times(nw, nt, times);
  866. /* Variables: number of tasks i assigned to worker j, and tmax */
  867. glp_add_cols(lp, nw*nt+1);
  868. #define colnum(w, t) ((t)*nw+(w)+1)
  869. glp_set_obj_coef(lp, nw*nt+1, 1.);
  870. for (w = 0; w < nw; w++)
  871. for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
  872. {
  873. char name[32];
  874. snprintf(name, sizeof(name), "w%dt%dn", w, t);
  875. glp_set_col_name(lp, colnum(w, t), name);
  876. if (integer)
  877. glp_set_col_kind(lp, colnum(w, t), GLP_IV);
  878. glp_set_col_bnds(lp, colnum(w, t), GLP_LO, 0., 0.);
  879. }
  880. glp_set_col_bnds(lp, nw*nt+1, GLP_LO, 0., 0.);
  881. /* Total worker execution time */
  882. glp_add_rows(lp, nw);
  883. for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
  884. {
  885. int someone = 0;
  886. for (w = 0; w < nw; w++)
  887. if (!isnan(times[w*nt+t]))
  888. someone = 1;
  889. if (!someone)
  890. {
  891. /* This task does not have any performance model at all, abort */
  892. glp_delete_prob(lp);
  893. return NULL;
  894. }
  895. }
  896. for (w = 0; w < nw; w++)
  897. {
  898. char name[32], title[64];
  899. starpu_worker_get_name(w, name, sizeof(name));
  900. snprintf(title, sizeof(title), "worker %s", name);
  901. glp_set_row_name(lp, w+1, title);
  902. for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
  903. {
  904. ia[n] = w+1;
  905. ja[n] = colnum(w, t);
  906. if (isnan(times[w*nt+t]))
  907. ar[n] = 1000000000.;
  908. else
  909. ar[n] = times[w*nt+t];
  910. n++;
  911. }
  912. /* tmax */
  913. ia[n] = w+1;
  914. ja[n] = nw*nt+1;
  915. ar[n] = -1;
  916. n++;
  917. glp_set_row_bnds(lp, w+1, GLP_UP, 0, 0);
  918. }
  919. /* Total task completion */
  920. glp_add_rows(lp, nt);
  921. for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
  922. {
  923. char name[32], title[64];
  924. starpu_worker_get_name(w, name, sizeof(name));
  925. snprintf(title, sizeof(title), "task %s key %x", _starpu_codelet_get_model_name(tp->cl), (unsigned) tp->footprint);
  926. glp_set_row_name(lp, nw+t+1, title);
  927. for (w = 0; w < nw; w++)
  928. {
  929. ia[n] = nw+t+1;
  930. ja[n] = colnum(w, t);
  931. ar[n] = 1;
  932. n++;
  933. }
  934. glp_set_row_bnds(lp, nw+t+1, GLP_FX, tp->n, tp->n);
  935. }
  936. STARPU_ASSERT(n == ne);
  937. glp_load_matrix(lp, ne-1, ia, ja, ar);
  938. }
  939. glp_smcp parm;
  940. glp_init_smcp(&parm);
  941. parm.msg_lev = GLP_MSG_OFF;
  942. ret = glp_simplex(lp, &parm);
  943. if (ret)
  944. {
  945. glp_delete_prob(lp);
  946. lp = NULL;
  947. return NULL;
  948. }
  949. if (integer)
  950. {
  951. glp_iocp iocp;
  952. glp_init_iocp(&iocp);
  953. iocp.msg_lev = GLP_MSG_OFF;
  954. glp_intopt(lp, &iocp);
  955. }
  956. return lp;
  957. }
  958. #endif /* STARPU_HAVE_GLPK_H */
  959. /* Print the computed bound as well as the optimized distribution of tasks */
  960. void starpu_bound_print(FILE *output, int integer STARPU_ATTRIBUTE_UNUSED)
  961. {
  962. #ifdef STARPU_HAVE_GLPK_H
  963. if (recorddeps)
  964. {
  965. fprintf(output, "Not supported\n");
  966. return;
  967. }
  968. STARPU_PTHREAD_MUTEX_LOCK(&mutex);
  969. glp_prob *lp = _starpu_bound_glp_resolve(integer);
  970. if (lp)
  971. {
  972. struct bound_task_pool * tp;
  973. int t, w;
  974. int nw; /* Number of different workers */
  975. double tmax;
  976. nw = starpu_worker_get_count();
  977. if (integer)
  978. tmax = glp_mip_obj_val(lp);
  979. else
  980. tmax = glp_get_obj_val(lp);
  981. fprintf(output, "Theoretical minimum execution time: %f ms\n", tmax);
  982. for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
  983. {
  984. fprintf(output, "%s key %x\n", _starpu_codelet_get_model_name(tp->cl), (unsigned) tp->footprint);
  985. for (w = 0; w < nw; w++)
  986. if (integer)
  987. fprintf(output, "\tw%dt%dn %f", w, t, glp_mip_col_val(lp, colnum(w, t)));
  988. else
  989. fprintf(output, "\tw%dt%dn %f", w, t, glp_get_col_prim(lp, colnum(w, t)));
  990. fprintf(output, "\n");
  991. }
  992. glp_delete_prob(lp);
  993. }
  994. else
  995. {
  996. fprintf(stderr, "Simplex failed\n");
  997. }
  998. STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
  999. #else /* STARPU_HAVE_GLPK_H */
  1000. fprintf(output, "Please rebuild StarPU with glpk installed.\n");
  1001. #endif /* STARPU_HAVE_GLPK_H */
  1002. }
  1003. /* Compute and return the bound */
  1004. void starpu_bound_compute(double *res, double *integer_res STARPU_ATTRIBUTE_UNUSED, int integer STARPU_ATTRIBUTE_UNUSED)
  1005. {
  1006. #ifdef STARPU_HAVE_GLPK_H
  1007. double ret;
  1008. if (recorddeps)
  1009. {
  1010. *res = 0.;
  1011. return;
  1012. }
  1013. STARPU_PTHREAD_MUTEX_LOCK(&mutex);
  1014. glp_prob *lp = _starpu_bound_glp_resolve(integer);
  1015. if (lp)
  1016. {
  1017. ret = glp_get_obj_val(lp);
  1018. if (integer)
  1019. *integer_res = glp_mip_obj_val(lp);
  1020. glp_delete_prob(lp);
  1021. }
  1022. else
  1023. ret = 0.;
  1024. STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
  1025. *res = ret;
  1026. #else /* STARPU_HAVE_GLPK_H */
  1027. *res = 0.;
  1028. #endif /* STARPU_HAVE_GLPK_H */
  1029. }