bound.c 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2010-2020 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  4. * Copyright (C) 2011 Télécom-SudParis
  5. * Copyright (C) 2013 Thibaut Lambert
  6. *
  7. * StarPU is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU Lesser General Public License as published by
  9. * the Free Software Foundation; either version 2.1 of the License, or (at
  10. * your option) any later version.
  11. *
  12. * StarPU is distributed in the hope that it will be useful, but
  13. * WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  15. *
  16. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  17. */
  18. /*
  19. * Record which kinds of tasks have been executed, to later on compute an upper
  20. * bound of the performance that could have theoretically been achieved
  21. */
  22. #include <starpu.h>
  23. #include <starpu_config.h>
  24. #include <profiling/bound.h>
  25. #include <core/jobs.h>
  26. #include <core/workers.h>
  27. #include <datawizard/memory_nodes.h>
  28. #ifdef STARPU_HAVE_GLPK_H
  29. #include <glpk.h>
  30. #endif /* STARPU_HAVE_GLPK_H */
  31. /* TODO: output duration between starpu_bound_start and starpu_bound_stop */
  32. /* TODO: compute critical path and introduce it in the LP */
  33. /*
  34. * Record without dependencies: just count each kind of task
  35. *
  36. * The linear programming problem will just have as variables:
  37. * - the number of tasks of kind `t' executed by worker `w'
  38. * - the total duration
  39. *
  40. * and the constraints will be:
  41. * - the time taken by each worker to complete its assigned tasks is lower than
  42. * the total duration.
  43. * - the total numer of tasks of a given kind is equal to the number run by the
  44. * application.
  45. */
  46. struct bound_task_pool
  47. {
  48. /* Which codelet has been executed */
  49. struct starpu_codelet *cl;
  50. /* Task footprint key (for history-based perfmodel) */
  51. uint32_t footprint;
  52. /* Number of tasks of this kind */
  53. unsigned long n;
  54. /* Other task kinds */
  55. struct bound_task_pool *next;
  56. };
  57. /*
  58. * Record with dependencies: each task is recorded separately
  59. *
  60. * The linear programming problem will have as variables:
  61. * - The start time of each task
  62. * - The completion time of each tag
  63. * - The total duration
  64. * - For each task and for each worker, whether the task is executing on that worker.
  65. * - For each pair of task, which task is scheduled first.
  66. *
  67. * and the constraints will be:
  68. * - All task start time plus duration are less than total duration
  69. * - Each task is executed on exactly one worker.
  70. * - Each task starts after all its task dependencies finish.
  71. * - Each task starts after all its tag dependencies finish.
  72. * - For each task pair and each worker, if both tasks are executed by that worker,
  73. * one is started after the other's completion.
  74. */
  75. struct task_dep
  76. {
  77. /* Task this depends on */
  78. struct bound_task *dep;
  79. /* Data transferred between tasks (i.e. implicit data dep size) */
  80. size_t size;
  81. };
  82. struct bound_task
  83. {
  84. /* Unique ID */
  85. unsigned long id;
  86. /* Tag ID, if any */
  87. starpu_tag_t tag_id;
  88. int use_tag;
  89. /* Which codelet has been executed */
  90. struct starpu_codelet *cl;
  91. /* Task footprint key */
  92. uint32_t footprint;
  93. /* Task priority */
  94. int priority;
  95. /* Tasks this one depends on */
  96. struct task_dep *deps;
  97. int depsn;
  98. /* Estimated duration */
  99. double** duration[STARPU_NARCH];
  100. /* Other tasks */
  101. struct bound_task *next;
  102. };
  103. struct bound_tag_dep
  104. {
  105. starpu_tag_t tag;
  106. starpu_tag_t dep_tag;
  107. struct bound_tag_dep *next;
  108. };
  109. static struct bound_task_pool *task_pools, *last;
  110. static struct bound_task *tasks;
  111. static struct bound_tag_dep *tag_deps;
  112. int _starpu_bound_recording;
  113. static int recorddeps;
  114. static int recordprio;
  115. static starpu_pthread_mutex_t mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
  116. static void _starpu_bound_clear(int record, int deps, int prio)
  117. {
  118. struct bound_task_pool *tp;
  119. struct bound_task *t;
  120. struct bound_tag_dep *td;
  121. STARPU_PTHREAD_MUTEX_LOCK(&mutex);
  122. tp = task_pools;
  123. task_pools = NULL;
  124. last = NULL;
  125. t = tasks;
  126. tasks = NULL;
  127. td = tag_deps;
  128. tag_deps = NULL;
  129. _starpu_bound_recording = record;
  130. recorddeps = deps;
  131. recordprio = prio;
  132. STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
  133. while (tp != NULL)
  134. {
  135. struct bound_task_pool *next = tp->next;
  136. free(tp);
  137. tp = next;
  138. }
  139. while (t != NULL)
  140. {
  141. struct bound_task *next = t->next;
  142. unsigned i,j;
  143. for (i = 0; i < STARPU_NARCH; i++)
  144. {
  145. if (t->duration[i])
  146. {
  147. for (j = 0; t->duration[i][j]; j++)
  148. free(t->duration[i][j]);
  149. free(t->duration[i]);
  150. }
  151. }
  152. free(t->deps);
  153. free(t);
  154. t = next;
  155. }
  156. while (td != NULL)
  157. {
  158. struct bound_tag_dep *next = td->next;
  159. free(td);
  160. td = next;
  161. }
  162. }
  163. void starpu_bound_clear(void)
  164. {
  165. _starpu_bound_clear(0, 0, 0);
  166. }
  167. /* Initialization */
  168. void starpu_bound_start(int deps, int prio)
  169. {
  170. _starpu_bound_clear(1, deps, prio);
  171. }
  172. /* Whether we will include it in the computation */
  173. static int good_job(struct _starpu_job *j)
  174. {
  175. /* No codelet, nothing to measure */
  176. if (j->exclude_from_dag)
  177. return 0;
  178. if (!j->task->cl)
  179. return 0;
  180. /* No performance model, no time duration estimation */
  181. if (!j->task->cl->model)
  182. return 0;
  183. /* Only support history based */
  184. if (j->task->cl->model->type != STARPU_HISTORY_BASED
  185. && j->task->cl->model->type != STARPU_NL_REGRESSION_BASED)
  186. return 0;
  187. return 1;
  188. }
  189. static double** initialize_arch_duration(int maxdevid, unsigned* maxncore_table)
  190. {
  191. int devid, maxncore;
  192. double ** arch_model;
  193. _STARPU_MALLOC(arch_model, sizeof(*arch_model)*(maxdevid+1));
  194. arch_model[maxdevid] = NULL;
  195. for(devid=0; devid<maxdevid; devid++)
  196. {
  197. if(maxncore_table != NULL)
  198. maxncore = maxncore_table[devid];
  199. else
  200. maxncore = 1;
  201. _STARPU_CALLOC(arch_model[devid], maxncore+1,sizeof(*arch_model[devid]));
  202. }
  203. return arch_model;
  204. }
  205. static void initialize_duration(struct bound_task *task)
  206. {
  207. struct _starpu_machine_config *conf = _starpu_get_machine_config();
  208. task->duration[STARPU_CPU_WORKER] = initialize_arch_duration(1,&conf->topology.nhwcpus);
  209. task->duration[STARPU_CUDA_WORKER] = initialize_arch_duration(conf->topology.nhwcudagpus,NULL);
  210. task->duration[STARPU_OPENCL_WORKER] = initialize_arch_duration(conf->topology.nhwopenclgpus,NULL);
  211. task->duration[STARPU_MIC_WORKER] = initialize_arch_duration(conf->topology.nhwmicdevices,conf->topology.nmiccores);
  212. }
  213. static struct starpu_perfmodel_device device =
  214. {
  215. .type = STARPU_CPU_WORKER,
  216. .devid = 0,
  217. .ncores = 1,
  218. };
  219. static struct starpu_perfmodel_arch dumb_arch =
  220. {
  221. .ndevices = 1,
  222. .devices = &device,
  223. };
  224. /* Create a new task (either because it has just been submitted, or a
  225. * dependency was added before submission) */
  226. static void new_task(struct _starpu_job *j)
  227. {
  228. struct bound_task *t;
  229. if (j->bound_task)
  230. return;
  231. _STARPU_CALLOC(t, 1, sizeof(*t));
  232. t->id = j->job_id;
  233. t->tag_id = j->task->tag_id;
  234. t->use_tag = j->task->use_tag;
  235. t->cl = j->task->cl;
  236. t->footprint = _starpu_compute_buffers_footprint(j->task->cl?j->task->cl->model:NULL, &dumb_arch, 0, j);
  237. t->priority = j->task->priority;
  238. t->deps = NULL;
  239. t->depsn = 0;
  240. initialize_duration(t);
  241. t->next = tasks;
  242. j->bound_task = t;
  243. tasks = t;
  244. }
  245. /* A new task was submitted, record it */
  246. void _starpu_bound_record(struct _starpu_job *j)
  247. {
  248. if (STARPU_LIKELY(!_starpu_bound_recording))
  249. return;
  250. if (!good_job(j))
  251. return;
  252. STARPU_PTHREAD_MUTEX_LOCK(&mutex);
  253. /* Re-check, this time with mutex held */
  254. if (!_starpu_bound_recording)
  255. {
  256. STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
  257. return;
  258. }
  259. if (recorddeps)
  260. {
  261. new_task(j);
  262. }
  263. else
  264. {
  265. struct bound_task_pool *tp;
  266. _starpu_compute_buffers_footprint(j->task->cl?j->task->cl->model:NULL, NULL, 0, j);
  267. if (last && last->cl == j->task->cl && last->footprint == j->footprint)
  268. tp = last;
  269. else
  270. for (tp = task_pools; tp; tp = tp->next)
  271. if (tp->cl == j->task->cl && tp->footprint == j->footprint)
  272. break;
  273. if (!tp)
  274. {
  275. _STARPU_MALLOC(tp, sizeof(*tp));
  276. tp->cl = j->task->cl;
  277. tp->footprint = j->footprint;
  278. tp->n = 0;
  279. tp->next = task_pools;
  280. task_pools = tp;
  281. }
  282. /* One more task of this kind */
  283. tp->n++;
  284. }
  285. STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
  286. }
  287. /* A tag dependency was emitted, record it */
  288. void _starpu_bound_tag_dep(starpu_tag_t id, starpu_tag_t dep_id)
  289. {
  290. struct bound_tag_dep *td;
  291. if (!_starpu_bound_recording || !recorddeps)
  292. return;
  293. STARPU_PTHREAD_MUTEX_LOCK(&mutex);
  294. /* Re-check, this time with mutex held */
  295. if (!_starpu_bound_recording || !recorddeps)
  296. {
  297. STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
  298. return;
  299. }
  300. _STARPU_MALLOC(td, sizeof(*td));
  301. td->tag = id;
  302. td->dep_tag = dep_id;
  303. td->next = tag_deps;
  304. tag_deps = td;
  305. STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
  306. }
  307. /* A task dependency was emitted, record it */
  308. void _starpu_bound_task_dep(struct _starpu_job *j, struct _starpu_job *dep_j)
  309. {
  310. struct bound_task *t;
  311. int i;
  312. if (!_starpu_bound_recording || !recorddeps)
  313. return;
  314. if (!good_job(j) || !good_job(dep_j))
  315. return;
  316. STARPU_PTHREAD_MUTEX_LOCK(&mutex);
  317. /* Re-check, this time with mutex held */
  318. if (!_starpu_bound_recording || !recorddeps)
  319. {
  320. STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
  321. return;
  322. }
  323. new_task(j);
  324. new_task(dep_j);
  325. t = j->bound_task;
  326. for (i = 0; i < t->depsn; i++)
  327. if (t->deps[i].dep == dep_j->bound_task)
  328. break;
  329. if (i == t->depsn)
  330. {
  331. /* Not already there, add */
  332. _STARPU_REALLOC(t->deps, ++t->depsn * sizeof(t->deps[0]));
  333. t->deps[t->depsn-1].dep = dep_j->bound_task;
  334. t->deps[t->depsn-1].size = 0; /* We don't have data information in that case */
  335. }
  336. STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
  337. }
  338. /* Look for job with id ID among our tasks */
  339. static struct bound_task *find_job(unsigned long id)
  340. {
  341. struct bound_task *t;
  342. for (t = tasks; t; t = t->next)
  343. if (t->id == id)
  344. return t;
  345. return NULL;
  346. }
  347. /* Job J depends on previous job of id ID (which is already finished) */
  348. void _starpu_bound_job_id_dep_size(size_t size, struct _starpu_job *j, unsigned long id)
  349. {
  350. struct bound_task *t, *dep_t;
  351. int i;
  352. if (!_starpu_bound_recording || !recorddeps)
  353. return;
  354. if (!good_job(j))
  355. return;
  356. STARPU_PTHREAD_MUTEX_LOCK(&mutex);
  357. /* Re-check, this time with mutex held */
  358. if (!_starpu_bound_recording || !recorddeps)
  359. {
  360. STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
  361. return;
  362. }
  363. new_task(j);
  364. dep_t = find_job(id);
  365. if (!dep_t)
  366. {
  367. _STARPU_MSG("dependency %lu not found !\n", id);
  368. STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
  369. return;
  370. }
  371. t = j->bound_task;
  372. for (i = 0; i < t->depsn; i++)
  373. if (t->deps[i].dep == dep_t)
  374. {
  375. /* Found, just add size */
  376. t->deps[i].size += size;
  377. break;
  378. }
  379. if (i == t->depsn)
  380. {
  381. /* Not already there, add */
  382. _STARPU_REALLOC(t->deps, ++t->depsn * sizeof(t->deps[0]));
  383. t->deps[t->depsn-1].dep = dep_t;
  384. t->deps[t->depsn-1].size = size;
  385. }
  386. STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
  387. }
  388. void _starpu_bound_job_id_dep(starpu_data_handle_t handle, struct _starpu_job *j, unsigned long id)
  389. {
  390. if (!_starpu_bound_recording || !recorddeps)
  391. return;
  392. if (!good_job(j))
  393. return;
  394. _starpu_bound_job_id_dep_size(_starpu_data_get_size(handle), j, id);
  395. }
  396. void starpu_bound_stop(void)
  397. {
  398. STARPU_PTHREAD_MUTEX_LOCK(&mutex);
  399. _starpu_bound_recording = 0;
  400. STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
  401. }
  402. /* Compute all tasks times on all workers */
  403. static void _starpu_get_tasks_times(int nw, int nt, double *times)
  404. {
  405. struct bound_task_pool *tp;
  406. int w, t;
  407. for (w = 0; w < nw; w++)
  408. {
  409. for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
  410. {
  411. struct _starpu_job j =
  412. {
  413. .footprint = tp->footprint,
  414. .footprint_is_computed = 1,
  415. };
  416. struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w, STARPU_NMAX_SCHED_CTXS);
  417. double length = _starpu_history_based_job_expected_perf(tp->cl->model, arch, &j, j.nimpl);
  418. if (isnan(length))
  419. times[w*nt+t] = NAN;
  420. else
  421. times[w*nt+t] = length / 1000.;
  422. }
  423. }
  424. }
  425. /* Return whether PARENT is an ancestor of CHILD */
  426. static int ancestor(struct bound_task *child, struct bound_task *parent)
  427. {
  428. int i;
  429. for (i = 0; i < child->depsn; i++)
  430. {
  431. if (parent == child->deps[i].dep)
  432. return 1;
  433. if (ancestor(child->deps[i].dep, parent))
  434. return -1;
  435. }
  436. return 0;
  437. }
  438. /* Print bound recording in .dot format */
  439. void starpu_bound_print_dot(FILE *output)
  440. {
  441. struct bound_task *t;
  442. struct bound_tag_dep *td;
  443. int i;
  444. if (!recorddeps)
  445. {
  446. fprintf(output, "Not supported\n");
  447. return;
  448. }
  449. fprintf(output, "strict digraph bounddeps {\n");
  450. for (t = tasks; t; t = t->next)
  451. {
  452. fprintf(output, "\"t%lu\" [label=\"%lu: %s\"]\n", t->id, t->id, _starpu_codelet_get_model_name(t->cl));
  453. for (i = 0; i < t->depsn; i++)
  454. fprintf(output, "\"t%lu\" -> \"t%lu\"\n", t->deps[i].dep->id, t->id);
  455. }
  456. for (td = tag_deps; td; td = td->next)
  457. fprintf(output, "\"tag%lu\" -> \"tag%lu\";\n", (unsigned long) td->dep_tag, (unsigned long) td->tag);
  458. fprintf(output, "}\n");
  459. }
  460. /*
  461. * Print bound system in lp_solve format
  462. *
  463. * When dependencies are enabled, you can check the set of tasks and deps that
  464. * were recorded by using tools/lp2paje and vite.
  465. */
  466. void starpu_bound_print_lp(FILE *output)
  467. {
  468. int nt; /* Number of different kinds of tasks */
  469. int nw; /* Number of different workers */
  470. int t;
  471. int w, w2; /* worker */
  472. unsigned n, n2;
  473. STARPU_PTHREAD_MUTEX_LOCK(&mutex);
  474. nw = starpu_worker_get_count();
  475. if (!nw)
  476. /* Make llvm happy about the VLA below */
  477. return;
  478. if (recorddeps)
  479. {
  480. struct bound_task *t1, *t2;
  481. struct bound_tag_dep *td;
  482. int i;
  483. nt = 0;
  484. for (t1 = tasks; t1; t1 = t1->next)
  485. {
  486. if (t1->cl->model->type != STARPU_HISTORY_BASED &&
  487. t1->cl->model->type != STARPU_NL_REGRESSION_BASED)
  488. /* TODO: */
  489. _STARPU_MSG("Warning: task %s uses a perf model which is neither history nor non-linear regression-based, support for such model is not implemented yet, system will not be solvable.\n", _starpu_codelet_get_model_name(t1->cl));
  490. struct _starpu_job j =
  491. {
  492. .footprint = t1->footprint,
  493. .footprint_is_computed = 1,
  494. };
  495. for (w = 0; w < nw; w++)
  496. {
  497. struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w, STARPU_NMAX_SCHED_CTXS);
  498. if (_STARPU_IS_ZERO(t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores]))
  499. {
  500. double length = _starpu_history_based_job_expected_perf(t1->cl->model, arch, &j,j.nimpl);
  501. if (isnan(length))
  502. /* Avoid problems with binary coding of doubles */
  503. t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores] = NAN;
  504. else
  505. t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores] = length / 1000.;
  506. }
  507. }
  508. nt++;
  509. }
  510. if (!nt)
  511. return;
  512. fprintf(output, "/* StarPU upper bound linear programming problem, to be run in lp_solve. */\n\n");
  513. fprintf(output, "/* !! This is a big system, it will be long to solve !! */\n\n");
  514. fprintf(output, "/* We want to minimize total execution time (ms) */\n");
  515. fprintf(output, "min: tmax;\n\n");
  516. fprintf(output, "/* Number of tasks */\n");
  517. fprintf(output, "nt = %d;\n", nt);
  518. fprintf(output, "/* Number of workers */\n");
  519. fprintf(output, "nw = %d;\n", nw);
  520. fprintf(output, "/* The total execution time is the maximum of all task completion times (ms) */\n");
  521. for (t1 = tasks; t1; t1 = t1->next)
  522. fprintf(output, "c%lu <= tmax;\n", t1->id);
  523. fprintf(output, "\n/* We have tasks executing on workers, exactly one worker executes each task */\n");
  524. for (t1 = tasks; t1; t1 = t1->next)
  525. {
  526. for (w = 0; w < nw; w++)
  527. {
  528. struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w, STARPU_NMAX_SCHED_CTXS);
  529. if (!isnan(t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores]))
  530. fprintf(output, " +t%luw%d", t1->id, w);
  531. }
  532. fprintf(output, " = 1;\n");
  533. }
  534. fprintf(output, "\n/* Completion time is start time plus computation time */\n");
  535. fprintf(output, "/* According to where the task is indeed executed */\n");
  536. for (t1 = tasks; t1; t1 = t1->next)
  537. {
  538. fprintf(output, "/* %s %x */\tc%lu = s%lu", _starpu_codelet_get_model_name(t1->cl), (unsigned) t1->footprint, t1->id, t1->id);
  539. for (w = 0; w < nw; w++)
  540. {
  541. struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w, STARPU_NMAX_SCHED_CTXS);
  542. if (!isnan(t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores]))
  543. fprintf(output, " + %f t%luw%d", t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores], t1->id, w);
  544. }
  545. fprintf(output, ";\n");
  546. }
  547. fprintf(output, "\n/* Each task starts after all its task dependencies finish and data is transferred. */\n");
  548. fprintf(output, "/* Note that the dependency finish time depends on the worker where it's working */\n");
  549. for (t1 = tasks; t1; t1 = t1->next)
  550. for (i = 0; i < t1->depsn; i++)
  551. {
  552. fprintf(output, "/* %lu bytes transferred */\n", (unsigned long) t1->deps[i].size);
  553. fprintf(output, "s%lu >= c%lu", t1->id, t1->deps[i].dep->id);
  554. /* Transfer time: pick up one source node and a worker on it */
  555. for (n = 0; n < starpu_memory_nodes_get_count(); n++)
  556. for (w = 0; w < nw; w++)
  557. if (starpu_worker_get_memory_node(w) == n)
  558. {
  559. /* pick up another destination node and a worker on it */
  560. for (n2 = 0; n2 < starpu_memory_nodes_get_count(); n2++)
  561. if (n2 != n)
  562. {
  563. for (w2 = 0; w2 < nw; w2++)
  564. if (starpu_worker_get_memory_node(w2) == n2)
  565. {
  566. /* If predecessor is on worker w and successor
  567. * on worker w2 on different nodes, we need to
  568. * transfer the data. */
  569. fprintf(output, " + d_t%luw%dt%luw%d", t1->deps[i].dep->id, w, t1->id, w2);
  570. }
  571. }
  572. }
  573. fprintf(output, ";\n");
  574. /* Transfer time: pick up one source node and a worker on it */
  575. for (n = 0; n < starpu_memory_nodes_get_count(); n++)
  576. for (w = 0; w < nw; w++)
  577. if (starpu_worker_get_memory_node(w) == n)
  578. {
  579. /* pick up another destination node and a worker on it */
  580. for (n2 = 0; n2 < starpu_memory_nodes_get_count(); n2++)
  581. if (n2 != n)
  582. {
  583. for (w2 = 0; w2 < nw; w2++)
  584. if (starpu_worker_get_memory_node(w2) == n2)
  585. {
  586. /* The data transfer is at least 0ms */
  587. fprintf(output, "d_t%luw%dt%luw%d >= 0;\n", t1->deps[i].dep->id, w, t1->id, w2);
  588. /* The data transfer from w to w2 only happens if tasks run there */
  589. fprintf(output, "d_t%luw%dt%luw%d >= %f - 2e5 + 1e5 t%luw%d + 1e5 t%luw%d;\n",
  590. t1->deps[i].dep->id, w, t1->id, w2,
  591. starpu_transfer_predict(n, n2, t1->deps[i].size)/1000.,
  592. t1->deps[i].dep->id, w, t1->id, w2);
  593. }
  594. }
  595. }
  596. }
  597. fprintf(output, "\n/* Each tag finishes when its corresponding task finishes */\n");
  598. for (t1 = tasks; t1; t1 = t1->next)
  599. if (t1->use_tag)
  600. {
  601. for (w = 0; w < nw; w++)
  602. fprintf(output, "c%lu = tag%lu;\n", t1->id, (unsigned long) t1->tag_id);
  603. }
  604. fprintf(output, "\n/* tags start after all their tag dependencies finish. */\n");
  605. for (td = tag_deps; td; td = td->next)
  606. fprintf(output, "tag%lu >= tag%lu;\n", (unsigned long) td->tag, (unsigned long) td->dep_tag);
  607. /* TODO: factorize ancestor calls */
  608. fprintf(output, "\n/* For each task pair and each worker, if both tasks are executed by the same worker,\n");
  609. fprintf(output, " one is started after the other's completion */\n");
  610. for (t1 = tasks; t1; t1 = t1->next)
  611. {
  612. for (t2 = t1->next; t2; t2 = t2->next)
  613. {
  614. if (!ancestor(t1, t2) && !ancestor(t2, t1))
  615. {
  616. for (w = 0; w < nw; w++)
  617. {
  618. struct starpu_perfmodel_arch* arch = starpu_worker_get_perf_archtype(w, STARPU_NMAX_SCHED_CTXS);
  619. if (!isnan(t1->duration[arch->devices[0].type][arch->devices[0].devid][arch->devices[0].ncores]))
  620. {
  621. fprintf(output, "s%lu - c%lu >= -3e5 + 1e5 t%luw%d + 1e5 t%luw%d + 1e5 t%luafter%lu;\n",
  622. t1->id, t2->id, t1->id, w, t2->id, w, t1->id, t2->id);
  623. fprintf(output, "s%lu - c%lu >= -2e5 + 1e5 t%luw%d + 1e5 t%luw%d - 1e5 t%luafter%lu;\n",
  624. t2->id, t1->id, t1->id, w, t2->id, w, t1->id, t2->id);
  625. }
  626. }
  627. }
  628. }
  629. }
  630. #if 0
  631. /* Doesn't help at all to actually express what "after" means */
  632. for (t1 = tasks; t1; t1 = t1->next)
  633. for (t2 = t1->next; t2; t2 = t2->next)
  634. if (!ancestor(t1, t2) && !ancestor(t2, t1))
  635. {
  636. fprintf(output, "s%lu - s%lu >= -1e5 + 1e5 t%luafter%lu;\n", t1->id, t2->id, t1->id, t2->id);
  637. fprintf(output, "s%lu - s%lu >= -1e5 t%luafter%lu;\n", t2->id, t1->id, t1->id, t2->id);
  638. }
  639. #endif
  640. if (recordprio)
  641. {
  642. fprintf(output, "\n/* For StarPU, a priority means given schedulable tasks it will consider the\n");
  643. fprintf(output, " * more prioritized first */\n");
  644. for (t1 = tasks; t1; t1 = t1->next)
  645. {
  646. for (t2 = t1->next; t2; t2 = t2->next)
  647. {
  648. if (!ancestor(t1, t2) && !ancestor(t2, t1)
  649. && t1->priority != t2->priority)
  650. {
  651. if (t1->priority > t2->priority)
  652. {
  653. /* Either t2 is scheduled before t1, but then it
  654. needs to be scheduled before some t dep finishes */
  655. /* One of the t1 deps to give the maximum start time for t2 */
  656. if (t1->depsn > 1)
  657. {
  658. for (i = 0; i < t1->depsn; i++)
  659. fprintf(output, " + t%lut%lud%d", t2->id, t1->id, i);
  660. fprintf(output, " = 1;\n");
  661. }
  662. for (i = 0; i < t1->depsn; i++)
  663. {
  664. fprintf(output, "c%lu - s%lu >= ", t1->deps[i].dep->id, t2->id);
  665. if (t1->depsn > 1)
  666. /* Only checks this when it's this dependency that is chosen */
  667. fprintf(output, "-2e5 + 1e5 t%lut%lud%d", t2->id, t1->id, i);
  668. else
  669. fprintf(output, "-1e5");
  670. /* Only check this if t1 is after t2 */
  671. fprintf(output, " + 1e5 t%luafter%lu", t1->id, t2->id);
  672. fprintf(output, ";\n");
  673. }
  674. /* Or t2 is scheduled after t1 is. */
  675. fprintf(output, "s%lu - s%lu >= -1e5 t%luafter%lu;\n", t2->id, t1->id, t1->id, t2->id);
  676. }
  677. else
  678. {
  679. /* Either t1 is scheduled before t2, but then it
  680. needs to be scheduled before some t2 dep finishes */
  681. /* One of the t2 deps to give the maximum start time for t1 */
  682. if (t2->depsn > 1)
  683. {
  684. for (i = 0; i < t2->depsn; i++)
  685. fprintf(output, " + t%lut%lud%d", t1->id, t2->id, i);
  686. fprintf(output, " = 1;\n");
  687. }
  688. for (i = 0; i < t2->depsn; i++)
  689. {
  690. fprintf(output, "c%lu - s%lu >= ", t2->deps[i].dep->id, t1->id);
  691. if (t2->depsn > 1)
  692. /* Only checks this when it's this dependency that is chosen */
  693. fprintf(output, "-1e5 + 1e5 t%lut%lud%d", t1->id, t2->id, i);
  694. /* Only check this if t2 is after t1 */
  695. fprintf(output, " - 1e5 t%luafter%lu;\n", t1->id, t2->id);
  696. }
  697. /* Or t1 is scheduled after t2 is. */
  698. fprintf(output, "s%lu - s%lu >= -1e5 + 1e5 t%luafter%lu;\n", t1->id, t2->id, t1->id, t2->id);
  699. }
  700. }
  701. }
  702. }
  703. }
  704. for (t1 = tasks; t1; t1 = t1->next)
  705. for (t2 = t1->next; t2; t2 = t2->next)
  706. if (!ancestor(t1, t2) && !ancestor(t2, t1))
  707. {
  708. fprintf(output, "bin t%luafter%lu;\n", t1->id, t2->id);
  709. if (recordprio && t1->priority != t2->priority)
  710. {
  711. if (t1->priority > t2->priority)
  712. {
  713. if (t1->depsn > 1)
  714. for (i = 0; i < t1->depsn; i++)
  715. fprintf(output, "bin t%lut%lud%d;\n", t2->id, t1->id, i);
  716. }
  717. else
  718. {
  719. if (t2->depsn > 1)
  720. for (i = 0; i < t2->depsn; i++)
  721. fprintf(output, "bin t%lut%lud%d;\n", t1->id, t2->id, i);
  722. }
  723. }
  724. }
  725. for (t1 = tasks; t1; t1 = t1->next)
  726. for (w = 0; w < nw; w++)
  727. fprintf(output, "bin t%luw%d;\n", t1->id, w);
  728. }
  729. else
  730. {
  731. struct bound_task_pool *tp;
  732. nt = 0;
  733. for (tp = task_pools; tp; tp = tp->next)
  734. nt++;
  735. if (!nt)
  736. return;
  737. {
  738. double times[nw*nt];
  739. _starpu_get_tasks_times(nw, nt, times);
  740. fprintf(output, "/* StarPU upper bound linear programming problem, to be run in lp_solve. */\n\n");
  741. fprintf(output, "/* We want to minimize total execution time (ms) */\n");
  742. fprintf(output, "min: tmax;\n\n");
  743. fprintf(output, "/* Which is the maximum of all worker execution times (ms) */\n");
  744. for (w = 0; w < nw; w++)
  745. {
  746. char name[32];
  747. starpu_worker_get_name(w, name, sizeof(name));
  748. fprintf(output, "/* worker %s */\n0", name);
  749. for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
  750. {
  751. if (!isnan(times[w*nt+t]))
  752. fprintf(output, "\t%+f * w%dt%dn", (float) times[w*nt+t], w, t);
  753. }
  754. fprintf(output, " <= tmax;\n");
  755. }
  756. fprintf(output, "\n");
  757. fprintf(output, "/* And we have to have computed exactly all tasks */\n");
  758. for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
  759. {
  760. int got_one = 0;
  761. fprintf(output, "/* task %s key %x */\n0", _starpu_codelet_get_model_name(tp->cl), (unsigned) tp->footprint);
  762. for (w = 0; w < nw; w++)
  763. {
  764. if (isnan(times[w*nt+t]))
  765. _STARPU_MSG("Warning: task %s has no performance measurement for worker %d.\n", _starpu_codelet_get_model_name(tp->cl), w);
  766. else
  767. {
  768. got_one = 1;
  769. fprintf(output, "\t+w%dt%dn", w, t);
  770. }
  771. }
  772. fprintf(output, " = %lu;\n", tp->n);
  773. if (!got_one)
  774. _STARPU_MSG("Warning: task %s has no performance measurement for any worker, system will not be solvable!\n", _starpu_codelet_get_model_name(tp->cl));
  775. /* Show actual values */
  776. fprintf(output, "/*");
  777. for (w = 0; w < nw; w++)
  778. fprintf(output, "\t+%lu", tp->cl->per_worker_stats[w]);
  779. fprintf(output, "\t*/\n\n");
  780. }
  781. fprintf(output, "/* Optionally tell that tasks can not be divided */\n");
  782. fprintf(output, "/* int ");
  783. int first = 1;
  784. for (w = 0; w < nw; w++)
  785. for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
  786. {
  787. if (!first)
  788. fprintf(output, ",");
  789. else
  790. first = 0;
  791. fprintf(output, "w%dt%dn", w, t);
  792. }
  793. fprintf(output, "; */\n");
  794. }
  795. }
  796. STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
  797. }
  798. /*
  799. * Print bound system in MPS output format
  800. */
  801. void starpu_bound_print_mps(FILE *output)
  802. {
  803. struct bound_task_pool * tp;
  804. int nt; /* Number of different kinds of tasks */
  805. int nw; /* Number of different workers */
  806. int t, w;
  807. if (recorddeps)
  808. {
  809. fprintf(output, "Not supported\n");
  810. return;
  811. }
  812. nw = starpu_worker_get_count();
  813. if (!nw)
  814. /* Make llvm happy about the VLA below */
  815. return;
  816. STARPU_PTHREAD_MUTEX_LOCK(&mutex);
  817. nt = 0;
  818. for (tp = task_pools; tp; tp = tp->next)
  819. nt++;
  820. if (!nt)
  821. {
  822. STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
  823. return;
  824. }
  825. {
  826. double times[nw*nt];
  827. _starpu_get_tasks_times(nw, nt, times);
  828. fprintf(output, "NAME StarPU theoretical bound\n");
  829. fprintf(output, "*\nROWS\n");
  830. fprintf(output, "* We want to minimize total execution time (ms)\n");
  831. fprintf(output, " N TMAX\n");
  832. fprintf(output, "* Which is the maximum of all worker execution times (ms)\n");
  833. for (w = 0; w < nw; w++)
  834. {
  835. char name[32];
  836. starpu_worker_get_name(w, name, sizeof(name));
  837. fprintf(output, "* worker %s\n", name);
  838. fprintf(output, " L W%d\n", w);
  839. }
  840. fprintf(output, "*\n* And we have to have computed exactly all tasks\n*\n");
  841. for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
  842. {
  843. fprintf(output, "* task %s key %x\n", _starpu_codelet_get_model_name(tp->cl), (unsigned) tp->footprint);
  844. fprintf(output, " E T%d\n", t);
  845. }
  846. fprintf(output, "*\nCOLUMNS\n*\n");
  847. fprintf(output, "*\n* Execution times and completion of all tasks\n*\n");
  848. for (w = 0; w < nw; w++)
  849. for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
  850. if (!isnan(times[w*nt+t]))
  851. {
  852. char name[23];
  853. snprintf(name, sizeof(name), "W%dT%d", w, t);
  854. fprintf(output," %-8s W%-7d %12f\n", name, w, times[w*nt+t]);
  855. fprintf(output," %-8s T%-7d %12d\n", name, t, 1);
  856. }
  857. fprintf(output, "*\n* Total execution time\n*\n");
  858. for (w = 0; w < nw; w++)
  859. fprintf(output," TMAX W%-2d %12d\n", w, -1);
  860. fprintf(output," TMAX TMAX %12d\n", 1);
  861. fprintf(output, "*\nRHS\n*\n");
  862. fprintf(output, "*\n* Total number of tasks\n*\n");
  863. for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
  864. fprintf(output," NT%-2d T%-7d %12lu\n", t, t, tp->n);
  865. fprintf(output, "ENDATA\n");
  866. }
  867. STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
  868. }
  869. /*
  870. * Solve bound system thanks to GNU Linear Programming Kit backend
  871. */
  872. #ifdef STARPU_HAVE_GLPK_H
  873. static glp_prob *_starpu_bound_glp_resolve(int integer)
  874. {
  875. struct bound_task_pool * tp;
  876. int nt; /* Number of different kinds of tasks */
  877. int nw; /* Number of different workers */
  878. int t, w;
  879. glp_prob *lp;
  880. int ret;
  881. nw = starpu_worker_get_count();
  882. if (!nw)
  883. /* Make llvm happy about the VLA below */
  884. return NULL;
  885. nt = 0;
  886. for (tp = task_pools; tp; tp = tp->next)
  887. nt++;
  888. if (!nt)
  889. return NULL;
  890. lp = glp_create_prob();
  891. glp_set_prob_name(lp, "StarPU theoretical bound");
  892. glp_set_obj_dir(lp, GLP_MIN);
  893. glp_set_obj_name(lp, "total execution time");
  894. {
  895. double times[nw*nt];
  896. int ne =
  897. nw * (nt+1) /* worker execution time */
  898. + nt * nw
  899. + 1; /* glp dumbness */
  900. int n = 1;
  901. int ia[ne], ja[ne];
  902. double ar[ne];
  903. _starpu_get_tasks_times(nw, nt, times);
  904. /* Variables: number of tasks i assigned to worker j, and tmax */
  905. glp_add_cols(lp, nw*nt+1);
  906. #define colnum(w, t) ((t)*nw+(w)+1)
  907. glp_set_obj_coef(lp, nw*nt+1, 1.);
  908. for (w = 0; w < nw; w++)
  909. for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
  910. {
  911. char name[32];
  912. snprintf(name, sizeof(name), "w%dt%dn", w, t);
  913. glp_set_col_name(lp, colnum(w, t), name);
  914. if (integer)
  915. glp_set_col_kind(lp, colnum(w, t), GLP_IV);
  916. glp_set_col_bnds(lp, colnum(w, t), GLP_LO, 0., 0.);
  917. }
  918. glp_set_col_bnds(lp, nw*nt+1, GLP_LO, 0., 0.);
  919. /* Total worker execution time */
  920. glp_add_rows(lp, nw);
  921. for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
  922. {
  923. int someone = 0;
  924. for (w = 0; w < nw; w++)
  925. if (!isnan(times[w*nt+t]))
  926. someone = 1;
  927. if (!someone)
  928. {
  929. /* This task does not have any performance model at all, abort */
  930. glp_delete_prob(lp);
  931. return NULL;
  932. }
  933. }
  934. for (w = 0; w < nw; w++)
  935. {
  936. char name[32], title[64];
  937. starpu_worker_get_name(w, name, sizeof(name));
  938. snprintf(title, sizeof(title), "worker %s", name);
  939. glp_set_row_name(lp, w+1, title);
  940. for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
  941. {
  942. ia[n] = w+1;
  943. ja[n] = colnum(w, t);
  944. if (isnan(times[w*nt+t]))
  945. ar[n] = 1000000000.;
  946. else
  947. ar[n] = times[w*nt+t];
  948. n++;
  949. }
  950. /* tmax */
  951. ia[n] = w+1;
  952. ja[n] = nw*nt+1;
  953. ar[n] = -1;
  954. n++;
  955. glp_set_row_bnds(lp, w+1, GLP_UP, 0, 0);
  956. }
  957. /* Total task completion */
  958. glp_add_rows(lp, nt);
  959. for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
  960. {
  961. char name[32], title[64];
  962. starpu_worker_get_name(w, name, sizeof(name));
  963. snprintf(title, sizeof(title), "task %s key %x", _starpu_codelet_get_model_name(tp->cl), (unsigned) tp->footprint);
  964. glp_set_row_name(lp, nw+t+1, title);
  965. for (w = 0; w < nw; w++)
  966. {
  967. ia[n] = nw+t+1;
  968. ja[n] = colnum(w, t);
  969. ar[n] = 1;
  970. n++;
  971. }
  972. glp_set_row_bnds(lp, nw+t+1, GLP_FX, tp->n, tp->n);
  973. }
  974. STARPU_ASSERT(n == ne);
  975. glp_load_matrix(lp, ne-1, ia, ja, ar);
  976. }
  977. glp_smcp parm;
  978. glp_init_smcp(&parm);
  979. parm.msg_lev = GLP_MSG_OFF;
  980. ret = glp_simplex(lp, &parm);
  981. if (ret)
  982. {
  983. glp_delete_prob(lp);
  984. lp = NULL;
  985. return NULL;
  986. }
  987. if (integer)
  988. {
  989. glp_iocp iocp;
  990. glp_init_iocp(&iocp);
  991. iocp.msg_lev = GLP_MSG_OFF;
  992. glp_intopt(lp, &iocp);
  993. }
  994. return lp;
  995. }
  996. #endif /* STARPU_HAVE_GLPK_H */
  997. /* Print the computed bound as well as the optimized distribution of tasks */
  998. void starpu_bound_print(FILE *output, int integer)
  999. {
  1000. #ifdef STARPU_HAVE_GLPK_H
  1001. if (recorddeps)
  1002. {
  1003. fprintf(output, "Not supported\n");
  1004. return;
  1005. }
  1006. STARPU_PTHREAD_MUTEX_LOCK(&mutex);
  1007. glp_prob *lp = _starpu_bound_glp_resolve(integer);
  1008. if (lp)
  1009. {
  1010. struct bound_task_pool * tp;
  1011. int t, w;
  1012. int nw; /* Number of different workers */
  1013. double tmax;
  1014. nw = starpu_worker_get_count();
  1015. if (integer)
  1016. tmax = glp_mip_obj_val(lp);
  1017. else
  1018. tmax = glp_get_obj_val(lp);
  1019. fprintf(output, "Theoretical minimum execution time: %f ms\n", tmax);
  1020. for (t = 0, tp = task_pools; tp; t++, tp = tp->next)
  1021. {
  1022. fprintf(output, "%s key %x\n", _starpu_codelet_get_model_name(tp->cl), (unsigned) tp->footprint);
  1023. for (w = 0; w < nw; w++)
  1024. if (integer)
  1025. fprintf(output, "\tw%dt%dn %f", w, t, glp_mip_col_val(lp, colnum(w, t)));
  1026. else
  1027. fprintf(output, "\tw%dt%dn %f", w, t, glp_get_col_prim(lp, colnum(w, t)));
  1028. fprintf(output, "\n");
  1029. }
  1030. glp_delete_prob(lp);
  1031. }
  1032. else
  1033. {
  1034. _STARPU_MSG("Simplex failed\n");
  1035. }
  1036. STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
  1037. #else /* STARPU_HAVE_GLPK_H */
  1038. (void) integer;
  1039. fprintf(output, "Please rebuild StarPU with glpk installed.\n");
  1040. #endif /* STARPU_HAVE_GLPK_H */
  1041. }
  1042. /* Compute and return the bound */
  1043. void starpu_bound_compute(double *res, double *integer_res, int integer)
  1044. {
  1045. #ifdef STARPU_HAVE_GLPK_H
  1046. double ret;
  1047. if (recorddeps)
  1048. {
  1049. *res = 0.;
  1050. return;
  1051. }
  1052. STARPU_PTHREAD_MUTEX_LOCK(&mutex);
  1053. glp_prob *lp = _starpu_bound_glp_resolve(integer);
  1054. if (lp)
  1055. {
  1056. ret = glp_get_obj_val(lp);
  1057. if (integer)
  1058. *integer_res = glp_mip_obj_val(lp);
  1059. glp_delete_prob(lp);
  1060. }
  1061. else
  1062. ret = 0.;
  1063. STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
  1064. *res = ret;
  1065. #else /* STARPU_HAVE_GLPK_H */
  1066. (void) integer_res;
  1067. (void) integer;
  1068. *res = 0.;
  1069. #endif /* STARPU_HAVE_GLPK_H */
  1070. }