ispeed_lp_policy.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2011, 2012 INRIA
  4. *
  5. * StarPU is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU Lesser General Public License as published by
  7. * the Free Software Foundation; either version 2.1 of the License, or (at
  8. * your option) any later version.
  9. *
  10. * StarPU is distributed in the hope that it will be useful, but
  11. * WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. *
  14. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  15. */
  16. #include <starpu_config.h>
  17. #include "sc_hypervisor_lp.h"
  18. #include "sc_hypervisor_policy.h"
  19. #include <math.h>
  20. #include <sys/time.h>
  21. struct ispeed_lp_data
  22. {
  23. double **velocity;
  24. double *flops;
  25. double **flops_on_w;
  26. int *workers;
  27. };
  28. /*
  29. * GNU Linear Programming Kit backend
  30. */
  31. #ifdef STARPU_HAVE_GLPK_H
  32. #include <glpk.h>
  33. static double _glp_resolve (int ns, int nw, double final_w_in_s[ns][nw],
  34. unsigned is_integer, double tmax, void *specific_data)
  35. {
  36. struct ispeed_lp_data *sd = (struct ispeed_lp_data *)specific_data;
  37. double **velocity = sd->velocity;
  38. double *flops = sd->flops;
  39. double **final_flops_on_w = sd->flops_on_w;
  40. int *workers = sd->workers;
  41. double w_in_s[ns][nw];
  42. double flops_on_w[ns][nw];
  43. int w, s;
  44. glp_prob *lp;
  45. // printf("try with tmax %lf\n", tmax);
  46. lp = glp_create_prob();
  47. glp_set_prob_name(lp, "StarPU theoretical bound");
  48. glp_set_obj_dir(lp, GLP_MAX);
  49. glp_set_obj_name(lp, "total execution time");
  50. {
  51. int ne = 5 * ns * nw /* worker execution time */
  52. + 1; /* glp dumbness */
  53. int n = 1;
  54. int ia[ne], ja[ne];
  55. double ar[ne];
  56. /* Variables: number of flops assigned to worker w in context s, and
  57. the acknwoledgment that the worker w belongs to the context s */
  58. glp_add_cols(lp, 2*nw*ns);
  59. #define colnum(w, s) ((s)*nw+(w)+1)
  60. for(s = 0; s < ns; s++)
  61. for(w = 0; w < nw; w++)
  62. glp_set_obj_coef(lp, nw*ns+colnum(w,s), 1.);
  63. for(s = 0; s < ns; s++)
  64. for(w = 0; w < nw; w++)
  65. {
  66. char name[32];
  67. snprintf(name, sizeof(name), "flopsw%ds%dn", w, s);
  68. glp_set_col_name(lp, colnum(w,s), name);
  69. glp_set_col_bnds(lp, colnum(w,s), GLP_LO, 0., 0.);
  70. snprintf(name, sizeof(name), "w%ds%dn", w, s);
  71. glp_set_col_name(lp, nw*ns+colnum(w,s), name);
  72. if (is_integer)
  73. {
  74. glp_set_col_kind(lp, nw*ns+colnum(w, s), GLP_IV);
  75. glp_set_col_bnds(lp, nw*ns+colnum(w,s), GLP_DB, 0, 1);
  76. }
  77. else
  78. glp_set_col_bnds(lp, nw*ns+colnum(w,s), GLP_DB, 0.0, 1.0);
  79. }
  80. int curr_row_idx = 0;
  81. /* Total worker execution time */
  82. glp_add_rows(lp, nw*ns);
  83. /*nflops[s][w]/v[s][w] < x[s][w]*tmax */
  84. for(s = 0; s < ns; s++)
  85. {
  86. for (w = 0; w < nw; w++)
  87. {
  88. char name[32], title[64];
  89. starpu_worker_get_name(w, name, sizeof(name));
  90. snprintf(title, sizeof(title), "worker %s", name);
  91. glp_set_row_name(lp, curr_row_idx+s*nw+w+1, title);
  92. /* nflosp[s][w] */
  93. ia[n] = curr_row_idx+s*nw+w+1;
  94. ja[n] = colnum(w, s);
  95. ar[n] = 1 / velocity[s][w];
  96. n++;
  97. /* x[s][w] = 1 | 0 */
  98. ia[n] = curr_row_idx+s*nw+w+1;
  99. ja[n] = nw*ns+colnum(w,s);
  100. ar[n] = (-1) * tmax;
  101. n++;
  102. glp_set_row_bnds(lp, curr_row_idx+s*nw+w+1, GLP_UP, 0.0, 0.0);
  103. }
  104. }
  105. curr_row_idx += nw*ns;
  106. /* sum(flops[s][w]) = flops[s] */
  107. glp_add_rows(lp, ns);
  108. for (s = 0; s < ns; s++)
  109. {
  110. char name[32], title[64];
  111. starpu_worker_get_name(w, name, sizeof(name));
  112. snprintf(title, sizeof(title), "flops %lf ctx%d", flops[s], s);
  113. glp_set_row_name(lp, curr_row_idx+s+1, title);
  114. for (w = 0; w < nw; w++)
  115. {
  116. ia[n] = curr_row_idx+s+1;
  117. ja[n] = colnum(w, s);
  118. ar[n] = 1;
  119. n++;
  120. }
  121. glp_set_row_bnds(lp, curr_row_idx+s+1, GLP_FX, flops[s], flops[s]);
  122. }
  123. curr_row_idx += ns;
  124. /* sum(x[s][w]) = 1 */
  125. glp_add_rows(lp, nw);
  126. for (w = 0; w < nw; w++)
  127. {
  128. char name[32], title[64];
  129. starpu_worker_get_name(w, name, sizeof(name));
  130. snprintf(title, sizeof(title), "w%x", w);
  131. glp_set_row_name(lp, curr_row_idx+w+1, title);
  132. for(s = 0; s < ns; s++)
  133. {
  134. ia[n] = curr_row_idx+w+1;
  135. ja[n] = nw*ns+colnum(w,s);
  136. ar[n] = 1;
  137. n++;
  138. }
  139. if(is_integer)
  140. glp_set_row_bnds(lp, curr_row_idx+w+1, GLP_FX, 1, 1);
  141. else
  142. glp_set_row_bnds(lp, curr_row_idx+w+1, GLP_FX, 1.0, 1.0);
  143. }
  144. curr_row_idx += nw;
  145. /* sum(nflops[s][w]) > 0*/
  146. glp_add_rows(lp, nw);
  147. for (w = 0; w < nw; w++)
  148. {
  149. char name[32], title[64];
  150. starpu_worker_get_name(w, name, sizeof(name));
  151. snprintf(title, sizeof(title), "flopsw%x", w);
  152. glp_set_row_name(lp, curr_row_idx+w+1, title);
  153. for(s = 0; s < ns; s++)
  154. {
  155. ia[n] = curr_row_idx+w+1;
  156. ja[n] = colnum(w,s);
  157. ar[n] = 1;
  158. n++;
  159. }
  160. glp_set_row_bnds(lp, curr_row_idx+w+1, GLP_LO, 0.1, 0.);
  161. }
  162. if(n != ne)
  163. printf("ns= %d nw = %d n = %d ne = %d\n", ns, nw, n, ne);
  164. STARPU_ASSERT(n == ne);
  165. glp_load_matrix(lp, ne-1, ia, ja, ar);
  166. }
  167. glp_smcp parm;
  168. glp_init_smcp(&parm);
  169. parm.msg_lev = GLP_MSG_OFF;
  170. int ret = glp_simplex(lp, &parm);
  171. if (ret)
  172. {
  173. glp_delete_prob(lp);
  174. lp = NULL;
  175. return 0.0;
  176. }
  177. if (is_integer)
  178. {
  179. glp_iocp iocp;
  180. glp_init_iocp(&iocp);
  181. iocp.msg_lev = GLP_MSG_OFF;
  182. glp_intopt(lp, &iocp);
  183. int stat = glp_mip_status(lp);
  184. /* if we don't have a solution return */
  185. if(stat == GLP_NOFEAS)
  186. {
  187. glp_delete_prob(lp);
  188. lp = NULL;
  189. return 0.0;
  190. }
  191. }
  192. int stat = glp_get_prim_stat(lp);
  193. /* if we don't have a solution return */
  194. if(stat == GLP_NOFEAS)
  195. {
  196. glp_delete_prob(lp);
  197. lp = NULL;
  198. return 0.0;
  199. }
  200. double res = glp_get_obj_val(lp);
  201. for(s = 0; s < ns; s++)
  202. for(w = 0; w < nw; w++)
  203. {
  204. flops_on_w[s][w] = glp_get_col_prim(lp, colnum(w, s));
  205. if (is_integer)
  206. w_in_s[s][w] = (double)glp_mip_col_val(lp, nw*ns+colnum(w, s));
  207. else
  208. w_in_s[s][w] = glp_get_col_prim(lp, nw*ns+colnum(w,s));
  209. // printf("w_in_s[s%d][w%d] = %lf flops[s%d][w%d] = %lf \n", s, w, w_in_s[s][w], s, w, flops_on_w[s][w]);
  210. }
  211. glp_delete_prob(lp);
  212. for(s = 0; s < ns; s++)
  213. for(w = 0; w < nw; w++)
  214. {
  215. final_w_in_s[s][w] = w_in_s[s][w];
  216. final_flops_on_w[s][w] = flops_on_w[s][w];
  217. }
  218. return res;
  219. }
  220. static unsigned _compute_flops_distribution_over_ctxs(int ns, int nw, double w_in_s[ns][nw], double **flops_on_w, int *in_sched_ctxs, int *workers)
  221. {
  222. // double flops[ns];
  223. // double velocity[ns][nw];
  224. double *flops = (double*)malloc(ns*sizeof(double));
  225. double **velocity = (double **)malloc(ns*sizeof(double*));
  226. int i;
  227. for(i = 0; i < ns; i++)
  228. velocity[i] = (double*)malloc(nw*sizeof(double));
  229. int *sched_ctxs = in_sched_ctxs == NULL ? sc_hypervisor_get_sched_ctxs() : in_sched_ctxs;
  230. int w,s;
  231. struct sc_hypervisor_wrapper* sc_w = NULL;
  232. double total_flops = 0.0;
  233. for(s = 0; s < ns; s++)
  234. {
  235. sc_w = sc_hypervisor_get_wrapper(sched_ctxs[s]);
  236. for(w = 0; w < nw; w++)
  237. {
  238. w_in_s[s][w] = 0.0;
  239. int worker = workers == NULL ? w : workers[w];
  240. velocity[s][w] = sc_hypervisor_get_velocity_per_worker(sc_w, worker);
  241. if(velocity[s][w] == -1.0)
  242. {
  243. enum starpu_worker_archtype arch = starpu_worker_get_type(worker);
  244. velocity[s][w] = sc_hypervisor_get_velocity(sc_w, arch);
  245. if(arch == STARPU_CUDA_WORKER)
  246. {
  247. unsigned worker_in_ctx = starpu_sched_ctx_contains_worker(worker, sc_w->sched_ctx);
  248. if(!worker_in_ctx)
  249. {
  250. double transfer_velocity = starpu_get_bandwidth_RAM_CUDA(worker) / 1000;
  251. velocity[s][w] = (velocity[s][w] * transfer_velocity) / (velocity[s][w] + transfer_velocity);
  252. }
  253. }
  254. }
  255. // printf("v[w%d][s%d] = %lf\n",w, s, velocity[s][w]);
  256. }
  257. struct sc_hypervisor_policy_config *config = sc_hypervisor_get_config(sched_ctxs[s]);
  258. flops[s] = config->ispeed_ctx_sample/1000000000; /* in gflops */
  259. }
  260. /* take the exec time of the slowest ctx
  261. as starting point and then try to minimize it
  262. as increasing it a little for the faster ctxs */
  263. double tmax = sc_hypervisor_get_slowest_ctx_exec_time();
  264. double smallest_tmax = sc_hypervisor_get_fastest_ctx_exec_time(); //tmax - 0.5*tmax;
  265. // printf("tmax %lf smallest %lf\n", tmax, smallest_tmax);
  266. double tmin = 0.0;
  267. struct ispeed_lp_data specific_data;
  268. specific_data.velocity = velocity;
  269. specific_data.flops = flops;
  270. specific_data.flops_on_w = flops_on_w;
  271. specific_data.workers = workers;
  272. unsigned found_sol = sc_hypervisor_lp_execute_dichotomy(ns, nw, w_in_s, 1, (void*)&specific_data,
  273. tmin, tmax, smallest_tmax, _glp_resolve);
  274. for(i = 0; i < ns; i++)
  275. free(velocity[i]);
  276. free(velocity);
  277. return found_sol;
  278. }
  279. static void _try_resizing(void)
  280. {
  281. int ns = sc_hypervisor_get_nsched_ctxs();
  282. int nw = starpu_worker_get_count(); /* Number of different workers */
  283. double w_in_s[ns][nw];
  284. // double flops_on_w[ns][nw];
  285. double **flops_on_w = (double**)malloc(ns*sizeof(double*));
  286. int i;
  287. for(i = 0; i < ns; i++)
  288. flops_on_w[i] = (double*)malloc(nw*sizeof(double));
  289. unsigned found_sol = _compute_flops_distribution_over_ctxs(ns, nw, w_in_s, flops_on_w, NULL, NULL);
  290. /* if we did find at least one solution redistribute the resources */
  291. if(found_sol)
  292. {
  293. int w, s;
  294. double nworkers[ns][2];
  295. int nworkers_rounded[ns][2];
  296. for(s = 0; s < ns; s++)
  297. {
  298. nworkers[s][0] = 0.0;
  299. nworkers[s][1] = 0.0;
  300. nworkers_rounded[s][0] = 0;
  301. nworkers_rounded[s][1] = 0;
  302. }
  303. for(s = 0; s < ns; s++)
  304. {
  305. for(w = 0; w < nw; w++)
  306. {
  307. enum starpu_worker_archtype arch = starpu_worker_get_type(w);
  308. if(arch == STARPU_CUDA_WORKER)
  309. {
  310. nworkers[s][0] += w_in_s[s][w];
  311. if(w_in_s[s][w] >= 0.3)
  312. nworkers_rounded[s][0]++;
  313. }
  314. else
  315. {
  316. nworkers[s][1] += w_in_s[s][w];
  317. if(w_in_s[s][w] > 0.5)
  318. nworkers_rounded[s][1]++;
  319. }
  320. }
  321. }
  322. /* for(s = 0; s < ns; s++) */
  323. /* printf("%d: cpus = %lf gpus = %lf cpus_round = %d gpus_round = %d\n", s, nworkers[s][1], nworkers[s][0], */
  324. /* nworkers_rounded[s][1], nworkers_rounded[s][0]); */
  325. sc_hypervisor_lp_redistribute_resources_in_ctxs(ns, 2, nworkers_rounded, nworkers);
  326. }
  327. for(i = 0; i < ns; i++)
  328. free(flops_on_w[i]);
  329. free(flops_on_w);
  330. }
  331. static void ispeed_lp_handle_poped_task(unsigned sched_ctx, int worker, struct starpu_task *task, uint32_t footprint)
  332. {
  333. int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
  334. if(ret != EBUSY)
  335. {
  336. unsigned criteria = sc_hypervisor_get_resize_criteria();
  337. if(criteria != SC_NOTHING && criteria == SC_VELOCITY)
  338. {
  339. if(sc_hypervisor_check_velocity_gap_btw_ctxs())
  340. {
  341. _try_resizing();
  342. }
  343. }
  344. starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
  345. }
  346. }
  347. static ispeed_lp_handle_idle_cycle(unsigned sched_ctx, int worker)
  348. {
  349. int ret = starpu_pthread_mutex_trylock(&act_hypervisor_mutex);
  350. if(ret != EBUSY)
  351. {
  352. unsigned criteria = sc_hypervisor_get_resize_criteria();
  353. if(criteria != SC_NOTHING && criteria == SC_IDLE)
  354. {
  355. if(sc_hypervisor_check_idle(sched_ctx, worker))
  356. {
  357. _try_resizing();
  358. // sc_hypervisor_move_workers(sched_ctx, 3 - sched_ctx, &worker, 1, 1);
  359. }
  360. }
  361. starpu_pthread_mutex_unlock(&act_hypervisor_mutex);
  362. }
  363. }
  364. static void ispeed_lp_end_ctx(unsigned sched_ctx)
  365. {
  366. struct sc_hypervisor_wrapper* sc_w = sc_hypervisor_get_wrapper(sched_ctx);
  367. int worker;
  368. /* for(worker = 0; worker < 12; worker++) */
  369. /* printf("%d/%d: speed %lf\n", worker, sched_ctx, sc_w->ref_velocity[worker]); */
  370. return;
  371. }
  372. struct sc_hypervisor_policy ispeed_lp_policy = {
  373. .size_ctxs = NULL,
  374. .handle_poped_task = ispeed_lp_handle_poped_task,
  375. .handle_pushed_task = NULL,
  376. .handle_idle_cycle = ispeed_lp_handle_idle_cycle,
  377. .handle_idle_end = NULL,
  378. .handle_post_exec_hook = NULL,
  379. .handle_submitted_job = NULL,
  380. .end_ctx = ispeed_lp_end_ctx,
  381. .custom = 0,
  382. .name = "ispeed_lp"
  383. };
  384. #endif /* STARPU_HAVE_GLPK_H */