dw_factolu.c 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2009-2013 Université de Bordeaux 1
  4. * Copyright (C) 2010 Mehdi Juhoor <mjuhoor@gmail.com>
  5. * Copyright (C) 2010, 2011, 2012 Centre National de la Recherche Scientifique
  6. *
  7. * StarPU is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU Lesser General Public License as published by
  9. * the Free Software Foundation; either version 2.1 of the License, or (at
  10. * your option) any later version.
  11. *
  12. * StarPU is distributed in the hope that it will be useful, but
  13. * WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  15. *
  16. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  17. */
  18. #include "dw_factolu.h"
  19. #include <sys/time.h>
  20. #if 0
  21. #define debug(fmt, ...) fprintf(stderr, fmt, ## __VA_ARGS__)
  22. #else
  23. #define debug(fmt, ...)
  24. #endif
  25. unsigned *advance_11; /* size nblocks, whether the 11 task is done */
  26. unsigned *advance_12_21; /* size nblocks*nblocks */
  27. unsigned *advance_22; /* array of nblocks *nblocks*nblocks */
  28. struct timeval start;
  29. struct timeval end;
  30. static unsigned no_prio = 0;
  31. static struct starpu_codelet cl11 =
  32. {
  33. .cpu_funcs = {dw_cpu_codelet_update_u11, NULL},
  34. #ifdef STARPU_USE_CUDA
  35. .cuda_funcs = {dw_cublas_codelet_update_u11, NULL},
  36. #endif
  37. .nbuffers = 1,
  38. .modes = {STARPU_RW},
  39. .model = &model_11
  40. };
  41. static struct starpu_codelet cl12 =
  42. {
  43. .cpu_funcs = {dw_cpu_codelet_update_u12, NULL},
  44. #ifdef STARPU_USE_CUDA
  45. .cuda_funcs = {dw_cublas_codelet_update_u12, NULL},
  46. #endif
  47. .nbuffers = 2,
  48. .modes = {STARPU_R, STARPU_RW},
  49. .model = &model_12
  50. };
  51. static struct starpu_codelet cl21 =
  52. {
  53. .cpu_funcs = {dw_cpu_codelet_update_u21, NULL},
  54. #ifdef STARPU_USE_CUDA
  55. .cuda_funcs = {dw_cublas_codelet_update_u21, NULL},
  56. #endif
  57. .nbuffers = 2,
  58. .modes = {STARPU_R, STARPU_RW},
  59. .model = &model_21
  60. };
  61. static struct starpu_codelet cl22 =
  62. {
  63. .cpu_funcs = {dw_cpu_codelet_update_u22, NULL},
  64. #ifdef STARPU_USE_CUDA
  65. .cuda_funcs = {dw_cublas_codelet_update_u22, NULL},
  66. #endif
  67. .nbuffers = 3,
  68. .modes = {STARPU_R, STARPU_R, STARPU_RW},
  69. .model = &model_22
  70. };
  71. #define STARTED 0x01
  72. #define DONE 0x11
  73. /*
  74. * Upgraded Callbacks : break the pipeline design !
  75. */
  76. void dw_callback_v2_codelet_update_u22(void *argcb)
  77. {
  78. int ret;
  79. cl_args *args = argcb;
  80. unsigned k = args->k;
  81. unsigned i = args->i;
  82. unsigned j = args->j;
  83. unsigned nblocks = args->nblocks;
  84. debug("u22 %d %d %d\n", k, i, j);
  85. /* we did task 22k,i,j */
  86. advance_22[k*nblocks*nblocks + i + j*nblocks] = DONE;
  87. if ( (i == j) && (i == k+1))
  88. {
  89. /* we now reduce the LU22 part (recursion appears there) */
  90. cl_args *u11arg = malloc(sizeof(cl_args));
  91. struct starpu_task *task = starpu_task_create();
  92. task->callback_func = dw_callback_v2_codelet_update_u11;
  93. task->callback_arg = u11arg;
  94. task->cl = &cl11;
  95. task->cl_arg = u11arg;
  96. task->handles[0] = starpu_data_get_sub_data(args->dataA, 2, k+1, k+1);
  97. u11arg->dataA = args->dataA;
  98. u11arg->i = k + 1;
  99. u11arg->nblocks = args->nblocks;
  100. /* schedule the codelet */
  101. if (!no_prio)
  102. task->priority = STARPU_MAX_PRIO;
  103. debug( "u22 %d %d %d start u11 %d\n", k, i, j, k + 1);
  104. ret = starpu_task_submit(task);
  105. STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
  106. }
  107. /* 11k+1 + 22k,k+1,j => 21 k+1,j */
  108. if ( i == k + 1 && j > k + 1)
  109. {
  110. uint8_t dep;
  111. /* 11 k+1*/
  112. dep = advance_11[(k+1)];
  113. if (dep & DONE)
  114. {
  115. /* try to push the task */
  116. uint8_t u = STARPU_ATOMIC_OR(&advance_12_21[(k+1) + j*nblocks], STARTED);
  117. if ((u & STARTED) == 0)
  118. {
  119. /* we are the only one that should launch that task */
  120. cl_args *u21a = malloc(sizeof(cl_args));
  121. struct starpu_task *task21 = starpu_task_create();
  122. task21->callback_func = dw_callback_v2_codelet_update_u21;
  123. task21->callback_arg = u21a;
  124. task21->cl = &cl21;
  125. task21->cl_arg = u21a;
  126. u21a->i = k+1;
  127. u21a->k = j;
  128. u21a->nblocks = args->nblocks;
  129. u21a->dataA = args->dataA;
  130. task21->handles[0] = starpu_data_get_sub_data(args->dataA, 2, u21a->i, u21a->i);
  131. task21->handles[1] = starpu_data_get_sub_data(args->dataA, 2, u21a->i, u21a->k);
  132. debug( "u22 %d %d %d start u21 %d %d\n", k, i, j, k+1, j);
  133. ret = starpu_task_submit(task21);
  134. STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
  135. }
  136. }
  137. }
  138. /* 11k + 22k-1,i,k => 12 k,i */
  139. if (j == k + 1 && i > k + 1)
  140. {
  141. uint8_t dep;
  142. /* 11 k+1*/
  143. dep = advance_11[(k+1)];
  144. if (dep & DONE)
  145. {
  146. /* try to push the task */
  147. uint8_t u = STARPU_ATOMIC_OR(&advance_12_21[(k+1)*nblocks + i], STARTED);
  148. if ((u & STARTED) == 0)
  149. {
  150. /* we are the only one that should launch that task */
  151. cl_args *u12a = malloc(sizeof(cl_args));
  152. struct starpu_task *task12 = starpu_task_create();
  153. task12->callback_func = dw_callback_v2_codelet_update_u12;
  154. task12->callback_arg = u12a;
  155. task12->cl = &cl12;
  156. task12->cl_arg = u12a;
  157. u12a->i = k+1;
  158. u12a->k = i;
  159. u12a->nblocks = args->nblocks;
  160. u12a->dataA = args->dataA;
  161. task12->handles[0] = starpu_data_get_sub_data(args->dataA, 2, u12a->i, u12a->i);
  162. task12->handles[1] = starpu_data_get_sub_data(args->dataA, 2, u12a->k, u12a->i);
  163. debug( "u22 %d %d %d start u12 %d %d\n", k, i, j, k+1, i);
  164. ret = starpu_task_submit(task12);
  165. STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
  166. }
  167. }
  168. }
  169. free(args);
  170. }
  171. void dw_callback_v2_codelet_update_u12(void *argcb)
  172. {
  173. int ret;
  174. cl_args *args = argcb;
  175. /* now launch the update of LU22 */
  176. unsigned i = args->i;
  177. unsigned k = args->k;
  178. unsigned nblocks = args->nblocks;
  179. debug( "u12 %d %d\n", i, k);
  180. /* we did task 21i,k */
  181. advance_12_21[i*nblocks + k] = DONE;
  182. unsigned slicey;
  183. for (slicey = i+1; slicey < nblocks; slicey++)
  184. {
  185. /* can we launch 22 i,args->k,slicey ? */
  186. /* deps : 21 args->k, slicey */
  187. uint8_t dep;
  188. dep = advance_12_21[i + slicey*nblocks];
  189. if (dep & DONE)
  190. {
  191. /* perhaps we may schedule the 22 i,args->k,slicey task */
  192. uint8_t u = STARPU_ATOMIC_OR(&advance_22[i*nblocks*nblocks + slicey*nblocks + k], STARTED);
  193. if ((u & STARTED) == 0)
  194. {
  195. /* update that square matrix */
  196. cl_args *u22a = malloc(sizeof(cl_args));
  197. struct starpu_task *task22 = starpu_task_create();
  198. task22->callback_func = dw_callback_v2_codelet_update_u22;
  199. task22->callback_arg = u22a;
  200. task22->cl = &cl22;
  201. task22->cl_arg = u22a;
  202. u22a->k = i;
  203. u22a->i = k;
  204. u22a->j = slicey;
  205. u22a->dataA = args->dataA;
  206. u22a->nblocks = nblocks;
  207. task22->handles[0] = starpu_data_get_sub_data(args->dataA, 2, u22a->i, u22a->k);
  208. task22->handles[1] = starpu_data_get_sub_data(args->dataA, 2, u22a->k, u22a->j);
  209. task22->handles[2] = starpu_data_get_sub_data(args->dataA, 2, u22a->i, u22a->j);
  210. /* schedule that codelet */
  211. if (!no_prio && (slicey == i+1))
  212. task22->priority = STARPU_MAX_PRIO;
  213. debug( "u12 %d %d start u22 %d %d %d\n", i, k, i, k, slicey);
  214. ret = starpu_task_submit(task22);
  215. STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
  216. }
  217. }
  218. }
  219. free(argcb);
  220. }
  221. void dw_callback_v2_codelet_update_u21(void *argcb)
  222. {
  223. int ret;
  224. cl_args *args = argcb;
  225. /* now launch the update of LU22 */
  226. unsigned i = args->i;
  227. unsigned k = args->k;
  228. unsigned nblocks = args->nblocks;
  229. /* we did task 21i,k */
  230. advance_12_21[i + k*nblocks] = DONE;
  231. debug("u21 %d %d\n", i, k);
  232. unsigned slicex;
  233. for (slicex = i+1; slicex < nblocks; slicex++)
  234. {
  235. /* can we launch 22 i,slicex,k ? */
  236. /* deps : 12 slicex k */
  237. uint8_t dep;
  238. dep = advance_12_21[i*nblocks + slicex];
  239. if (dep & DONE)
  240. {
  241. /* perhaps we may schedule the 22 i,args->k,slicey task */
  242. uint8_t u = STARPU_ATOMIC_OR(&advance_22[i*nblocks*nblocks + k*nblocks + slicex], STARTED);
  243. if ((u & STARTED) == 0)
  244. {
  245. /* update that square matrix */
  246. cl_args *u22a = malloc(sizeof(cl_args));
  247. struct starpu_task *task22 = starpu_task_create();
  248. task22->callback_func = dw_callback_v2_codelet_update_u22;
  249. task22->callback_arg = u22a;
  250. task22->cl = &cl22;
  251. task22->cl_arg = u22a;
  252. u22a->k = i;
  253. u22a->i = slicex;
  254. u22a->j = k;
  255. u22a->dataA = args->dataA;
  256. u22a->nblocks = nblocks;
  257. task22->handles[0] = starpu_data_get_sub_data(args->dataA, 2, u22a->i, u22a->k);
  258. task22->handles[1] = starpu_data_get_sub_data(args->dataA, 2, u22a->k, u22a->j);
  259. task22->handles[2] = starpu_data_get_sub_data(args->dataA, 2, u22a->i, u22a->j);
  260. /* schedule that codelet */
  261. if (!no_prio && (slicex == i+1))
  262. task22->priority = STARPU_MAX_PRIO;
  263. debug( "u21 %d %d start u22 %d %d %d\n", i, k, i, slicex, k);
  264. ret = starpu_task_submit(task22);
  265. STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
  266. }
  267. }
  268. }
  269. free(argcb);
  270. }
  271. void dw_callback_v2_codelet_update_u11(void *argcb)
  272. {
  273. int ret;
  274. /* in case there remains work, go on */
  275. cl_args *args = argcb;
  276. unsigned nblocks = args->nblocks;
  277. unsigned i = args->i;
  278. debug("u11 %d\n", i);
  279. /* we did task 11k */
  280. advance_11[i] = DONE;
  281. if (i == nblocks - 1)
  282. {
  283. /* we are done */
  284. free(argcb);
  285. return;
  286. }
  287. else
  288. {
  289. /* put new tasks */
  290. unsigned slice;
  291. for (slice = i + 1; slice < nblocks; slice++)
  292. {
  293. /* can we launch 12i,slice ? */
  294. uint8_t deps12;
  295. if (i == 0)
  296. {
  297. deps12 = DONE;
  298. }
  299. else
  300. {
  301. deps12 = advance_22[(i-1)*nblocks*nblocks + slice + i*nblocks];
  302. }
  303. if (deps12 & DONE)
  304. {
  305. /* we may perhaps launch the task 12i,slice */
  306. uint8_t u = STARPU_ATOMIC_OR(&advance_12_21[i*nblocks + slice], STARTED);
  307. if ((u & STARTED) == 0)
  308. {
  309. /* we are the only one that should launch that task */
  310. cl_args *u12a = malloc(sizeof(cl_args));
  311. struct starpu_task *task12 = starpu_task_create();
  312. task12->callback_func = dw_callback_v2_codelet_update_u12;
  313. task12->callback_arg = u12a;
  314. task12->cl = &cl12;
  315. task12->cl_arg = u12a;
  316. u12a->i = i;
  317. u12a->k = slice;
  318. u12a->nblocks = args->nblocks;
  319. u12a->dataA = args->dataA;
  320. task12->handles[0] = starpu_data_get_sub_data(args->dataA, 2, u12a->i, u12a->i);
  321. task12->handles[1] = starpu_data_get_sub_data(args->dataA, 2, u12a->k, u12a->i);
  322. if (!no_prio && (slice == i +1))
  323. task12->priority = STARPU_MAX_PRIO;
  324. debug( "u11 %d start u12 %d %d\n", i, i, slice);
  325. ret = starpu_task_submit(task12);
  326. STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
  327. }
  328. }
  329. /* can we launch 21i,slice ? */
  330. if (i == 0)
  331. {
  332. deps12 = DONE;
  333. }
  334. else
  335. {
  336. deps12 = advance_22[(i-1)*nblocks*nblocks + slice*nblocks + i];
  337. }
  338. if (deps12 & DONE)
  339. {
  340. /* we may perhaps launch the task 12i,slice */
  341. uint8_t u = STARPU_ATOMIC_OR(&advance_12_21[i + slice*nblocks], STARTED);
  342. if ((u & STARTED) == 0)
  343. {
  344. /* we are the only one that should launch that task */
  345. cl_args *u21a = malloc(sizeof(cl_args));
  346. struct starpu_task *task21 = starpu_task_create();
  347. task21->callback_func = dw_callback_v2_codelet_update_u21;
  348. task21->callback_arg = u21a;
  349. task21->cl = &cl21;
  350. task21->cl_arg = u21a;
  351. u21a->i = i;
  352. u21a->k = slice;
  353. u21a->nblocks = args->nblocks;
  354. u21a->dataA = args->dataA;
  355. task21->handles[0] = starpu_data_get_sub_data(args->dataA, 2, u21a->i, u21a->i);
  356. task21->handles[1] = starpu_data_get_sub_data(args->dataA, 2, u21a->i, u21a->k);
  357. if (!no_prio && (slice == i +1))
  358. task21->priority = STARPU_MAX_PRIO;
  359. debug( "u11 %d start u21 %d %d\n", i, i, slice);
  360. ret = starpu_task_submit(task21);
  361. STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
  362. }
  363. }
  364. }
  365. }
  366. free(argcb);
  367. }
  368. /*
  369. * Callbacks
  370. */
  371. void dw_callback_codelet_update_u11(void *argcb)
  372. {
  373. int ret;
  374. /* in case there remains work, go on */
  375. cl_args *args = argcb;
  376. if (args->i == args->nblocks - 1)
  377. {
  378. /* we are done */
  379. free(argcb);
  380. return;
  381. }
  382. else
  383. {
  384. /* put new tasks */
  385. unsigned nslices;
  386. nslices = args->nblocks - 1 - args->i;
  387. unsigned *remaining = malloc(sizeof(unsigned));
  388. *remaining = 2*nslices;
  389. unsigned slice;
  390. for (slice = args->i + 1; slice < args->nblocks; slice++)
  391. {
  392. /* update slice from u12 */
  393. cl_args *u12a = malloc(sizeof(cl_args));
  394. /* update slice from u21 */
  395. cl_args *u21a = malloc(sizeof(cl_args));
  396. struct starpu_task *task12 = starpu_task_create();
  397. task12->callback_func = dw_callback_codelet_update_u12_21;
  398. task12->callback_arg = u12a;
  399. task12->cl = &cl12;
  400. task12->cl_arg = u12a;
  401. struct starpu_task *task21 = starpu_task_create();
  402. task21->callback_func = dw_callback_codelet_update_u12_21;
  403. task21->callback_arg = u21a;
  404. task21->cl = &cl21;
  405. task21->cl_arg = u21a;
  406. u12a->i = args->i;
  407. u12a->k = slice;
  408. u12a->nblocks = args->nblocks;
  409. u12a->dataA = args->dataA;
  410. u12a->remaining = remaining;
  411. u21a->i = args->i;
  412. u21a->k = slice;
  413. u21a->nblocks = args->nblocks;
  414. u21a->dataA = args->dataA;
  415. u21a->remaining = remaining;
  416. task12->handles[0] = starpu_data_get_sub_data(args->dataA, 2, u12a->i, u12a->i);
  417. task12->handles[1] = starpu_data_get_sub_data(args->dataA, 2, u12a->k, u12a->i);
  418. task21->handles[0] = starpu_data_get_sub_data(args->dataA, 2, u21a->i, u21a->i);
  419. task21->handles[1] = starpu_data_get_sub_data(args->dataA, 2, u21a->i, u21a->k);
  420. ret = starpu_task_submit(task12);
  421. STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
  422. ret = starpu_task_submit(task21);
  423. STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
  424. }
  425. }
  426. }
  427. void dw_callback_codelet_update_u22(void *argcb)
  428. {
  429. int ret;
  430. cl_args *args = argcb;
  431. if (STARPU_ATOMIC_ADD(args->remaining, (-1)) == 0)
  432. {
  433. /* all worker already used the counter */
  434. free(args->remaining);
  435. /* we now reduce the LU22 part (recursion appears there) */
  436. cl_args *u11arg = malloc(sizeof(cl_args));
  437. struct starpu_task *task = starpu_task_create();
  438. task->callback_func = dw_callback_codelet_update_u11;
  439. task->callback_arg = u11arg;
  440. task->cl = &cl11;
  441. task->cl_arg = u11arg;
  442. task->handles[0] = starpu_data_get_sub_data(args->dataA, 2, args->k + 1, args->k + 1);
  443. u11arg->dataA = args->dataA;
  444. u11arg->i = args->k + 1;
  445. u11arg->nblocks = args->nblocks;
  446. /* schedule the codelet */
  447. ret = starpu_task_submit(task);
  448. STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
  449. }
  450. free(args);
  451. }
  452. void dw_callback_codelet_update_u12_21(void *argcb)
  453. {
  454. int ret;
  455. cl_args *args = argcb;
  456. if (STARPU_ATOMIC_ADD(args->remaining, -1) == 0)
  457. {
  458. /* now launch the update of LU22 */
  459. unsigned i = args->i;
  460. unsigned nblocks = args->nblocks;
  461. /* the number of tasks to be done */
  462. unsigned *remaining = malloc(sizeof(unsigned));
  463. *remaining = (nblocks - 1 - i)*(nblocks - 1 - i);
  464. unsigned slicey, slicex;
  465. for (slicey = i+1; slicey < nblocks; slicey++)
  466. {
  467. for (slicex = i+1; slicex < nblocks; slicex++)
  468. {
  469. /* update that square matrix */
  470. cl_args *u22a = malloc(sizeof(cl_args));
  471. struct starpu_task *task22 = starpu_task_create();
  472. task22->callback_func = dw_callback_codelet_update_u22;
  473. task22->callback_arg = u22a;
  474. task22->cl = &cl22;
  475. task22->cl_arg = u22a;
  476. u22a->k = i;
  477. u22a->i = slicex;
  478. u22a->j = slicey;
  479. u22a->dataA = args->dataA;
  480. u22a->nblocks = nblocks;
  481. u22a->remaining = remaining;
  482. task22->handles[0] = starpu_data_get_sub_data(args->dataA, 2, u22a->i, u22a->k);
  483. task22->handles[1] = starpu_data_get_sub_data(args->dataA, 2, u22a->k, u22a->j);
  484. task22->handles[2] = starpu_data_get_sub_data(args->dataA, 2, u22a->i, u22a->j);
  485. /* schedule that codelet */
  486. ret = starpu_task_submit(task22);
  487. STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
  488. }
  489. }
  490. }
  491. }
  492. /*
  493. * code to bootstrap the factorization
  494. */
  495. void dw_codelet_facto(starpu_data_handle_t dataA, unsigned nblocks)
  496. {
  497. int ret;
  498. cl_args *args = malloc(sizeof(cl_args));
  499. args->i = 0;
  500. args->nblocks = nblocks;
  501. args->dataA = dataA;
  502. gettimeofday(&start, NULL);
  503. /* inject a new task with this codelet into the system */
  504. struct starpu_task *task = starpu_task_create();
  505. task->callback_func = dw_callback_codelet_update_u11;
  506. task->callback_arg = args;
  507. task->cl = &cl11;
  508. task->cl_arg = args;
  509. task->handles[0] = starpu_data_get_sub_data(dataA, 2, 0, 0);
  510. /* schedule the codelet */
  511. ret = starpu_task_submit(task);
  512. STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
  513. starpu_task_wait_for_all();
  514. gettimeofday(&end, NULL);
  515. double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  516. FPRINTF(stderr, "Computation took (in ms)\n");
  517. FPRINTF(stdout, "%2.2f\n", timing/1000);
  518. unsigned n = starpu_matrix_get_nx(dataA);
  519. double flop = (2.0f*n*n*n)/3.0f;
  520. FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
  521. }
  522. void dw_codelet_facto_v2(starpu_data_handle_t dataA, unsigned nblocks)
  523. {
  524. advance_11 = calloc(nblocks, sizeof(*advance_11));
  525. STARPU_ASSERT(advance_11);
  526. advance_12_21 = calloc(nblocks*nblocks, sizeof(*advance_12_21));
  527. STARPU_ASSERT(advance_12_21);
  528. advance_22 = calloc(nblocks*nblocks*nblocks, sizeof(*advance_22));
  529. STARPU_ASSERT(advance_22);
  530. cl_args *args = malloc(sizeof(cl_args));
  531. args->i = 0;
  532. args->nblocks = nblocks;
  533. args->dataA = dataA;
  534. gettimeofday(&start, NULL);
  535. /* inject a new task with this codelet into the system */
  536. struct starpu_task *task = starpu_task_create();
  537. task->callback_func = dw_callback_v2_codelet_update_u11;
  538. task->callback_arg = args;
  539. task->cl = &cl11;
  540. task->cl_arg = args;
  541. task->handles[0] = starpu_data_get_sub_data(dataA, 2, 0, 0);
  542. /* schedule the codelet */
  543. int ret = starpu_task_submit(task);
  544. if (STARPU_UNLIKELY(ret == -ENODEV))
  545. {
  546. FPRINTF(stderr, "No worker may execute this task\n");
  547. exit(0);
  548. }
  549. starpu_task_wait_for_all();
  550. gettimeofday(&end, NULL);
  551. double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  552. FPRINTF(stderr, "Computation took (in ms)\n");
  553. FPRINTF(stdout, "%2.2f\n", timing/1000);
  554. unsigned n = starpu_matrix_get_nx(dataA);
  555. double flop = (2.0f*n*n*n)/3.0f;
  556. FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
  557. free(advance_11);
  558. free(advance_12_21);
  559. free(advance_22);
  560. }
  561. void initialize_system(float **A, float **B, unsigned dim, unsigned pinned)
  562. {
  563. int ret;
  564. ret = starpu_init(NULL);
  565. if (ret == -ENODEV)
  566. exit(77);
  567. STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
  568. starpu_cublas_init();
  569. if (pinned)
  570. {
  571. starpu_malloc((void **)A, (size_t)dim*dim*sizeof(float));
  572. starpu_malloc((void **)B, (size_t)dim*sizeof(float));
  573. }
  574. else
  575. {
  576. *A = malloc((size_t)dim*dim*sizeof(float));
  577. STARPU_ASSERT(*A);
  578. *B = malloc((size_t)dim*sizeof(float));
  579. STARPU_ASSERT(*B);
  580. }
  581. }
  582. void free_system(float *A, float *B, unsigned dim, unsigned pinned)
  583. {
  584. if (pinned)
  585. {
  586. starpu_free(A);
  587. starpu_free(B);
  588. }
  589. else
  590. {
  591. free(A);
  592. free(B);
  593. }
  594. }
  595. void dw_factoLU(float *matA, unsigned size,
  596. unsigned ld, unsigned nblocks,
  597. unsigned version, unsigned _no_prio)
  598. {
  599. #ifdef CHECK_RESULTS
  600. FPRINTF(stderr, "Checking results ...\n");
  601. float *Asaved;
  602. Asaved = malloc((size_t)ld*ld*sizeof(float));
  603. memcpy(Asaved, matA, (size_t)ld*ld*sizeof(float));
  604. #endif
  605. no_prio = _no_prio;
  606. starpu_data_handle_t dataA;
  607. /* monitor and partition the A matrix into blocks :
  608. * one block is now determined by 2 unsigned (i,j) */
  609. starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld,
  610. size, size, sizeof(float));
  611. struct starpu_data_filter f =
  612. {
  613. .filter_func = starpu_matrix_filter_vertical_block,
  614. .nchildren = nblocks
  615. };
  616. struct starpu_data_filter f2 =
  617. {
  618. .filter_func = starpu_matrix_filter_block,
  619. .nchildren = nblocks
  620. };
  621. starpu_data_map_filters(dataA, 2, &f, &f2);
  622. switch (version)
  623. {
  624. case 1:
  625. dw_codelet_facto(dataA, nblocks);
  626. break;
  627. default:
  628. case 2:
  629. dw_codelet_facto_v2(dataA, nblocks);
  630. break;
  631. }
  632. /* gather all the data */
  633. starpu_data_unpartition(dataA, 0);
  634. starpu_data_unregister(dataA);
  635. #ifdef CHECK_RESULTS
  636. compare_A_LU(Asaved, matA, size, ld);
  637. #endif
  638. }