dw_factolu.c 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2009-2011 Université de Bordeaux 1
  4. * Copyright (C) 2010 Mehdi Juhoor <mjuhoor@gmail.com>
  5. * Copyright (C) 2010, 2011, 2012 Centre National de la Recherche Scientifique
  6. *
  7. * StarPU is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU Lesser General Public License as published by
  9. * the Free Software Foundation; either version 2.1 of the License, or (at
  10. * your option) any later version.
  11. *
  12. * StarPU is distributed in the hope that it will be useful, but
  13. * WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  15. *
  16. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  17. */
  18. #include "dw_factolu.h"
  19. #include <sys/time.h>
  20. uint8_t *advance_12_21; /* size nblocks*nblocks */
  21. uint8_t *advance_11; /* size nblocks*nblocks */
  22. uint8_t *advance_22; /* array of nblocks *nblocks*nblocks */
  23. struct timeval start;
  24. struct timeval end;
  25. static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
  26. static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
  27. static unsigned finished = 0;
  28. static unsigned no_prio = 0;
  29. static struct starpu_codelet cl11 =
  30. {
  31. .where = STARPU_CPU|STARPU_CUDA,
  32. .cpu_funcs = {dw_cpu_codelet_update_u11, NULL},
  33. #ifdef STARPU_USE_CUDA
  34. .cuda_funcs = {dw_cublas_codelet_update_u11, NULL},
  35. #endif
  36. .nbuffers = 1,
  37. .modes = {STARPU_RW},
  38. .model = &model_11
  39. };
  40. static struct starpu_codelet cl12 =
  41. {
  42. .where = STARPU_CPU|STARPU_CUDA,
  43. .cpu_funcs = {dw_cpu_codelet_update_u12, NULL},
  44. #ifdef STARPU_USE_CUDA
  45. .cuda_funcs = {dw_cublas_codelet_update_u12, NULL},
  46. #endif
  47. .nbuffers = 2,
  48. .modes = {STARPU_R, STARPU_RW},
  49. .model = &model_12
  50. };
  51. static struct starpu_codelet cl21 =
  52. {
  53. .where = STARPU_CPU|STARPU_CUDA,
  54. .cpu_funcs = {dw_cpu_codelet_update_u21, NULL},
  55. #ifdef STARPU_USE_CUDA
  56. .cuda_funcs = {dw_cublas_codelet_update_u21, NULL},
  57. #endif
  58. .nbuffers = 2,
  59. .modes = {STARPU_R, STARPU_RW},
  60. .model = &model_21
  61. };
  62. static struct starpu_codelet cl22 =
  63. {
  64. .where = STARPU_CPU|STARPU_CUDA,
  65. .cpu_funcs = {dw_cpu_codelet_update_u22, NULL},
  66. #ifdef STARPU_USE_CUDA
  67. .cuda_funcs = {dw_cublas_codelet_update_u22, NULL},
  68. #endif
  69. .nbuffers = 3,
  70. .modes = {STARPU_R, STARPU_R, STARPU_RW},
  71. .model = &model_22
  72. };
  73. #define STARTED 0x01
  74. #define DONE 0x10
  75. /*
  76. * Upgraded Callbacks : break the pipeline design !
  77. */
  78. void dw_callback_v2_codelet_update_u22(void *argcb)
  79. {
  80. cl_args *args = argcb;
  81. unsigned k = args->k;
  82. unsigned i = args->i;
  83. unsigned j = args->j;
  84. unsigned nblocks = args->nblocks;
  85. /* we did task 22k,i,j */
  86. advance_22[k*nblocks*nblocks + i + j*nblocks] = DONE;
  87. if ( (i == j) && (i == k+1))
  88. {
  89. /* we now reduce the LU22 part (recursion appears there) */
  90. cl_args *u11arg = malloc(sizeof(cl_args));
  91. struct starpu_task *task = starpu_task_create();
  92. task->callback_func = dw_callback_v2_codelet_update_u11;
  93. task->callback_arg = u11arg;
  94. task->cl = &cl11;
  95. task->cl_arg = u11arg;
  96. task->handles[0] = starpu_data_get_sub_data(args->dataA, 2, k+1, k+1);
  97. u11arg->dataA = args->dataA;
  98. u11arg->i = k + 1;
  99. u11arg->nblocks = args->nblocks;
  100. /* schedule the codelet */
  101. if (!no_prio)
  102. task->priority = STARPU_MAX_PRIO;
  103. starpu_task_submit(task);
  104. }
  105. /* 11k+1 + 22k,k+1,j => 21 k+1,j */
  106. if ( i == k + 1)
  107. {
  108. uint8_t dep;
  109. /* 11 k+1*/
  110. dep = advance_11[(k+1)];
  111. if (dep & DONE)
  112. {
  113. /* try to push the task */
  114. uint8_t u = STARPU_ATOMIC_OR(&advance_12_21[(k+1) + j*nblocks], STARTED);
  115. if ((u & STARTED) == 0)
  116. {
  117. /* we are the only one that should launch that task */
  118. cl_args *u21a = malloc(sizeof(cl_args));
  119. struct starpu_task *task21 = starpu_task_create();
  120. task21->callback_func = dw_callback_v2_codelet_update_u21;
  121. task21->callback_arg = u21a;
  122. task21->cl = &cl21;
  123. task21->cl_arg = u21a;
  124. u21a->i = k+1;
  125. u21a->k = j;
  126. u21a->nblocks = args->nblocks;
  127. u21a->dataA = args->dataA;
  128. task21->handles[0] = starpu_data_get_sub_data(args->dataA, 2, u21a->i, u21a->i);
  129. task21->handles[1] = starpu_data_get_sub_data(args->dataA, 2, u21a->i, u21a->k);
  130. starpu_task_submit(task21);
  131. }
  132. }
  133. }
  134. /* 11k + 22k-1,i,k => 12 k,i */
  135. if (j == k + 1)
  136. {
  137. uint8_t dep;
  138. /* 11 k+1*/
  139. dep = advance_11[(k+1)];
  140. if (dep & DONE)
  141. {
  142. /* try to push the task */
  143. uint8_t u = STARPU_ATOMIC_OR(&advance_12_21[(k+1)*nblocks + i], STARTED);
  144. if ((u & STARTED) == 0)
  145. {
  146. /* we are the only one that should launch that task */
  147. cl_args *u12a = malloc(sizeof(cl_args));
  148. struct starpu_task *task12 = starpu_task_create();
  149. task12->callback_func = dw_callback_v2_codelet_update_u12;
  150. task12->callback_arg = u12a;
  151. task12->cl = &cl12;
  152. task12->cl_arg = u12a;
  153. u12a->i = k+1;
  154. u12a->k = i;
  155. u12a->nblocks = args->nblocks;
  156. u12a->dataA = args->dataA;
  157. task12->handles[0] = starpu_data_get_sub_data(args->dataA, 2, u12a->i, u12a->i);
  158. task12->handles[1] = starpu_data_get_sub_data(args->dataA, 2, u12a->k, u12a->i);
  159. starpu_task_submit(task12);
  160. }
  161. }
  162. }
  163. free(args);
  164. }
  165. void dw_callback_v2_codelet_update_u12(void *argcb)
  166. {
  167. cl_args *args = argcb;
  168. /* now launch the update of LU22 */
  169. unsigned i = args->i;
  170. unsigned k = args->k;
  171. unsigned nblocks = args->nblocks;
  172. /* we did task 21i,k */
  173. advance_12_21[i*nblocks + k] = DONE;
  174. unsigned slicey;
  175. for (slicey = i+1; slicey < nblocks; slicey++)
  176. {
  177. /* can we launch 22 i,args->k,slicey ? */
  178. /* deps : 21 args->k, slicey */
  179. uint8_t dep;
  180. dep = advance_12_21[i + slicey*nblocks];
  181. if (dep & DONE)
  182. {
  183. /* perhaps we may schedule the 22 i,args->k,slicey task */
  184. uint8_t u = STARPU_ATOMIC_OR(&advance_22[i*nblocks*nblocks + slicey*nblocks + k], STARTED);
  185. if ((u & STARTED) == 0)
  186. {
  187. /* update that square matrix */
  188. cl_args *u22a = malloc(sizeof(cl_args));
  189. struct starpu_task *task22 = starpu_task_create();
  190. task22->callback_func = dw_callback_v2_codelet_update_u22;
  191. task22->callback_arg = u22a;
  192. task22->cl = &cl22;
  193. task22->cl_arg = u22a;
  194. u22a->k = i;
  195. u22a->i = k;
  196. u22a->j = slicey;
  197. u22a->dataA = args->dataA;
  198. u22a->nblocks = nblocks;
  199. task22->handles[0] = starpu_data_get_sub_data(args->dataA, 2, u22a->i, u22a->k);
  200. task22->handles[1] = starpu_data_get_sub_data(args->dataA, 2, u22a->k, u22a->j);
  201. task22->handles[2] = starpu_data_get_sub_data(args->dataA, 2, u22a->i, u22a->j);
  202. /* schedule that codelet */
  203. if (!no_prio && (slicey == i+1))
  204. task22->priority = STARPU_MAX_PRIO;
  205. starpu_task_submit(task22);
  206. }
  207. }
  208. }
  209. }
  210. void dw_callback_v2_codelet_update_u21(void *argcb)
  211. {
  212. cl_args *args = argcb;
  213. /* now launch the update of LU22 */
  214. unsigned i = args->i;
  215. unsigned k = args->k;
  216. unsigned nblocks = args->nblocks;
  217. /* we did task 21i,k */
  218. advance_12_21[i + k*nblocks] = DONE;
  219. unsigned slicex;
  220. for (slicex = i+1; slicex < nblocks; slicex++)
  221. {
  222. /* can we launch 22 i,slicex,k ? */
  223. /* deps : 12 slicex k */
  224. uint8_t dep;
  225. dep = advance_12_21[i*nblocks + slicex];
  226. if (dep & DONE)
  227. {
  228. /* perhaps we may schedule the 22 i,args->k,slicey task */
  229. uint8_t u = STARPU_ATOMIC_OR(&advance_22[i*nblocks*nblocks + k*nblocks + slicex], STARTED);
  230. if ((u & STARTED) == 0)
  231. {
  232. /* update that square matrix */
  233. cl_args *u22a = malloc(sizeof(cl_args));
  234. struct starpu_task *task22 = starpu_task_create();
  235. task22->callback_func = dw_callback_v2_codelet_update_u22;
  236. task22->callback_arg = u22a;
  237. task22->cl = &cl22;
  238. task22->cl_arg = u22a;
  239. u22a->k = i;
  240. u22a->i = slicex;
  241. u22a->j = k;
  242. u22a->dataA = args->dataA;
  243. u22a->nblocks = nblocks;
  244. task22->handles[0] = starpu_data_get_sub_data(args->dataA, 2, u22a->i, u22a->k);
  245. task22->handles[1] = starpu_data_get_sub_data(args->dataA, 2, u22a->k, u22a->j);
  246. task22->handles[2] = starpu_data_get_sub_data(args->dataA, 2, u22a->i, u22a->j);
  247. /* schedule that codelet */
  248. if (!no_prio && (slicex == i+1))
  249. task22->priority = STARPU_MAX_PRIO;
  250. starpu_task_submit(task22);
  251. }
  252. }
  253. }
  254. }
  255. void dw_callback_v2_codelet_update_u11(void *argcb)
  256. {
  257. /* in case there remains work, go on */
  258. cl_args *args = argcb;
  259. unsigned nblocks = args->nblocks;
  260. unsigned i = args->i;
  261. /* we did task 11k */
  262. advance_11[i] = DONE;
  263. if (i == nblocks - 1)
  264. {
  265. /* we are done : wake the application up */
  266. pthread_mutex_lock(&mutex);
  267. finished = 1;
  268. pthread_cond_signal(&cond);
  269. pthread_mutex_unlock(&mutex);
  270. return;
  271. }
  272. else
  273. {
  274. /* put new tasks */
  275. unsigned slice;
  276. for (slice = i + 1; slice < nblocks; slice++)
  277. {
  278. /* can we launch 12i,slice ? */
  279. uint8_t deps12;
  280. if (i == 0)
  281. {
  282. deps12 = DONE;
  283. }
  284. else
  285. {
  286. deps12 = advance_22[(i-1)*nblocks*nblocks + slice + i*nblocks];
  287. }
  288. if (deps12 & DONE)
  289. {
  290. /* we may perhaps launch the task 12i,slice */
  291. uint8_t u = STARPU_ATOMIC_OR(&advance_12_21[i*nblocks + slice], STARTED);
  292. if ((u & STARTED) == 0)
  293. {
  294. /* we are the only one that should launch that task */
  295. cl_args *u12a = malloc(sizeof(cl_args));
  296. struct starpu_task *task12 = starpu_task_create();
  297. task12->callback_func = dw_callback_v2_codelet_update_u12;
  298. task12->callback_arg = u12a;
  299. task12->cl = &cl12;
  300. task12->cl_arg = u12a;
  301. u12a->i = i;
  302. u12a->k = slice;
  303. u12a->nblocks = args->nblocks;
  304. u12a->dataA = args->dataA;
  305. task12->handles[0] = starpu_data_get_sub_data(args->dataA, 2, u12a->i, u12a->i);
  306. task12->handles[1] = starpu_data_get_sub_data(args->dataA, 2, u12a->k, u12a->i);
  307. if (!no_prio && (slice == i +1))
  308. task12->priority = STARPU_MAX_PRIO;
  309. starpu_task_submit(task12);
  310. }
  311. }
  312. /* can we launch 21i,slice ? */
  313. if (i == 0)
  314. {
  315. deps12 = DONE;
  316. }
  317. else
  318. {
  319. deps12 = advance_22[(i-1)*nblocks*nblocks + slice*nblocks + i];
  320. }
  321. if (deps12 & DONE)
  322. {
  323. /* we may perhaps launch the task 12i,slice */
  324. uint8_t u = STARPU_ATOMIC_OR(&advance_12_21[i + slice*nblocks], STARTED);
  325. if ((u & STARTED) == 0)
  326. {
  327. /* we are the only one that should launch that task */
  328. cl_args *u21a = malloc(sizeof(cl_args));
  329. struct starpu_task *task21 = starpu_task_create();
  330. task21->callback_func = dw_callback_v2_codelet_update_u21;
  331. task21->callback_arg = u21a;
  332. task21->cl = &cl21;
  333. task21->cl_arg = u21a;
  334. u21a->i = i;
  335. u21a->k = slice;
  336. u21a->nblocks = args->nblocks;
  337. u21a->dataA = args->dataA;
  338. task21->handles[0] = starpu_data_get_sub_data(args->dataA, 2, u21a->i, u21a->i);
  339. task21->handles[1] = starpu_data_get_sub_data(args->dataA, 2, u21a->i, u21a->k);
  340. if (!no_prio && (slice == i +1))
  341. task21->priority = STARPU_MAX_PRIO;
  342. starpu_task_submit(task21);
  343. }
  344. }
  345. }
  346. }
  347. }
  348. /*
  349. * Callbacks
  350. */
  351. void dw_callback_codelet_update_u11(void *argcb)
  352. {
  353. /* in case there remains work, go on */
  354. cl_args *args = argcb;
  355. if (args->i == args->nblocks - 1)
  356. {
  357. /* we are done : wake the application up */
  358. pthread_mutex_lock(&mutex);
  359. finished = 1;
  360. pthread_cond_signal(&cond);
  361. pthread_mutex_unlock(&mutex);
  362. return;
  363. }
  364. else
  365. {
  366. /* put new tasks */
  367. unsigned nslices;
  368. nslices = args->nblocks - 1 - args->i;
  369. unsigned *remaining = malloc(sizeof(unsigned));
  370. *remaining = 2*nslices;
  371. unsigned slice;
  372. for (slice = args->i + 1; slice < args->nblocks; slice++)
  373. {
  374. /* update slice from u12 */
  375. cl_args *u12a = malloc(sizeof(cl_args));
  376. /* update slice from u21 */
  377. cl_args *u21a = malloc(sizeof(cl_args));
  378. struct starpu_task *task12 = starpu_task_create();
  379. task12->callback_func = dw_callback_codelet_update_u12_21;
  380. task12->callback_arg = u12a;
  381. task12->cl = &cl12;
  382. task12->cl_arg = u12a;
  383. struct starpu_task *task21 = starpu_task_create();
  384. task21->callback_func = dw_callback_codelet_update_u12_21;
  385. task21->callback_arg = u21a;
  386. task21->cl = &cl21;
  387. task21->cl_arg = u21a;
  388. u12a->i = args->i;
  389. u12a->k = slice;
  390. u12a->nblocks = args->nblocks;
  391. u12a->dataA = args->dataA;
  392. u12a->remaining = remaining;
  393. u21a->i = args->i;
  394. u21a->k = slice;
  395. u21a->nblocks = args->nblocks;
  396. u21a->dataA = args->dataA;
  397. u21a->remaining = remaining;
  398. task12->handles[0] = starpu_data_get_sub_data(args->dataA, 2, u12a->i, u12a->i);
  399. task12->handles[1] = starpu_data_get_sub_data(args->dataA, 2, u12a->k, u12a->i);
  400. task21->handles[0] = starpu_data_get_sub_data(args->dataA, 2, u21a->i, u21a->i);
  401. task21->handles[1] = starpu_data_get_sub_data(args->dataA, 2, u21a->i, u21a->k);
  402. starpu_task_submit(task12);
  403. starpu_task_submit(task21);
  404. }
  405. }
  406. }
  407. void dw_callback_codelet_update_u22(void *argcb)
  408. {
  409. cl_args *args = argcb;
  410. if (STARPU_ATOMIC_ADD(args->remaining, (-1)) == 0)
  411. {
  412. /* all worker already used the counter */
  413. free(args->remaining);
  414. /* we now reduce the LU22 part (recursion appears there) */
  415. cl_args *u11arg = malloc(sizeof(cl_args));
  416. struct starpu_task *task = starpu_task_create();
  417. task->callback_func = dw_callback_codelet_update_u11;
  418. task->callback_arg = u11arg;
  419. task->cl = &cl11;
  420. task->cl_arg = u11arg;
  421. task->handles[0] = starpu_data_get_sub_data(args->dataA, 2, args->k + 1, args->k + 1);
  422. u11arg->dataA = args->dataA;
  423. u11arg->i = args->k + 1;
  424. u11arg->nblocks = args->nblocks;
  425. /* schedule the codelet */
  426. starpu_task_submit(task);
  427. }
  428. free(args);
  429. }
  430. void dw_callback_codelet_update_u12_21(void *argcb)
  431. {
  432. cl_args *args = argcb;
  433. if (STARPU_ATOMIC_ADD(args->remaining, -1) == 0)
  434. {
  435. /* now launch the update of LU22 */
  436. unsigned i = args->i;
  437. unsigned nblocks = args->nblocks;
  438. /* the number of tasks to be done */
  439. unsigned *remaining = malloc(sizeof(unsigned));
  440. *remaining = (nblocks - 1 - i)*(nblocks - 1 - i);
  441. unsigned slicey, slicex;
  442. for (slicey = i+1; slicey < nblocks; slicey++)
  443. {
  444. for (slicex = i+1; slicex < nblocks; slicex++)
  445. {
  446. /* update that square matrix */
  447. cl_args *u22a = malloc(sizeof(cl_args));
  448. struct starpu_task *task22 = starpu_task_create();
  449. task22->callback_func = dw_callback_codelet_update_u22;
  450. task22->callback_arg = u22a;
  451. task22->cl = &cl22;
  452. task22->cl_arg = u22a;
  453. u22a->k = i;
  454. u22a->i = slicex;
  455. u22a->j = slicey;
  456. u22a->dataA = args->dataA;
  457. u22a->nblocks = nblocks;
  458. u22a->remaining = remaining;
  459. task22->handles[0] = starpu_data_get_sub_data(args->dataA, 2, u22a->i, u22a->k);
  460. task22->handles[1] = starpu_data_get_sub_data(args->dataA, 2, u22a->k, u22a->j);
  461. task22->handles[2] = starpu_data_get_sub_data(args->dataA, 2, u22a->i, u22a->j);
  462. /* schedule that codelet */
  463. starpu_task_submit(task22);
  464. }
  465. }
  466. }
  467. }
  468. /*
  469. * code to bootstrap the factorization
  470. */
  471. void dw_codelet_facto(starpu_data_handle_t dataA, unsigned nblocks)
  472. {
  473. cl_args *args = malloc(sizeof(cl_args));
  474. args->i = 0;
  475. args->nblocks = nblocks;
  476. args->dataA = dataA;
  477. gettimeofday(&start, NULL);
  478. /* inject a new task with this codelet into the system */
  479. struct starpu_task *task = starpu_task_create();
  480. task->callback_func = dw_callback_codelet_update_u11;
  481. task->callback_arg = args;
  482. task->cl = &cl11;
  483. task->cl_arg = args;
  484. task->handles[0] = starpu_data_get_sub_data(dataA, 2, 0, 0);
  485. /* schedule the codelet */
  486. starpu_task_submit(task);
  487. /* stall the application until the end of computations */
  488. pthread_mutex_lock(&mutex);
  489. if (!finished)
  490. pthread_cond_wait(&cond, &mutex);
  491. pthread_mutex_unlock(&mutex);
  492. gettimeofday(&end, NULL);
  493. double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  494. FPRINTF(stderr, "Computation took (in ms)\n");
  495. FPRINTF(stdout, "%2.2f\n", timing/1000);
  496. unsigned n = starpu_matrix_get_nx(dataA);
  497. double flop = (2.0f*n*n*n)/3.0f;
  498. FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
  499. }
  500. void dw_codelet_facto_v2(starpu_data_handle_t dataA, unsigned nblocks)
  501. {
  502. advance_11 = calloc(nblocks, sizeof(uint8_t));
  503. STARPU_ASSERT(advance_11);
  504. advance_12_21 = calloc(nblocks*nblocks, sizeof(uint8_t));
  505. STARPU_ASSERT(advance_12_21);
  506. advance_22 = calloc(nblocks*nblocks*nblocks, sizeof(uint8_t));
  507. STARPU_ASSERT(advance_22);
  508. cl_args *args = malloc(sizeof(cl_args));
  509. args->i = 0;
  510. args->nblocks = nblocks;
  511. args->dataA = dataA;
  512. gettimeofday(&start, NULL);
  513. /* inject a new task with this codelet into the system */
  514. struct starpu_task *task = starpu_task_create();
  515. task->callback_func = dw_callback_v2_codelet_update_u11;
  516. task->callback_arg = args;
  517. task->cl = &cl11;
  518. task->cl_arg = args;
  519. task->handles[0] = starpu_data_get_sub_data(dataA, 2, 0, 0);
  520. /* schedule the codelet */
  521. int ret = starpu_task_submit(task);
  522. if (STARPU_UNLIKELY(ret == -ENODEV))
  523. {
  524. FPRINTF(stderr, "No worker may execute this task\n");
  525. exit(0);
  526. }
  527. /* stall the application until the end of computations */
  528. pthread_mutex_lock(&mutex);
  529. if (!finished)
  530. pthread_cond_wait(&cond, &mutex);
  531. pthread_mutex_unlock(&mutex);
  532. gettimeofday(&end, NULL);
  533. double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  534. FPRINTF(stderr, "Computation took (in ms)\n");
  535. FPRINTF(stdout, "%2.2f\n", timing/1000);
  536. unsigned n = starpu_matrix_get_nx(dataA);
  537. double flop = (2.0f*n*n*n)/3.0f;
  538. FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
  539. }
  540. void initialize_system(float **A, float **B, unsigned dim, unsigned pinned)
  541. {
  542. starpu_init(NULL);
  543. starpu_helper_cublas_init();
  544. if (pinned)
  545. {
  546. starpu_malloc((void **)A, (size_t)dim*dim*sizeof(float));
  547. starpu_malloc((void **)B, (size_t)dim*sizeof(float));
  548. }
  549. else
  550. {
  551. *A = malloc((size_t)dim*dim*sizeof(float));
  552. STARPU_ASSERT(*A);
  553. *B = malloc((size_t)dim*sizeof(float));
  554. STARPU_ASSERT(*B);
  555. }
  556. }
  557. void dw_factoLU(float *matA, unsigned size,
  558. unsigned ld, unsigned nblocks,
  559. unsigned version, unsigned _no_prio)
  560. {
  561. #ifdef CHECK_RESULTS
  562. FPRINTF(stderr, "Checking results ...\n");
  563. float *Asaved;
  564. Asaved = malloc((size_t)ld*ld*sizeof(float));
  565. memcpy(Asaved, matA, (size_t)ld*ld*sizeof(float));
  566. #endif
  567. no_prio = _no_prio;
  568. starpu_data_handle_t dataA;
  569. /* monitor and partition the A matrix into blocks :
  570. * one block is now determined by 2 unsigned (i,j) */
  571. starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld,
  572. size, size, sizeof(float));
  573. struct starpu_data_filter f =
  574. {
  575. .filter_func = starpu_vertical_block_filter_func,
  576. .nchildren = nblocks
  577. };
  578. struct starpu_data_filter f2 =
  579. {
  580. .filter_func = starpu_block_filter_func,
  581. .nchildren = nblocks
  582. };
  583. starpu_data_map_filters(dataA, 2, &f, &f2);
  584. switch (version)
  585. {
  586. case 1:
  587. dw_codelet_facto(dataA, nblocks);
  588. break;
  589. default:
  590. case 2:
  591. dw_codelet_facto_v2(dataA, nblocks);
  592. break;
  593. }
  594. /* gather all the data */
  595. starpu_data_unpartition(dataA, 0);
  596. starpu_data_unregister(dataA);
  597. #ifdef CHECK_RESULTS
  598. compare_A_LU(Asaved, matA, size, ld);
  599. #endif
  600. }