dw_factolu.c 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752
  1. /*
  2. * StarPU
  3. * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU Lesser General Public License as published by
  7. * the Free Software Foundation; either version 2.1 of the License, or (at
  8. * your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful, but
  11. * WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. *
  14. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  15. */
  16. #include "dw_factolu.h"
  17. #include <sys/time.h>
  18. uint8_t *advance_12_21; /* size nblocks*nblocks */
  19. uint8_t *advance_11; /* size nblocks*nblocks */
  20. uint8_t *advance_22; /* array of nblocks *nblocks*nblocks */
  21. struct timeval start;
  22. struct timeval end;
  23. static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
  24. static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
  25. static unsigned finished = 0;
  26. static starpu_codelet cl11 =
  27. {
  28. .where = CORE|CUBLAS,
  29. .core_func = dw_core_codelet_update_u11,
  30. #ifdef USE_CUDA
  31. .cublas_func = dw_cublas_codelet_update_u11,
  32. #endif
  33. .nbuffers = 1,
  34. .model = &model_11
  35. };
  36. static starpu_codelet cl12 =
  37. {
  38. .where = CORE|CUBLAS,
  39. .core_func = dw_core_codelet_update_u12,
  40. #ifdef USE_CUDA
  41. .cublas_func = dw_cublas_codelet_update_u12,
  42. #endif
  43. .nbuffers = 2,
  44. .model = &model_12
  45. };
  46. static starpu_codelet cl21 =
  47. {
  48. .where = CORE|CUBLAS,
  49. .core_func = dw_core_codelet_update_u21,
  50. #ifdef USE_CUDA
  51. .cublas_func = dw_cublas_codelet_update_u21,
  52. #endif
  53. .nbuffers = 2,
  54. .model = &model_21
  55. };
  56. static starpu_codelet cl22 =
  57. {
  58. .where = CORE|CUBLAS,
  59. .core_func = dw_core_codelet_update_u22,
  60. #ifdef USE_CUDA
  61. .cublas_func = dw_cublas_codelet_update_u22,
  62. #endif
  63. .nbuffers = 3,
  64. .model = &model_22
  65. };
  66. #define STARTED 0x01
  67. #define DONE 0x10
  68. /*
  69. * Upgraded Callbacks : break the pipeline design !
  70. */
  71. void dw_callback_v2_codelet_update_u22(void *argcb)
  72. {
  73. cl_args *args = argcb;
  74. unsigned k = args->k;
  75. unsigned i = args->i;
  76. unsigned j = args->j;
  77. unsigned nblocks = args->nblocks;
  78. /* we did task 22k,i,j */
  79. advance_22[k*nblocks*nblocks + i + j*nblocks] = DONE;
  80. if ( (i == j) && (i == k+1)) {
  81. /* we now reduce the LU22 part (recursion appears there) */
  82. cl_args *u11arg = malloc(sizeof(cl_args));
  83. struct starpu_task *task = starpu_task_create();
  84. task->callback_func = dw_callback_v2_codelet_update_u11;
  85. task->callback_arg = u11arg;
  86. task->cl = &cl11;
  87. task->cl_arg = u11arg;
  88. task->buffers[0].handle =
  89. get_sub_data(args->dataA, 2, k+1, k+1);
  90. task->buffers[0].mode = STARPU_RW;
  91. u11arg->dataA = args->dataA;
  92. u11arg->i = k + 1;
  93. u11arg->nblocks = args->nblocks;
  94. /* schedule the codelet */
  95. task->priority = MAX_PRIO;
  96. starpu_submit_task(task);
  97. }
  98. /* 11k+1 + 22k,k+1,j => 21 k+1,j */
  99. if ( i == k + 1) {
  100. uint8_t dep;
  101. /* 11 k+1*/
  102. dep = advance_11[(k+1)];
  103. if (dep & DONE) {
  104. /* try to push the task */
  105. uint8_t u = STARPU_ATOMIC_OR(&advance_12_21[(k+1) + j*nblocks], STARTED);
  106. if ((u & STARTED) == 0) {
  107. /* we are the only one that should
  108. * launch that task */
  109. cl_args *u21a = malloc(sizeof(cl_args));
  110. struct starpu_task *task21 = starpu_task_create();
  111. task21->callback_func = dw_callback_v2_codelet_update_u21;
  112. task21->callback_arg = u21a;
  113. task21->cl = &cl21;
  114. task21->cl_arg = u21a;
  115. u21a->i = k+1;
  116. u21a->k = j;
  117. u21a->nblocks = args->nblocks;
  118. u21a->dataA = args->dataA;
  119. task21->buffers[0].handle =
  120. get_sub_data(args->dataA, 2, u21a->i, u21a->i);
  121. task21->buffers[0].mode = STARPU_R;
  122. task21->buffers[1].handle =
  123. get_sub_data(args->dataA, 2, u21a->i, u21a->k);
  124. task21->buffers[1].mode = STARPU_RW;
  125. starpu_submit_task(task21);
  126. }
  127. }
  128. }
  129. /* 11k + 22k-1,i,k => 12 k,i */
  130. if (j == k + 1) {
  131. uint8_t dep;
  132. /* 11 k+1*/
  133. dep = advance_11[(k+1)];
  134. if (dep & DONE) {
  135. /* try to push the task */
  136. uint8_t u = STARPU_ATOMIC_OR(&advance_12_21[(k+1)*nblocks + i], STARTED);
  137. if ((u & STARTED) == 0) {
  138. /* we are the only one that should launch that task */
  139. cl_args *u12a = malloc(sizeof(cl_args));
  140. struct starpu_task *task12 = starpu_task_create();
  141. task12->callback_func = dw_callback_v2_codelet_update_u12;
  142. task12->callback_arg = u12a;
  143. task12->cl = &cl12;
  144. task12->cl_arg = u12a;
  145. u12a->i = k+1;
  146. u12a->k = i;
  147. u12a->nblocks = args->nblocks;
  148. u12a->dataA = args->dataA;
  149. task12->buffers[0].handle = get_sub_data(args->dataA, 2, u12a->i, u12a->i);
  150. task12->buffers[0].mode = STARPU_R;
  151. task12->buffers[1].handle = get_sub_data(args->dataA, 2, u12a->k, u12a->i);
  152. task12->buffers[1].mode = STARPU_RW;
  153. starpu_submit_task(task12);
  154. }
  155. }
  156. }
  157. free(args);
  158. }
  159. void dw_callback_v2_codelet_update_u12(void *argcb)
  160. {
  161. cl_args *args = argcb;
  162. /* now launch the update of LU22 */
  163. unsigned i = args->i;
  164. unsigned k = args->k;
  165. unsigned nblocks = args->nblocks;
  166. /* we did task 21i,k */
  167. advance_12_21[i*nblocks + k] = DONE;
  168. unsigned slicey;
  169. for (slicey = i+1; slicey < nblocks; slicey++)
  170. {
  171. /* can we launch 22 i,args->k,slicey ? */
  172. /* deps : 21 args->k, slicey */
  173. uint8_t dep;
  174. dep = advance_12_21[i + slicey*nblocks];
  175. if (dep & DONE)
  176. {
  177. /* perhaps we may schedule the 22 i,args->k,slicey task */
  178. uint8_t u = STARPU_ATOMIC_OR(&advance_22[i*nblocks*nblocks + slicey*nblocks + k], STARTED);
  179. if ((u & STARTED) == 0) {
  180. /* update that square matrix */
  181. cl_args *u22a = malloc(sizeof(cl_args));
  182. struct starpu_task *task22 = starpu_task_create();
  183. task22->callback_func = dw_callback_v2_codelet_update_u22;
  184. task22->callback_arg = u22a;
  185. task22->cl = &cl22;
  186. task22->cl_arg = u22a;
  187. u22a->k = i;
  188. u22a->i = k;
  189. u22a->j = slicey;
  190. u22a->dataA = args->dataA;
  191. u22a->nblocks = nblocks;
  192. task22->buffers[0].handle = get_sub_data(args->dataA, 2, u22a->i, u22a->k);
  193. task22->buffers[0].mode = STARPU_R;
  194. task22->buffers[1].handle = get_sub_data(args->dataA, 2, u22a->k, u22a->j);
  195. task22->buffers[1].mode = STARPU_R;
  196. task22->buffers[2].handle = get_sub_data(args->dataA, 2, u22a->i, u22a->j);
  197. task22->buffers[2].mode = STARPU_RW;
  198. /* schedule that codelet */
  199. if (slicey == i+1)
  200. task22->priority = MAX_PRIO;
  201. starpu_submit_task(task22);
  202. }
  203. }
  204. }
  205. }
  206. void dw_callback_v2_codelet_update_u21(void *argcb)
  207. {
  208. cl_args *args = argcb;
  209. /* now launch the update of LU22 */
  210. unsigned i = args->i;
  211. unsigned k = args->k;
  212. unsigned nblocks = args->nblocks;
  213. /* we did task 21i,k */
  214. advance_12_21[i + k*nblocks] = DONE;
  215. unsigned slicex;
  216. for (slicex = i+1; slicex < nblocks; slicex++)
  217. {
  218. /* can we launch 22 i,slicex,k ? */
  219. /* deps : 12 slicex k */
  220. uint8_t dep;
  221. dep = advance_12_21[i*nblocks + slicex];
  222. if (dep & DONE)
  223. {
  224. /* perhaps we may schedule the 22 i,args->k,slicey task */
  225. uint8_t u = STARPU_ATOMIC_OR(&advance_22[i*nblocks*nblocks + k*nblocks + slicex], STARTED);
  226. if ((u & STARTED) == 0) {
  227. /* update that square matrix */
  228. cl_args *u22a = malloc(sizeof(cl_args));
  229. struct starpu_task *task22 = starpu_task_create();
  230. task22->callback_func = dw_callback_v2_codelet_update_u22;
  231. task22->callback_arg = u22a;
  232. task22->cl = &cl22;
  233. task22->cl_arg = u22a;
  234. u22a->k = i;
  235. u22a->i = slicex;
  236. u22a->j = k;
  237. u22a->dataA = args->dataA;
  238. u22a->nblocks = nblocks;
  239. task22->buffers[0].handle = get_sub_data(args->dataA, 2, u22a->i, u22a->k);
  240. task22->buffers[0].mode = STARPU_R;
  241. task22->buffers[1].handle = get_sub_data(args->dataA, 2, u22a->k, u22a->j);
  242. task22->buffers[1].mode = STARPU_R;
  243. task22->buffers[2].handle = get_sub_data(args->dataA, 2, u22a->i, u22a->j);
  244. task22->buffers[2].mode = STARPU_RW;
  245. /* schedule that codelet */
  246. if (slicex == i+1)
  247. task22->priority = MAX_PRIO;
  248. starpu_submit_task(task22);
  249. }
  250. }
  251. }
  252. }
  253. void dw_callback_v2_codelet_update_u11(void *argcb)
  254. {
  255. /* in case there remains work, go on */
  256. cl_args *args = argcb;
  257. unsigned nblocks = args->nblocks;
  258. unsigned i = args->i;
  259. /* we did task 11k */
  260. advance_11[i] = DONE;
  261. if (i == nblocks - 1)
  262. {
  263. /* we are done : wake the application up */
  264. pthread_mutex_lock(&mutex);
  265. finished = 1;
  266. pthread_cond_signal(&cond);
  267. pthread_mutex_unlock(&mutex);
  268. return;
  269. }
  270. else
  271. {
  272. /* put new tasks */
  273. unsigned slice;
  274. for (slice = i + 1; slice < nblocks; slice++)
  275. {
  276. /* can we launch 12i,slice ? */
  277. uint8_t deps12;
  278. if (i == 0) {
  279. deps12 = DONE;
  280. }
  281. else {
  282. deps12 = advance_22[(i-1)*nblocks*nblocks + slice + i*nblocks];
  283. }
  284. if (deps12 & DONE) {
  285. /* we may perhaps launch the task 12i,slice */
  286. uint8_t u = STARPU_ATOMIC_OR(&advance_12_21[i*nblocks + slice], STARTED);
  287. if ((u & STARTED) == 0) {
  288. /* we are the only one that should launch that task */
  289. cl_args *u12a = malloc(sizeof(cl_args));
  290. struct starpu_task *task12 = starpu_task_create();
  291. task12->callback_func = dw_callback_v2_codelet_update_u12;
  292. task12->callback_arg = u12a;
  293. task12->cl = &cl12;
  294. task12->cl_arg = u12a;
  295. u12a->i = i;
  296. u12a->k = slice;
  297. u12a->nblocks = args->nblocks;
  298. u12a->dataA = args->dataA;
  299. task12->buffers[0].handle = get_sub_data(args->dataA, 2, u12a->i, u12a->i);
  300. task12->buffers[0].mode = STARPU_R;
  301. task12->buffers[1].handle = get_sub_data(args->dataA, 2, u12a->k, u12a->i);
  302. task12->buffers[1].mode = STARPU_RW;
  303. if (slice == i +1)
  304. task12->priority = MAX_PRIO;
  305. starpu_submit_task(task12);
  306. }
  307. }
  308. /* can we launch 21i,slice ? */
  309. if (i == 0) {
  310. deps12 = DONE;
  311. }
  312. else {
  313. deps12 = advance_22[(i-1)*nblocks*nblocks + slice*nblocks + i];
  314. }
  315. if (deps12 & DONE) {
  316. /* we may perhaps launch the task 12i,slice */
  317. uint8_t u = STARPU_ATOMIC_OR(&advance_12_21[i + slice*nblocks], STARTED);
  318. if ((u & STARTED) == 0) {
  319. /* we are the only one that should launch that task */
  320. cl_args *u21a = malloc(sizeof(cl_args));
  321. struct starpu_task *task21 = starpu_task_create();
  322. task21->callback_func = dw_callback_v2_codelet_update_u21;
  323. task21->callback_arg = u21a;
  324. task21->cl = &cl21;
  325. task21->cl_arg = u21a;
  326. u21a->i = i;
  327. u21a->k = slice;
  328. u21a->nblocks = args->nblocks;
  329. u21a->dataA = args->dataA;
  330. task21->buffers[0].handle = get_sub_data(args->dataA, 2, u21a->i, u21a->i);
  331. task21->buffers[0].mode = STARPU_R;
  332. task21->buffers[1].handle = get_sub_data(args->dataA, 2, u21a->i, u21a->k);
  333. task21->buffers[1].mode = STARPU_RW;
  334. if (slice == i +1)
  335. task21->priority = MAX_PRIO;
  336. starpu_submit_task(task21);
  337. }
  338. }
  339. }
  340. }
  341. }
  342. /*
  343. * Callbacks
  344. */
  345. void dw_callback_codelet_update_u11(void *argcb)
  346. {
  347. /* in case there remains work, go on */
  348. cl_args *args = argcb;
  349. if (args->i == args->nblocks - 1)
  350. {
  351. /* we are done : wake the application up */
  352. pthread_mutex_lock(&mutex);
  353. finished = 1;
  354. pthread_cond_signal(&cond);
  355. pthread_mutex_unlock(&mutex);
  356. return;
  357. }
  358. else
  359. {
  360. /* put new tasks */
  361. unsigned nslices;
  362. nslices = args->nblocks - 1 - args->i;
  363. unsigned *remaining = malloc(sizeof(unsigned));
  364. *remaining = 2*nslices;
  365. unsigned slice;
  366. for (slice = args->i + 1; slice < args->nblocks; slice++)
  367. {
  368. /* update slice from u12 */
  369. cl_args *u12a = malloc(sizeof(cl_args));
  370. /* update slice from u21 */
  371. cl_args *u21a = malloc(sizeof(cl_args));
  372. struct starpu_task *task12 = starpu_task_create();
  373. task12->callback_func = dw_callback_codelet_update_u12_21;
  374. task12->callback_arg = u12a;
  375. task12->cl = &cl12;
  376. task12->cl_arg = u12a;
  377. struct starpu_task *task21 = starpu_task_create();
  378. task21->callback_func = dw_callback_codelet_update_u12_21;
  379. task21->callback_arg = u21a;
  380. task21->cl = &cl21;
  381. task21->cl_arg = u21a;
  382. u12a->i = args->i;
  383. u12a->k = slice;
  384. u12a->nblocks = args->nblocks;
  385. u12a->dataA = args->dataA;
  386. u12a->remaining = remaining;
  387. u21a->i = args->i;
  388. u21a->k = slice;
  389. u21a->nblocks = args->nblocks;
  390. u21a->dataA = args->dataA;
  391. u21a->remaining = remaining;
  392. task12->buffers[0].handle =
  393. get_sub_data(args->dataA, 2, u12a->i, u12a->i);
  394. task12->buffers[0].mode = STARPU_R;
  395. task12->buffers[1].handle =
  396. get_sub_data(args->dataA, 2, u12a->k, u12a->i);
  397. task12->buffers[1].mode = STARPU_RW;
  398. task21->buffers[0].handle =
  399. get_sub_data(args->dataA, 2, u21a->i, u21a->i);
  400. task21->buffers[0].mode = STARPU_R;
  401. task21->buffers[1].handle =
  402. get_sub_data(args->dataA, 2, u21a->i, u21a->k);
  403. task21->buffers[1].mode = STARPU_RW;
  404. starpu_submit_task(task12);
  405. starpu_submit_task(task21);
  406. }
  407. }
  408. }
  409. void dw_callback_codelet_update_u22(void *argcb)
  410. {
  411. cl_args *args = argcb;
  412. if (STARPU_ATOMIC_ADD(args->remaining, (-1)) == 0)
  413. {
  414. /* all worker already used the counter */
  415. free(args->remaining);
  416. /* we now reduce the LU22 part (recursion appears there) */
  417. cl_args *u11arg = malloc(sizeof(cl_args));
  418. struct starpu_task *task = starpu_task_create();
  419. task->callback_func = dw_callback_codelet_update_u11;
  420. task->callback_arg = u11arg;
  421. task->cl = &cl11;
  422. task->cl_arg = u11arg;
  423. task->buffers[0].handle = get_sub_data(args->dataA, 2, args->k + 1, args->k + 1);
  424. task->buffers[0].mode = STARPU_RW;
  425. u11arg->dataA = args->dataA;
  426. u11arg->i = args->k + 1;
  427. u11arg->nblocks = args->nblocks;
  428. /* schedule the codelet */
  429. starpu_submit_task(task);
  430. }
  431. free(args);
  432. }
  433. void dw_callback_codelet_update_u12_21(void *argcb)
  434. {
  435. cl_args *args = argcb;
  436. if (STARPU_ATOMIC_ADD(args->remaining, -1) == 0)
  437. {
  438. /* now launch the update of LU22 */
  439. unsigned i = args->i;
  440. unsigned nblocks = args->nblocks;
  441. /* the number of tasks to be done */
  442. unsigned *remaining = malloc(sizeof(unsigned));
  443. *remaining = (nblocks - 1 - i)*(nblocks - 1 - i);
  444. unsigned slicey, slicex;
  445. for (slicey = i+1; slicey < nblocks; slicey++)
  446. {
  447. for (slicex = i+1; slicex < nblocks; slicex++)
  448. {
  449. /* update that square matrix */
  450. cl_args *u22a = malloc(sizeof(cl_args));
  451. struct starpu_task *task22 = starpu_task_create();
  452. task22->callback_func = dw_callback_codelet_update_u22;
  453. task22->callback_arg = u22a;
  454. task22->cl = &cl22;
  455. task22->cl_arg = u22a;
  456. u22a->k = i;
  457. u22a->i = slicex;
  458. u22a->j = slicey;
  459. u22a->dataA = args->dataA;
  460. u22a->nblocks = nblocks;
  461. u22a->remaining = remaining;
  462. task22->buffers[0].handle = get_sub_data(args->dataA, 2, u22a->i, u22a->k);
  463. task22->buffers[0].mode = STARPU_R;
  464. task22->buffers[1].handle = get_sub_data(args->dataA, 2, u22a->k, u22a->j);
  465. task22->buffers[1].mode = STARPU_R;
  466. task22->buffers[2].handle = get_sub_data(args->dataA, 2, u22a->i, u22a->j);
  467. task22->buffers[2].mode = STARPU_RW;
  468. /* schedule that codelet */
  469. starpu_submit_task(task22);
  470. }
  471. }
  472. }
  473. }
  474. /*
  475. * code to bootstrap the factorization
  476. */
  477. void dw_codelet_facto(starpu_data_handle dataA, unsigned nblocks)
  478. {
  479. cl_args *args = malloc(sizeof(cl_args));
  480. args->i = 0;
  481. args->nblocks = nblocks;
  482. args->dataA = dataA;
  483. gettimeofday(&start, NULL);
  484. /* inject a new task with this codelet into the system */
  485. struct starpu_task *task = starpu_task_create();
  486. task->callback_func = dw_callback_codelet_update_u11;
  487. task->callback_arg = args;
  488. task->cl = &cl11;
  489. task->cl_arg = args;
  490. task->buffers[0].handle = get_sub_data(dataA, 2, 0, 0);
  491. task->buffers[0].mode = STARPU_RW;
  492. /* schedule the codelet */
  493. starpu_submit_task(task);
  494. /* stall the application until the end of computations */
  495. pthread_mutex_lock(&mutex);
  496. if (!finished)
  497. pthread_cond_wait(&cond, &mutex);
  498. pthread_mutex_unlock(&mutex);
  499. gettimeofday(&end, NULL);
  500. double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  501. fprintf(stderr, "Computation took (in ms)\n");
  502. printf("%2.2f\n", timing/1000);
  503. unsigned n = starpu_get_blas_nx(dataA);
  504. double flop = (2.0f*n*n*n)/3.0f;
  505. fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
  506. }
  507. void dw_codelet_facto_v2(starpu_data_handle dataA, unsigned nblocks)
  508. {
  509. advance_11 = calloc(nblocks, sizeof(uint8_t));
  510. STARPU_ASSERT(advance_11);
  511. advance_12_21 = calloc(nblocks*nblocks, sizeof(uint8_t));
  512. STARPU_ASSERT(advance_12_21);
  513. advance_22 = calloc(nblocks*nblocks*nblocks, sizeof(uint8_t));
  514. STARPU_ASSERT(advance_22);
  515. cl_args *args = malloc(sizeof(cl_args));
  516. args->i = 0;
  517. args->nblocks = nblocks;
  518. args->dataA = dataA;
  519. gettimeofday(&start, NULL);
  520. /* inject a new task with this codelet into the system */
  521. struct starpu_task *task = starpu_task_create();
  522. task->callback_func = dw_callback_v2_codelet_update_u11;
  523. task->callback_arg = args;
  524. task->cl = &cl11;
  525. task->cl_arg = args;
  526. task->buffers[0].handle = get_sub_data(dataA, 2, 0, 0);
  527. task->buffers[0].mode = STARPU_RW;
  528. /* schedule the codelet */
  529. int ret = starpu_submit_task(task);
  530. if (STARPU_UNLIKELY(ret == -ENODEV))
  531. {
  532. fprintf(stderr, "No worker may execute this task\n");
  533. exit(0);
  534. }
  535. /* stall the application until the end of computations */
  536. pthread_mutex_lock(&mutex);
  537. if (!finished)
  538. pthread_cond_wait(&cond, &mutex);
  539. pthread_mutex_unlock(&mutex);
  540. gettimeofday(&end, NULL);
  541. double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  542. fprintf(stderr, "Computation took (in ms)\n");
  543. printf("%2.2f\n", timing/1000);
  544. unsigned n = starpu_get_blas_nx(dataA);
  545. double flop = (2.0f*n*n*n)/3.0f;
  546. fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
  547. }
  548. void initialize_system(float **A, float **B, unsigned dim, unsigned pinned)
  549. {
  550. starpu_init(NULL);
  551. timing_init();
  552. if (pinned)
  553. {
  554. starpu_malloc_pinned_if_possible((void **)A, dim*dim*sizeof(float));
  555. starpu_malloc_pinned_if_possible((void **)B, dim*sizeof(float));
  556. }
  557. else {
  558. *A = malloc(dim*dim*sizeof(float));
  559. *B = malloc(dim*sizeof(float));
  560. }
  561. }
  562. void dw_factoLU(float *matA, unsigned size,
  563. unsigned ld, unsigned nblocks,
  564. unsigned version)
  565. {
  566. #ifdef CHECK_RESULTS
  567. fprintf(stderr, "Checking results ...\n");
  568. float *Asaved;
  569. Asaved = malloc(ld*ld*sizeof(float));
  570. memcpy(Asaved, matA, ld*ld*sizeof(float));
  571. #endif
  572. starpu_data_handle dataA;
  573. /* monitor and partition the A matrix into blocks :
  574. * one block is now determined by 2 unsigned (i,j) */
  575. starpu_register_blas_data(&dataA, 0, (uintptr_t)matA, ld,
  576. size, size, sizeof(float));
  577. starpu_filter f;
  578. f.filter_func = starpu_vertical_block_filter_func;
  579. f.filter_arg = nblocks;
  580. starpu_filter f2;
  581. f2.filter_func = starpu_block_filter_func;
  582. f2.filter_arg = nblocks;
  583. starpu_map_filters(dataA, 2, &f, &f2);
  584. switch (version) {
  585. case 1:
  586. dw_codelet_facto(dataA, nblocks);
  587. break;
  588. default:
  589. case 2:
  590. dw_codelet_facto_v2(dataA, nblocks);
  591. break;
  592. }
  593. /* gather all the data */
  594. starpu_unpartition_data(dataA, 0);
  595. starpu_delete_data(dataA);
  596. #ifdef CHECK_RESULTS
  597. compare_A_LU(Asaved, matA, size, ld);
  598. #endif
  599. }