dw_factolu.c 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764
  1. /*
  2. * StarPU
  3. * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU Lesser General Public License as published by
  7. * the Free Software Foundation; either version 2.1 of the License, or (at
  8. * your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful, but
  11. * WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. *
  14. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  15. */
  16. #include "dw_factolu.h"
  17. #include <sys/time.h>
  18. uint8_t *advance_12_21; /* size nblocks*nblocks */
  19. uint8_t *advance_11; /* size nblocks*nblocks */
  20. uint8_t *advance_22; /* array of nblocks *nblocks*nblocks */
  21. struct timeval start;
  22. struct timeval end;
  23. static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
  24. static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
  25. static unsigned finished = 0;
  26. static unsigned no_prio = 0;
  27. static starpu_codelet cl11 =
  28. {
  29. .where = STARPU_CPU|STARPU_CUDA,
  30. .cpu_func = dw_cpu_codelet_update_u11,
  31. #ifdef STARPU_USE_CUDA
  32. .cuda_func = dw_cublas_codelet_update_u11,
  33. #endif
  34. .nbuffers = 1,
  35. .model = &model_11
  36. };
  37. static starpu_codelet cl12 =
  38. {
  39. .where = STARPU_CPU|STARPU_CUDA,
  40. .cpu_func = dw_cpu_codelet_update_u12,
  41. #ifdef STARPU_USE_CUDA
  42. .cuda_func = dw_cublas_codelet_update_u12,
  43. #endif
  44. .nbuffers = 2,
  45. .model = &model_12
  46. };
  47. static starpu_codelet cl21 =
  48. {
  49. .where = STARPU_CPU|STARPU_CUDA,
  50. .cpu_func = dw_cpu_codelet_update_u21,
  51. #ifdef STARPU_USE_CUDA
  52. .cuda_func = dw_cublas_codelet_update_u21,
  53. #endif
  54. .nbuffers = 2,
  55. .model = &model_21
  56. };
  57. static starpu_codelet cl22 =
  58. {
  59. .where = STARPU_CPU|STARPU_CUDA,
  60. .cpu_func = dw_cpu_codelet_update_u22,
  61. #ifdef STARPU_USE_CUDA
  62. .cuda_func = dw_cublas_codelet_update_u22,
  63. #endif
  64. .nbuffers = 3,
  65. .model = &model_22
  66. };
  67. #define STARTED 0x01
  68. #define DONE 0x10
  69. /*
  70. * Upgraded Callbacks : break the pipeline design !
  71. */
  72. void dw_callback_v2_codelet_update_u22(void *argcb)
  73. {
  74. cl_args *args = argcb;
  75. unsigned k = args->k;
  76. unsigned i = args->i;
  77. unsigned j = args->j;
  78. unsigned nblocks = args->nblocks;
  79. /* we did task 22k,i,j */
  80. advance_22[k*nblocks*nblocks + i + j*nblocks] = DONE;
  81. if ( (i == j) && (i == k+1)) {
  82. /* we now reduce the LU22 part (recursion appears there) */
  83. cl_args *u11arg = malloc(sizeof(cl_args));
  84. struct starpu_task *task = starpu_task_create();
  85. task->callback_func = dw_callback_v2_codelet_update_u11;
  86. task->callback_arg = u11arg;
  87. task->cl = &cl11;
  88. task->cl_arg = u11arg;
  89. task->buffers[0].handle =
  90. starpu_data_get_sub_data(args->dataA, 2, k+1, k+1);
  91. task->buffers[0].mode = STARPU_RW;
  92. u11arg->dataA = args->dataA;
  93. u11arg->i = k + 1;
  94. u11arg->nblocks = args->nblocks;
  95. /* schedule the codelet */
  96. if (!no_prio)
  97. task->priority = STARPU_MAX_PRIO;
  98. starpu_task_submit(task);
  99. }
  100. /* 11k+1 + 22k,k+1,j => 21 k+1,j */
  101. if ( i == k + 1) {
  102. uint8_t dep;
  103. /* 11 k+1*/
  104. dep = advance_11[(k+1)];
  105. if (dep & DONE) {
  106. /* try to push the task */
  107. uint8_t u = STARPU_ATOMIC_OR(&advance_12_21[(k+1) + j*nblocks], STARTED);
  108. if ((u & STARTED) == 0) {
  109. /* we are the only one that should
  110. * launch that task */
  111. cl_args *u21a = malloc(sizeof(cl_args));
  112. struct starpu_task *task21 = starpu_task_create();
  113. task21->callback_func = dw_callback_v2_codelet_update_u21;
  114. task21->callback_arg = u21a;
  115. task21->cl = &cl21;
  116. task21->cl_arg = u21a;
  117. u21a->i = k+1;
  118. u21a->k = j;
  119. u21a->nblocks = args->nblocks;
  120. u21a->dataA = args->dataA;
  121. task21->buffers[0].handle =
  122. starpu_data_get_sub_data(args->dataA, 2, u21a->i, u21a->i);
  123. task21->buffers[0].mode = STARPU_R;
  124. task21->buffers[1].handle =
  125. starpu_data_get_sub_data(args->dataA, 2, u21a->i, u21a->k);
  126. task21->buffers[1].mode = STARPU_RW;
  127. starpu_task_submit(task21);
  128. }
  129. }
  130. }
  131. /* 11k + 22k-1,i,k => 12 k,i */
  132. if (j == k + 1) {
  133. uint8_t dep;
  134. /* 11 k+1*/
  135. dep = advance_11[(k+1)];
  136. if (dep & DONE) {
  137. /* try to push the task */
  138. uint8_t u = STARPU_ATOMIC_OR(&advance_12_21[(k+1)*nblocks + i], STARTED);
  139. if ((u & STARTED) == 0) {
  140. /* we are the only one that should launch that task */
  141. cl_args *u12a = malloc(sizeof(cl_args));
  142. struct starpu_task *task12 = starpu_task_create();
  143. task12->callback_func = dw_callback_v2_codelet_update_u12;
  144. task12->callback_arg = u12a;
  145. task12->cl = &cl12;
  146. task12->cl_arg = u12a;
  147. u12a->i = k+1;
  148. u12a->k = i;
  149. u12a->nblocks = args->nblocks;
  150. u12a->dataA = args->dataA;
  151. task12->buffers[0].handle = starpu_data_get_sub_data(args->dataA, 2, u12a->i, u12a->i);
  152. task12->buffers[0].mode = STARPU_R;
  153. task12->buffers[1].handle = starpu_data_get_sub_data(args->dataA, 2, u12a->k, u12a->i);
  154. task12->buffers[1].mode = STARPU_RW;
  155. starpu_task_submit(task12);
  156. }
  157. }
  158. }
  159. free(args);
  160. }
  161. void dw_callback_v2_codelet_update_u12(void *argcb)
  162. {
  163. cl_args *args = argcb;
  164. /* now launch the update of LU22 */
  165. unsigned i = args->i;
  166. unsigned k = args->k;
  167. unsigned nblocks = args->nblocks;
  168. /* we did task 21i,k */
  169. advance_12_21[i*nblocks + k] = DONE;
  170. unsigned slicey;
  171. for (slicey = i+1; slicey < nblocks; slicey++)
  172. {
  173. /* can we launch 22 i,args->k,slicey ? */
  174. /* deps : 21 args->k, slicey */
  175. uint8_t dep;
  176. dep = advance_12_21[i + slicey*nblocks];
  177. if (dep & DONE)
  178. {
  179. /* perhaps we may schedule the 22 i,args->k,slicey task */
  180. uint8_t u = STARPU_ATOMIC_OR(&advance_22[i*nblocks*nblocks + slicey*nblocks + k], STARTED);
  181. if ((u & STARTED) == 0) {
  182. /* update that square matrix */
  183. cl_args *u22a = malloc(sizeof(cl_args));
  184. struct starpu_task *task22 = starpu_task_create();
  185. task22->callback_func = dw_callback_v2_codelet_update_u22;
  186. task22->callback_arg = u22a;
  187. task22->cl = &cl22;
  188. task22->cl_arg = u22a;
  189. u22a->k = i;
  190. u22a->i = k;
  191. u22a->j = slicey;
  192. u22a->dataA = args->dataA;
  193. u22a->nblocks = nblocks;
  194. task22->buffers[0].handle = starpu_data_get_sub_data(args->dataA, 2, u22a->i, u22a->k);
  195. task22->buffers[0].mode = STARPU_R;
  196. task22->buffers[1].handle = starpu_data_get_sub_data(args->dataA, 2, u22a->k, u22a->j);
  197. task22->buffers[1].mode = STARPU_R;
  198. task22->buffers[2].handle = starpu_data_get_sub_data(args->dataA, 2, u22a->i, u22a->j);
  199. task22->buffers[2].mode = STARPU_RW;
  200. /* schedule that codelet */
  201. if (!no_prio && (slicey == i+1))
  202. task22->priority = STARPU_MAX_PRIO;
  203. starpu_task_submit(task22);
  204. }
  205. }
  206. }
  207. }
  208. void dw_callback_v2_codelet_update_u21(void *argcb)
  209. {
  210. cl_args *args = argcb;
  211. /* now launch the update of LU22 */
  212. unsigned i = args->i;
  213. unsigned k = args->k;
  214. unsigned nblocks = args->nblocks;
  215. /* we did task 21i,k */
  216. advance_12_21[i + k*nblocks] = DONE;
  217. unsigned slicex;
  218. for (slicex = i+1; slicex < nblocks; slicex++)
  219. {
  220. /* can we launch 22 i,slicex,k ? */
  221. /* deps : 12 slicex k */
  222. uint8_t dep;
  223. dep = advance_12_21[i*nblocks + slicex];
  224. if (dep & DONE)
  225. {
  226. /* perhaps we may schedule the 22 i,args->k,slicey task */
  227. uint8_t u = STARPU_ATOMIC_OR(&advance_22[i*nblocks*nblocks + k*nblocks + slicex], STARTED);
  228. if ((u & STARTED) == 0) {
  229. /* update that square matrix */
  230. cl_args *u22a = malloc(sizeof(cl_args));
  231. struct starpu_task *task22 = starpu_task_create();
  232. task22->callback_func = dw_callback_v2_codelet_update_u22;
  233. task22->callback_arg = u22a;
  234. task22->cl = &cl22;
  235. task22->cl_arg = u22a;
  236. u22a->k = i;
  237. u22a->i = slicex;
  238. u22a->j = k;
  239. u22a->dataA = args->dataA;
  240. u22a->nblocks = nblocks;
  241. task22->buffers[0].handle = starpu_data_get_sub_data(args->dataA, 2, u22a->i, u22a->k);
  242. task22->buffers[0].mode = STARPU_R;
  243. task22->buffers[1].handle = starpu_data_get_sub_data(args->dataA, 2, u22a->k, u22a->j);
  244. task22->buffers[1].mode = STARPU_R;
  245. task22->buffers[2].handle = starpu_data_get_sub_data(args->dataA, 2, u22a->i, u22a->j);
  246. task22->buffers[2].mode = STARPU_RW;
  247. /* schedule that codelet */
  248. if (!no_prio && (slicex == i+1))
  249. task22->priority = STARPU_MAX_PRIO;
  250. starpu_task_submit(task22);
  251. }
  252. }
  253. }
  254. }
  255. void dw_callback_v2_codelet_update_u11(void *argcb)
  256. {
  257. /* in case there remains work, go on */
  258. cl_args *args = argcb;
  259. unsigned nblocks = args->nblocks;
  260. unsigned i = args->i;
  261. /* we did task 11k */
  262. advance_11[i] = DONE;
  263. if (i == nblocks - 1)
  264. {
  265. /* we are done : wake the application up */
  266. pthread_mutex_lock(&mutex);
  267. finished = 1;
  268. pthread_cond_signal(&cond);
  269. pthread_mutex_unlock(&mutex);
  270. return;
  271. }
  272. else
  273. {
  274. /* put new tasks */
  275. unsigned slice;
  276. for (slice = i + 1; slice < nblocks; slice++)
  277. {
  278. /* can we launch 12i,slice ? */
  279. uint8_t deps12;
  280. if (i == 0) {
  281. deps12 = DONE;
  282. }
  283. else {
  284. deps12 = advance_22[(i-1)*nblocks*nblocks + slice + i*nblocks];
  285. }
  286. if (deps12 & DONE) {
  287. /* we may perhaps launch the task 12i,slice */
  288. uint8_t u = STARPU_ATOMIC_OR(&advance_12_21[i*nblocks + slice], STARTED);
  289. if ((u & STARTED) == 0) {
  290. /* we are the only one that should launch that task */
  291. cl_args *u12a = malloc(sizeof(cl_args));
  292. struct starpu_task *task12 = starpu_task_create();
  293. task12->callback_func = dw_callback_v2_codelet_update_u12;
  294. task12->callback_arg = u12a;
  295. task12->cl = &cl12;
  296. task12->cl_arg = u12a;
  297. u12a->i = i;
  298. u12a->k = slice;
  299. u12a->nblocks = args->nblocks;
  300. u12a->dataA = args->dataA;
  301. task12->buffers[0].handle = starpu_data_get_sub_data(args->dataA, 2, u12a->i, u12a->i);
  302. task12->buffers[0].mode = STARPU_R;
  303. task12->buffers[1].handle = starpu_data_get_sub_data(args->dataA, 2, u12a->k, u12a->i);
  304. task12->buffers[1].mode = STARPU_RW;
  305. if (!no_prio && (slice == i +1))
  306. task12->priority = STARPU_MAX_PRIO;
  307. starpu_task_submit(task12);
  308. }
  309. }
  310. /* can we launch 21i,slice ? */
  311. if (i == 0) {
  312. deps12 = DONE;
  313. }
  314. else {
  315. deps12 = advance_22[(i-1)*nblocks*nblocks + slice*nblocks + i];
  316. }
  317. if (deps12 & DONE) {
  318. /* we may perhaps launch the task 12i,slice */
  319. uint8_t u = STARPU_ATOMIC_OR(&advance_12_21[i + slice*nblocks], STARTED);
  320. if ((u & STARTED) == 0) {
  321. /* we are the only one that should launch that task */
  322. cl_args *u21a = malloc(sizeof(cl_args));
  323. struct starpu_task *task21 = starpu_task_create();
  324. task21->callback_func = dw_callback_v2_codelet_update_u21;
  325. task21->callback_arg = u21a;
  326. task21->cl = &cl21;
  327. task21->cl_arg = u21a;
  328. u21a->i = i;
  329. u21a->k = slice;
  330. u21a->nblocks = args->nblocks;
  331. u21a->dataA = args->dataA;
  332. task21->buffers[0].handle = starpu_data_get_sub_data(args->dataA, 2, u21a->i, u21a->i);
  333. task21->buffers[0].mode = STARPU_R;
  334. task21->buffers[1].handle = starpu_data_get_sub_data(args->dataA, 2, u21a->i, u21a->k);
  335. task21->buffers[1].mode = STARPU_RW;
  336. if (!no_prio && (slice == i +1))
  337. task21->priority = STARPU_MAX_PRIO;
  338. starpu_task_submit(task21);
  339. }
  340. }
  341. }
  342. }
  343. }
  344. /*
  345. * Callbacks
  346. */
  347. void dw_callback_codelet_update_u11(void *argcb)
  348. {
  349. /* in case there remains work, go on */
  350. cl_args *args = argcb;
  351. if (args->i == args->nblocks - 1)
  352. {
  353. /* we are done : wake the application up */
  354. pthread_mutex_lock(&mutex);
  355. finished = 1;
  356. pthread_cond_signal(&cond);
  357. pthread_mutex_unlock(&mutex);
  358. return;
  359. }
  360. else
  361. {
  362. /* put new tasks */
  363. unsigned nslices;
  364. nslices = args->nblocks - 1 - args->i;
  365. unsigned *remaining = malloc(sizeof(unsigned));
  366. *remaining = 2*nslices;
  367. unsigned slice;
  368. for (slice = args->i + 1; slice < args->nblocks; slice++)
  369. {
  370. /* update slice from u12 */
  371. cl_args *u12a = malloc(sizeof(cl_args));
  372. /* update slice from u21 */
  373. cl_args *u21a = malloc(sizeof(cl_args));
  374. struct starpu_task *task12 = starpu_task_create();
  375. task12->callback_func = dw_callback_codelet_update_u12_21;
  376. task12->callback_arg = u12a;
  377. task12->cl = &cl12;
  378. task12->cl_arg = u12a;
  379. struct starpu_task *task21 = starpu_task_create();
  380. task21->callback_func = dw_callback_codelet_update_u12_21;
  381. task21->callback_arg = u21a;
  382. task21->cl = &cl21;
  383. task21->cl_arg = u21a;
  384. u12a->i = args->i;
  385. u12a->k = slice;
  386. u12a->nblocks = args->nblocks;
  387. u12a->dataA = args->dataA;
  388. u12a->remaining = remaining;
  389. u21a->i = args->i;
  390. u21a->k = slice;
  391. u21a->nblocks = args->nblocks;
  392. u21a->dataA = args->dataA;
  393. u21a->remaining = remaining;
  394. task12->buffers[0].handle =
  395. starpu_data_get_sub_data(args->dataA, 2, u12a->i, u12a->i);
  396. task12->buffers[0].mode = STARPU_R;
  397. task12->buffers[1].handle =
  398. starpu_data_get_sub_data(args->dataA, 2, u12a->k, u12a->i);
  399. task12->buffers[1].mode = STARPU_RW;
  400. task21->buffers[0].handle =
  401. starpu_data_get_sub_data(args->dataA, 2, u21a->i, u21a->i);
  402. task21->buffers[0].mode = STARPU_R;
  403. task21->buffers[1].handle =
  404. starpu_data_get_sub_data(args->dataA, 2, u21a->i, u21a->k);
  405. task21->buffers[1].mode = STARPU_RW;
  406. starpu_task_submit(task12);
  407. starpu_task_submit(task21);
  408. }
  409. }
  410. }
  411. void dw_callback_codelet_update_u22(void *argcb)
  412. {
  413. cl_args *args = argcb;
  414. if (STARPU_ATOMIC_ADD(args->remaining, (-1)) == 0)
  415. {
  416. /* all worker already used the counter */
  417. free(args->remaining);
  418. /* we now reduce the LU22 part (recursion appears there) */
  419. cl_args *u11arg = malloc(sizeof(cl_args));
  420. struct starpu_task *task = starpu_task_create();
  421. task->callback_func = dw_callback_codelet_update_u11;
  422. task->callback_arg = u11arg;
  423. task->cl = &cl11;
  424. task->cl_arg = u11arg;
  425. task->buffers[0].handle = starpu_data_get_sub_data(args->dataA, 2, args->k + 1, args->k + 1);
  426. task->buffers[0].mode = STARPU_RW;
  427. u11arg->dataA = args->dataA;
  428. u11arg->i = args->k + 1;
  429. u11arg->nblocks = args->nblocks;
  430. /* schedule the codelet */
  431. starpu_task_submit(task);
  432. }
  433. free(args);
  434. }
  435. void dw_callback_codelet_update_u12_21(void *argcb)
  436. {
  437. cl_args *args = argcb;
  438. if (STARPU_ATOMIC_ADD(args->remaining, -1) == 0)
  439. {
  440. /* now launch the update of LU22 */
  441. unsigned i = args->i;
  442. unsigned nblocks = args->nblocks;
  443. /* the number of tasks to be done */
  444. unsigned *remaining = malloc(sizeof(unsigned));
  445. *remaining = (nblocks - 1 - i)*(nblocks - 1 - i);
  446. unsigned slicey, slicex;
  447. for (slicey = i+1; slicey < nblocks; slicey++)
  448. {
  449. for (slicex = i+1; slicex < nblocks; slicex++)
  450. {
  451. /* update that square matrix */
  452. cl_args *u22a = malloc(sizeof(cl_args));
  453. struct starpu_task *task22 = starpu_task_create();
  454. task22->callback_func = dw_callback_codelet_update_u22;
  455. task22->callback_arg = u22a;
  456. task22->cl = &cl22;
  457. task22->cl_arg = u22a;
  458. u22a->k = i;
  459. u22a->i = slicex;
  460. u22a->j = slicey;
  461. u22a->dataA = args->dataA;
  462. u22a->nblocks = nblocks;
  463. u22a->remaining = remaining;
  464. task22->buffers[0].handle = starpu_data_get_sub_data(args->dataA, 2, u22a->i, u22a->k);
  465. task22->buffers[0].mode = STARPU_R;
  466. task22->buffers[1].handle = starpu_data_get_sub_data(args->dataA, 2, u22a->k, u22a->j);
  467. task22->buffers[1].mode = STARPU_R;
  468. task22->buffers[2].handle = starpu_data_get_sub_data(args->dataA, 2, u22a->i, u22a->j);
  469. task22->buffers[2].mode = STARPU_RW;
  470. /* schedule that codelet */
  471. starpu_task_submit(task22);
  472. }
  473. }
  474. }
  475. }
  476. /*
  477. * code to bootstrap the factorization
  478. */
  479. void dw_codelet_facto(starpu_data_handle dataA, unsigned nblocks)
  480. {
  481. cl_args *args = malloc(sizeof(cl_args));
  482. args->i = 0;
  483. args->nblocks = nblocks;
  484. args->dataA = dataA;
  485. gettimeofday(&start, NULL);
  486. /* inject a new task with this codelet into the system */
  487. struct starpu_task *task = starpu_task_create();
  488. task->callback_func = dw_callback_codelet_update_u11;
  489. task->callback_arg = args;
  490. task->cl = &cl11;
  491. task->cl_arg = args;
  492. task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, 0, 0);
  493. task->buffers[0].mode = STARPU_RW;
  494. /* schedule the codelet */
  495. starpu_task_submit(task);
  496. /* stall the application until the end of computations */
  497. pthread_mutex_lock(&mutex);
  498. if (!finished)
  499. pthread_cond_wait(&cond, &mutex);
  500. pthread_mutex_unlock(&mutex);
  501. gettimeofday(&end, NULL);
  502. double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  503. fprintf(stderr, "Computation took (in ms)\n");
  504. printf("%2.2f\n", timing/1000);
  505. unsigned n = starpu_matrix_get_nx(dataA);
  506. double flop = (2.0f*n*n*n)/3.0f;
  507. fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
  508. }
  509. void dw_codelet_facto_v2(starpu_data_handle dataA, unsigned nblocks)
  510. {
  511. advance_11 = calloc(nblocks, sizeof(uint8_t));
  512. STARPU_ASSERT(advance_11);
  513. advance_12_21 = calloc(nblocks*nblocks, sizeof(uint8_t));
  514. STARPU_ASSERT(advance_12_21);
  515. advance_22 = calloc(nblocks*nblocks*nblocks, sizeof(uint8_t));
  516. STARPU_ASSERT(advance_22);
  517. cl_args *args = malloc(sizeof(cl_args));
  518. args->i = 0;
  519. args->nblocks = nblocks;
  520. args->dataA = dataA;
  521. gettimeofday(&start, NULL);
  522. /* inject a new task with this codelet into the system */
  523. struct starpu_task *task = starpu_task_create();
  524. task->callback_func = dw_callback_v2_codelet_update_u11;
  525. task->callback_arg = args;
  526. task->cl = &cl11;
  527. task->cl_arg = args;
  528. task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, 0, 0);
  529. task->buffers[0].mode = STARPU_RW;
  530. /* schedule the codelet */
  531. int ret = starpu_task_submit(task);
  532. if (STARPU_UNLIKELY(ret == -ENODEV))
  533. {
  534. fprintf(stderr, "No worker may execute this task\n");
  535. exit(0);
  536. }
  537. /* stall the application until the end of computations */
  538. pthread_mutex_lock(&mutex);
  539. if (!finished)
  540. pthread_cond_wait(&cond, &mutex);
  541. pthread_mutex_unlock(&mutex);
  542. gettimeofday(&end, NULL);
  543. double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  544. fprintf(stderr, "Computation took (in ms)\n");
  545. printf("%2.2f\n", timing/1000);
  546. unsigned n = starpu_matrix_get_nx(dataA);
  547. double flop = (2.0f*n*n*n)/3.0f;
  548. fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
  549. }
  550. void initialize_system(float **A, float **B, unsigned dim, unsigned pinned)
  551. {
  552. starpu_init(NULL);
  553. starpu_helper_cublas_init();
  554. if (pinned)
  555. {
  556. starpu_data_malloc_pinned_if_possible((void **)A, (size_t)dim*dim*sizeof(float));
  557. starpu_data_malloc_pinned_if_possible((void **)B, (size_t)dim*sizeof(float));
  558. }
  559. else {
  560. *A = malloc((size_t)dim*dim*sizeof(float));
  561. STARPU_ASSERT(*A);
  562. *B = malloc((size_t)dim*sizeof(float));
  563. STARPU_ASSERT(*B);
  564. }
  565. }
  566. void dw_factoLU(float *matA, unsigned size,
  567. unsigned ld, unsigned nblocks,
  568. unsigned version, unsigned _no_prio)
  569. {
  570. #ifdef CHECK_RESULTS
  571. fprintf(stderr, "Checking results ...\n");
  572. float *Asaved;
  573. Asaved = malloc((size_t)ld*ld*sizeof(float));
  574. memcpy(Asaved, matA, (size_t)ld*ld*sizeof(float));
  575. #endif
  576. no_prio = _no_prio;
  577. starpu_data_handle dataA;
  578. /* monitor and partition the A matrix into blocks :
  579. * one block is now determined by 2 unsigned (i,j) */
  580. starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld,
  581. size, size, sizeof(float));
  582. struct starpu_data_filter f;
  583. f.filter_func = starpu_vertical_block_filter_func;
  584. f.nchildren = nblocks;
  585. f.get_nchildren = NULL;
  586. f.get_child_ops = NULL;
  587. struct starpu_data_filter f2;
  588. f2.filter_func = starpu_block_filter_func;
  589. f2.nchildren = nblocks;
  590. f2.get_nchildren = NULL;
  591. f2.get_child_ops = NULL;
  592. starpu_data_map_filters(dataA, 2, &f, &f2);
  593. switch (version) {
  594. case 1:
  595. dw_codelet_facto(dataA, nblocks);
  596. break;
  597. default:
  598. case 2:
  599. dw_codelet_facto_v2(dataA, nblocks);
  600. break;
  601. }
  602. /* gather all the data */
  603. starpu_data_unpartition(dataA, 0);
  604. starpu_data_unregister(dataA);
  605. #ifdef CHECK_RESULTS
  606. compare_A_LU(Asaved, matA, size, ld);
  607. #endif
  608. }