dw_factolu.c 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2009-2011 Université de Bordeaux 1
  4. * Copyright (C) 2010 Mehdi Juhoor <mjuhoor@gmail.com>
  5. * Copyright (C) 2010, 2011 Centre National de la Recherche Scientifique
  6. *
  7. * StarPU is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU Lesser General Public License as published by
  9. * the Free Software Foundation; either version 2.1 of the License, or (at
  10. * your option) any later version.
  11. *
  12. * StarPU is distributed in the hope that it will be useful, but
  13. * WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  15. *
  16. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  17. */
  18. #include "dw_factolu.h"
  19. #include <sys/time.h>
  20. uint8_t *advance_12_21; /* size nblocks*nblocks */
  21. uint8_t *advance_11; /* size nblocks*nblocks */
  22. uint8_t *advance_22; /* array of nblocks *nblocks*nblocks */
  23. struct timeval start;
  24. struct timeval end;
  25. static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
  26. static pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
  27. static unsigned finished = 0;
  28. static unsigned no_prio = 0;
  29. static starpu_codelet cl11 =
  30. {
  31. .where = STARPU_CPU|STARPU_CUDA,
  32. .cpu_func = dw_cpu_codelet_update_u11,
  33. #ifdef STARPU_USE_CUDA
  34. .cuda_func = dw_cublas_codelet_update_u11,
  35. #endif
  36. .nbuffers = 1,
  37. .model = &model_11
  38. };
  39. static starpu_codelet cl12 =
  40. {
  41. .where = STARPU_CPU|STARPU_CUDA,
  42. .cpu_func = dw_cpu_codelet_update_u12,
  43. #ifdef STARPU_USE_CUDA
  44. .cuda_func = dw_cublas_codelet_update_u12,
  45. #endif
  46. .nbuffers = 2,
  47. .model = &model_12
  48. };
  49. static starpu_codelet cl21 =
  50. {
  51. .where = STARPU_CPU|STARPU_CUDA,
  52. .cpu_func = dw_cpu_codelet_update_u21,
  53. #ifdef STARPU_USE_CUDA
  54. .cuda_func = dw_cublas_codelet_update_u21,
  55. #endif
  56. .nbuffers = 2,
  57. .model = &model_21
  58. };
  59. static starpu_codelet cl22 =
  60. {
  61. .where = STARPU_CPU|STARPU_CUDA,
  62. .cpu_func = dw_cpu_codelet_update_u22,
  63. #ifdef STARPU_USE_CUDA
  64. .cuda_func = dw_cublas_codelet_update_u22,
  65. #endif
  66. .nbuffers = 3,
  67. .model = &model_22
  68. };
  69. #define STARTED 0x01
  70. #define DONE 0x10
  71. /*
  72. * Upgraded Callbacks : break the pipeline design !
  73. */
  74. void dw_callback_v2_codelet_update_u22(void *argcb)
  75. {
  76. cl_args *args = argcb;
  77. unsigned k = args->k;
  78. unsigned i = args->i;
  79. unsigned j = args->j;
  80. unsigned nblocks = args->nblocks;
  81. /* we did task 22k,i,j */
  82. advance_22[k*nblocks*nblocks + i + j*nblocks] = DONE;
  83. if ( (i == j) && (i == k+1)) {
  84. /* we now reduce the LU22 part (recursion appears there) */
  85. cl_args *u11arg = malloc(sizeof(cl_args));
  86. struct starpu_task *task = starpu_task_create();
  87. task->callback_func = dw_callback_v2_codelet_update_u11;
  88. task->callback_arg = u11arg;
  89. task->cl = &cl11;
  90. task->cl_arg = u11arg;
  91. task->buffers[0].handle =
  92. starpu_data_get_sub_data(args->dataA, 2, k+1, k+1);
  93. task->buffers[0].mode = STARPU_RW;
  94. u11arg->dataA = args->dataA;
  95. u11arg->i = k + 1;
  96. u11arg->nblocks = args->nblocks;
  97. /* schedule the codelet */
  98. if (!no_prio)
  99. task->priority = STARPU_MAX_PRIO;
  100. starpu_task_submit(task);
  101. }
  102. /* 11k+1 + 22k,k+1,j => 21 k+1,j */
  103. if ( i == k + 1) {
  104. uint8_t dep;
  105. /* 11 k+1*/
  106. dep = advance_11[(k+1)];
  107. if (dep & DONE) {
  108. /* try to push the task */
  109. uint8_t u = STARPU_ATOMIC_OR(&advance_12_21[(k+1) + j*nblocks], STARTED);
  110. if ((u & STARTED) == 0) {
  111. /* we are the only one that should
  112. * launch that task */
  113. cl_args *u21a = malloc(sizeof(cl_args));
  114. struct starpu_task *task21 = starpu_task_create();
  115. task21->callback_func = dw_callback_v2_codelet_update_u21;
  116. task21->callback_arg = u21a;
  117. task21->cl = &cl21;
  118. task21->cl_arg = u21a;
  119. u21a->i = k+1;
  120. u21a->k = j;
  121. u21a->nblocks = args->nblocks;
  122. u21a->dataA = args->dataA;
  123. task21->buffers[0].handle =
  124. starpu_data_get_sub_data(args->dataA, 2, u21a->i, u21a->i);
  125. task21->buffers[0].mode = STARPU_R;
  126. task21->buffers[1].handle =
  127. starpu_data_get_sub_data(args->dataA, 2, u21a->i, u21a->k);
  128. task21->buffers[1].mode = STARPU_RW;
  129. starpu_task_submit(task21);
  130. }
  131. }
  132. }
  133. /* 11k + 22k-1,i,k => 12 k,i */
  134. if (j == k + 1) {
  135. uint8_t dep;
  136. /* 11 k+1*/
  137. dep = advance_11[(k+1)];
  138. if (dep & DONE) {
  139. /* try to push the task */
  140. uint8_t u = STARPU_ATOMIC_OR(&advance_12_21[(k+1)*nblocks + i], STARTED);
  141. if ((u & STARTED) == 0) {
  142. /* we are the only one that should launch that task */
  143. cl_args *u12a = malloc(sizeof(cl_args));
  144. struct starpu_task *task12 = starpu_task_create();
  145. task12->callback_func = dw_callback_v2_codelet_update_u12;
  146. task12->callback_arg = u12a;
  147. task12->cl = &cl12;
  148. task12->cl_arg = u12a;
  149. u12a->i = k+1;
  150. u12a->k = i;
  151. u12a->nblocks = args->nblocks;
  152. u12a->dataA = args->dataA;
  153. task12->buffers[0].handle = starpu_data_get_sub_data(args->dataA, 2, u12a->i, u12a->i);
  154. task12->buffers[0].mode = STARPU_R;
  155. task12->buffers[1].handle = starpu_data_get_sub_data(args->dataA, 2, u12a->k, u12a->i);
  156. task12->buffers[1].mode = STARPU_RW;
  157. starpu_task_submit(task12);
  158. }
  159. }
  160. }
  161. free(args);
  162. }
  163. void dw_callback_v2_codelet_update_u12(void *argcb)
  164. {
  165. cl_args *args = argcb;
  166. /* now launch the update of LU22 */
  167. unsigned i = args->i;
  168. unsigned k = args->k;
  169. unsigned nblocks = args->nblocks;
  170. /* we did task 21i,k */
  171. advance_12_21[i*nblocks + k] = DONE;
  172. unsigned slicey;
  173. for (slicey = i+1; slicey < nblocks; slicey++)
  174. {
  175. /* can we launch 22 i,args->k,slicey ? */
  176. /* deps : 21 args->k, slicey */
  177. uint8_t dep;
  178. dep = advance_12_21[i + slicey*nblocks];
  179. if (dep & DONE)
  180. {
  181. /* perhaps we may schedule the 22 i,args->k,slicey task */
  182. uint8_t u = STARPU_ATOMIC_OR(&advance_22[i*nblocks*nblocks + slicey*nblocks + k], STARTED);
  183. if ((u & STARTED) == 0) {
  184. /* update that square matrix */
  185. cl_args *u22a = malloc(sizeof(cl_args));
  186. struct starpu_task *task22 = starpu_task_create();
  187. task22->callback_func = dw_callback_v2_codelet_update_u22;
  188. task22->callback_arg = u22a;
  189. task22->cl = &cl22;
  190. task22->cl_arg = u22a;
  191. u22a->k = i;
  192. u22a->i = k;
  193. u22a->j = slicey;
  194. u22a->dataA = args->dataA;
  195. u22a->nblocks = nblocks;
  196. task22->buffers[0].handle = starpu_data_get_sub_data(args->dataA, 2, u22a->i, u22a->k);
  197. task22->buffers[0].mode = STARPU_R;
  198. task22->buffers[1].handle = starpu_data_get_sub_data(args->dataA, 2, u22a->k, u22a->j);
  199. task22->buffers[1].mode = STARPU_R;
  200. task22->buffers[2].handle = starpu_data_get_sub_data(args->dataA, 2, u22a->i, u22a->j);
  201. task22->buffers[2].mode = STARPU_RW;
  202. /* schedule that codelet */
  203. if (!no_prio && (slicey == i+1))
  204. task22->priority = STARPU_MAX_PRIO;
  205. starpu_task_submit(task22);
  206. }
  207. }
  208. }
  209. }
  210. void dw_callback_v2_codelet_update_u21(void *argcb)
  211. {
  212. cl_args *args = argcb;
  213. /* now launch the update of LU22 */
  214. unsigned i = args->i;
  215. unsigned k = args->k;
  216. unsigned nblocks = args->nblocks;
  217. /* we did task 21i,k */
  218. advance_12_21[i + k*nblocks] = DONE;
  219. unsigned slicex;
  220. for (slicex = i+1; slicex < nblocks; slicex++)
  221. {
  222. /* can we launch 22 i,slicex,k ? */
  223. /* deps : 12 slicex k */
  224. uint8_t dep;
  225. dep = advance_12_21[i*nblocks + slicex];
  226. if (dep & DONE)
  227. {
  228. /* perhaps we may schedule the 22 i,args->k,slicey task */
  229. uint8_t u = STARPU_ATOMIC_OR(&advance_22[i*nblocks*nblocks + k*nblocks + slicex], STARTED);
  230. if ((u & STARTED) == 0) {
  231. /* update that square matrix */
  232. cl_args *u22a = malloc(sizeof(cl_args));
  233. struct starpu_task *task22 = starpu_task_create();
  234. task22->callback_func = dw_callback_v2_codelet_update_u22;
  235. task22->callback_arg = u22a;
  236. task22->cl = &cl22;
  237. task22->cl_arg = u22a;
  238. u22a->k = i;
  239. u22a->i = slicex;
  240. u22a->j = k;
  241. u22a->dataA = args->dataA;
  242. u22a->nblocks = nblocks;
  243. task22->buffers[0].handle = starpu_data_get_sub_data(args->dataA, 2, u22a->i, u22a->k);
  244. task22->buffers[0].mode = STARPU_R;
  245. task22->buffers[1].handle = starpu_data_get_sub_data(args->dataA, 2, u22a->k, u22a->j);
  246. task22->buffers[1].mode = STARPU_R;
  247. task22->buffers[2].handle = starpu_data_get_sub_data(args->dataA, 2, u22a->i, u22a->j);
  248. task22->buffers[2].mode = STARPU_RW;
  249. /* schedule that codelet */
  250. if (!no_prio && (slicex == i+1))
  251. task22->priority = STARPU_MAX_PRIO;
  252. starpu_task_submit(task22);
  253. }
  254. }
  255. }
  256. }
  257. void dw_callback_v2_codelet_update_u11(void *argcb)
  258. {
  259. /* in case there remains work, go on */
  260. cl_args *args = argcb;
  261. unsigned nblocks = args->nblocks;
  262. unsigned i = args->i;
  263. /* we did task 11k */
  264. advance_11[i] = DONE;
  265. if (i == nblocks - 1)
  266. {
  267. /* we are done : wake the application up */
  268. pthread_mutex_lock(&mutex);
  269. finished = 1;
  270. pthread_cond_signal(&cond);
  271. pthread_mutex_unlock(&mutex);
  272. return;
  273. }
  274. else
  275. {
  276. /* put new tasks */
  277. unsigned slice;
  278. for (slice = i + 1; slice < nblocks; slice++)
  279. {
  280. /* can we launch 12i,slice ? */
  281. uint8_t deps12;
  282. if (i == 0) {
  283. deps12 = DONE;
  284. }
  285. else {
  286. deps12 = advance_22[(i-1)*nblocks*nblocks + slice + i*nblocks];
  287. }
  288. if (deps12 & DONE) {
  289. /* we may perhaps launch the task 12i,slice */
  290. uint8_t u = STARPU_ATOMIC_OR(&advance_12_21[i*nblocks + slice], STARTED);
  291. if ((u & STARTED) == 0) {
  292. /* we are the only one that should launch that task */
  293. cl_args *u12a = malloc(sizeof(cl_args));
  294. struct starpu_task *task12 = starpu_task_create();
  295. task12->callback_func = dw_callback_v2_codelet_update_u12;
  296. task12->callback_arg = u12a;
  297. task12->cl = &cl12;
  298. task12->cl_arg = u12a;
  299. u12a->i = i;
  300. u12a->k = slice;
  301. u12a->nblocks = args->nblocks;
  302. u12a->dataA = args->dataA;
  303. task12->buffers[0].handle = starpu_data_get_sub_data(args->dataA, 2, u12a->i, u12a->i);
  304. task12->buffers[0].mode = STARPU_R;
  305. task12->buffers[1].handle = starpu_data_get_sub_data(args->dataA, 2, u12a->k, u12a->i);
  306. task12->buffers[1].mode = STARPU_RW;
  307. if (!no_prio && (slice == i +1))
  308. task12->priority = STARPU_MAX_PRIO;
  309. starpu_task_submit(task12);
  310. }
  311. }
  312. /* can we launch 21i,slice ? */
  313. if (i == 0) {
  314. deps12 = DONE;
  315. }
  316. else {
  317. deps12 = advance_22[(i-1)*nblocks*nblocks + slice*nblocks + i];
  318. }
  319. if (deps12 & DONE) {
  320. /* we may perhaps launch the task 12i,slice */
  321. uint8_t u = STARPU_ATOMIC_OR(&advance_12_21[i + slice*nblocks], STARTED);
  322. if ((u & STARTED) == 0) {
  323. /* we are the only one that should launch that task */
  324. cl_args *u21a = malloc(sizeof(cl_args));
  325. struct starpu_task *task21 = starpu_task_create();
  326. task21->callback_func = dw_callback_v2_codelet_update_u21;
  327. task21->callback_arg = u21a;
  328. task21->cl = &cl21;
  329. task21->cl_arg = u21a;
  330. u21a->i = i;
  331. u21a->k = slice;
  332. u21a->nblocks = args->nblocks;
  333. u21a->dataA = args->dataA;
  334. task21->buffers[0].handle = starpu_data_get_sub_data(args->dataA, 2, u21a->i, u21a->i);
  335. task21->buffers[0].mode = STARPU_R;
  336. task21->buffers[1].handle = starpu_data_get_sub_data(args->dataA, 2, u21a->i, u21a->k);
  337. task21->buffers[1].mode = STARPU_RW;
  338. if (!no_prio && (slice == i +1))
  339. task21->priority = STARPU_MAX_PRIO;
  340. starpu_task_submit(task21);
  341. }
  342. }
  343. }
  344. }
  345. }
  346. /*
  347. * Callbacks
  348. */
  349. void dw_callback_codelet_update_u11(void *argcb)
  350. {
  351. /* in case there remains work, go on */
  352. cl_args *args = argcb;
  353. if (args->i == args->nblocks - 1)
  354. {
  355. /* we are done : wake the application up */
  356. pthread_mutex_lock(&mutex);
  357. finished = 1;
  358. pthread_cond_signal(&cond);
  359. pthread_mutex_unlock(&mutex);
  360. return;
  361. }
  362. else
  363. {
  364. /* put new tasks */
  365. unsigned nslices;
  366. nslices = args->nblocks - 1 - args->i;
  367. unsigned *remaining = malloc(sizeof(unsigned));
  368. *remaining = 2*nslices;
  369. unsigned slice;
  370. for (slice = args->i + 1; slice < args->nblocks; slice++)
  371. {
  372. /* update slice from u12 */
  373. cl_args *u12a = malloc(sizeof(cl_args));
  374. /* update slice from u21 */
  375. cl_args *u21a = malloc(sizeof(cl_args));
  376. struct starpu_task *task12 = starpu_task_create();
  377. task12->callback_func = dw_callback_codelet_update_u12_21;
  378. task12->callback_arg = u12a;
  379. task12->cl = &cl12;
  380. task12->cl_arg = u12a;
  381. struct starpu_task *task21 = starpu_task_create();
  382. task21->callback_func = dw_callback_codelet_update_u12_21;
  383. task21->callback_arg = u21a;
  384. task21->cl = &cl21;
  385. task21->cl_arg = u21a;
  386. u12a->i = args->i;
  387. u12a->k = slice;
  388. u12a->nblocks = args->nblocks;
  389. u12a->dataA = args->dataA;
  390. u12a->remaining = remaining;
  391. u21a->i = args->i;
  392. u21a->k = slice;
  393. u21a->nblocks = args->nblocks;
  394. u21a->dataA = args->dataA;
  395. u21a->remaining = remaining;
  396. task12->buffers[0].handle =
  397. starpu_data_get_sub_data(args->dataA, 2, u12a->i, u12a->i);
  398. task12->buffers[0].mode = STARPU_R;
  399. task12->buffers[1].handle =
  400. starpu_data_get_sub_data(args->dataA, 2, u12a->k, u12a->i);
  401. task12->buffers[1].mode = STARPU_RW;
  402. task21->buffers[0].handle =
  403. starpu_data_get_sub_data(args->dataA, 2, u21a->i, u21a->i);
  404. task21->buffers[0].mode = STARPU_R;
  405. task21->buffers[1].handle =
  406. starpu_data_get_sub_data(args->dataA, 2, u21a->i, u21a->k);
  407. task21->buffers[1].mode = STARPU_RW;
  408. starpu_task_submit(task12);
  409. starpu_task_submit(task21);
  410. }
  411. }
  412. }
  413. void dw_callback_codelet_update_u22(void *argcb)
  414. {
  415. cl_args *args = argcb;
  416. if (STARPU_ATOMIC_ADD(args->remaining, (-1)) == 0)
  417. {
  418. /* all worker already used the counter */
  419. free(args->remaining);
  420. /* we now reduce the LU22 part (recursion appears there) */
  421. cl_args *u11arg = malloc(sizeof(cl_args));
  422. struct starpu_task *task = starpu_task_create();
  423. task->callback_func = dw_callback_codelet_update_u11;
  424. task->callback_arg = u11arg;
  425. task->cl = &cl11;
  426. task->cl_arg = u11arg;
  427. task->buffers[0].handle = starpu_data_get_sub_data(args->dataA, 2, args->k + 1, args->k + 1);
  428. task->buffers[0].mode = STARPU_RW;
  429. u11arg->dataA = args->dataA;
  430. u11arg->i = args->k + 1;
  431. u11arg->nblocks = args->nblocks;
  432. /* schedule the codelet */
  433. starpu_task_submit(task);
  434. }
  435. free(args);
  436. }
  437. void dw_callback_codelet_update_u12_21(void *argcb)
  438. {
  439. cl_args *args = argcb;
  440. if (STARPU_ATOMIC_ADD(args->remaining, -1) == 0)
  441. {
  442. /* now launch the update of LU22 */
  443. unsigned i = args->i;
  444. unsigned nblocks = args->nblocks;
  445. /* the number of tasks to be done */
  446. unsigned *remaining = malloc(sizeof(unsigned));
  447. *remaining = (nblocks - 1 - i)*(nblocks - 1 - i);
  448. unsigned slicey, slicex;
  449. for (slicey = i+1; slicey < nblocks; slicey++)
  450. {
  451. for (slicex = i+1; slicex < nblocks; slicex++)
  452. {
  453. /* update that square matrix */
  454. cl_args *u22a = malloc(sizeof(cl_args));
  455. struct starpu_task *task22 = starpu_task_create();
  456. task22->callback_func = dw_callback_codelet_update_u22;
  457. task22->callback_arg = u22a;
  458. task22->cl = &cl22;
  459. task22->cl_arg = u22a;
  460. u22a->k = i;
  461. u22a->i = slicex;
  462. u22a->j = slicey;
  463. u22a->dataA = args->dataA;
  464. u22a->nblocks = nblocks;
  465. u22a->remaining = remaining;
  466. task22->buffers[0].handle = starpu_data_get_sub_data(args->dataA, 2, u22a->i, u22a->k);
  467. task22->buffers[0].mode = STARPU_R;
  468. task22->buffers[1].handle = starpu_data_get_sub_data(args->dataA, 2, u22a->k, u22a->j);
  469. task22->buffers[1].mode = STARPU_R;
  470. task22->buffers[2].handle = starpu_data_get_sub_data(args->dataA, 2, u22a->i, u22a->j);
  471. task22->buffers[2].mode = STARPU_RW;
  472. /* schedule that codelet */
  473. starpu_task_submit(task22);
  474. }
  475. }
  476. }
  477. }
  478. /*
  479. * code to bootstrap the factorization
  480. */
  481. void dw_codelet_facto(starpu_data_handle dataA, unsigned nblocks)
  482. {
  483. cl_args *args = malloc(sizeof(cl_args));
  484. args->i = 0;
  485. args->nblocks = nblocks;
  486. args->dataA = dataA;
  487. gettimeofday(&start, NULL);
  488. /* inject a new task with this codelet into the system */
  489. struct starpu_task *task = starpu_task_create();
  490. task->callback_func = dw_callback_codelet_update_u11;
  491. task->callback_arg = args;
  492. task->cl = &cl11;
  493. task->cl_arg = args;
  494. task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, 0, 0);
  495. task->buffers[0].mode = STARPU_RW;
  496. /* schedule the codelet */
  497. starpu_task_submit(task);
  498. /* stall the application until the end of computations */
  499. pthread_mutex_lock(&mutex);
  500. if (!finished)
  501. pthread_cond_wait(&cond, &mutex);
  502. pthread_mutex_unlock(&mutex);
  503. gettimeofday(&end, NULL);
  504. double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  505. FPRINTF(stderr, "Computation took (in ms)\n");
  506. FPRINTF(stdout, "%2.2f\n", timing/1000);
  507. unsigned n = starpu_matrix_get_nx(dataA);
  508. double flop = (2.0f*n*n*n)/3.0f;
  509. FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
  510. }
  511. void dw_codelet_facto_v2(starpu_data_handle dataA, unsigned nblocks)
  512. {
  513. advance_11 = calloc(nblocks, sizeof(uint8_t));
  514. STARPU_ASSERT(advance_11);
  515. advance_12_21 = calloc(nblocks*nblocks, sizeof(uint8_t));
  516. STARPU_ASSERT(advance_12_21);
  517. advance_22 = calloc(nblocks*nblocks*nblocks, sizeof(uint8_t));
  518. STARPU_ASSERT(advance_22);
  519. cl_args *args = malloc(sizeof(cl_args));
  520. args->i = 0;
  521. args->nblocks = nblocks;
  522. args->dataA = dataA;
  523. gettimeofday(&start, NULL);
  524. /* inject a new task with this codelet into the system */
  525. struct starpu_task *task = starpu_task_create();
  526. task->callback_func = dw_callback_v2_codelet_update_u11;
  527. task->callback_arg = args;
  528. task->cl = &cl11;
  529. task->cl_arg = args;
  530. task->buffers[0].handle = starpu_data_get_sub_data(dataA, 2, 0, 0);
  531. task->buffers[0].mode = STARPU_RW;
  532. /* schedule the codelet */
  533. int ret = starpu_task_submit(task);
  534. if (STARPU_UNLIKELY(ret == -ENODEV))
  535. {
  536. FPRINTF(stderr, "No worker may execute this task\n");
  537. exit(0);
  538. }
  539. /* stall the application until the end of computations */
  540. pthread_mutex_lock(&mutex);
  541. if (!finished)
  542. pthread_cond_wait(&cond, &mutex);
  543. pthread_mutex_unlock(&mutex);
  544. gettimeofday(&end, NULL);
  545. double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  546. FPRINTF(stderr, "Computation took (in ms)\n");
  547. FPRINTF(stdout, "%2.2f\n", timing/1000);
  548. unsigned n = starpu_matrix_get_nx(dataA);
  549. double flop = (2.0f*n*n*n)/3.0f;
  550. FPRINTF(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
  551. }
  552. void initialize_system(float **A, float **B, unsigned dim, unsigned pinned)
  553. {
  554. starpu_init(NULL);
  555. starpu_helper_cublas_init();
  556. if (pinned)
  557. {
  558. starpu_malloc((void **)A, (size_t)dim*dim*sizeof(float));
  559. starpu_malloc((void **)B, (size_t)dim*sizeof(float));
  560. }
  561. else {
  562. *A = malloc((size_t)dim*dim*sizeof(float));
  563. STARPU_ASSERT(*A);
  564. *B = malloc((size_t)dim*sizeof(float));
  565. STARPU_ASSERT(*B);
  566. }
  567. }
  568. void dw_factoLU(float *matA, unsigned size,
  569. unsigned ld, unsigned nblocks,
  570. unsigned version, unsigned _no_prio)
  571. {
  572. #ifdef CHECK_RESULTS
  573. FPRINTF(stderr, "Checking results ...\n");
  574. float *Asaved;
  575. Asaved = malloc((size_t)ld*ld*sizeof(float));
  576. memcpy(Asaved, matA, (size_t)ld*ld*sizeof(float));
  577. #endif
  578. no_prio = _no_prio;
  579. starpu_data_handle dataA;
  580. /* monitor and partition the A matrix into blocks :
  581. * one block is now determined by 2 unsigned (i,j) */
  582. starpu_matrix_data_register(&dataA, 0, (uintptr_t)matA, ld,
  583. size, size, sizeof(float));
  584. struct starpu_data_filter f = {
  585. .filter_func = starpu_vertical_block_filter_func,
  586. .nchildren = nblocks
  587. };
  588. struct starpu_data_filter f2 = {
  589. .filter_func = starpu_block_filter_func,
  590. .nchildren = nblocks
  591. };
  592. starpu_data_map_filters(dataA, 2, &f, &f2);
  593. switch (version) {
  594. case 1:
  595. dw_codelet_facto(dataA, nblocks);
  596. break;
  597. default:
  598. case 2:
  599. dw_codelet_facto_v2(dataA, nblocks);
  600. break;
  601. }
  602. /* gather all the data */
  603. starpu_data_unpartition(dataA, 0);
  604. starpu_data_unregister(dataA);
  605. #ifdef CHECK_RESULTS
  606. compare_A_LU(Asaved, matA, size, ld);
  607. #endif
  608. }