mpi_cholesky_codelets.c 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2009-2020 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  4. *
  5. * StarPU is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU Lesser General Public License as published by
  7. * the Free Software Foundation; either version 2.1 of the License, or (at
  8. * your option) any later version.
  9. *
  10. * StarPU is distributed in the hope that it will be useful, but
  11. * WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. *
  14. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  15. */
  16. #include "mpi_cholesky.h"
  17. #include <common/blas.h>
  18. #include <sys/time.h>
  19. #include <limits.h>
  20. #include <math.h>
  21. /* This is from magma
  22. -- Innovative Computing Laboratory
  23. -- Electrical Engineering and Computer Science Department
  24. -- University of Tennessee
  25. -- (C) Copyright 2009
  26. Redistribution and use in source and binary forms, with or without
  27. modification, are permitted provided that the following conditions
  28. are met:
  29. * Redistributions of source code must retain the above copyright
  30. notice, this list of conditions and the following disclaimer.
  31. * Redistributions in binary form must reproduce the above copyright
  32. notice, this list of conditions and the following disclaimer in the
  33. documentation and/or other materials provided with the distribution.
  34. * Neither the name of the University of Tennessee, Knoxville nor the
  35. names of its contributors may be used to endorse or promote products
  36. derived from this software without specific prior written permission.
  37. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  38. ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  39. LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  40. A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  41. HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  42. SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  43. LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  44. DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  45. THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  46. (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  47. OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  48. */
  49. #define FMULS_POTRF(__n) ((double)(__n) * (((1. / 6.) * (double)(__n) + 0.5) * (double)(__n) + (1. / 3.)))
  50. #define FADDS_POTRF(__n) ((double)(__n) * (((1. / 6.) * (double)(__n) ) * (double)(__n) - (1. / 6.)))
  51. #define FLOPS_SPOTRF(__n) ( FMULS_POTRF((__n)) + FADDS_POTRF((__n)) )
  52. #define FMULS_TRMM_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)+1.))
  53. #define FADDS_TRMM_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)-1.))
  54. #define FMULS_TRMM(__m, __n) ( /*( (__side) == PlasmaLeft ) ? FMULS_TRMM_2((__m), (__n)) :*/ FMULS_TRMM_2((__n), (__m)) )
  55. #define FADDS_TRMM(__m, __n) ( /*( (__side) == PlasmaLeft ) ? FADDS_TRMM_2((__m), (__n)) :*/ FADDS_TRMM_2((__n), (__m)) )
  56. #define FMULS_TRSM FMULS_TRMM
  57. #define FADDS_TRSM FMULS_TRMM
  58. #define FLOPS_STRSM(__m, __n) ( FMULS_TRSM((__m), (__n)) + FADDS_TRSM((__m), (__n)) )
  59. #define FMULS_GEMM(__m, __n, __k) ((double)(__m) * (double)(__n) * (double)(__k))
  60. #define FADDS_GEMM(__m, __n, __k) ((double)(__m) * (double)(__n) * (double)(__k))
  61. #define FLOPS_SGEMM(__m, __n, __k) ( FMULS_GEMM((__m), (__n), (__k)) + FADDS_GEMM((__m), (__n), (__k)) )
  62. /* End of magma code */
  63. int _nodes;
  64. starpu_mpi_checkpoint_template_t* checkpoint_p;
  65. int backup_function(int rank)
  66. {
  67. return (rank/dblockx)*dblockx +(rank+1)%dblockx;
  68. // return (rank+1)%_nodes;
  69. }
  70. /*
  71. * Create the codelets
  72. */
  73. static struct starpu_codelet cl11 =
  74. {
  75. .cpu_funcs = {chol_cpu_codelet_update_u11},
  76. #ifdef STARPU_USE_CUDA
  77. .cuda_funcs = {chol_cublas_codelet_update_u11},
  78. #elif defined(STARPU_SIMGRID)
  79. .cuda_funcs = {(void*)1},
  80. #endif
  81. .nbuffers = 1,
  82. .modes = {STARPU_RW},
  83. .model = &chol_model_11,
  84. .color = 0xffff00,
  85. };
  86. static struct starpu_codelet cl21 =
  87. {
  88. .cpu_funcs = {chol_cpu_codelet_update_u21},
  89. #ifdef STARPU_USE_CUDA
  90. .cuda_funcs = {chol_cublas_codelet_update_u21},
  91. #elif defined(STARPU_SIMGRID)
  92. .cuda_funcs = {(void*)1},
  93. #endif
  94. .cuda_flags = {STARPU_CUDA_ASYNC},
  95. .nbuffers = 2,
  96. .modes = {STARPU_R, STARPU_RW},
  97. .model = &chol_model_21,
  98. .color = 0x8080ff,
  99. };
  100. static struct starpu_codelet cl22 =
  101. {
  102. .cpu_funcs = {chol_cpu_codelet_update_u22},
  103. #ifdef STARPU_USE_CUDA
  104. .cuda_funcs = {chol_cublas_codelet_update_u22},
  105. #elif defined(STARPU_SIMGRID)
  106. .cuda_funcs = {(void*)1},
  107. #endif
  108. .cuda_flags = {STARPU_CUDA_ASYNC},
  109. .nbuffers = 3,
  110. .modes = {STARPU_R, STARPU_R, STARPU_RW | STARPU_COMMUTE},
  111. .model = &chol_model_22,
  112. .color = 0x00ff00,
  113. };
  114. static void run_cholesky(starpu_data_handle_t **data_handles, int rank, int nodes)
  115. {
  116. unsigned k, m, n;
  117. unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
  118. unsigned nn = size/nblocks;
  119. starpu_mpi_checkpoint_template_add_entry(checkpoint_p, STARPU_VALUE, &k, sizeof(k), nblocks*nblocks+10, backup_function);
  120. starpu_mpi_checkpoint_template_freeze(checkpoint_p);
  121. for (k = 0; k < nblocks; k++)
  122. {
  123. starpu_iteration_push(k);
  124. starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
  125. STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
  126. STARPU_RW, data_handles[k][k],
  127. STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
  128. 0);
  129. for (m = k+1; m<nblocks; m++)
  130. {
  131. starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
  132. STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
  133. STARPU_R, data_handles[k][k],
  134. STARPU_RW, data_handles[m][k],
  135. STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
  136. 0);
  137. starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[k][k]);
  138. if (my_distrib(k, k, nodes) == rank)
  139. starpu_data_wont_use(data_handles[k][k]);
  140. }
  141. for (n = k+1; n<nblocks; n++)
  142. {
  143. for (m = n; m<nblocks; m++)
  144. {
  145. starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
  146. STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
  147. STARPU_R, data_handles[n][k],
  148. STARPU_R, data_handles[m][k],
  149. STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
  150. STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
  151. 0);
  152. }
  153. starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][k]);
  154. if (my_distrib(n, k, nodes) == rank)
  155. starpu_data_wont_use(data_handles[n][k]);
  156. }
  157. if (k%checkpoint_period==checkpoint_period-1)
  158. starpu_mpi_submit_checkpoint_template(*checkpoint_p, -2*k);
  159. starpu_iteration_pop();
  160. }
  161. }
  162. /* TODO: generate from compiler polyhedral analysis of classical algorithm */
  163. static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, int nodes)
  164. {
  165. unsigned k, m, n;
  166. unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
  167. unsigned nn = size/nblocks;
  168. starpu_mpi_checkpoint_template_add_entry(checkpoint_p, STARPU_VALUE, &n, sizeof(n), nblocks*nblocks+10, backup_function);
  169. starpu_mpi_checkpoint_template_freeze(checkpoint_p);
  170. /* Column */
  171. for (n = 0; n<nblocks; n++)
  172. {
  173. starpu_iteration_push(n);
  174. /* First handle the diagonal block */
  175. /* Row */
  176. m = n;
  177. for (k = 0; k < n; k++)
  178. {
  179. /* Accumulate updates from TRSMs */
  180. starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
  181. STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
  182. STARPU_R, data_handles[n][k],
  183. STARPU_R, data_handles[m][k],
  184. STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
  185. STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
  186. 0);
  187. /* Nobody else will need it */
  188. starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[m][k]);
  189. starpu_data_wont_use(data_handles[m][k]);
  190. }
  191. k = n;
  192. /* Factorize */
  193. starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
  194. STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
  195. STARPU_RW, data_handles[k][k],
  196. STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
  197. 0);
  198. /* Row */
  199. for (m = n + 1; m<nblocks; m++)
  200. {
  201. for (k = 0; k < n; k++)
  202. {
  203. /* Accumulate updates from TRSMs */
  204. starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
  205. STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
  206. STARPU_R, data_handles[n][k],
  207. STARPU_R, data_handles[m][k],
  208. STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
  209. STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
  210. 0);
  211. }
  212. k = n;
  213. /* Solve */
  214. starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
  215. STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
  216. STARPU_R, data_handles[k][k],
  217. STARPU_RW, data_handles[m][k],
  218. STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
  219. 0);
  220. }
  221. /* We won't need it any more */
  222. starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][n]);
  223. starpu_data_wont_use(data_handles[n][n]);
  224. if (n%checkpoint_period==checkpoint_period-1)
  225. starpu_mpi_submit_checkpoint_template(*checkpoint_p, (int)(nblocks - 2*n));
  226. starpu_iteration_pop();
  227. }
  228. }
  229. /* TODO: generate from compiler polyhedral analysis of classical algorithm */
  230. static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int rank, int nodes)
  231. {
  232. unsigned a;
  233. unsigned k, m, n;
  234. unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
  235. unsigned nn = size/nblocks;
  236. starpu_mpi_checkpoint_template_add_entry(checkpoint_p, STARPU_VALUE, &a, sizeof(a), nblocks*nblocks+10, backup_function);
  237. starpu_mpi_checkpoint_template_freeze(checkpoint_p);
  238. /* double-antidiagonal number:
  239. * - a=0 contains (0,0) plus (1,0)
  240. * - a=1 contains (2,0), (1,1) plus (3,0), (2, 1)
  241. * - etc.
  242. */
  243. for (a = 0; a < nblocks; a++)
  244. {
  245. starpu_iteration_push(a);
  246. unsigned nfirst;
  247. if (2*a < nblocks)
  248. nfirst = 0;
  249. else
  250. nfirst = 2*a - (nblocks-1);
  251. /* column within first antidiagonal for a */
  252. for (n = nfirst; n <= a; n++)
  253. {
  254. /* row */
  255. m = 2*a-n;
  256. /* Accumulate updates from TRSMs */
  257. for (k = 0; k < n; k++)
  258. {
  259. starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
  260. STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
  261. STARPU_R, data_handles[n][k],
  262. STARPU_R, data_handles[m][k],
  263. STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
  264. STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
  265. 0);
  266. if (m == nblocks-1)
  267. {
  268. /* Nobody else will need it */
  269. starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][k]);
  270. starpu_data_wont_use(data_handles[n][k]);
  271. }
  272. }
  273. /* k = n */
  274. if (n < a)
  275. {
  276. /* non-diagonal block, solve */
  277. starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
  278. STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
  279. STARPU_R, data_handles[k][k],
  280. STARPU_RW, data_handles[m][k],
  281. STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
  282. 0);
  283. }
  284. else
  285. {
  286. /* diagonal block, factorize */
  287. starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
  288. STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
  289. STARPU_RW, data_handles[k][k],
  290. STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
  291. 0);
  292. }
  293. if (m == nblocks - 1)
  294. {
  295. /* We do not need the potrf result any more */
  296. starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][n]);
  297. starpu_data_wont_use(data_handles[n][n]);
  298. }
  299. }
  300. /* column within second antidiagonal for a */
  301. for (n = nfirst; n <= a; n++)
  302. {
  303. /* row */
  304. m = 2*a-n + 1;
  305. if (m >= nblocks)
  306. /* Skip first item when even number of tiles */
  307. continue;
  308. /* Accumulate updates from TRSMs */
  309. for (k = 0; k < n; k++)
  310. {
  311. starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
  312. STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
  313. STARPU_R, data_handles[n][k],
  314. STARPU_R, data_handles[m][k],
  315. STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
  316. STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
  317. 0);
  318. if (m == nblocks-1)
  319. {
  320. /* Nobody else will need it */
  321. starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][k]);
  322. starpu_data_wont_use(data_handles[n][k]);
  323. }
  324. }
  325. /* non-diagonal block, solve */
  326. k = n;
  327. starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
  328. STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
  329. STARPU_R, data_handles[k][k],
  330. STARPU_RW, data_handles[m][k],
  331. STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
  332. 0);
  333. if (m == nblocks - 1)
  334. {
  335. /* We do not need the potrf result any more */
  336. starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][n]);
  337. starpu_data_wont_use(data_handles[n][n]);
  338. }
  339. }
  340. if (a%checkpoint_period==checkpoint_period-1)
  341. starpu_mpi_submit_checkpoint_template(*checkpoint_p, (int)(2*nblocks -4*a));
  342. starpu_iteration_pop();
  343. }
  344. }
  345. /* TODO: generate from compiler polyhedral analysis of classical algorithm */
  346. static void run_cholesky_prio(starpu_data_handle_t **data_handles, int rank, int nodes)
  347. {
  348. unsigned a;
  349. int k, m, n;
  350. unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
  351. unsigned nn = size/nblocks;
  352. /*
  353. * This is basically similar to above, except that we shift k according to the priorities set in the algorithm, so that gemm prio ~= 2*nblocks - a
  354. * double-antidiagonal number:
  355. * - a=0 contains (0,0) plus (1,0)
  356. * - a=1 contains (2,0), (1,1) plus (3,0), (2, 1)
  357. * - etc.
  358. */
  359. starpu_mpi_checkpoint_template_add_entry(checkpoint_p, STARPU_VALUE, &a, sizeof(a), nblocks*nblocks+10, backup_function);
  360. starpu_mpi_checkpoint_template_freeze(checkpoint_p);
  361. for (a = 0; a < 4*nblocks; a++)
  362. {
  363. starpu_iteration_push(a);
  364. for (k = 0; k < (int) nblocks; k++)
  365. {
  366. n = k;
  367. /* Should be m = a-k-n; for potrf and trsm to respect
  368. priorities, but needs to be this for dependencies */
  369. m = a-2*k-n;
  370. if (m == n)
  371. {
  372. /* diagonal block, factorize */
  373. starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
  374. STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
  375. STARPU_RW, data_handles[k][k],
  376. STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
  377. 0);
  378. }
  379. else if (m >= n && m < (int) nblocks)
  380. {
  381. /* non-diagonal block, solve */
  382. starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
  383. STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
  384. STARPU_R, data_handles[k][k],
  385. STARPU_RW, data_handles[m][k],
  386. STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
  387. 0);
  388. }
  389. if (m == (int) nblocks - 1)
  390. {
  391. /* We do not need the potrf result any more */
  392. starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][n]);
  393. starpu_data_wont_use(data_handles[n][n]);
  394. }
  395. /* column within antidiagonal for a */
  396. for (n = k + 1; n < (int) nblocks; n++)
  397. {
  398. /* row */
  399. m = a-2*k-n;
  400. if (m >= n && m < (int) nblocks)
  401. {
  402. /* Update */
  403. starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
  404. STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
  405. STARPU_R, data_handles[n][k],
  406. STARPU_R, data_handles[m][k],
  407. STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
  408. STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
  409. 0);
  410. if (m == (int) nblocks - 1)
  411. {
  412. /* Nobody else will need it */
  413. starpu_data_wont_use(data_handles[n][k]);
  414. starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][k]);
  415. }
  416. }
  417. }
  418. }
  419. if (a%(4*checkpoint_period)==(4*checkpoint_period)-1)
  420. starpu_mpi_submit_checkpoint_template(*checkpoint_p, (int)(2*nblocks - a));
  421. starpu_iteration_pop();
  422. }
  423. }
  424. /*
  425. * code to bootstrap the factorization
  426. * and construct the DAG
  427. */
  428. void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing, double *flops)
  429. {
  430. double start;
  431. double end;
  432. starpu_data_handle_t **data_handles;
  433. unsigned k, m, n;
  434. /* create all the DAG nodes */
  435. _nodes = nodes;
  436. starpu_malloc((void**)&checkpoint_p, sizeof(starpu_mpi_checkpoint_template_t));
  437. starpu_mpi_checkpoint_template_create(checkpoint_p, 13, 0);
  438. data_handles = malloc(nblocks*sizeof(starpu_data_handle_t *));
  439. for(m=0 ; m<nblocks ; m++) data_handles[m] = malloc(nblocks*sizeof(starpu_data_handle_t));
  440. for (m = 0; m < nblocks; m++)
  441. {
  442. for(n = 0; n < nblocks ; n++)
  443. {
  444. int mpi_rank = my_distrib(m, n, nodes);
  445. if (mpi_rank == rank || (check && rank == 0))
  446. {
  447. //fprintf(stderr, "[%d] Owning data[%d][%d]\n", rank, n, m);
  448. starpu_matrix_data_register(&data_handles[m][n], STARPU_MAIN_RAM, (uintptr_t)matA[m][n],
  449. ld, size/nblocks, size/nblocks, sizeof(float));
  450. }
  451. #ifdef STARPU_DEVEL
  452. #warning TODO: make better test to only register what is needed
  453. #endif
  454. else
  455. {
  456. /* I don't own this index, but will need it for my computations */
  457. //fprintf(stderr, "[%d] Neighbour of data[%d][%d]\n", rank, n, m);
  458. starpu_matrix_data_register(&data_handles[m][n], -1, (uintptr_t)NULL,
  459. ld, size/nblocks, size/nblocks, sizeof(float));
  460. }
  461. if (data_handles[m][n])
  462. {
  463. starpu_data_set_coordinates(data_handles[m][n], 2, n, m);
  464. starpu_mpi_data_register(data_handles[m][n], (m*nblocks)+n, mpi_rank);
  465. if (m>=n)
  466. starpu_mpi_checkpoint_template_add_entry(checkpoint_p, STARPU_R, data_handles[m][n], backup_function(mpi_rank));
  467. }
  468. }
  469. }
  470. starpu_mpi_wait_for_all(MPI_COMM_WORLD);
  471. starpu_mpi_barrier(MPI_COMM_WORLD);
  472. start = starpu_timing_now();
  473. switch (submission)
  474. {
  475. case TRIANGLES: run_cholesky(data_handles, rank, nodes); break;
  476. case COLUMNS: run_cholesky_column(data_handles, rank, nodes); break;
  477. case ANTIDIAGONALS: run_cholesky_antidiagonal(data_handles, rank, nodes); break;
  478. case PRIOS: run_cholesky_prio(data_handles, rank, nodes); break;
  479. default: STARPU_ABORT();
  480. }
  481. starpu_mpi_wait_for_all(MPI_COMM_WORLD);
  482. starpu_mpi_barrier(MPI_COMM_WORLD);
  483. end = starpu_timing_now();
  484. for (m = 0; m < nblocks; m++)
  485. {
  486. for(n = 0; n < nblocks ; n++)
  487. {
  488. /* Get back data on node 0 for the check */
  489. if (check && data_handles[m][n])
  490. starpu_mpi_get_data_on_node(MPI_COMM_WORLD, data_handles[m][n], 0);
  491. if (data_handles[m][n])
  492. starpu_data_unregister(data_handles[m][n]);
  493. }
  494. free(data_handles[m]);
  495. }
  496. free(data_handles);
  497. if (rank == 0)
  498. {
  499. *timing = end - start;
  500. *flops = FLOPS_SPOTRF(size);
  501. }
  502. }
  503. void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *correctness, double *flops, double epsilon)
  504. {
  505. unsigned nn,mm,n,m;
  506. float *rmat = malloc(size*size*sizeof(float));
  507. for(n=0 ; n<nblocks ; n++)
  508. {
  509. for(m=0 ; m<nblocks ; m++)
  510. {
  511. for (nn = 0; nn < BLOCKSIZE; nn++)
  512. {
  513. for (mm = 0; mm < BLOCKSIZE; mm++)
  514. {
  515. rmat[mm+(m*BLOCKSIZE)+(nn+(n*BLOCKSIZE))*size] = matA[m][n][mm +nn*BLOCKSIZE];
  516. }
  517. }
  518. }
  519. }
  520. FPRINTF(stderr, "[%d] compute explicit LLt ...\n", rank);
  521. for (mm = 0; mm < size; mm++)
  522. {
  523. for (nn = 0; nn < size; nn++)
  524. {
  525. if (nn > mm)
  526. {
  527. rmat[mm+nn*size] = 0.0f; // debug
  528. }
  529. }
  530. }
  531. float *test_mat = malloc(size*size*sizeof(float));
  532. STARPU_ASSERT(test_mat);
  533. STARPU_SSYRK("L", "N", size, size, 1.0f,
  534. rmat, size, 0.0f, test_mat, size);
  535. FPRINTF(stderr, "[%d] comparing results ...\n", rank);
  536. if (display)
  537. {
  538. for (mm = 0; mm < size; mm++)
  539. {
  540. for (nn = 0; nn < size; nn++)
  541. {
  542. if (nn <= mm)
  543. {
  544. printf("%2.2f\t", test_mat[mm +nn*size]);
  545. }
  546. else
  547. {
  548. printf(".\t");
  549. }
  550. }
  551. printf("\n");
  552. }
  553. }
  554. *correctness = 1;
  555. for(n = 0; n < nblocks ; n++)
  556. {
  557. for (m = 0; m < nblocks; m++)
  558. {
  559. for (nn = BLOCKSIZE*n ; nn < BLOCKSIZE*(n+1); nn++)
  560. {
  561. for (mm = BLOCKSIZE*m ; mm < BLOCKSIZE*(m+1); mm++)
  562. {
  563. if (nn <= mm)
  564. {
  565. float orig = (1.0f/(1.0f+nn+mm)) + ((nn == mm)?1.0f*size:0.0f);
  566. float err = fabsf(test_mat[mm +nn*size] - orig) / orig;
  567. if (err > epsilon)
  568. {
  569. FPRINTF(stderr, "[%d] Error[%u, %u] --> %2.20f != %2.20f (err %2.20f)\n", rank, nn, mm, test_mat[mm +nn*size], orig, err);
  570. *correctness = 0;
  571. *flops = 0;
  572. break;
  573. }
  574. }
  575. }
  576. }
  577. }
  578. }
  579. free(rmat);
  580. free(test_mat);
  581. }