heat.c 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2008-2021 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  4. *
  5. * StarPU is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU Lesser General Public License as published by
  7. * the Free Software Foundation; either version 2.1 of the License, or (at
  8. * your option) any later version.
  9. *
  10. * StarPU is distributed in the hope that it will be useful, but
  11. * WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. *
  14. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  15. */
  16. /*
  17. * heat propagation simulation through either direct LU factorization or
  18. * iterative conjugate gradient.
  19. */
  20. #include "heat.h"
  21. /* default values */
  22. #ifdef STARPU_QUICK_CHECK
  23. static unsigned ntheta = 8+2;
  24. static unsigned nthick = 8+2;
  25. #else
  26. static unsigned ntheta = 32+2;
  27. static unsigned nthick = 32+2;
  28. #endif
  29. static unsigned nblocks = 16;
  30. static unsigned nbigblocks = 8;
  31. static unsigned shape = 0;
  32. static unsigned pinned = 0;
  33. static unsigned check = 0;
  34. static unsigned version = 2;
  35. static unsigned use_cg = 0; /* use a LU decomposition of CG ? */
  36. static unsigned no_prio = 0;
  37. extern void do_conjugate_gradient(float *nzvalA, float *vecb, float *vecx, uint32_t nnz,
  38. unsigned nrow, uint32_t *colind, uint32_t *rowptr);
  39. static void parse_args(int argc, char **argv)
  40. {
  41. int i;
  42. for (i = 1; i < argc; i++)
  43. {
  44. if (strcmp(argv[i], "-cg") == 0)
  45. {
  46. use_cg = 1;
  47. }
  48. if (strcmp(argv[i], "-shape") == 0)
  49. {
  50. char *argptr;
  51. shape = strtol(argv[++i], &argptr, 10);
  52. }
  53. if (strcmp(argv[i], "-nthick") == 0)
  54. {
  55. char *argptr;
  56. nthick = strtol(argv[++i], &argptr, 10);
  57. }
  58. if (strcmp(argv[i], "-ntheta") == 0)
  59. {
  60. char *argptr;
  61. ntheta = strtol(argv[++i], &argptr, 10);
  62. }
  63. if (strcmp(argv[i], "-nblocks") == 0)
  64. {
  65. char *argptr;
  66. nblocks = strtol(argv[++i], &argptr, 10);
  67. }
  68. if (strcmp(argv[i], "-nbigblocks") == 0)
  69. {
  70. char *argptr;
  71. nbigblocks = strtol(argv[++i], &argptr, 10);
  72. }
  73. if (strcmp(argv[i], "-v1") == 0)
  74. {
  75. version = 1;
  76. }
  77. if (strcmp(argv[i], "-v2") == 0)
  78. {
  79. version = 2;
  80. }
  81. if (strcmp(argv[i], "-v3") == 0)
  82. {
  83. version = 3;
  84. }
  85. if (strcmp(argv[i], "-v4") == 0)
  86. {
  87. version = 4;
  88. }
  89. if (strcmp(argv[i], "-pin") == 0)
  90. {
  91. pinned = 1;
  92. }
  93. if (strcmp(argv[i], "-check") == 0)
  94. {
  95. check = 1;
  96. }
  97. if (strcmp(argv[i], "-no-prio") == 0)
  98. {
  99. no_prio = 1;
  100. }
  101. if (strcmp(argv[i], "-size") == 0)
  102. {
  103. char *argptr;
  104. unsigned size = strtol(argv[++i], &argptr, 10);
  105. nthick = 130;
  106. ntheta = (size/128) + 2;
  107. STARPU_ASSERT((nthick - 2)*(ntheta - 2) == size);
  108. }
  109. if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-help") == 0)
  110. {
  111. printf("usage : %s [-v1|-v2|-v3|-v4] [-pin] [-nthick number] [-ntheta number] [-shape [0|1|2]] [-cg] [-size number] [-no-prio]\n", argv[0]);
  112. }
  113. }
  114. }
  115. /*
  116. * The Finite element method code
  117. *
  118. * B C
  119. * **********
  120. * * 0 * *
  121. * * * *
  122. * * * 1 *
  123. * **********
  124. * A D
  125. */
  126. static inline float diff_psi(unsigned theta_tr, unsigned thick_tr, unsigned side_tr,
  127. unsigned theta_psi, unsigned thick_psi, unsigned xy, point *pmesh)
  128. {
  129. float xa,ya,xb,yb,xc,yc;
  130. float tmp;
  131. assert(theta_tr + 2 <= ntheta);
  132. assert(thick_tr + 2 <= nthick);
  133. /* A */
  134. xa = pmesh[NODE_NUMBER(theta_tr, thick_tr)].x;
  135. ya = pmesh[NODE_NUMBER(theta_tr, thick_tr)].y;
  136. /* B */
  137. if (side_tr)
  138. {
  139. /* lower D is actually B here */
  140. xb = pmesh[NODE_NUMBER(theta_tr+1, thick_tr)].x;
  141. yb = pmesh[NODE_NUMBER(theta_tr+1, thick_tr)].y;
  142. }
  143. else
  144. {
  145. /* upper */
  146. xb = pmesh[NODE_NUMBER(theta_tr, thick_tr+1)].x;
  147. yb = pmesh[NODE_NUMBER(theta_tr, thick_tr+1)].y;
  148. }
  149. xc = pmesh[NODE_NUMBER(theta_tr+1, thick_tr+1)].x;
  150. yc = pmesh[NODE_NUMBER(theta_tr+1, thick_tr+1)].y;
  151. /* now look for the actual psi node */
  152. if (NODE_NUMBER(theta_tr, thick_tr) == NODE_NUMBER(theta_psi, thick_psi))
  153. {
  154. /* A nothing to do */
  155. }
  156. else if (NODE_NUMBER(theta_tr+1, thick_tr+1) == NODE_NUMBER(theta_psi, thick_psi))
  157. {
  158. /* psi matches C */
  159. /* swap A and C coordinates */
  160. tmp = xa; xa = xc; xc = tmp;
  161. tmp = ya; ya = yc; yc = tmp;
  162. }
  163. else if (side_tr && (NODE_NUMBER(theta_tr+1, thick_tr) == NODE_NUMBER(theta_psi, thick_psi)))
  164. {
  165. /* psi is D (that was stored in C) XXX */
  166. tmp = xa; xa = xb; xb = tmp;
  167. tmp = ya; ya = yb; yb = tmp;
  168. }
  169. else if (!side_tr && (NODE_NUMBER(theta_tr, thick_tr+1) == NODE_NUMBER(theta_psi, thick_psi)))
  170. {
  171. /* psi is C */
  172. tmp = xa; xa = xb; xb = tmp;
  173. tmp = ya; ya = yb; yb = tmp;
  174. }
  175. else
  176. {
  177. /* the psi node is not a node of the current triangle */
  178. return 0.0f;
  179. }
  180. /* now the triangle should have A as the psi node */
  181. float denom;
  182. float value;
  183. denom = (xa - xb)*(yc - ya) - (xc - xb)*(ya - yb);
  184. switch (xy)
  185. {
  186. case X:
  187. value = (yc - yb)/denom;
  188. break;
  189. case Y:
  190. value = -(xc - xb)/denom;
  191. break;
  192. default:
  193. assert(0);
  194. }
  195. return value;
  196. }
  197. static inline float diff_y_psi(unsigned theta_tr, unsigned thick_tr, unsigned side_tr,
  198. unsigned theta_psi, unsigned thick_psi, point *pmesh)
  199. {
  200. return diff_psi(theta_tr, thick_tr, side_tr, theta_psi, thick_psi, Y, pmesh);
  201. }
  202. static inline float diff_x_psi(unsigned theta_tr, unsigned thick_tr, unsigned side_tr,
  203. unsigned theta_psi, unsigned thick_psi, point *pmesh)
  204. {
  205. return diff_psi(theta_tr, thick_tr, side_tr, theta_psi, thick_psi, X, pmesh);
  206. }
  207. static inline float surface_triangle(unsigned theta_tr, unsigned thick_tr, unsigned side_tr, point *pmesh)
  208. {
  209. float surface;
  210. float tmp;
  211. float xi, xj, xk, yi, yj, yk;
  212. STARPU_ASSERT(theta_tr + 2 <= ntheta);
  213. STARPU_ASSERT(thick_tr + 2 <= nthick);
  214. xi = pmesh[NODE_NUMBER(theta_tr, thick_tr)].x;
  215. yi = pmesh[NODE_NUMBER(theta_tr, thick_tr)].y;
  216. xj = pmesh[NODE_NUMBER(theta_tr+1, thick_tr+1)].x;
  217. yj = pmesh[NODE_NUMBER(theta_tr+1, thick_tr+1)].y;
  218. if (side_tr)
  219. {
  220. /* lower */
  221. xk = pmesh[NODE_NUMBER(theta_tr+1, thick_tr)].x;
  222. yk = pmesh[NODE_NUMBER(theta_tr+1, thick_tr)].y;
  223. }
  224. else
  225. {
  226. xk = pmesh[NODE_NUMBER(theta_tr, thick_tr+1)].x;
  227. yk = pmesh[NODE_NUMBER(theta_tr, thick_tr+1)].y;
  228. }
  229. tmp = (xi - xj)*(yk -yj) - (xk - xj)*(yi -yj);
  230. surface = 0.5*fabs(tmp);
  231. return surface;
  232. }
  233. static inline float integral_triangle(int theta_tr, int thick_tr, unsigned side_tr,
  234. unsigned theta_i, unsigned thick_i, unsigned theta_j, unsigned thick_j, point *pmesh)
  235. {
  236. float surface;
  237. float value;
  238. float dxi, dxj, dyi, dyj;
  239. if (theta_tr < 0) return 0.0f;
  240. if (theta_tr + 2 > (int)ntheta) return 0.0f;
  241. if (thick_tr < 0) return 0.0f;
  242. if (thick_tr + 2 > (int)nthick) return 0.0f;
  243. dxi = diff_x_psi(theta_tr, thick_tr, side_tr, theta_i, thick_i, pmesh);
  244. dyi = diff_y_psi(theta_tr, thick_tr, side_tr, theta_i, thick_i, pmesh);
  245. dxj = diff_x_psi(theta_tr, thick_tr, side_tr, theta_j, thick_j, pmesh);
  246. dyj = diff_y_psi(theta_tr, thick_tr, side_tr, theta_j, thick_j, pmesh);
  247. surface = surface_triangle(theta_tr, thick_tr, side_tr, pmesh);
  248. value = (dxi*dxj + dyi*dyj)*surface;
  249. return value;
  250. }
  251. static inline float integrale_sum(unsigned theta_i, unsigned thick_i, unsigned theta_j, unsigned thick_j, point *pmesh)
  252. {
  253. float integral = 0.0f;
  254. integral += integral_triangle(theta_i - 1, thick_i - 1, 1, theta_i, thick_i, theta_j, thick_j, pmesh);
  255. integral += integral_triangle(theta_i - 1, thick_i - 1, 0, theta_i, thick_i, theta_j, thick_j, pmesh);
  256. integral += integral_triangle(theta_i - 1, thick_i, 1, theta_i, thick_i, theta_j, thick_j, pmesh);
  257. integral += integral_triangle(theta_i, thick_i, 0, theta_i, thick_i, theta_j, thick_j, pmesh);
  258. integral += integral_triangle(theta_i, thick_i, 1, theta_i, thick_i, theta_j, thick_j, pmesh);
  259. integral += integral_triangle(theta_i, thick_i - 1, 0, theta_i, thick_i, theta_j, thick_j, pmesh);
  260. return integral;
  261. }
  262. static float compute_A_value(unsigned i, unsigned j, point *pmesh)
  263. {
  264. float value = 0.0f;
  265. unsigned thick_i, thick_j;
  266. unsigned theta_i, theta_j;
  267. /* add all contributions from all connex triangles */
  268. thick_i = NODE_TO_THICK(i);
  269. thick_j = NODE_TO_THICK(j);
  270. theta_i = NODE_TO_THETA(i);
  271. theta_j = NODE_TO_THETA(j);
  272. /* Compute the Sum of all the integral over all triangles */
  273. if ( (abs((int)thick_i - (int)thick_j) <= 1) && (abs((int)theta_i - (int)theta_j) <= 1) )
  274. {
  275. if ( (theta_j == theta_i -1) && (thick_j == thick_i +1))
  276. goto done;
  277. if ( (theta_j == theta_i + 1) && (thick_j == thick_i - 1))
  278. goto done;
  279. /* this may not be a null entry */
  280. value += integrale_sum(theta_i, thick_i, theta_j, thick_j, pmesh);
  281. }
  282. done:
  283. return value;
  284. }
  285. #define TRANSLATE(k) (RefArray[(k)])
  286. #define TRANSLATEBACK(k) (RefArrayBack[(k)])
  287. static void solve_system(unsigned size, unsigned subsize, float *result, int *RefArray, float *Bformer, float *A, float *B)
  288. {
  289. unsigned i;
  290. /* solve the actual problem LU X = B */
  291. /* solve LX' = Y with X' = UX */
  292. /* solve UX = X' */
  293. FPRINTF(stderr, "Solving the problem ...\n");
  294. float *savedB = NULL;
  295. float *LUB = NULL;
  296. if (check)
  297. {
  298. savedB = malloc(subsize*sizeof(float));
  299. memcpy(savedB, B, subsize*sizeof(float));
  300. LUB = malloc(subsize*sizeof(float));
  301. }
  302. /* L */
  303. STARPU_STRSV("L", "N", "N", subsize, A, subsize, B, 1);
  304. /* U */
  305. STARPU_STRSV("U", "N", "U", subsize, A, subsize, B, 1);
  306. STARPU_ASSERT(DIM == size);
  307. if (check)
  308. {
  309. /* compute the error on (LUB - savedB) which should be 0 */
  310. /* LUB = B */
  311. memcpy(LUB, B, subsize*sizeof(float));
  312. /* LUB = U * LUB */
  313. STARPU_STRMV("U", "N", "U", subsize, A, subsize, LUB, 1);
  314. /* LUB = L * LUB */
  315. STARPU_STRMV("L", "N", "N", subsize, A, subsize, LUB, 1);
  316. /* LUB -= B */
  317. STARPU_SAXPY(subsize, -1.0f, savedB, 1, LUB, 1);
  318. /* check if LUB is close to the 0 vector */
  319. int maxind = STARPU_ISAMAX(subsize, LUB, 1);
  320. FPRINTF(stderr, "max error (LUX - B) = %e\n",LUB[maxind - 1]);
  321. float sum = STARPU_SASUM(subsize, LUB, 1);
  322. FPRINTF(stderr,"avg. error %e\n", sum/subsize);
  323. free(LUB);
  324. free(savedB);
  325. }
  326. /* now display back the ACTUAL result */
  327. for (i = 0; i < subsize; i++)
  328. {
  329. result[TRANSLATE(i)] = B[i];
  330. }
  331. for (i = subsize ; i < size; i++)
  332. {
  333. result[TRANSLATE(i)] = Bformer[TRANSLATE(i)];
  334. }
  335. }
  336. unsigned compute_pivot_array(int *RefArray, int *RefArrayBack, unsigned size)
  337. {
  338. unsigned k;
  339. unsigned index = 0;
  340. unsigned theta, thick;
  341. unsigned newsize;
  342. for (k = 0; k < size; k++)
  343. {
  344. RefArray[k] = k;
  345. RefArrayBack[k] = k;
  346. }
  347. /* first inner nodes */
  348. for (theta = 1; theta < ntheta - 1 ; theta++)
  349. {
  350. for (thick = 1; thick < nthick - 1; thick++)
  351. {
  352. /* inner nodes are unknown */
  353. RefArrayBack[NODE_NUMBER(theta, thick)] = index;
  354. RefArray[index] = NODE_NUMBER(theta, thick);
  355. index++;
  356. }
  357. }
  358. newsize = index;
  359. for (theta=0; theta < ntheta; theta++)
  360. {
  361. /* Lower boundary "South" */
  362. RefArrayBack[NODE_NUMBER(theta, 0)] = index;
  363. RefArray[index++] = NODE_NUMBER(theta, 0);
  364. /* Upper boundary "North" */
  365. RefArrayBack[NODE_NUMBER(theta, nthick-1)] = index;
  366. RefArray[index++] = NODE_NUMBER(theta, nthick-1);
  367. }
  368. for (thick = 1; thick < nthick -1; thick++)
  369. {
  370. /* "West "*/
  371. RefArrayBack[NODE_NUMBER(0, thick)] = index;
  372. RefArray[index++] = NODE_NUMBER(0, thick);
  373. /* "East" */
  374. RefArrayBack[NODE_NUMBER(ntheta-1, thick)] = index;
  375. RefArray[index++] = NODE_NUMBER(ntheta-1, thick);
  376. }
  377. assert(index == size);
  378. return newsize;
  379. }
  380. void build_mesh(point *mesh)
  381. {
  382. unsigned theta, thick;
  383. /* first build the mesh by determining all points positions */
  384. for (theta = 0; theta < ntheta; theta++)
  385. {
  386. float angle;
  387. angle = (ntheta - 1 - theta) * Pi/(ntheta-1);
  388. for (thick = 0; thick < nthick; thick++)
  389. {
  390. float r;
  391. r = thick * (RMAX - RMIN)/(nthick - 1) + RMIN;
  392. switch (shape)
  393. {
  394. default:
  395. case 0:
  396. mesh[NODE_NUMBER(theta,thick)].x = r*cosf(angle);
  397. mesh[NODE_NUMBER(theta,thick)].y = r*sinf(angle);
  398. break;
  399. case 1:
  400. mesh[NODE_NUMBER(theta,thick)].x =
  401. -100 + RMIN+((RMAX-RMIN)*theta)/(ntheta - 1);
  402. mesh[NODE_NUMBER(theta,thick)].y =
  403. RMIN+((RMAX-RMIN)*thick)/(nthick - 1);
  404. break;
  405. case 2:
  406. mesh[NODE_NUMBER(theta,thick)].x = r*(2.0f*theta/(ntheta - 1)- 1.0f);
  407. mesh[NODE_NUMBER(theta,thick)].y = r*(2.0f*thick/(nthick - 1)- 1.0f);
  408. break;
  409. }
  410. }
  411. }
  412. }
  413. static unsigned long build_neighbour_vector(unsigned long*neighbours, unsigned node, int *RefArray, int *RefArrayBack)
  414. {
  415. /* where is that point in the former space ? */
  416. int former = TRANSLATE(node);
  417. int former_thick, former_theta;
  418. former_thick= (int)NODE_TO_THICK(former);
  419. former_theta = (int)NODE_TO_THETA(former);
  420. /* do a list of all the possible neighbours */
  421. unsigned nneighbours = 0;
  422. int dtheta, dthick;
  423. for (dthick = -1; dthick <= 1; dthick++)
  424. {
  425. if ((former_thick + dthick) >= 0 && (former_thick + dthick) <= (int)nthick )
  426. {
  427. for (dtheta = -1; dtheta <= 1; dtheta++)
  428. {
  429. if ((former_theta + dtheta) >= 0 && (former_theta + dtheta) <= (int)ntheta )
  430. {
  431. /* we got a possible neighbour */
  432. unsigned pnode =
  433. NODE_NUMBER((former_theta + dtheta), (former_thick + dthick));
  434. neighbours[nneighbours++] = TRANSLATEBACK(pnode);
  435. }
  436. }
  437. }
  438. }
  439. unsigned i;
  440. /* order that list */
  441. for (i = 0; i < nneighbours; i++)
  442. {
  443. /* find the i^th smallest entry for position i */
  444. unsigned index;
  445. unsigned min , min_index;
  446. min = neighbours[i];
  447. min_index = i;
  448. for (index = i+1; index < nneighbours; index++)
  449. {
  450. STARPU_ASSERT(neighbours[i] != neighbours[index]);
  451. if (neighbours[index] < min)
  452. {
  453. min = neighbours[index];
  454. min_index = index;
  455. }
  456. }
  457. /* swap values */
  458. neighbours[min_index] = neighbours[i];
  459. neighbours[i] = min;
  460. }
  461. return nneighbours;
  462. }
  463. static void build_sparse_stiffness_matrix_B(point *pmesh, float *B, float *Bformer, unsigned size, unsigned newsize, int *RefArray, int *RefArrayBack)
  464. {
  465. unsigned i,j;
  466. /* first give the value of known nodes (at boundaries) */
  467. for (i = 0; i < size; i++)
  468. {
  469. Bformer[i] = 0.0f;
  470. }
  471. for (i = 0; i < nthick; i++)
  472. {
  473. Bformer[i] = 200.0f;
  474. Bformer[size-1-i] = 200.0f;
  475. }
  476. for (i = 1; i < ntheta-1; i++)
  477. {
  478. Bformer[i*nthick] = 200.0f;
  479. Bformer[(i+1)*nthick-1] = 100.0f;
  480. }
  481. /* now the actual stiffness (reordered) matrix*/
  482. for (j = 0 ; j < newsize ; j++)
  483. {
  484. unsigned long neighbour;
  485. unsigned long nneighbours;
  486. unsigned long neighbours[9];
  487. nneighbours = build_neighbour_vector(&neighbours[0], j, RefArray, RefArrayBack);
  488. B[j] = Bformer[TRANSLATE(j)];
  489. for (neighbour = 0; neighbour < nneighbours; neighbour++)
  490. {
  491. unsigned n = neighbours[neighbour];
  492. if (n >= newsize)
  493. {
  494. B[j] -= compute_A_value(TRANSLATE(n), TRANSLATE(j), pmesh)*Bformer[TRANSLATE(n)];
  495. }
  496. }
  497. }
  498. }
  499. static unsigned build_sparse_stiffness_matrix_A(point *pmesh, float **nzval, uint32_t **colind,
  500. uint32_t *rowptr, unsigned newsize, int *RefArray, int *RefArrayBack)
  501. {
  502. unsigned j;
  503. unsigned pos = 0;
  504. *nzval = NULL;
  505. *colind = NULL;
  506. /* now the actual stiffness (reordered) matrix*/
  507. for (j = 0 ; j < newsize ; j++)
  508. {
  509. rowptr[j] = pos;
  510. unsigned long neighbour;
  511. unsigned long nneighbours;
  512. unsigned long neighbours[9];
  513. nneighbours = build_neighbour_vector(&neighbours[0], j, RefArray, RefArrayBack);
  514. for (neighbour = 0; neighbour < nneighbours; neighbour++)
  515. {
  516. unsigned nodeneighbour = neighbours[neighbour];
  517. if (nodeneighbour < newsize)
  518. {
  519. float val;
  520. val = compute_A_value(TRANSLATE(j), TRANSLATE(nodeneighbour), pmesh);
  521. if (val != 0.0f)
  522. {
  523. *nzval = realloc(*nzval, (pos+1)*sizeof(float));
  524. *colind = realloc(*colind, (pos+1)*sizeof(uint32_t));
  525. (*nzval)[pos] = val;
  526. (*colind)[pos] = nodeneighbour;
  527. pos++;
  528. }
  529. }
  530. }
  531. }
  532. rowptr[newsize] = pos;
  533. return pos;
  534. }
  535. static void build_dense_stiffness_matrix_A(point *pmesh, float *A, unsigned newsize, int *RefArray, int *RefArrayBack)
  536. {
  537. unsigned long j;
  538. /* touch all the memory */
  539. memset(A, 0, newsize*newsize*sizeof(float));
  540. /* now the actual stiffness (reordered) matrix*/
  541. for (j = 0 ; j < newsize ; j++)
  542. {
  543. unsigned long neighbour;
  544. unsigned long nneighbours;
  545. unsigned long neighbours[9];
  546. nneighbours = build_neighbour_vector(&neighbours[0], j, RefArray, RefArrayBack);
  547. for (neighbour = 0; neighbour < nneighbours; neighbour++)
  548. {
  549. unsigned long nodeneighbour = neighbours[neighbour];
  550. if (nodeneighbour < newsize)
  551. {
  552. float val;
  553. val = compute_A_value(TRANSLATE(j), TRANSLATE(nodeneighbour), pmesh);
  554. A[j+ (unsigned long)newsize*nodeneighbour] = val;
  555. }
  556. }
  557. }
  558. }
  559. int main(int argc, char **argv)
  560. {
  561. float *A;
  562. float *B;
  563. unsigned newsize;
  564. float *result;
  565. int *RefArray, *RefArrayBack;
  566. point *pmesh;
  567. float *Bformer;
  568. parse_args(argc, argv);
  569. pmesh = malloc(DIM*sizeof(point));
  570. RefArray = malloc(DIM*sizeof(int));
  571. RefArrayBack = malloc(DIM*sizeof(int));
  572. Bformer = malloc(DIM*sizeof(float));
  573. result = calloc(DIM, sizeof(float));
  574. build_mesh(pmesh);
  575. /* now simplify that problem given the boundary conditions
  576. * to do so, we remove the already known variables from the system
  577. * by pivoting the various know variable, RefArray keep track of that
  578. * pivoting */
  579. newsize = compute_pivot_array(RefArray, RefArrayBack, DIM);
  580. /* we can either use a direct method (LU decomposition here) or an
  581. * iterative method (conjugate gradient here) */
  582. if (use_cg)
  583. {
  584. unsigned nnz;
  585. float *nzval;
  586. uint32_t *colind;
  587. uint32_t *rowptr;
  588. rowptr = malloc((newsize+1)*sizeof(uint32_t));
  589. B = malloc(newsize*sizeof(float));
  590. build_sparse_stiffness_matrix_B(pmesh, B, Bformer, DIM, newsize, RefArray, RefArrayBack);
  591. nnz = build_sparse_stiffness_matrix_A(pmesh, &nzval, &colind, rowptr, newsize, RefArray, RefArrayBack);
  592. do_conjugate_gradient(nzval, B, result, nnz, newsize, colind, rowptr);
  593. /* XXX */
  594. memcpy(B, result, newsize*sizeof(float));
  595. /* now display back the ACTUAL result */
  596. unsigned i;
  597. for (i = 0; i < newsize; i++)
  598. {
  599. result[TRANSLATE(i)] = B[i];
  600. }
  601. for (i = newsize ; i < DIM; i++)
  602. {
  603. result[TRANSLATE(i)] = Bformer[TRANSLATE(i)];
  604. }
  605. free(nzval);
  606. free(colind);
  607. free(rowptr);
  608. free(B);
  609. }
  610. else
  611. {
  612. /* unfortunately CUDA does not allow late memory registration,
  613. * we need to do the malloc using CUDA itself ... */
  614. initialize_system(&A, &B, newsize, pinned);
  615. /* then build the stiffness matrix A */
  616. build_sparse_stiffness_matrix_B(pmesh, B, Bformer, DIM, newsize, RefArray, RefArrayBack);
  617. build_dense_stiffness_matrix_A(pmesh, A, newsize, RefArray, RefArrayBack);
  618. FPRINTF(stderr, "Problem size : %ux%u (%ux%u) (%lu MB)\n", newsize, newsize, DIM, DIM, ((unsigned long)newsize*newsize*4UL)/(1024*1024));
  619. STARPU_ASSERT(newsize % nblocks == 0);
  620. switch (version)
  621. {
  622. case 1:
  623. case 2:
  624. dw_factoLU(A, newsize, newsize, nblocks, version, no_prio);
  625. break;
  626. case 3:
  627. dw_factoLU_tag(A, newsize, newsize, nblocks, no_prio);
  628. break;
  629. case 4:
  630. dw_factoLU_grain(A, newsize, newsize, nblocks, nbigblocks);
  631. break;
  632. default:
  633. STARPU_ABORT();
  634. }
  635. display_stat_heat();
  636. if (check)
  637. solve_system(DIM, newsize, result, RefArray, Bformer, A, B);
  638. starpu_cublas_shutdown();
  639. starpu_shutdown();
  640. free_system(A, B, newsize, pinned);
  641. }
  642. #ifdef STARPU_OPENGL_RENDER
  643. const char *display = getenv("DISPLAY");
  644. if (display && display[0])
  645. opengl_render(ntheta, nthick, result, pmesh, argc, argv);
  646. #endif
  647. free(pmesh);
  648. free(RefArray);
  649. free(RefArrayBack);
  650. free(Bformer);
  651. free(result);
  652. return 0;
  653. }