xlu_kernels.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2009, 2010 Université de Bordeaux 1
  4. * Copyright (C) 2010 Centre National de la Recherche Scientifique
  5. *
  6. * StarPU is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU Lesser General Public License as published by
  8. * the Free Software Foundation; either version 2.1 of the License, or (at
  9. * your option) any later version.
  10. *
  11. * StarPU is distributed in the hope that it will be useful, but
  12. * WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  14. *
  15. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  16. */
  17. #include "xlu.h"
  18. #include <math.h>
  19. #define str(s) #s
  20. #define xstr(s) str(s)
  21. #define STARPU_LU_STR(name) xstr(STARPU_LU(name))
  22. /*
  23. * U22
  24. */
  25. static inline void STARPU_LU(common_u22)(void *descr[],
  26. int s, __attribute__((unused)) void *_args)
  27. {
  28. TYPE *right = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
  29. TYPE *left = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
  30. TYPE *center = (TYPE *)STARPU_MATRIX_GET_PTR(descr[2]);
  31. unsigned dx = STARPU_MATRIX_GET_NX(descr[2]);
  32. unsigned dy = STARPU_MATRIX_GET_NY(descr[2]);
  33. unsigned dz = STARPU_MATRIX_GET_NY(descr[0]);
  34. unsigned ld12 = STARPU_MATRIX_GET_LD(descr[0]);
  35. unsigned ld21 = STARPU_MATRIX_GET_LD(descr[1]);
  36. unsigned ld22 = STARPU_MATRIX_GET_LD(descr[2]);
  37. #ifdef STARPU_USE_CUDA
  38. cublasStatus status;
  39. cudaError_t cures;
  40. #endif
  41. switch (s) {
  42. case 0:
  43. CPU_GEMM("N", "N", dy, dx, dz,
  44. (TYPE)-1.0, right, ld21, left, ld12,
  45. (TYPE)1.0, center, ld22);
  46. break;
  47. #ifdef STARPU_USE_CUDA
  48. case 1:
  49. status = cublasGetError();
  50. if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
  51. STARPU_ABORT();
  52. if (STARPU_UNLIKELY((cures = cudaThreadSynchronize()) != cudaSuccess))
  53. STARPU_CUDA_REPORT_ERROR(cures);
  54. // printf("dx = %d dy = %d dz = %d ld21 = %d ld12= %d ld22 = %d\n");
  55. CUBLAS_GEMM('n', 'n', dx, dy, dz,
  56. (TYPE)-1.0, right, ld21, left, ld12,
  57. (TYPE)1.0f, center, ld22);
  58. status = cublasGetError();
  59. if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
  60. STARPU_ABORT();
  61. if (STARPU_UNLIKELY((cures = cudaThreadSynchronize()) != cudaSuccess))
  62. STARPU_CUDA_REPORT_ERROR(cures);
  63. break;
  64. #endif
  65. default:
  66. STARPU_ABORT();
  67. break;
  68. }
  69. }
  70. void STARPU_LU(cpu_u22)(void *descr[], void *_args)
  71. {
  72. STARPU_LU(common_u22)(descr, 0, _args);
  73. }
  74. #ifdef STARPU_USE_CUDA
  75. void STARPU_LU(cublas_u22)(void *descr[], void *_args)
  76. {
  77. STARPU_LU(common_u22)(descr, 1, _args);
  78. }
  79. #endif// STARPU_USE_CUDA
  80. static struct starpu_perfmodel_t STARPU_LU(model_22) = {
  81. .type = STARPU_HISTORY_BASED,
  82. #ifdef STARPU_ATLAS
  83. .symbol = STARPU_LU_STR(lu_model_22_atlas)
  84. #elif defined(STARPU_GOTO)
  85. .symbol = STARPU_LU_STR(lu_model_22_goto)
  86. #else
  87. .symbol = STARPU_LU_STR(lu_model_22)
  88. #endif
  89. };
  90. starpu_codelet cl22 = {
  91. .where = STARPU_CPU|STARPU_CUDA,
  92. .cpu_func = STARPU_LU(cpu_u22),
  93. #ifdef STARPU_USE_CUDA
  94. .cuda_func = STARPU_LU(cublas_u22),
  95. #endif
  96. .nbuffers = 3,
  97. .model = &STARPU_LU(model_22)
  98. };
  99. /*
  100. * U12
  101. */
  102. static inline void STARPU_LU(common_u12)(void *descr[],
  103. int s, __attribute__((unused)) void *_args)
  104. {
  105. TYPE *sub11;
  106. TYPE *sub12;
  107. sub11 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
  108. sub12 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
  109. unsigned ld11 = STARPU_MATRIX_GET_LD(descr[0]);
  110. unsigned ld12 = STARPU_MATRIX_GET_LD(descr[1]);
  111. unsigned nx12 = STARPU_MATRIX_GET_NX(descr[1]);
  112. unsigned ny12 = STARPU_MATRIX_GET_NY(descr[1]);
  113. #ifdef STARPU_USE_CUDA
  114. cublasStatus status;
  115. cudaError_t cures;
  116. #endif
  117. /* solve L11 U12 = A12 (find U12) */
  118. switch (s) {
  119. case 0:
  120. CPU_TRSM("L", "L", "N", "N", nx12, ny12,
  121. (TYPE)1.0, sub11, ld11, sub12, ld12);
  122. break;
  123. #ifdef STARPU_USE_CUDA
  124. case 1:
  125. CUBLAS_TRSM('L', 'L', 'N', 'N', ny12, nx12,
  126. (TYPE)1.0, sub11, ld11, sub12, ld12);
  127. status = cublasGetError();
  128. if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
  129. STARPU_ABORT();
  130. if (STARPU_UNLIKELY((cures = cudaThreadSynchronize()) != cudaSuccess))
  131. STARPU_CUDA_REPORT_ERROR(cures);
  132. break;
  133. #endif
  134. default:
  135. STARPU_ABORT();
  136. break;
  137. }
  138. }
  139. void STARPU_LU(cpu_u12)(void *descr[], void *_args)
  140. {
  141. STARPU_LU(common_u12)(descr, 0, _args);
  142. }
  143. #ifdef STARPU_USE_CUDA
  144. void STARPU_LU(cublas_u12)(void *descr[], void *_args)
  145. {
  146. STARPU_LU(common_u12)(descr, 1, _args);
  147. }
  148. #endif // STARPU_USE_CUDA
  149. static struct starpu_perfmodel_t STARPU_LU(model_12) = {
  150. .type = STARPU_HISTORY_BASED,
  151. #ifdef STARPU_ATLAS
  152. .symbol = STARPU_LU_STR(lu_model_12_atlas)
  153. #elif defined(STARPU_GOTO)
  154. .symbol = STARPU_LU_STR(lu_model_12_goto)
  155. #else
  156. .symbol = STARPU_LU_STR(lu_model_12)
  157. #endif
  158. };
  159. starpu_codelet cl12 = {
  160. .where = STARPU_CPU|STARPU_CUDA,
  161. .cpu_func = STARPU_LU(cpu_u12),
  162. #ifdef STARPU_USE_CUDA
  163. .cuda_func = STARPU_LU(cublas_u12),
  164. #endif
  165. .nbuffers = 2,
  166. .model = &STARPU_LU(model_12)
  167. };
  168. /*
  169. * U21
  170. */
  171. static inline void STARPU_LU(common_u21)(void *descr[],
  172. int s, __attribute__((unused)) void *_args)
  173. {
  174. TYPE *sub11;
  175. TYPE *sub21;
  176. sub11 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
  177. sub21 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
  178. unsigned ld11 = STARPU_MATRIX_GET_LD(descr[0]);
  179. unsigned ld21 = STARPU_MATRIX_GET_LD(descr[1]);
  180. unsigned nx21 = STARPU_MATRIX_GET_NX(descr[1]);
  181. unsigned ny21 = STARPU_MATRIX_GET_NY(descr[1]);
  182. #ifdef STARPU_USE_CUDA
  183. cublasStatus status;
  184. cudaError_t cures;
  185. #endif
  186. switch (s) {
  187. case 0:
  188. CPU_TRSM("R", "U", "N", "U", nx21, ny21,
  189. (TYPE)1.0, sub11, ld11, sub21, ld21);
  190. break;
  191. #ifdef STARPU_USE_CUDA
  192. case 1:
  193. CUBLAS_TRSM('R', 'U', 'N', 'U', ny21, nx21,
  194. (TYPE)1.0, sub11, ld11, sub21, ld21);
  195. status = cublasGetError();
  196. if (status != CUBLAS_STATUS_SUCCESS)
  197. STARPU_ABORT();
  198. cudaThreadSynchronize();
  199. break;
  200. #endif
  201. default:
  202. STARPU_ABORT();
  203. break;
  204. }
  205. }
  206. void STARPU_LU(cpu_u21)(void *descr[], void *_args)
  207. {
  208. STARPU_LU(common_u21)(descr, 0, _args);
  209. }
  210. #ifdef STARPU_USE_CUDA
  211. void STARPU_LU(cublas_u21)(void *descr[], void *_args)
  212. {
  213. STARPU_LU(common_u21)(descr, 1, _args);
  214. }
  215. #endif
  216. static struct starpu_perfmodel_t STARPU_LU(model_21) = {
  217. .type = STARPU_HISTORY_BASED,
  218. #ifdef STARPU_ATLAS
  219. .symbol = STARPU_LU_STR(lu_model_21_atlas)
  220. #elif defined(STARPU_GOTO)
  221. .symbol = STARPU_LU_STR(lu_model_21_goto)
  222. #else
  223. .symbol = STARPU_LU_STR(lu_model_21)
  224. #endif
  225. };
  226. starpu_codelet cl21 = {
  227. .where = STARPU_CPU|STARPU_CUDA,
  228. .cpu_func = STARPU_LU(cpu_u21),
  229. #ifdef STARPU_USE_CUDA
  230. .cuda_func = STARPU_LU(cublas_u21),
  231. #endif
  232. .nbuffers = 2,
  233. .model = &STARPU_LU(model_21)
  234. };
  235. /*
  236. * U11
  237. */
  238. static inline void STARPU_LU(common_u11)(void *descr[],
  239. int s, __attribute__((unused)) void *_args)
  240. {
  241. TYPE *sub11;
  242. sub11 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
  243. unsigned long nx = STARPU_MATRIX_GET_NX(descr[0]);
  244. unsigned long ld = STARPU_MATRIX_GET_LD(descr[0]);
  245. unsigned long z;
  246. #ifdef STARPU_USE_CUDA
  247. cublasStatus status;
  248. cudaError_t cures;
  249. #endif
  250. switch (s) {
  251. case 0:
  252. for (z = 0; z < nx; z++)
  253. {
  254. TYPE pivot;
  255. pivot = sub11[z+z*ld];
  256. STARPU_ASSERT(pivot != 0.0);
  257. CPU_SCAL(nx - z - 1, (1.0/pivot), &sub11[z+(z+1)*ld], ld);
  258. CPU_GER(nx - z - 1, nx - z - 1, -1.0,
  259. &sub11[(z+1)+z*ld], 1,
  260. &sub11[z+(z+1)*ld], ld,
  261. &sub11[(z+1) + (z+1)*ld],ld);
  262. }
  263. break;
  264. #ifdef STARPU_USE_CUDA
  265. case 1:
  266. for (z = 0; z < nx; z++)
  267. {
  268. TYPE pivot;
  269. cudaMemcpy(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost);
  270. cudaStreamSynchronize(0);
  271. STARPU_ASSERT(pivot != 0.0);
  272. CUBLAS_SCAL(nx - z - 1, 1.0/pivot, &sub11[z+(z+1)*ld], ld);
  273. CUBLAS_GER(nx - z - 1, nx - z - 1, -1.0,
  274. &sub11[(z+1)+z*ld], 1,
  275. &sub11[z+(z+1)*ld], ld,
  276. &sub11[(z+1) + (z+1)*ld],ld);
  277. }
  278. status = cublasGetError();
  279. if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
  280. STARPU_ABORT();
  281. if (STARPU_UNLIKELY((cures = cudaThreadSynchronize()) != cudaSuccess))
  282. STARPU_CUDA_REPORT_ERROR(cures);
  283. cudaThreadSynchronize();
  284. break;
  285. #endif
  286. default:
  287. STARPU_ABORT();
  288. break;
  289. }
  290. }
  291. void STARPU_LU(cpu_u11)(void *descr[], void *_args)
  292. {
  293. STARPU_LU(common_u11)(descr, 0, _args);
  294. }
  295. #ifdef STARPU_USE_CUDA
  296. void STARPU_LU(cublas_u11)(void *descr[], void *_args)
  297. {
  298. STARPU_LU(common_u11)(descr, 1, _args);
  299. }
  300. #endif// STARPU_USE_CUDA
  301. static struct starpu_perfmodel_t STARPU_LU(model_11) = {
  302. .type = STARPU_HISTORY_BASED,
  303. #ifdef STARPU_ATLAS
  304. .symbol = STARPU_LU_STR(lu_model_11_atlas)
  305. #elif defined(STARPU_GOTO)
  306. .symbol = STARPU_LU_STR(lu_model_11_goto)
  307. #else
  308. .symbol = STARPU_LU_STR(lu_model_11)
  309. #endif
  310. };
  311. starpu_codelet cl11 = {
  312. .where = STARPU_CPU|STARPU_CUDA,
  313. .cpu_func = STARPU_LU(cpu_u11),
  314. #ifdef STARPU_USE_CUDA
  315. .cuda_func = STARPU_LU(cublas_u11),
  316. #endif
  317. .nbuffers = 1,
  318. .model = &STARPU_LU(model_11)
  319. };
  320. /*
  321. * U11 with pivoting
  322. */
  323. static inline void STARPU_LU(common_u11_pivot)(void *descr[],
  324. int s, void *_args)
  325. {
  326. TYPE *sub11;
  327. sub11 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
  328. unsigned long nx = STARPU_MATRIX_GET_NX(descr[0]);
  329. unsigned long ld = STARPU_MATRIX_GET_LD(descr[0]);
  330. unsigned long z;
  331. struct piv_s *piv = _args;
  332. unsigned *ipiv = piv->piv;
  333. unsigned first = piv->first;
  334. switch (s) {
  335. case 0:
  336. for (z = 0; z < nx; z++)
  337. {
  338. TYPE pivot;
  339. pivot = sub11[z+z*ld];
  340. if (fabs((double)(pivot)) < PIVOT_THRESHHOLD)
  341. {
  342. /* find the pivot */
  343. int piv_ind = CPU_IAMAX(nx - z, &sub11[z*(ld+1)], ld);
  344. ipiv[z + first] = piv_ind + z + first;
  345. /* swap if needed */
  346. if (piv_ind != 0)
  347. {
  348. CPU_SWAP(nx, &sub11[z*ld], 1, &sub11[(z+piv_ind)*ld], 1);
  349. }
  350. pivot = sub11[z+z*ld];
  351. }
  352. STARPU_ASSERT(pivot != 0.0);
  353. CPU_SCAL(nx - z - 1, (1.0/pivot), &sub11[z+(z+1)*ld], ld);
  354. CPU_GER(nx - z - 1, nx - z - 1, -1.0,
  355. &sub11[(z+1)+z*ld], 1,
  356. &sub11[z+(z+1)*ld], ld,
  357. &sub11[(z+1) + (z+1)*ld],ld);
  358. }
  359. break;
  360. #ifdef STARPU_USE_CUDA
  361. case 1:
  362. for (z = 0; z < nx; z++)
  363. {
  364. TYPE pivot;
  365. cudaMemcpy(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost);
  366. cudaStreamSynchronize(0);
  367. if (fabs((double)(pivot)) < PIVOT_THRESHHOLD)
  368. {
  369. /* find the pivot */
  370. int piv_ind = CUBLAS_IAMAX(nx - z, &sub11[z*(ld+1)], ld) - 1;
  371. ipiv[z + first] = piv_ind + z + first;
  372. /* swap if needed */
  373. if (piv_ind != 0)
  374. {
  375. CUBLAS_SWAP(nx, &sub11[z*ld], 1, &sub11[(z+piv_ind)*ld], 1);
  376. }
  377. cudaMemcpy(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost);
  378. cudaStreamSynchronize(0);
  379. }
  380. STARPU_ASSERT(pivot != 0.0);
  381. CUBLAS_SCAL(nx - z - 1, 1.0/pivot, &sub11[z+(z+1)*ld], ld);
  382. CUBLAS_GER(nx - z - 1, nx - z - 1, -1.0,
  383. &sub11[(z+1)+z*ld], 1,
  384. &sub11[z+(z+1)*ld], ld,
  385. &sub11[(z+1) + (z+1)*ld],ld);
  386. }
  387. cudaThreadSynchronize();
  388. break;
  389. #endif
  390. default:
  391. STARPU_ABORT();
  392. break;
  393. }
  394. }
  395. void STARPU_LU(cpu_u11_pivot)(void *descr[], void *_args)
  396. {
  397. STARPU_LU(common_u11_pivot)(descr, 0, _args);
  398. }
  399. #ifdef STARPU_USE_CUDA
  400. void STARPU_LU(cublas_u11_pivot)(void *descr[], void *_args)
  401. {
  402. STARPU_LU(common_u11_pivot)(descr, 1, _args);
  403. }
  404. #endif// STARPU_USE_CUDA
  405. static struct starpu_perfmodel_t STARPU_LU(model_11_pivot) = {
  406. .type = STARPU_HISTORY_BASED,
  407. #ifdef STARPU_ATLAS
  408. .symbol = STARPU_LU_STR(lu_model_11_pivot_atlas)
  409. #elif defined(STARPU_GOTO)
  410. .symbol = STARPU_LU_STR(lu_model_11_pivot_goto)
  411. #else
  412. .symbol = STARPU_LU_STR(lu_model_11_pivot)
  413. #endif
  414. };
  415. starpu_codelet cl11_pivot = {
  416. .where = STARPU_CPU|STARPU_CUDA,
  417. .cpu_func = STARPU_LU(cpu_u11_pivot),
  418. #ifdef STARPU_USE_CUDA
  419. .cuda_func = STARPU_LU(cublas_u11_pivot),
  420. #endif
  421. .nbuffers = 1,
  422. .model = &STARPU_LU(model_11_pivot)
  423. };
  424. /*
  425. * Pivoting
  426. */
  427. static inline void STARPU_LU(common_pivot)(void *descr[],
  428. int s, void *_args)
  429. {
  430. TYPE *matrix;
  431. matrix = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
  432. unsigned long nx = STARPU_MATRIX_GET_NX(descr[0]);
  433. unsigned long ld = STARPU_MATRIX_GET_LD(descr[0]);
  434. unsigned row;
  435. struct piv_s *piv = _args;
  436. unsigned *ipiv = piv->piv;
  437. unsigned first = piv->first;
  438. switch (s) {
  439. case 0:
  440. for (row = 0; row < nx; row++)
  441. {
  442. unsigned rowpiv = ipiv[row+first] - first;
  443. if (rowpiv != row)
  444. {
  445. CPU_SWAP(nx, &matrix[row*ld], 1, &matrix[rowpiv*ld], 1);
  446. }
  447. }
  448. break;
  449. #ifdef STARPU_USE_CUDA
  450. case 1:
  451. for (row = 0; row < nx; row++)
  452. {
  453. unsigned rowpiv = ipiv[row+first] - first;
  454. if (rowpiv != row)
  455. {
  456. CUBLAS_SWAP(nx, &matrix[row*ld], 1, &matrix[rowpiv*ld], 1);
  457. }
  458. }
  459. cudaThreadSynchronize();
  460. break;
  461. #endif
  462. default:
  463. STARPU_ABORT();
  464. break;
  465. }
  466. }
  467. void STARPU_LU(cpu_pivot)(void *descr[], void *_args)
  468. {
  469. STARPU_LU(common_pivot)(descr, 0, _args);
  470. }
  471. #ifdef STARPU_USE_CUDA
  472. void STARPU_LU(cublas_pivot)(void *descr[], void *_args)
  473. {
  474. STARPU_LU(common_pivot)(descr, 1, _args);
  475. }
  476. #endif// STARPU_USE_CUDA
  477. static struct starpu_perfmodel_t STARPU_LU(model_pivot) = {
  478. .type = STARPU_HISTORY_BASED,
  479. #ifdef STARPU_ATLAS
  480. .symbol = STARPU_LU_STR(lu_model_pivot_atlas)
  481. #elif defined(STARPU_GOTO)
  482. .symbol = STARPU_LU_STR(lu_model_pivot_goto)
  483. #else
  484. .symbol = STARPU_LU_STR(lu_model_pivot)
  485. #endif
  486. };
  487. starpu_codelet cl_pivot = {
  488. .where = STARPU_CPU|STARPU_CUDA,
  489. .cpu_func = STARPU_LU(cpu_pivot),
  490. #ifdef STARPU_USE_CUDA
  491. .cuda_func = STARPU_LU(cublas_pivot),
  492. #endif
  493. .nbuffers = 1,
  494. .model = &STARPU_LU(model_pivot)
  495. };