xlu_kernels.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2009-2020 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  4. *
  5. * StarPU is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU Lesser General Public License as published by
  7. * the Free Software Foundation; either version 2.1 of the License, or (at
  8. * your option) any later version.
  9. *
  10. * StarPU is distributed in the hope that it will be useful, but
  11. * WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. *
  14. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  15. */
  16. /* LU Kernels */
  17. #include "xlu.h"
  18. #include <math.h>
  19. #include <complex.h>
  20. #ifdef STARPU_USE_CUDA
  21. #include <cublas.h>
  22. #include <starpu_cublas_v2.h>
  23. #endif
  24. #define str(s) #s
  25. #define xstr(s) str(s)
  26. #define STARPU_LU_STR(name) xstr(STARPU_LU(name))
  27. #ifdef STARPU_USE_CUDA
  28. static const TYPE p1 = 1.0f;
  29. static const TYPE m1 = -1.0f;
  30. #endif
  31. /*
  32. * U22
  33. */
  34. static inline void STARPU_LU(common_u22)(void *descr[], int s, void *_args)
  35. {
  36. (void)_args;
  37. TYPE *right = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
  38. TYPE *left = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
  39. TYPE *center = (TYPE *)STARPU_MATRIX_GET_PTR(descr[2]);
  40. unsigned dx = STARPU_MATRIX_GET_NX(descr[2]);
  41. unsigned dy = STARPU_MATRIX_GET_NY(descr[2]);
  42. unsigned dz = STARPU_MATRIX_GET_NY(descr[0]);
  43. unsigned ld12 = STARPU_MATRIX_GET_LD(descr[0]);
  44. unsigned ld21 = STARPU_MATRIX_GET_LD(descr[1]);
  45. unsigned ld22 = STARPU_MATRIX_GET_LD(descr[2]);
  46. #ifdef STARPU_USE_CUDA
  47. cublasStatus status;
  48. cudaError_t cures;
  49. #endif
  50. switch (s)
  51. {
  52. case 0:
  53. CPU_GEMM("N", "N", dy, dx, dz,
  54. (TYPE)-1.0, right, ld21, left, ld12,
  55. (TYPE)1.0, center, ld22);
  56. break;
  57. #ifdef STARPU_USE_CUDA
  58. case 1:
  59. {
  60. status = CUBLAS_GEMM(starpu_cublas_get_local_handle(),
  61. CUBLAS_OP_N, CUBLAS_OP_N, dx, dy, dz,
  62. (CUBLAS_TYPE *)&m1, (CUBLAS_TYPE *)right, ld21, (CUBLAS_TYPE *)left, ld12,
  63. (CUBLAS_TYPE *)&p1, (CUBLAS_TYPE *)center, ld22);
  64. if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
  65. STARPU_CUBLAS_REPORT_ERROR(status);
  66. break;
  67. }
  68. #endif
  69. default:
  70. STARPU_ABORT();
  71. break;
  72. }
  73. }
  74. void STARPU_LU(cpu_u22)(void *descr[], void *_args)
  75. {
  76. STARPU_LU(common_u22)(descr, 0, _args);
  77. }
  78. #ifdef STARPU_USE_CUDA
  79. void STARPU_LU(cublas_u22)(void *descr[], void *_args)
  80. {
  81. STARPU_LU(common_u22)(descr, 1, _args);
  82. }
  83. #endif /* STARPU_USE_CUDA */
  84. static struct starpu_perfmodel STARPU_LU(model_22) =
  85. {
  86. .type = STARPU_HISTORY_BASED,
  87. #ifdef STARPU_ATLAS
  88. .symbol = STARPU_LU_STR(lu_model_22_atlas)
  89. #elif defined(STARPU_GOTO)
  90. .symbol = STARPU_LU_STR(lu_model_22_goto)
  91. #elif defined(STARPU_OPENBLAS)
  92. .symbol = STARPU_LU_STR(lu_model_22_openblas)
  93. #else
  94. .symbol = STARPU_LU_STR(lu_model_22)
  95. #endif
  96. };
  97. #ifdef STARPU_USE_CUDA
  98. static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
  99. {
  100. (void)task;
  101. (void)nimpl;
  102. enum starpu_worker_archtype type = starpu_worker_get_type(workerid);
  103. if (type == STARPU_CPU_WORKER || type == STARPU_MIC_WORKER)
  104. return 1;
  105. #ifdef STARPU_SIMGRID
  106. /* We don't know, let's assume it can */
  107. return 1;
  108. #else
  109. /* Cuda device */
  110. const struct cudaDeviceProp *props;
  111. props = starpu_cuda_get_device_properties(workerid);
  112. if (props->major >= 2 || props->minor >= 3)
  113. {
  114. /* At least compute capability 1.3, supports doubles */
  115. return 1;
  116. }
  117. else
  118. {
  119. /* Old card does not support doubles */
  120. return 0;
  121. }
  122. #endif
  123. }
  124. #endif
  125. #define STRINGIFY_(x) #x
  126. #define STRINGIFY(x) STRINGIFY_(x)
  127. struct starpu_codelet cl22 =
  128. {
  129. .cpu_funcs = {STARPU_LU(cpu_u22)},
  130. .cpu_funcs_name = {STRINGIFY(STARPU_LU(cpu_u22))},
  131. #ifdef STARPU_USE_CUDA
  132. .cuda_funcs = {STARPU_LU(cublas_u22)},
  133. CAN_EXECUTE
  134. #elif defined(STARPU_SIMGRID)
  135. .cuda_funcs = {(void*)1},
  136. #endif
  137. .cuda_flags = {STARPU_CUDA_ASYNC},
  138. .nbuffers = 3,
  139. .modes = {STARPU_R, STARPU_R, STARPU_RW},
  140. .model = &STARPU_LU(model_22)
  141. };
  142. /*
  143. * U12
  144. */
  145. static inline void STARPU_LU(common_u12)(void *descr[], int s, void *_args)
  146. {
  147. (void)_args;
  148. TYPE *sub11;
  149. TYPE *sub12;
  150. sub11 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
  151. sub12 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
  152. unsigned ld11 = STARPU_MATRIX_GET_LD(descr[0]);
  153. unsigned ld12 = STARPU_MATRIX_GET_LD(descr[1]);
  154. unsigned nx12 = STARPU_MATRIX_GET_NX(descr[1]);
  155. unsigned ny12 = STARPU_MATRIX_GET_NY(descr[1]);
  156. #ifdef STARPU_USE_CUDA
  157. cublasStatus status;
  158. cudaError_t cures;
  159. #endif
  160. /* solve L11 U12 = A12 (find U12) */
  161. switch (s)
  162. {
  163. case 0:
  164. CPU_TRSM("L", "L", "N", "N", nx12, ny12,
  165. (TYPE)1.0, sub11, ld11, sub12, ld12);
  166. break;
  167. #ifdef STARPU_USE_CUDA
  168. case 1:
  169. status = CUBLAS_TRSM(starpu_cublas_get_local_handle(),
  170. CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_N, CUBLAS_DIAG_NON_UNIT,
  171. ny12, nx12,
  172. (CUBLAS_TYPE*)&p1, (CUBLAS_TYPE*)sub11, ld11, (CUBLAS_TYPE*)sub12, ld12);
  173. if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
  174. STARPU_CUBLAS_REPORT_ERROR(status);
  175. break;
  176. #endif
  177. default:
  178. STARPU_ABORT();
  179. break;
  180. }
  181. }
  182. void STARPU_LU(cpu_u12)(void *descr[], void *_args)
  183. {
  184. STARPU_LU(common_u12)(descr, 0, _args);
  185. }
  186. #ifdef STARPU_USE_CUDA
  187. void STARPU_LU(cublas_u12)(void *descr[], void *_args)
  188. {
  189. STARPU_LU(common_u12)(descr, 1, _args);
  190. }
  191. #endif /* STARPU_USE_CUDA */
  192. static struct starpu_perfmodel STARPU_LU(model_12) =
  193. {
  194. .type = STARPU_HISTORY_BASED,
  195. #ifdef STARPU_ATLAS
  196. .symbol = STARPU_LU_STR(lu_model_12_atlas)
  197. #elif defined(STARPU_GOTO)
  198. .symbol = STARPU_LU_STR(lu_model_12_goto)
  199. #elif defined(STARPU_OPENBLAS)
  200. .symbol = STARPU_LU_STR(lu_model_12_openblas)
  201. #else
  202. .symbol = STARPU_LU_STR(lu_model_12)
  203. #endif
  204. };
  205. struct starpu_codelet cl12 =
  206. {
  207. .cpu_funcs = {STARPU_LU(cpu_u12)},
  208. .cpu_funcs_name = {STRINGIFY(STARPU_LU(cpu_u12))},
  209. #ifdef STARPU_USE_CUDA
  210. .cuda_funcs = {STARPU_LU(cublas_u12)},
  211. CAN_EXECUTE
  212. #elif defined(STARPU_SIMGRID)
  213. .cuda_funcs = {(void*)1},
  214. #endif
  215. .cuda_flags = {STARPU_CUDA_ASYNC},
  216. .nbuffers = 2,
  217. .modes = {STARPU_R, STARPU_RW},
  218. .model = &STARPU_LU(model_12)
  219. };
  220. /*
  221. * U21
  222. */
  223. static inline void STARPU_LU(common_u21)(void *descr[], int s, void *_args)
  224. {
  225. (void)_args;
  226. TYPE *sub11;
  227. TYPE *sub21;
  228. sub11 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
  229. sub21 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
  230. unsigned ld11 = STARPU_MATRIX_GET_LD(descr[0]);
  231. unsigned ld21 = STARPU_MATRIX_GET_LD(descr[1]);
  232. unsigned nx21 = STARPU_MATRIX_GET_NX(descr[1]);
  233. unsigned ny21 = STARPU_MATRIX_GET_NY(descr[1]);
  234. #ifdef STARPU_USE_CUDA
  235. cublasStatus status;
  236. #endif
  237. switch (s)
  238. {
  239. case 0:
  240. CPU_TRSM("R", "U", "N", "U", nx21, ny21,
  241. (TYPE)1.0, sub11, ld11, sub21, ld21);
  242. break;
  243. #ifdef STARPU_USE_CUDA
  244. case 1:
  245. status = CUBLAS_TRSM(starpu_cublas_get_local_handle(),
  246. CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_DIAG_UNIT,
  247. ny21, nx21,
  248. (CUBLAS_TYPE*)&p1, (CUBLAS_TYPE*)sub11, ld11, (CUBLAS_TYPE*)sub21, ld21);
  249. if (status != CUBLAS_STATUS_SUCCESS)
  250. STARPU_CUBLAS_REPORT_ERROR(status);
  251. break;
  252. #endif
  253. default:
  254. STARPU_ABORT();
  255. break;
  256. }
  257. }
  258. void STARPU_LU(cpu_u21)(void *descr[], void *_args)
  259. {
  260. STARPU_LU(common_u21)(descr, 0, _args);
  261. }
  262. #ifdef STARPU_USE_CUDA
  263. void STARPU_LU(cublas_u21)(void *descr[], void *_args)
  264. {
  265. STARPU_LU(common_u21)(descr, 1, _args);
  266. }
  267. #endif
  268. static struct starpu_perfmodel STARPU_LU(model_21) =
  269. {
  270. .type = STARPU_HISTORY_BASED,
  271. #ifdef STARPU_ATLAS
  272. .symbol = STARPU_LU_STR(lu_model_21_atlas)
  273. #elif defined(STARPU_GOTO)
  274. .symbol = STARPU_LU_STR(lu_model_21_goto)
  275. #elif defined(STARPU_OPENBLAS)
  276. .symbol = STARPU_LU_STR(lu_model_21_openblas)
  277. #else
  278. .symbol = STARPU_LU_STR(lu_model_21)
  279. #endif
  280. };
  281. struct starpu_codelet cl21 =
  282. {
  283. .cpu_funcs = {STARPU_LU(cpu_u21)},
  284. .cpu_funcs_name = {STRINGIFY(STARPU_LU(cpu_u21))},
  285. #ifdef STARPU_USE_CUDA
  286. .cuda_funcs = {STARPU_LU(cublas_u21)},
  287. CAN_EXECUTE
  288. #elif defined(STARPU_SIMGRID)
  289. .cuda_funcs = {(void*)1},
  290. #endif
  291. .cuda_flags = {STARPU_CUDA_ASYNC},
  292. .nbuffers = 2,
  293. .modes = {STARPU_R, STARPU_RW},
  294. .model = &STARPU_LU(model_21)
  295. };
  296. /*
  297. * U11
  298. */
  299. static inline void STARPU_LU(common_u11)(void *descr[], int s, void *_args)
  300. {
  301. (void)_args;
  302. TYPE *sub11;
  303. sub11 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
  304. unsigned long nx = STARPU_MATRIX_GET_NX(descr[0]);
  305. unsigned long ld = STARPU_MATRIX_GET_LD(descr[0]);
  306. unsigned long z;
  307. #ifdef STARPU_USE_CUDA
  308. cublasStatus status;
  309. cublasHandle_t handle;
  310. cudaStream_t stream;
  311. #endif
  312. switch (s)
  313. {
  314. case 0:
  315. for (z = 0; z < nx; z++)
  316. {
  317. TYPE pivot;
  318. pivot = sub11[z+z*ld];
  319. STARPU_ASSERT(!ISZERO(pivot));
  320. CPU_SCAL(nx - z - 1, (1.0/pivot), &sub11[z+(z+1)*ld], ld);
  321. CPU_GER(nx - z - 1, nx - z - 1, -1.0,
  322. &sub11[(z+1)+z*ld], 1,
  323. &sub11[z+(z+1)*ld], ld,
  324. &sub11[(z+1) + (z+1)*ld],ld);
  325. }
  326. break;
  327. #ifdef STARPU_USE_CUDA
  328. case 1:
  329. handle = starpu_cublas_get_local_handle();
  330. stream = starpu_cuda_get_local_stream();
  331. for (z = 0; z < nx; z++)
  332. {
  333. TYPE pivot;
  334. TYPE inv_pivot;
  335. cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, stream);
  336. cudaStreamSynchronize(stream);
  337. STARPU_ASSERT(!ISZERO(pivot));
  338. inv_pivot = 1.0/pivot;
  339. status = CUBLAS_SCAL(handle,
  340. nx - z - 1,
  341. (CUBLAS_TYPE*)&inv_pivot, (CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld);
  342. if (status != CUBLAS_STATUS_SUCCESS)
  343. STARPU_CUBLAS_REPORT_ERROR(status);
  344. status = CUBLAS_GER(handle,
  345. nx - z - 1, nx - z - 1,
  346. (CUBLAS_TYPE*)&m1,
  347. (CUBLAS_TYPE*)&sub11[(z+1)+z*ld], 1,
  348. (CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld,
  349. (CUBLAS_TYPE*)&sub11[(z+1) + (z+1)*ld],ld);
  350. if (status != CUBLAS_STATUS_SUCCESS)
  351. STARPU_CUBLAS_REPORT_ERROR(status);
  352. }
  353. cudaStreamSynchronize(stream);
  354. break;
  355. #endif
  356. default:
  357. STARPU_ABORT();
  358. break;
  359. }
  360. }
  361. void STARPU_LU(cpu_u11)(void *descr[], void *_args)
  362. {
  363. STARPU_LU(common_u11)(descr, 0, _args);
  364. }
  365. #ifdef STARPU_USE_CUDA
  366. void STARPU_LU(cublas_u11)(void *descr[], void *_args)
  367. {
  368. STARPU_LU(common_u11)(descr, 1, _args);
  369. }
  370. #endif /* STARPU_USE_CUDA */
  371. static struct starpu_perfmodel STARPU_LU(model_11) =
  372. {
  373. .type = STARPU_HISTORY_BASED,
  374. #ifdef STARPU_ATLAS
  375. .symbol = STARPU_LU_STR(lu_model_11_atlas)
  376. #elif defined(STARPU_GOTO)
  377. .symbol = STARPU_LU_STR(lu_model_11_goto)
  378. #elif defined(STARPU_OPENBLAS)
  379. .symbol = STARPU_LU_STR(lu_model_11_openblas)
  380. #else
  381. .symbol = STARPU_LU_STR(lu_model_11)
  382. #endif
  383. };
  384. struct starpu_codelet cl11 =
  385. {
  386. .cpu_funcs = {STARPU_LU(cpu_u11)},
  387. .cpu_funcs_name = {STRINGIFY(STARPU_LU(cpu_u11))},
  388. #ifdef STARPU_USE_CUDA
  389. .cuda_funcs = {STARPU_LU(cublas_u11)},
  390. CAN_EXECUTE
  391. #elif defined(STARPU_SIMGRID)
  392. .cuda_funcs = {(void*)1},
  393. #endif
  394. .nbuffers = 1,
  395. .modes = {STARPU_RW},
  396. .model = &STARPU_LU(model_11)
  397. };
  398. /*
  399. * U11 with pivoting
  400. */
  401. static inline void STARPU_LU(common_u11_pivot)(void *descr[],
  402. int s, void *_args)
  403. {
  404. TYPE *sub11;
  405. sub11 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
  406. unsigned long nx = STARPU_MATRIX_GET_NX(descr[0]);
  407. unsigned long ld = STARPU_MATRIX_GET_LD(descr[0]);
  408. unsigned long z;
  409. struct piv_s *piv = _args;
  410. unsigned *ipiv = piv->piv;
  411. unsigned first = piv->first;
  412. #ifdef STARPU_USE_CUDA
  413. cublasStatus status;
  414. cublasHandle_t handle;
  415. cudaStream_t stream;
  416. #endif
  417. switch (s)
  418. {
  419. case 0:
  420. for (z = 0; z < nx; z++)
  421. {
  422. TYPE pivot;
  423. pivot = sub11[z+z*ld];
  424. if (fabs((double)(pivot)) < PIVOT_THRESHHOLD)
  425. {
  426. /* find the pivot */
  427. int piv_ind = CPU_IAMAX(nx - z, &sub11[z*(ld+1)], ld);
  428. ipiv[z + first] = piv_ind + z + first;
  429. /* swap if needed */
  430. if (piv_ind != 0)
  431. {
  432. CPU_SWAP(nx, &sub11[z*ld], 1, &sub11[(z+piv_ind)*ld], 1);
  433. }
  434. pivot = sub11[z+z*ld];
  435. }
  436. STARPU_ASSERT(!ISZERO(pivot));
  437. CPU_SCAL(nx - z - 1, (1.0/pivot), &sub11[z+(z+1)*ld], ld);
  438. CPU_GER(nx - z - 1, nx - z - 1, -1.0,
  439. &sub11[(z+1)+z*ld], 1,
  440. &sub11[z+(z+1)*ld], ld,
  441. &sub11[(z+1) + (z+1)*ld],ld);
  442. }
  443. break;
  444. #ifdef STARPU_USE_CUDA
  445. case 1:
  446. handle = starpu_cublas_get_local_handle();
  447. stream = starpu_cuda_get_local_stream();
  448. for (z = 0; z < nx; z++)
  449. {
  450. TYPE pivot;
  451. TYPE inv_pivot;
  452. cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, stream);
  453. cudaStreamSynchronize(stream);
  454. if (fabs((double)(pivot)) < PIVOT_THRESHHOLD)
  455. {
  456. /* find the pivot */
  457. int piv_ind;
  458. status = CUBLAS_IAMAX(handle,
  459. nx - z, (CUBLAS_TYPE*)&sub11[z*(ld+1)], ld, &piv_ind);
  460. piv_ind -= 1;
  461. if (status != CUBLAS_STATUS_SUCCESS)
  462. STARPU_CUBLAS_REPORT_ERROR(status);
  463. ipiv[z + first] = piv_ind + z + first;
  464. /* swap if needed */
  465. if (piv_ind != 0)
  466. {
  467. status = CUBLAS_SWAP(handle,
  468. nx,
  469. (CUBLAS_TYPE*)&sub11[z*ld], 1,
  470. (CUBLAS_TYPE*)&sub11[(z+piv_ind)*ld], 1);
  471. if (status != CUBLAS_STATUS_SUCCESS)
  472. STARPU_CUBLAS_REPORT_ERROR(status);
  473. }
  474. cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, stream);
  475. cudaStreamSynchronize(stream);
  476. }
  477. STARPU_ASSERT(!ISZERO(pivot));
  478. inv_pivot = 1.0/pivot;
  479. status = CUBLAS_SCAL(handle,
  480. nx - z - 1,
  481. (CUBLAS_TYPE*)&inv_pivot,
  482. (CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld);
  483. if (status != CUBLAS_STATUS_SUCCESS)
  484. STARPU_CUBLAS_REPORT_ERROR(status);
  485. status = CUBLAS_GER(handle,
  486. nx - z - 1, nx - z - 1,
  487. (CUBLAS_TYPE*)&m1,
  488. (CUBLAS_TYPE*)&sub11[(z+1)+z*ld], 1,
  489. (CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld,
  490. (CUBLAS_TYPE*)&sub11[(z+1) + (z+1)*ld],ld);
  491. if (status != CUBLAS_STATUS_SUCCESS)
  492. STARPU_CUBLAS_REPORT_ERROR(status);
  493. }
  494. cudaStreamSynchronize(stream);
  495. break;
  496. #endif
  497. default:
  498. STARPU_ABORT();
  499. break;
  500. }
  501. }
  502. void STARPU_LU(cpu_u11_pivot)(void *descr[], void *_args)
  503. {
  504. STARPU_LU(common_u11_pivot)(descr, 0, _args);
  505. }
  506. #ifdef STARPU_USE_CUDA
  507. void STARPU_LU(cublas_u11_pivot)(void *descr[], void *_args)
  508. {
  509. STARPU_LU(common_u11_pivot)(descr, 1, _args);
  510. }
  511. #endif /* STARPU_USE_CUDA */
  512. static struct starpu_perfmodel STARPU_LU(model_11_pivot) =
  513. {
  514. .type = STARPU_HISTORY_BASED,
  515. #ifdef STARPU_ATLAS
  516. .symbol = STARPU_LU_STR(lu_model_11_pivot_atlas)
  517. #elif defined(STARPU_GOTO)
  518. .symbol = STARPU_LU_STR(lu_model_11_pivot_goto)
  519. #elif defined(STARPU_OPENBLAS)
  520. .symbol = STARPU_LU_STR(lu_model_11_pivot_openblas)
  521. #else
  522. .symbol = STARPU_LU_STR(lu_model_11_pivot)
  523. #endif
  524. };
  525. struct starpu_codelet cl11_pivot =
  526. {
  527. .cpu_funcs = {STARPU_LU(cpu_u11_pivot)},
  528. .cpu_funcs_name = {STRINGIFY(STARPU_LU(cpu_u11_pivot))},
  529. #ifdef STARPU_USE_CUDA
  530. .cuda_funcs = {STARPU_LU(cublas_u11_pivot)},
  531. CAN_EXECUTE
  532. #elif defined(STARPU_SIMGRID)
  533. .cuda_funcs = {(void*)1},
  534. #endif
  535. .nbuffers = 1,
  536. .modes = {STARPU_RW},
  537. .model = &STARPU_LU(model_11_pivot)
  538. };
  539. /*
  540. * Pivoting
  541. */
  542. static inline void STARPU_LU(common_pivot)(void *descr[],
  543. int s, void *_args)
  544. {
  545. TYPE *matrix;
  546. matrix = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
  547. unsigned long nx = STARPU_MATRIX_GET_NX(descr[0]);
  548. unsigned long ld = STARPU_MATRIX_GET_LD(descr[0]);
  549. unsigned row;
  550. struct piv_s *piv = _args;
  551. unsigned *ipiv = piv->piv;
  552. unsigned first = piv->first;
  553. #ifdef STARPU_USE_CUDA
  554. cublasStatus status;
  555. cublasHandle_t handle;
  556. #endif
  557. switch (s)
  558. {
  559. case 0:
  560. for (row = 0; row < nx; row++)
  561. {
  562. unsigned rowpiv = ipiv[row+first] - first;
  563. if (rowpiv != row)
  564. {
  565. CPU_SWAP(nx, &matrix[row*ld], 1, &matrix[rowpiv*ld], 1);
  566. }
  567. }
  568. break;
  569. #ifdef STARPU_USE_CUDA
  570. case 1:
  571. handle = starpu_cublas_get_local_handle();
  572. for (row = 0; row < nx; row++)
  573. {
  574. unsigned rowpiv = ipiv[row+first] - first;
  575. if (rowpiv != row)
  576. {
  577. status = CUBLAS_SWAP(handle,
  578. nx,
  579. (CUBLAS_TYPE*)&matrix[row*ld], 1,
  580. (CUBLAS_TYPE*)&matrix[rowpiv*ld], 1);
  581. if (status != CUBLAS_STATUS_SUCCESS)
  582. STARPU_CUBLAS_REPORT_ERROR(status);
  583. }
  584. }
  585. break;
  586. #endif
  587. default:
  588. STARPU_ABORT();
  589. break;
  590. }
  591. }
  592. void STARPU_LU(cpu_pivot)(void *descr[], void *_args)
  593. {
  594. STARPU_LU(common_pivot)(descr, 0, _args);
  595. }
  596. #ifdef STARPU_USE_CUDA
  597. void STARPU_LU(cublas_pivot)(void *descr[], void *_args)
  598. {
  599. STARPU_LU(common_pivot)(descr, 1, _args);
  600. }
  601. #endif /* STARPU_USE_CUDA */
  602. static struct starpu_perfmodel STARPU_LU(model_pivot) =
  603. {
  604. .type = STARPU_HISTORY_BASED,
  605. #ifdef STARPU_ATLAS
  606. .symbol = STARPU_LU_STR(lu_model_pivot_atlas)
  607. #elif defined(STARPU_GOTO)
  608. .symbol = STARPU_LU_STR(lu_model_pivot_goto)
  609. #elif defined(STARPU_OPENBLAS)
  610. .symbol = STARPU_LU_STR(lu_model_pivot_openblas)
  611. #else
  612. .symbol = STARPU_LU_STR(lu_model_pivot)
  613. #endif
  614. };
  615. struct starpu_codelet cl_pivot =
  616. {
  617. .cpu_funcs = {STARPU_LU(cpu_pivot)},
  618. .cpu_funcs_name = {STRINGIFY(STARPU_LU(cpu_pivot))},
  619. #ifdef STARPU_USE_CUDA
  620. .cuda_funcs = {STARPU_LU(cublas_pivot)},
  621. CAN_EXECUTE
  622. #elif defined(STARPU_SIMGRID)
  623. .cuda_funcs = {(void*)1},
  624. #endif
  625. .cuda_flags = {STARPU_CUDA_ASYNC},
  626. .nbuffers = 1,
  627. .modes = {STARPU_RW},
  628. .model = &STARPU_LU(model_pivot)
  629. };