xlu_kernels.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2009, 2010-2012, 2014-2016 Université de Bordeaux
  4. * Copyright (C) 2010, 2011, 2012, 2015 CNRS
  5. *
  6. * StarPU is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU Lesser General Public License as published by
  8. * the Free Software Foundation; either version 2.1 of the License, or (at
  9. * your option) any later version.
  10. *
  11. * StarPU is distributed in the hope that it will be useful, but
  12. * WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  14. *
  15. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  16. */
  17. /* LU Kernels */
  18. #include "xlu.h"
  19. #include <math.h>
  20. #include <complex.h>
  21. #define str(s) #s
  22. #define xstr(s) str(s)
  23. #define STARPU_LU_STR(name) xstr(STARPU_LU(name))
  24. #ifdef STARPU_USE_CUDA
  25. static const TYPE p1 = 1.0f;
  26. static const TYPE m1 = -1.0f;
  27. #endif
  28. /*
  29. * U22
  30. */
  31. static inline void STARPU_LU(common_u22)(void *descr[],
  32. int s, STARPU_ATTRIBUTE_UNUSED void *_args)
  33. {
  34. TYPE *right = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
  35. TYPE *left = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
  36. TYPE *center = (TYPE *)STARPU_MATRIX_GET_PTR(descr[2]);
  37. unsigned dx = STARPU_MATRIX_GET_NX(descr[2]);
  38. unsigned dy = STARPU_MATRIX_GET_NY(descr[2]);
  39. unsigned dz = STARPU_MATRIX_GET_NY(descr[0]);
  40. unsigned ld12 = STARPU_MATRIX_GET_LD(descr[0]);
  41. unsigned ld21 = STARPU_MATRIX_GET_LD(descr[1]);
  42. unsigned ld22 = STARPU_MATRIX_GET_LD(descr[2]);
  43. #ifdef STARPU_USE_CUDA
  44. cublasStatus status;
  45. cudaError_t cures;
  46. #endif
  47. switch (s)
  48. {
  49. case 0:
  50. CPU_GEMM("N", "N", dy, dx, dz,
  51. (TYPE)-1.0, right, ld21, left, ld12,
  52. (TYPE)1.0, center, ld22);
  53. break;
  54. #ifdef STARPU_USE_CUDA
  55. case 1:
  56. {
  57. CUBLAS_GEMM('n', 'n', dx, dy, dz,
  58. *(CUBLAS_TYPE*)&m1, (CUBLAS_TYPE *)right, ld21, (CUBLAS_TYPE *)left, ld12,
  59. *(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE *)center, ld22);
  60. status = cublasGetError();
  61. if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
  62. STARPU_CUBLAS_REPORT_ERROR(status);
  63. break;
  64. }
  65. #endif
  66. default:
  67. STARPU_ABORT();
  68. break;
  69. }
  70. }
  71. void STARPU_LU(cpu_u22)(void *descr[], void *_args)
  72. {
  73. STARPU_LU(common_u22)(descr, 0, _args);
  74. }
  75. #ifdef STARPU_USE_CUDA
  76. void STARPU_LU(cublas_u22)(void *descr[], void *_args)
  77. {
  78. STARPU_LU(common_u22)(descr, 1, _args);
  79. }
  80. #endif /* STARPU_USE_CUDA */
  81. static struct starpu_perfmodel STARPU_LU(model_22) =
  82. {
  83. .type = STARPU_HISTORY_BASED,
  84. #ifdef STARPU_ATLAS
  85. .symbol = STARPU_LU_STR(lu_model_22_atlas)
  86. #elif defined(STARPU_GOTO)
  87. .symbol = STARPU_LU_STR(lu_model_22_goto)
  88. #else
  89. .symbol = STARPU_LU_STR(lu_model_22)
  90. #endif
  91. };
  92. #ifdef STARPU_USE_CUDA
  93. static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
  94. {
  95. enum starpu_worker_archtype type = starpu_worker_get_type(workerid);
  96. if (type == STARPU_CPU_WORKER || type == STARPU_MIC_WORKER || type == STARPU_SCC_WORKER)
  97. return 1;
  98. #ifdef STARPU_SIMGRID
  99. /* We don't know, let's assume it can */
  100. return 1;
  101. #else
  102. /* Cuda device */
  103. const struct cudaDeviceProp *props;
  104. props = starpu_cuda_get_device_properties(workerid);
  105. if (props->major >= 2 || props->minor >= 3)
  106. {
  107. /* At least compute capability 1.3, supports doubles */
  108. return 1;
  109. }
  110. else
  111. {
  112. /* Old card does not support doubles */
  113. return 0;
  114. }
  115. #endif
  116. }
  117. #endif
  118. #define STRINGIFY_(x) #x
  119. #define STRINGIFY(x) STRINGIFY_(x)
  120. struct starpu_codelet cl22 =
  121. {
  122. .cpu_funcs = {STARPU_LU(cpu_u22)},
  123. .cpu_funcs_name = {STRINGIFY(STARPU_LU(cpu_u22))},
  124. #ifdef STARPU_USE_CUDA
  125. .cuda_funcs = {STARPU_LU(cublas_u22)},
  126. CAN_EXECUTE
  127. #elif defined(STARPU_SIMGRID)
  128. .cuda_funcs = {(void*)1},
  129. #endif
  130. .cuda_flags = {STARPU_CUDA_ASYNC},
  131. .nbuffers = 3,
  132. .modes = {STARPU_R, STARPU_R, STARPU_RW},
  133. .model = &STARPU_LU(model_22)
  134. };
  135. /*
  136. * U12
  137. */
  138. static inline void STARPU_LU(common_u12)(void *descr[],
  139. int s, STARPU_ATTRIBUTE_UNUSED void *_args)
  140. {
  141. TYPE *sub11;
  142. TYPE *sub12;
  143. sub11 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
  144. sub12 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
  145. unsigned ld11 = STARPU_MATRIX_GET_LD(descr[0]);
  146. unsigned ld12 = STARPU_MATRIX_GET_LD(descr[1]);
  147. unsigned nx12 = STARPU_MATRIX_GET_NX(descr[1]);
  148. unsigned ny12 = STARPU_MATRIX_GET_NY(descr[1]);
  149. #ifdef STARPU_USE_CUDA
  150. cublasStatus status;
  151. cudaError_t cures;
  152. #endif
  153. /* solve L11 U12 = A12 (find U12) */
  154. switch (s)
  155. {
  156. case 0:
  157. CPU_TRSM("L", "L", "N", "N", nx12, ny12,
  158. (TYPE)1.0, sub11, ld11, sub12, ld12);
  159. break;
  160. #ifdef STARPU_USE_CUDA
  161. case 1:
  162. CUBLAS_TRSM('L', 'L', 'N', 'N', ny12, nx12,
  163. *(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE*)sub11, ld11, (CUBLAS_TYPE*)sub12, ld12);
  164. status = cublasGetError();
  165. if (STARPU_UNLIKELY(status != CUBLAS_STATUS_SUCCESS))
  166. STARPU_CUBLAS_REPORT_ERROR(status);
  167. break;
  168. #endif
  169. default:
  170. STARPU_ABORT();
  171. break;
  172. }
  173. }
  174. void STARPU_LU(cpu_u12)(void *descr[], void *_args)
  175. {
  176. STARPU_LU(common_u12)(descr, 0, _args);
  177. }
  178. #ifdef STARPU_USE_CUDA
  179. void STARPU_LU(cublas_u12)(void *descr[], void *_args)
  180. {
  181. STARPU_LU(common_u12)(descr, 1, _args);
  182. }
  183. #endif /* STARPU_USE_CUDA */
  184. static struct starpu_perfmodel STARPU_LU(model_12) =
  185. {
  186. .type = STARPU_HISTORY_BASED,
  187. #ifdef STARPU_ATLAS
  188. .symbol = STARPU_LU_STR(lu_model_12_atlas)
  189. #elif defined(STARPU_GOTO)
  190. .symbol = STARPU_LU_STR(lu_model_12_goto)
  191. #else
  192. .symbol = STARPU_LU_STR(lu_model_12)
  193. #endif
  194. };
  195. struct starpu_codelet cl12 =
  196. {
  197. .cpu_funcs = {STARPU_LU(cpu_u12)},
  198. .cpu_funcs_name = {STRINGIFY(STARPU_LU(cpu_u12))},
  199. #ifdef STARPU_USE_CUDA
  200. .cuda_funcs = {STARPU_LU(cublas_u12)},
  201. CAN_EXECUTE
  202. #elif defined(STARPU_SIMGRID)
  203. .cuda_funcs = {(void*)1},
  204. #endif
  205. .cuda_flags = {STARPU_CUDA_ASYNC},
  206. .nbuffers = 2,
  207. .modes = {STARPU_R, STARPU_RW},
  208. .model = &STARPU_LU(model_12)
  209. };
  210. /*
  211. * U21
  212. */
  213. static inline void STARPU_LU(common_u21)(void *descr[],
  214. int s, STARPU_ATTRIBUTE_UNUSED void *_args)
  215. {
  216. TYPE *sub11;
  217. TYPE *sub21;
  218. sub11 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
  219. sub21 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
  220. unsigned ld11 = STARPU_MATRIX_GET_LD(descr[0]);
  221. unsigned ld21 = STARPU_MATRIX_GET_LD(descr[1]);
  222. unsigned nx21 = STARPU_MATRIX_GET_NX(descr[1]);
  223. unsigned ny21 = STARPU_MATRIX_GET_NY(descr[1]);
  224. #ifdef STARPU_USE_CUDA
  225. cublasStatus status;
  226. #endif
  227. switch (s)
  228. {
  229. case 0:
  230. CPU_TRSM("R", "U", "N", "U", nx21, ny21,
  231. (TYPE)1.0, sub11, ld11, sub21, ld21);
  232. break;
  233. #ifdef STARPU_USE_CUDA
  234. case 1:
  235. CUBLAS_TRSM('R', 'U', 'N', 'U', ny21, nx21,
  236. *(CUBLAS_TYPE*)&p1, (CUBLAS_TYPE*)sub11, ld11, (CUBLAS_TYPE*)sub21, ld21);
  237. status = cublasGetError();
  238. if (status != CUBLAS_STATUS_SUCCESS)
  239. STARPU_CUBLAS_REPORT_ERROR(status);
  240. break;
  241. #endif
  242. default:
  243. STARPU_ABORT();
  244. break;
  245. }
  246. }
  247. void STARPU_LU(cpu_u21)(void *descr[], void *_args)
  248. {
  249. STARPU_LU(common_u21)(descr, 0, _args);
  250. }
  251. #ifdef STARPU_USE_CUDA
  252. void STARPU_LU(cublas_u21)(void *descr[], void *_args)
  253. {
  254. STARPU_LU(common_u21)(descr, 1, _args);
  255. }
  256. #endif
  257. static struct starpu_perfmodel STARPU_LU(model_21) =
  258. {
  259. .type = STARPU_HISTORY_BASED,
  260. #ifdef STARPU_ATLAS
  261. .symbol = STARPU_LU_STR(lu_model_21_atlas)
  262. #elif defined(STARPU_GOTO)
  263. .symbol = STARPU_LU_STR(lu_model_21_goto)
  264. #else
  265. .symbol = STARPU_LU_STR(lu_model_21)
  266. #endif
  267. };
  268. struct starpu_codelet cl21 =
  269. {
  270. .cpu_funcs = {STARPU_LU(cpu_u21)},
  271. .cpu_funcs_name = {STRINGIFY(STARPU_LU(cpu_u21))},
  272. #ifdef STARPU_USE_CUDA
  273. .cuda_funcs = {STARPU_LU(cublas_u21)},
  274. CAN_EXECUTE
  275. #elif defined(STARPU_SIMGRID)
  276. .cuda_funcs = {(void*)1},
  277. #endif
  278. .cuda_flags = {STARPU_CUDA_ASYNC},
  279. .nbuffers = 2,
  280. .modes = {STARPU_R, STARPU_RW},
  281. .model = &STARPU_LU(model_21)
  282. };
  283. /*
  284. * U11
  285. */
  286. static inline void STARPU_LU(common_u11)(void *descr[],
  287. int s, STARPU_ATTRIBUTE_UNUSED void *_args)
  288. {
  289. TYPE *sub11;
  290. sub11 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
  291. unsigned long nx = STARPU_MATRIX_GET_NX(descr[0]);
  292. unsigned long ld = STARPU_MATRIX_GET_LD(descr[0]);
  293. unsigned long z;
  294. switch (s)
  295. {
  296. case 0:
  297. for (z = 0; z < nx; z++)
  298. {
  299. TYPE pivot;
  300. pivot = sub11[z+z*ld];
  301. #ifdef COMPLEX_LU
  302. STARPU_ASSERT(fpclassify(creal(pivot)) != FP_ZERO);
  303. STARPU_ASSERT(fpclassify(cimag(pivot)) != FP_ZERO);
  304. #else
  305. STARPU_ASSERT(fpclassify(pivot) != FP_ZERO);
  306. #endif
  307. CPU_SCAL(nx - z - 1, (1.0/pivot), &sub11[z+(z+1)*ld], ld);
  308. CPU_GER(nx - z - 1, nx - z - 1, -1.0,
  309. &sub11[(z+1)+z*ld], 1,
  310. &sub11[z+(z+1)*ld], ld,
  311. &sub11[(z+1) + (z+1)*ld],ld);
  312. }
  313. break;
  314. #ifdef STARPU_USE_CUDA
  315. case 1:
  316. for (z = 0; z < nx; z++)
  317. {
  318. TYPE pivot;
  319. TYPE inv_pivot;
  320. cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
  321. cudaStreamSynchronize(starpu_cuda_get_local_stream());
  322. #ifdef COMPLEX_LU
  323. STARPU_ASSERT(fpclassify(creal(pivot)) != FP_ZERO);
  324. STARPU_ASSERT(fpclassify(cimag(pivot)) != FP_ZERO);
  325. #else
  326. STARPU_ASSERT(fpclassify(pivot) != FP_ZERO);
  327. #endif
  328. inv_pivot = 1.0/pivot;
  329. CUBLAS_SCAL(nx - z - 1, *(CUBLAS_TYPE*)&inv_pivot, (CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld);
  330. CUBLAS_GER(nx - z - 1, nx - z - 1, *(CUBLAS_TYPE*)&m1,
  331. (CUBLAS_TYPE*)&sub11[(z+1)+z*ld], 1,
  332. (CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld,
  333. (CUBLAS_TYPE*)&sub11[(z+1) + (z+1)*ld],ld);
  334. }
  335. cudaStreamSynchronize(starpu_cuda_get_local_stream());
  336. break;
  337. #endif
  338. default:
  339. STARPU_ABORT();
  340. break;
  341. }
  342. }
  343. void STARPU_LU(cpu_u11)(void *descr[], void *_args)
  344. {
  345. STARPU_LU(common_u11)(descr, 0, _args);
  346. }
  347. #ifdef STARPU_USE_CUDA
  348. void STARPU_LU(cublas_u11)(void *descr[], void *_args)
  349. {
  350. STARPU_LU(common_u11)(descr, 1, _args);
  351. }
  352. #endif /* STARPU_USE_CUDA */
  353. static struct starpu_perfmodel STARPU_LU(model_11) =
  354. {
  355. .type = STARPU_HISTORY_BASED,
  356. #ifdef STARPU_ATLAS
  357. .symbol = STARPU_LU_STR(lu_model_11_atlas)
  358. #elif defined(STARPU_GOTO)
  359. .symbol = STARPU_LU_STR(lu_model_11_goto)
  360. #else
  361. .symbol = STARPU_LU_STR(lu_model_11)
  362. #endif
  363. };
  364. struct starpu_codelet cl11 =
  365. {
  366. .cpu_funcs = {STARPU_LU(cpu_u11)},
  367. .cpu_funcs_name = {STRINGIFY(STARPU_LU(cpu_u11))},
  368. #ifdef STARPU_USE_CUDA
  369. .cuda_funcs = {STARPU_LU(cublas_u11)},
  370. CAN_EXECUTE
  371. #elif defined(STARPU_SIMGRID)
  372. .cuda_funcs = {(void*)1},
  373. #endif
  374. .nbuffers = 1,
  375. .modes = {STARPU_RW},
  376. .model = &STARPU_LU(model_11)
  377. };
  378. /*
  379. * U11 with pivoting
  380. */
  381. static inline void STARPU_LU(common_u11_pivot)(void *descr[],
  382. int s, void *_args)
  383. {
  384. TYPE *sub11;
  385. sub11 = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
  386. unsigned long nx = STARPU_MATRIX_GET_NX(descr[0]);
  387. unsigned long ld = STARPU_MATRIX_GET_LD(descr[0]);
  388. unsigned long z;
  389. struct piv_s *piv = _args;
  390. unsigned *ipiv = piv->piv;
  391. unsigned first = piv->first;
  392. switch (s)
  393. {
  394. case 0:
  395. for (z = 0; z < nx; z++)
  396. {
  397. TYPE pivot;
  398. pivot = sub11[z+z*ld];
  399. if (fabs((double)(pivot)) < PIVOT_THRESHHOLD)
  400. {
  401. /* find the pivot */
  402. int piv_ind = CPU_IAMAX(nx - z, &sub11[z*(ld+1)], ld);
  403. ipiv[z + first] = piv_ind + z + first;
  404. /* swap if needed */
  405. if (piv_ind != 0)
  406. {
  407. CPU_SWAP(nx, &sub11[z*ld], 1, &sub11[(z+piv_ind)*ld], 1);
  408. }
  409. pivot = sub11[z+z*ld];
  410. }
  411. STARPU_ASSERT(pivot != 0.0);
  412. CPU_SCAL(nx - z - 1, (1.0/pivot), &sub11[z+(z+1)*ld], ld);
  413. CPU_GER(nx - z - 1, nx - z - 1, -1.0,
  414. &sub11[(z+1)+z*ld], 1,
  415. &sub11[z+(z+1)*ld], ld,
  416. &sub11[(z+1) + (z+1)*ld],ld);
  417. }
  418. break;
  419. #ifdef STARPU_USE_CUDA
  420. case 1:
  421. for (z = 0; z < nx; z++)
  422. {
  423. TYPE pivot;
  424. TYPE inv_pivot;
  425. cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
  426. cudaStreamSynchronize(starpu_cuda_get_local_stream());
  427. if (fabs((double)(pivot)) < PIVOT_THRESHHOLD)
  428. {
  429. /* find the pivot */
  430. int piv_ind = CUBLAS_IAMAX(nx - z, (CUBLAS_TYPE*)&sub11[z*(ld+1)], ld) - 1;
  431. ipiv[z + first] = piv_ind + z + first;
  432. /* swap if needed */
  433. if (piv_ind != 0)
  434. {
  435. CUBLAS_SWAP(nx, (CUBLAS_TYPE*)&sub11[z*ld], 1, (CUBLAS_TYPE*)&sub11[(z+piv_ind)*ld], 1);
  436. }
  437. cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
  438. cudaStreamSynchronize(starpu_cuda_get_local_stream());
  439. }
  440. STARPU_ASSERT(pivot != 0.0);
  441. inv_pivot = 1.0/pivot;
  442. CUBLAS_SCAL(nx - z - 1, *(CUBLAS_TYPE*)&inv_pivot, (CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld);
  443. CUBLAS_GER(nx - z - 1, nx - z - 1, *(CUBLAS_TYPE*)&m1,
  444. (CUBLAS_TYPE*)&sub11[(z+1)+z*ld], 1,
  445. (CUBLAS_TYPE*)&sub11[z+(z+1)*ld], ld,
  446. (CUBLAS_TYPE*)&sub11[(z+1) + (z+1)*ld],ld);
  447. }
  448. cudaStreamSynchronize(starpu_cuda_get_local_stream());
  449. break;
  450. #endif
  451. default:
  452. STARPU_ABORT();
  453. break;
  454. }
  455. }
  456. void STARPU_LU(cpu_u11_pivot)(void *descr[], void *_args)
  457. {
  458. STARPU_LU(common_u11_pivot)(descr, 0, _args);
  459. }
  460. #ifdef STARPU_USE_CUDA
  461. void STARPU_LU(cublas_u11_pivot)(void *descr[], void *_args)
  462. {
  463. STARPU_LU(common_u11_pivot)(descr, 1, _args);
  464. }
  465. #endif /* STARPU_USE_CUDA */
  466. static struct starpu_perfmodel STARPU_LU(model_11_pivot) =
  467. {
  468. .type = STARPU_HISTORY_BASED,
  469. #ifdef STARPU_ATLAS
  470. .symbol = STARPU_LU_STR(lu_model_11_pivot_atlas)
  471. #elif defined(STARPU_GOTO)
  472. .symbol = STARPU_LU_STR(lu_model_11_pivot_goto)
  473. #else
  474. .symbol = STARPU_LU_STR(lu_model_11_pivot)
  475. #endif
  476. };
  477. struct starpu_codelet cl11_pivot =
  478. {
  479. .cpu_funcs = {STARPU_LU(cpu_u11_pivot)},
  480. .cpu_funcs_name = {STRINGIFY(STARPU_LU(cpu_u11_pivot))},
  481. #ifdef STARPU_USE_CUDA
  482. .cuda_funcs = {STARPU_LU(cublas_u11_pivot)},
  483. CAN_EXECUTE
  484. #elif defined(STARPU_SIMGRID)
  485. .cuda_funcs = {(void*)1},
  486. #endif
  487. .nbuffers = 1,
  488. .modes = {STARPU_RW},
  489. .model = &STARPU_LU(model_11_pivot)
  490. };
  491. /*
  492. * Pivoting
  493. */
  494. static inline void STARPU_LU(common_pivot)(void *descr[],
  495. int s, void *_args)
  496. {
  497. TYPE *matrix;
  498. matrix = (TYPE *)STARPU_MATRIX_GET_PTR(descr[0]);
  499. unsigned long nx = STARPU_MATRIX_GET_NX(descr[0]);
  500. unsigned long ld = STARPU_MATRIX_GET_LD(descr[0]);
  501. unsigned row;
  502. struct piv_s *piv = _args;
  503. unsigned *ipiv = piv->piv;
  504. unsigned first = piv->first;
  505. switch (s)
  506. {
  507. case 0:
  508. for (row = 0; row < nx; row++)
  509. {
  510. unsigned rowpiv = ipiv[row+first] - first;
  511. if (rowpiv != row)
  512. {
  513. CPU_SWAP(nx, &matrix[row*ld], 1, &matrix[rowpiv*ld], 1);
  514. }
  515. }
  516. break;
  517. #ifdef STARPU_USE_CUDA
  518. case 1:
  519. for (row = 0; row < nx; row++)
  520. {
  521. unsigned rowpiv = ipiv[row+first] - first;
  522. if (rowpiv != row)
  523. {
  524. CUBLAS_SWAP(nx, (CUBLAS_TYPE*)&matrix[row*ld], 1, (CUBLAS_TYPE*)&matrix[rowpiv*ld], 1);
  525. }
  526. }
  527. break;
  528. #endif
  529. default:
  530. STARPU_ABORT();
  531. break;
  532. }
  533. }
  534. void STARPU_LU(cpu_pivot)(void *descr[], void *_args)
  535. {
  536. STARPU_LU(common_pivot)(descr, 0, _args);
  537. }
  538. #ifdef STARPU_USE_CUDA
  539. void STARPU_LU(cublas_pivot)(void *descr[], void *_args)
  540. {
  541. STARPU_LU(common_pivot)(descr, 1, _args);
  542. }
  543. #endif /* STARPU_USE_CUDA */
  544. static struct starpu_perfmodel STARPU_LU(model_pivot) =
  545. {
  546. .type = STARPU_HISTORY_BASED,
  547. #ifdef STARPU_ATLAS
  548. .symbol = STARPU_LU_STR(lu_model_pivot_atlas)
  549. #elif defined(STARPU_GOTO)
  550. .symbol = STARPU_LU_STR(lu_model_pivot_goto)
  551. #else
  552. .symbol = STARPU_LU_STR(lu_model_pivot)
  553. #endif
  554. };
  555. struct starpu_codelet cl_pivot =
  556. {
  557. .cpu_funcs = {STARPU_LU(cpu_pivot)},
  558. .cpu_funcs_name = {STRINGIFY(STARPU_LU(cpu_pivot))},
  559. #ifdef STARPU_USE_CUDA
  560. .cuda_funcs = {STARPU_LU(cublas_pivot)},
  561. CAN_EXECUTE
  562. #elif defined(STARPU_SIMGRID)
  563. .cuda_funcs = {(void*)1},
  564. #endif
  565. .cuda_flags = {STARPU_CUDA_ASYNC},
  566. .nbuffers = 1,
  567. .modes = {STARPU_RW},
  568. .model = &STARPU_LU(model_pivot)
  569. };