dw_sparse_cg_kernels.c 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2008-2011,2013-2014,2017 Université de Bordeaux
  4. * Copyright (C) 2010-2011,2015,2017 CNRS
  5. *
  6. * StarPU is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU Lesser General Public License as published by
  8. * the Free Software Foundation; either version 2.1 of the License, or (at
  9. * your option) any later version.
  10. *
  11. * StarPU is distributed in the hope that it will be useful, but
  12. * WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  14. *
  15. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  16. */
  17. #include "dw_sparse_cg.h"
  18. #ifdef STARPU_USE_CUDA
  19. #include <starpu_cublas_v2.h>
  20. #endif
  21. /*
  22. * Algorithm :
  23. *
  24. * i = 0
  25. * r = b - A x
  26. * ( d = A x ; r = r - d )
  27. * d = r
  28. * delta_new = trans(r) r
  29. * delta_0 = delta_new
  30. *
  31. * while (i < i_max && delta_new > eps^2 delta_0)
  32. * {
  33. * q = A d
  34. * alpha = delta_new / ( trans(d) q )
  35. * x = x + alpha d
  36. * if ( i is divisible by 50 )
  37. * r = b - A x
  38. * else
  39. * r = r - alpha q
  40. * delta_old = delta_new
  41. * delta_new = trans(r) r
  42. * beta = delta_new / delta_old
  43. * d = r + beta d
  44. * i = i + 1
  45. * }
  46. */
  47. /*
  48. * compute r = b - A x
  49. *
  50. * descr[0] = A, descr[1] = x, descr [2] = r, descr[3] = b
  51. */
  52. void cpu_codelet_func_1(void *descr[], void *arg)
  53. {
  54. (void)arg;
  55. float *nzval = (float *)STARPU_CSR_GET_NZVAL(descr[0]);
  56. uint32_t *colind = STARPU_CSR_GET_COLIND(descr[0]);
  57. uint32_t *rowptr = STARPU_CSR_GET_ROWPTR(descr[0]);
  58. uint32_t firstentry = STARPU_CSR_GET_ELEMSIZE(descr[0]);
  59. float *vecx = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  60. float *vecr = (float *)STARPU_VECTOR_GET_PTR(descr[2]);
  61. float *vecb = (float *)STARPU_VECTOR_GET_PTR(descr[3]);
  62. uint32_t nrow;
  63. nrow = STARPU_CSR_GET_NROW(descr[0]);
  64. unsigned row;
  65. for (row = 0; row < nrow; row++)
  66. {
  67. float tmp = 0.0f;
  68. unsigned index;
  69. unsigned firstindex = rowptr[row] - firstentry;
  70. unsigned lastindex = rowptr[row+1] - firstentry;
  71. for (index = firstindex; index < lastindex; index++)
  72. {
  73. unsigned col;
  74. col = colind[index];
  75. tmp += nzval[index]*vecx[col];
  76. }
  77. vecr[row] = vecb[row] - tmp;
  78. }
  79. }
  80. /*
  81. * compute d = r
  82. * descr[0] = d, descr[1] = r
  83. */
  84. void cpu_codelet_func_2(void *descr[], void *arg)
  85. {
  86. (void)arg;
  87. /* simply copy r into d */
  88. uint32_t nx = STARPU_VECTOR_GET_NX(descr[0]);
  89. size_t elemsize = STARPU_VECTOR_GET_ELEMSIZE(descr[0]);
  90. STARPU_ASSERT(STARPU_VECTOR_GET_NX(descr[0]) == STARPU_VECTOR_GET_NX(descr[1]));
  91. STARPU_ASSERT(STARPU_VECTOR_GET_ELEMSIZE(descr[0]) == STARPU_VECTOR_GET_ELEMSIZE(descr[1]));
  92. float *src = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  93. float *dst = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  94. memcpy(dst, src, nx*elemsize);
  95. }
  96. /*
  97. * compute delta_new = trans(r) r
  98. * delta_0 = delta_new
  99. *
  100. * args = &delta_new, &delta_0
  101. */
  102. void cpu_codelet_func_3(void *descr[], void *arg)
  103. {
  104. struct cg_problem *pb = arg;
  105. float dot;
  106. float *vec;
  107. int size;
  108. /* get the vector */
  109. vec = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  110. size = (int)STARPU_VECTOR_GET_NX(descr[0]);
  111. dot = STARPU_SDOT(size, vec, 1, vec, 1);
  112. fprintf(stderr, "func 3 : DOT = %f\n", dot);
  113. pb->delta_new = dot;
  114. pb->delta_0 = dot;
  115. }
  116. #ifdef STARPU_USE_CUDA
  117. void cublas_codelet_func_3(void *descr[], void *arg)
  118. {
  119. struct cg_problem *pb = arg;
  120. float dot;
  121. float *vec;
  122. uint32_t size;
  123. /* get the vector */
  124. vec = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  125. size = STARPU_VECTOR_GET_NX(descr[0]);
  126. cublasStatus_t status = cublasSdot (starpu_cublas_get_local_handle(), size, vec, 1, vec, 1, &dot);
  127. if (status != CUBLAS_STATUS_SUCCESS)
  128. STARPU_CUBLAS_REPORT_ERROR(status);
  129. cudaStreamSynchronize(starpu_cuda_get_local_stream());
  130. pb->delta_new = dot;
  131. pb->delta_0 = dot;
  132. }
  133. #endif
  134. /*
  135. * compute q with : q = A d
  136. *
  137. * descr[0] = A, descr[1] = d, descr [2] = q
  138. */
  139. void cpu_codelet_func_4(void *descr[], void *arg)
  140. {
  141. (void)arg;
  142. float *nzval = (float *)STARPU_CSR_GET_NZVAL(descr[0]);
  143. uint32_t *colind = STARPU_CSR_GET_COLIND(descr[0]);
  144. uint32_t *rowptr = STARPU_CSR_GET_ROWPTR(descr[0]);
  145. uint32_t firstentry = STARPU_CSR_GET_FIRSTENTRY(descr[0]);
  146. float *vecd = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  147. float *vecq = (float *)STARPU_VECTOR_GET_PTR(descr[2]);
  148. uint32_t nrow;
  149. nrow = STARPU_CSR_GET_NROW(descr[0]);
  150. unsigned row;
  151. for (row = 0; row < nrow; row++)
  152. {
  153. float tmp = 0.0f;
  154. unsigned index;
  155. unsigned firstindex = rowptr[row] - firstentry;
  156. unsigned lastindex = rowptr[row+1] - firstentry;
  157. for (index = firstindex; index < lastindex; index++)
  158. {
  159. unsigned col;
  160. col = colind[index];
  161. tmp += nzval[index]*vecd[col];
  162. }
  163. vecq[row] = tmp;
  164. }
  165. }
  166. /*
  167. * compute alpha = delta_new / ( trans(d) q )
  168. *
  169. * descr[0] = d, descr[1] = q
  170. * args = &alpha, &delta_new
  171. */
  172. void cpu_codelet_func_5(void *descr[], void *arg)
  173. {
  174. float dot;
  175. struct cg_problem *pb = arg;
  176. float *vecd, *vecq;
  177. uint32_t size;
  178. /* get the vector */
  179. vecd = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  180. vecq = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  181. STARPU_ASSERT(STARPU_VECTOR_GET_NX(descr[0]) == STARPU_VECTOR_GET_NX(descr[1]));
  182. size = STARPU_VECTOR_GET_NX(descr[0]);
  183. dot = STARPU_SDOT(size, vecd, 1, vecq, 1);
  184. pb->alpha = pb->delta_new / dot;
  185. }
  186. #ifdef STARPU_USE_CUDA
  187. void cublas_codelet_func_5(void *descr[], void *arg)
  188. {
  189. float dot;
  190. struct cg_problem *pb = arg;
  191. float *vecd, *vecq;
  192. uint32_t size;
  193. /* get the vector */
  194. vecd = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  195. vecq = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  196. STARPU_ASSERT(STARPU_VECTOR_GET_NX(descr[0]) == STARPU_VECTOR_GET_NX(descr[1]));
  197. size = STARPU_VECTOR_GET_NX(descr[0]);
  198. cublasStatus_t status = cublasSdot (starpu_cublas_get_local_handle(), size, vecd, 1, vecq, 1, &dot);
  199. if (status != CUBLAS_STATUS_SUCCESS)
  200. STARPU_CUBLAS_REPORT_ERROR(status);
  201. cudaStreamSynchronize(starpu_cuda_get_local_stream());
  202. pb->alpha = pb->delta_new / dot;
  203. }
  204. #endif
  205. /*
  206. * compute x = x + alpha d
  207. *
  208. * descr[0] : x, descr[1] : d
  209. * args = &alpha
  210. */
  211. void cpu_codelet_func_6(void *descr[], void *arg)
  212. {
  213. struct cg_problem *pb = arg;
  214. float *vecx, *vecd;
  215. uint32_t size;
  216. /* get the vector */
  217. vecx = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  218. vecd = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  219. size = STARPU_VECTOR_GET_NX(descr[0]);
  220. STARPU_SAXPY(size, pb->alpha, vecd, 1, vecx, 1);
  221. }
  222. #ifdef STARPU_USE_CUDA
  223. void cublas_codelet_func_6(void *descr[], void *arg)
  224. {
  225. struct cg_problem *pb = arg;
  226. float *vecx, *vecd;
  227. uint32_t size;
  228. /* get the vector */
  229. vecx = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  230. vecd = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  231. size = STARPU_VECTOR_GET_NX(descr[0]);
  232. cublasStatus_t status = cublasSaxpy (starpu_cublas_get_local_handle(), size, &pb->alpha, vecd, 1, vecx, 1);
  233. if (status != CUBLAS_STATUS_SUCCESS)
  234. STARPU_CUBLAS_REPORT_ERROR(status);
  235. }
  236. #endif
  237. /*
  238. * compute r = r - alpha q
  239. *
  240. * descr[0] : r, descr[1] : q
  241. * args = &alpha
  242. */
  243. void cpu_codelet_func_7(void *descr[], void *arg)
  244. {
  245. struct cg_problem *pb = arg;
  246. float *vecr, *vecq;
  247. uint32_t size;
  248. /* get the vector */
  249. vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  250. vecq = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  251. size = STARPU_VECTOR_GET_NX(descr[0]);
  252. STARPU_SAXPY(size, -pb->alpha, vecq, 1, vecr, 1);
  253. }
  254. #ifdef STARPU_USE_CUDA
  255. void cublas_codelet_func_7(void *descr[], void *arg)
  256. {
  257. struct cg_problem *pb = arg;
  258. float *vecr, *vecq;
  259. uint32_t size;
  260. /* get the vector */
  261. vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  262. vecq = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  263. size = STARPU_VECTOR_GET_NX(descr[0]);
  264. float scal = -pb->alpha;
  265. cublasStatus_t status = cublasSaxpy (starpu_cublas_get_local_handle(), size, &scal, vecq, 1, vecr, 1);
  266. if (status != CUBLAS_STATUS_SUCCESS)
  267. STARPU_CUBLAS_REPORT_ERROR(status);
  268. }
  269. #endif
  270. /*
  271. * compute delta_old = delta_new
  272. * delta_new = trans(r) r
  273. * beta = delta_new / delta_old
  274. *
  275. * descr[0] = r
  276. * args = &delta_old, &delta_new, &beta
  277. */
  278. void cpu_codelet_func_8(void *descr[], void *arg)
  279. {
  280. float dot;
  281. struct cg_problem *pb = arg;
  282. float *vecr;
  283. uint32_t size;
  284. /* get the vector */
  285. vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  286. size = STARPU_VECTOR_GET_NX(descr[0]);
  287. dot = STARPU_SDOT(size, vecr, 1, vecr, 1);
  288. pb->delta_old = pb->delta_new;
  289. pb->delta_new = dot;
  290. pb->beta = pb->delta_new/pb->delta_old;
  291. }
  292. #ifdef STARPU_USE_CUDA
  293. void cublas_codelet_func_8(void *descr[], void *arg)
  294. {
  295. float dot;
  296. struct cg_problem *pb = arg;
  297. float *vecr;
  298. uint32_t size;
  299. /* get the vector */
  300. vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  301. size = STARPU_VECTOR_GET_NX(descr[0]);
  302. cublasStatus_t status = cublasSdot(starpu_cublas_get_local_handle(), size, vecr, 1, vecr, 1, &dot);
  303. if (status != CUBLAS_STATUS_SUCCESS) STARPU_CUBLAS_REPORT_ERROR(status);
  304. cudaStreamSynchronize(starpu_cuda_get_local_stream());
  305. pb->delta_old = pb->delta_new;
  306. pb->delta_new = dot;
  307. pb->beta = pb->delta_new/pb->delta_old;
  308. }
  309. #endif
  310. /*
  311. * compute d = r + beta d
  312. *
  313. * descr[0] : d, descr[1] : r
  314. * args = &beta
  315. *
  316. */
  317. void cpu_codelet_func_9(void *descr[], void *arg)
  318. {
  319. struct cg_problem *pb = arg;
  320. float *vecd, *vecr;
  321. uint32_t size;
  322. /* get the vector */
  323. vecd = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  324. vecr = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  325. size = STARPU_VECTOR_GET_NX(descr[0]);
  326. /* d = beta d */
  327. STARPU_SSCAL(size, pb->beta, vecd, 1);
  328. /* d = r + d */
  329. STARPU_SAXPY (size, 1.0f, vecr, 1, vecd, 1);
  330. }
  331. #ifdef STARPU_USE_CUDA
  332. void cublas_codelet_func_9(void *descr[], void *arg)
  333. {
  334. struct cg_problem *pb = arg;
  335. float *vecd, *vecr;
  336. uint32_t size;
  337. /* get the vector */
  338. vecd = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  339. vecr = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  340. size = STARPU_VECTOR_GET_NX(descr[0]);
  341. /* d = beta d */
  342. cublasStatus_t status;
  343. status = cublasSscal(starpu_cublas_get_local_handle(), size, &pb->beta, vecd, 1);
  344. if (status != CUBLAS_STATUS_SUCCESS)
  345. STARPU_CUBLAS_REPORT_ERROR(status);
  346. /* d = r + d */
  347. float scal = 1.0f;
  348. status = cublasSaxpy (starpu_cublas_get_local_handle(), size, &scal, vecr, 1, vecd, 1);
  349. if (status != CUBLAS_STATUS_SUCCESS)
  350. STARPU_CUBLAS_REPORT_ERROR(status);
  351. }
  352. #endif