dw_sparse_cg_kernels.c 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2009, 2010 Université de Bordeaux
  4. * Copyright (C) 2010 CNRS
  5. *
  6. * StarPU is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU Lesser General Public License as published by
  8. * the Free Software Foundation; either version 2.1 of the License, or (at
  9. * your option) any later version.
  10. *
  11. * StarPU is distributed in the hope that it will be useful, but
  12. * WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  14. *
  15. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  16. */
  17. #include "dw_sparse_cg.h"
  18. #ifdef STARPU_USE_CUDA
  19. #include <starpu_cublas_v2.h>
  20. #endif
  21. /*
  22. * Algorithm :
  23. *
  24. * i = 0
  25. * r = b - A x
  26. * ( d = A x ; r = r - d )
  27. * d = r
  28. * delta_new = trans(r) r
  29. * delta_0 = delta_new
  30. *
  31. * while (i < i_max && delta_new > eps^2 delta_0)
  32. * {
  33. * q = A d
  34. * alpha = delta_new / ( trans(d) q )
  35. * x = x + alpha d
  36. * if ( i is divisible by 50 )
  37. * r = b - A x
  38. * else
  39. * r = r - alpha q
  40. * delta_old = delta_new
  41. * delta_new = trans(r) r
  42. * beta = delta_new / delta_old
  43. * d = r + beta d
  44. * i = i + 1
  45. * }
  46. */
  47. /*
  48. * compute r = b - A x
  49. *
  50. * descr[0] = A, descr[1] = x, descr [2] = r, descr[3] = b
  51. */
  52. void cpu_codelet_func_1(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg)
  53. {
  54. float *nzval = (float *)STARPU_CSR_GET_NZVAL(descr[0]);
  55. uint32_t *colind = STARPU_CSR_GET_COLIND(descr[0]);
  56. uint32_t *rowptr = STARPU_CSR_GET_ROWPTR(descr[0]);
  57. uint32_t firstentry = STARPU_CSR_GET_ELEMSIZE(descr[0]);
  58. float *vecx = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  59. float *vecr = (float *)STARPU_VECTOR_GET_PTR(descr[2]);
  60. float *vecb = (float *)STARPU_VECTOR_GET_PTR(descr[3]);
  61. uint32_t nrow;
  62. nrow = STARPU_CSR_GET_NROW(descr[0]);
  63. unsigned row;
  64. for (row = 0; row < nrow; row++)
  65. {
  66. float tmp = 0.0f;
  67. unsigned index;
  68. unsigned firstindex = rowptr[row] - firstentry;
  69. unsigned lastindex = rowptr[row+1] - firstentry;
  70. for (index = firstindex; index < lastindex; index++)
  71. {
  72. unsigned col;
  73. col = colind[index];
  74. tmp += nzval[index]*vecx[col];
  75. }
  76. vecr[row] = vecb[row] - tmp;
  77. }
  78. }
  79. /*
  80. * compute d = r
  81. * descr[0] = d, descr[1] = r
  82. */
  83. void cpu_codelet_func_2(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg)
  84. {
  85. /* simply copy r into d */
  86. uint32_t nx = STARPU_VECTOR_GET_NX(descr[0]);
  87. size_t elemsize = STARPU_VECTOR_GET_ELEMSIZE(descr[0]);
  88. STARPU_ASSERT(STARPU_VECTOR_GET_NX(descr[0]) == STARPU_VECTOR_GET_NX(descr[1]));
  89. STARPU_ASSERT(STARPU_VECTOR_GET_ELEMSIZE(descr[0]) == STARPU_VECTOR_GET_ELEMSIZE(descr[1]));
  90. float *src = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  91. float *dst = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  92. memcpy(dst, src, nx*elemsize);
  93. }
  94. /*
  95. * compute delta_new = trans(r) r
  96. * delta_0 = delta_new
  97. *
  98. * args = &delta_new, &delta_0
  99. */
  100. void cpu_codelet_func_3(void *descr[], void *arg)
  101. {
  102. struct cg_problem *pb = arg;
  103. float dot;
  104. float *vec;
  105. int size;
  106. /* get the vector */
  107. vec = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  108. size = (int)STARPU_VECTOR_GET_NX(descr[0]);
  109. dot = STARPU_SDOT(size, vec, 1, vec, 1);
  110. fprintf(stderr, "func 3 : DOT = %f\n", dot);
  111. pb->delta_new = dot;
  112. pb->delta_0 = dot;
  113. }
  114. #ifdef STARPU_USE_CUDA
  115. void cublas_codelet_func_3(void *descr[], void *arg)
  116. {
  117. struct cg_problem *pb = arg;
  118. float dot;
  119. float *vec;
  120. uint32_t size;
  121. /* get the vector */
  122. vec = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  123. size = STARPU_VECTOR_GET_NX(descr[0]);
  124. cublasStatus_t status = cublasSdot (starpu_cublas_get_local_handle(), size, vec, 1, vec, 1, &dot);
  125. if (status != CUBLAS_STATUS_SUCCESS)
  126. STARPU_CUBLAS_REPORT_ERROR(status);
  127. cudaStreamSynchronize(starpu_cuda_get_local_stream());
  128. pb->delta_new = dot;
  129. pb->delta_0 = dot;
  130. }
  131. #endif
  132. /*
  133. * compute q with : q = A d
  134. *
  135. * descr[0] = A, descr[1] = d, descr [2] = q
  136. */
  137. void cpu_codelet_func_4(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg)
  138. {
  139. float *nzval = (float *)STARPU_CSR_GET_NZVAL(descr[0]);
  140. uint32_t *colind = STARPU_CSR_GET_COLIND(descr[0]);
  141. uint32_t *rowptr = STARPU_CSR_GET_ROWPTR(descr[0]);
  142. uint32_t firstentry = STARPU_CSR_GET_FIRSTENTRY(descr[0]);
  143. float *vecd = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  144. float *vecq = (float *)STARPU_VECTOR_GET_PTR(descr[2]);
  145. uint32_t nrow;
  146. nrow = STARPU_CSR_GET_NROW(descr[0]);
  147. unsigned row;
  148. for (row = 0; row < nrow; row++)
  149. {
  150. float tmp = 0.0f;
  151. unsigned index;
  152. unsigned firstindex = rowptr[row] - firstentry;
  153. unsigned lastindex = rowptr[row+1] - firstentry;
  154. for (index = firstindex; index < lastindex; index++)
  155. {
  156. unsigned col;
  157. col = colind[index];
  158. tmp += nzval[index]*vecd[col];
  159. }
  160. vecq[row] = tmp;
  161. }
  162. }
  163. /*
  164. * compute alpha = delta_new / ( trans(d) q )
  165. *
  166. * descr[0] = d, descr[1] = q
  167. * args = &alpha, &delta_new
  168. */
  169. void cpu_codelet_func_5(void *descr[], void *arg)
  170. {
  171. float dot;
  172. struct cg_problem *pb = arg;
  173. float *vecd, *vecq;
  174. uint32_t size;
  175. /* get the vector */
  176. vecd = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  177. vecq = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  178. STARPU_ASSERT(STARPU_VECTOR_GET_NX(descr[0]) == STARPU_VECTOR_GET_NX(descr[1]));
  179. size = STARPU_VECTOR_GET_NX(descr[0]);
  180. dot = STARPU_SDOT(size, vecd, 1, vecq, 1);
  181. pb->alpha = pb->delta_new / dot;
  182. }
  183. #ifdef STARPU_USE_CUDA
  184. void cublas_codelet_func_5(void *descr[], void *arg)
  185. {
  186. float dot;
  187. struct cg_problem *pb = arg;
  188. float *vecd, *vecq;
  189. uint32_t size;
  190. /* get the vector */
  191. vecd = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  192. vecq = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  193. STARPU_ASSERT(STARPU_VECTOR_GET_NX(descr[0]) == STARPU_VECTOR_GET_NX(descr[1]));
  194. size = STARPU_VECTOR_GET_NX(descr[0]);
  195. cublasStatus_t status = cublasSdot (starpu_cublas_get_local_handle(), size, vecd, 1, vecq, 1, &dot);
  196. if (status != CUBLAS_STATUS_SUCCESS)
  197. STARPU_CUBLAS_REPORT_ERROR(status);
  198. cudaStreamSynchronize(starpu_cuda_get_local_stream());
  199. pb->alpha = pb->delta_new / dot;
  200. }
  201. #endif
  202. /*
  203. * compute x = x + alpha d
  204. *
  205. * descr[0] : x, descr[1] : d
  206. * args = &alpha
  207. */
  208. void cpu_codelet_func_6(void *descr[], void *arg)
  209. {
  210. struct cg_problem *pb = arg;
  211. float *vecx, *vecd;
  212. uint32_t size;
  213. /* get the vector */
  214. vecx = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  215. vecd = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  216. size = STARPU_VECTOR_GET_NX(descr[0]);
  217. STARPU_SAXPY(size, pb->alpha, vecd, 1, vecx, 1);
  218. }
  219. #ifdef STARPU_USE_CUDA
  220. void cublas_codelet_func_6(void *descr[], void *arg)
  221. {
  222. struct cg_problem *pb = arg;
  223. float *vecx, *vecd;
  224. uint32_t size;
  225. /* get the vector */
  226. vecx = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  227. vecd = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  228. size = STARPU_VECTOR_GET_NX(descr[0]);
  229. cublasStatus_t status = cublasSaxpy (starpu_cublas_get_local_handle(), size, &pb->alpha, vecd, 1, vecx, 1);
  230. if (status != CUBLAS_STATUS_SUCCESS)
  231. STARPU_CUBLAS_REPORT_ERROR(status);
  232. }
  233. #endif
  234. /*
  235. * compute r = r - alpha q
  236. *
  237. * descr[0] : r, descr[1] : q
  238. * args = &alpha
  239. */
  240. void cpu_codelet_func_7(void *descr[], void *arg)
  241. {
  242. struct cg_problem *pb = arg;
  243. float *vecr, *vecq;
  244. uint32_t size;
  245. /* get the vector */
  246. vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  247. vecq = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  248. size = STARPU_VECTOR_GET_NX(descr[0]);
  249. STARPU_SAXPY(size, -pb->alpha, vecq, 1, vecr, 1);
  250. }
  251. #ifdef STARPU_USE_CUDA
  252. void cublas_codelet_func_7(void *descr[], void *arg)
  253. {
  254. struct cg_problem *pb = arg;
  255. float *vecr, *vecq;
  256. uint32_t size;
  257. /* get the vector */
  258. vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  259. vecq = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  260. size = STARPU_VECTOR_GET_NX(descr[0]);
  261. float scal = -pb->alpha;
  262. cublasStatus_t status = cublasSaxpy (starpu_cublas_get_local_handle(), size, &scal, vecq, 1, vecr, 1);
  263. if (status != CUBLAS_STATUS_SUCCESS)
  264. STARPU_CUBLAS_REPORT_ERROR(status);
  265. }
  266. #endif
  267. /*
  268. * compute delta_old = delta_new
  269. * delta_new = trans(r) r
  270. * beta = delta_new / delta_old
  271. *
  272. * descr[0] = r
  273. * args = &delta_old, &delta_new, &beta
  274. */
  275. void cpu_codelet_func_8(void *descr[], void *arg)
  276. {
  277. float dot;
  278. struct cg_problem *pb = arg;
  279. float *vecr;
  280. uint32_t size;
  281. /* get the vector */
  282. vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  283. size = STARPU_VECTOR_GET_NX(descr[0]);
  284. dot = STARPU_SDOT(size, vecr, 1, vecr, 1);
  285. pb->delta_old = pb->delta_new;
  286. pb->delta_new = dot;
  287. pb->beta = pb->delta_new/pb->delta_old;
  288. }
  289. #ifdef STARPU_USE_CUDA
  290. void cublas_codelet_func_8(void *descr[], void *arg)
  291. {
  292. float dot;
  293. struct cg_problem *pb = arg;
  294. float *vecr;
  295. uint32_t size;
  296. /* get the vector */
  297. vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  298. size = STARPU_VECTOR_GET_NX(descr[0]);
  299. cublasStatus_t status = cublasSdot (starpu_cublas_get_local_handle(), size, vecr, 1, vecr, 1, &dot);
  300. cudaStreamSynchronize(starpu_cuda_get_local_stream());
  301. pb->delta_old = pb->delta_new;
  302. pb->delta_new = dot;
  303. pb->beta = pb->delta_new/pb->delta_old;
  304. }
  305. #endif
  306. /*
  307. * compute d = r + beta d
  308. *
  309. * descr[0] : d, descr[1] : r
  310. * args = &beta
  311. *
  312. */
  313. void cpu_codelet_func_9(void *descr[], void *arg)
  314. {
  315. struct cg_problem *pb = arg;
  316. float *vecd, *vecr;
  317. uint32_t size;
  318. /* get the vector */
  319. vecd = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  320. vecr = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  321. size = STARPU_VECTOR_GET_NX(descr[0]);
  322. /* d = beta d */
  323. STARPU_SSCAL(size, pb->beta, vecd, 1);
  324. /* d = r + d */
  325. STARPU_SAXPY (size, 1.0f, vecr, 1, vecd, 1);
  326. }
  327. #ifdef STARPU_USE_CUDA
  328. void cublas_codelet_func_9(void *descr[], void *arg)
  329. {
  330. struct cg_problem *pb = arg;
  331. float *vecd, *vecr;
  332. uint32_t size;
  333. /* get the vector */
  334. vecd = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  335. vecr = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  336. size = STARPU_VECTOR_GET_NX(descr[0]);
  337. /* d = beta d */
  338. cublasStatus_t status;
  339. status = cublasSscal(starpu_cublas_get_local_handle(), size, &pb->beta, vecd, 1);
  340. if (status != CUBLAS_STATUS_SUCCESS)
  341. STARPU_CUBLAS_REPORT_ERROR(status);
  342. /* d = r + d */
  343. float scal = 1.0f;
  344. status = cublasSaxpy (starpu_cublas_get_local_handle(), size, &scal, vecr, 1, vecd, 1);
  345. if (status != CUBLAS_STATUS_SUCCESS)
  346. STARPU_CUBLAS_REPORT_ERROR(status);
  347. }
  348. #endif