dw_sparse_cg_kernels.c 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2009, 2010 Université de Bordeaux
  4. * Copyright (C) 2010 CNRS
  5. *
  6. * StarPU is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU Lesser General Public License as published by
  8. * the Free Software Foundation; either version 2.1 of the License, or (at
  9. * your option) any later version.
  10. *
  11. * StarPU is distributed in the hope that it will be useful, but
  12. * WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  14. *
  15. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  16. */
  17. #include "dw_sparse_cg.h"
  18. /*
  19. * Algorithm :
  20. *
  21. * i = 0
  22. * r = b - A x
  23. * ( d = A x ; r = r - d )
  24. * d = r
  25. * delta_new = trans(r) r
  26. * delta_0 = delta_new
  27. *
  28. * while (i < i_max && delta_new > eps^2 delta_0)
  29. * {
  30. * q = A d
  31. * alpha = delta_new / ( trans(d) q )
  32. * x = x + alpha d
  33. * if ( i is divisible by 50 )
  34. * r = b - A x
  35. * else
  36. * r = r - alpha q
  37. * delta_old = delta_new
  38. * delta_new = trans(r) r
  39. * beta = delta_new / delta_old
  40. * d = r + beta d
  41. * i = i + 1
  42. * }
  43. */
  44. /*
  45. * compute r = b - A x
  46. *
  47. * descr[0] = A, descr[1] = x, descr [2] = r, descr[3] = b
  48. */
  49. void cpu_codelet_func_1(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg)
  50. {
  51. float *nzval = (float *)STARPU_CSR_GET_NZVAL(descr[0]);
  52. uint32_t *colind = STARPU_CSR_GET_COLIND(descr[0]);
  53. uint32_t *rowptr = STARPU_CSR_GET_ROWPTR(descr[0]);
  54. uint32_t firstentry = STARPU_CSR_GET_ELEMSIZE(descr[0]);
  55. float *vecx = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  56. float *vecr = (float *)STARPU_VECTOR_GET_PTR(descr[2]);
  57. float *vecb = (float *)STARPU_VECTOR_GET_PTR(descr[3]);
  58. uint32_t nrow;
  59. nrow = STARPU_CSR_GET_NROW(descr[0]);
  60. unsigned row;
  61. for (row = 0; row < nrow; row++)
  62. {
  63. float tmp = 0.0f;
  64. unsigned index;
  65. unsigned firstindex = rowptr[row] - firstentry;
  66. unsigned lastindex = rowptr[row+1] - firstentry;
  67. for (index = firstindex; index < lastindex; index++)
  68. {
  69. unsigned col;
  70. col = colind[index];
  71. tmp += nzval[index]*vecx[col];
  72. }
  73. vecr[row] = vecb[row] - tmp;
  74. }
  75. }
  76. /*
  77. * compute d = r
  78. * descr[0] = d, descr[1] = r
  79. */
  80. void cpu_codelet_func_2(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg)
  81. {
  82. /* simply copy r into d */
  83. uint32_t nx = STARPU_VECTOR_GET_NX(descr[0]);
  84. size_t elemsize = STARPU_VECTOR_GET_ELEMSIZE(descr[0]);
  85. STARPU_ASSERT(STARPU_VECTOR_GET_NX(descr[0]) == STARPU_VECTOR_GET_NX(descr[1]));
  86. STARPU_ASSERT(STARPU_VECTOR_GET_ELEMSIZE(descr[0]) == STARPU_VECTOR_GET_ELEMSIZE(descr[1]));
  87. float *src = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  88. float *dst = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  89. memcpy(dst, src, nx*elemsize);
  90. }
  91. /*
  92. * compute delta_new = trans(r) r
  93. * delta_0 = delta_new
  94. *
  95. * args = &delta_new, &delta_0
  96. */
  97. void cpu_codelet_func_3(void *descr[], void *arg)
  98. {
  99. struct cg_problem *pb = arg;
  100. float dot;
  101. float *vec;
  102. int size;
  103. /* get the vector */
  104. vec = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  105. size = (int)STARPU_VECTOR_GET_NX(descr[0]);
  106. dot = STARPU_SDOT(size, vec, 1, vec, 1);
  107. fprintf(stderr, "func 3 : DOT = %f\n", dot);
  108. pb->delta_new = dot;
  109. pb->delta_0 = dot;
  110. }
  111. #ifdef STARPU_USE_CUDA
  112. void cublas_codelet_func_3(void *descr[], void *arg)
  113. {
  114. struct cg_problem *pb = arg;
  115. float dot;
  116. float *vec;
  117. uint32_t size;
  118. /* get the vector */
  119. vec = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  120. size = STARPU_VECTOR_GET_NX(descr[0]);
  121. starpu_cublas_set_stream();
  122. dot = cublasSdot (size, vec, 1, vec, 1);
  123. pb->delta_new = dot;
  124. pb->delta_0 = dot;
  125. }
  126. #endif
  127. /*
  128. * compute q with : q = A d
  129. *
  130. * descr[0] = A, descr[1] = d, descr [2] = q
  131. */
  132. void cpu_codelet_func_4(void *descr[], STARPU_ATTRIBUTE_UNUSED void *arg)
  133. {
  134. float *nzval = (float *)STARPU_CSR_GET_NZVAL(descr[0]);
  135. uint32_t *colind = STARPU_CSR_GET_COLIND(descr[0]);
  136. uint32_t *rowptr = STARPU_CSR_GET_ROWPTR(descr[0]);
  137. uint32_t firstentry = STARPU_CSR_GET_FIRSTENTRY(descr[0]);
  138. float *vecd = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  139. float *vecq = (float *)STARPU_VECTOR_GET_PTR(descr[2]);
  140. uint32_t nrow;
  141. nrow = STARPU_CSR_GET_NROW(descr[0]);
  142. unsigned row;
  143. for (row = 0; row < nrow; row++)
  144. {
  145. float tmp = 0.0f;
  146. unsigned index;
  147. unsigned firstindex = rowptr[row] - firstentry;
  148. unsigned lastindex = rowptr[row+1] - firstentry;
  149. for (index = firstindex; index < lastindex; index++)
  150. {
  151. unsigned col;
  152. col = colind[index];
  153. tmp += nzval[index]*vecd[col];
  154. }
  155. vecq[row] = tmp;
  156. }
  157. }
  158. /*
  159. * compute alpha = delta_new / ( trans(d) q )
  160. *
  161. * descr[0] = d, descr[1] = q
  162. * args = &alpha, &delta_new
  163. */
  164. void cpu_codelet_func_5(void *descr[], void *arg)
  165. {
  166. float dot;
  167. struct cg_problem *pb = arg;
  168. float *vecd, *vecq;
  169. uint32_t size;
  170. /* get the vector */
  171. vecd = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  172. vecq = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  173. STARPU_ASSERT(STARPU_VECTOR_GET_NX(descr[0]) == STARPU_VECTOR_GET_NX(descr[1]));
  174. size = STARPU_VECTOR_GET_NX(descr[0]);
  175. dot = STARPU_SDOT(size, vecd, 1, vecq, 1);
  176. pb->alpha = pb->delta_new / dot;
  177. }
  178. #ifdef STARPU_USE_CUDA
  179. void cublas_codelet_func_5(void *descr[], void *arg)
  180. {
  181. float dot;
  182. struct cg_problem *pb = arg;
  183. float *vecd, *vecq;
  184. uint32_t size;
  185. /* get the vector */
  186. vecd = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  187. vecq = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  188. STARPU_ASSERT(STARPU_VECTOR_GET_NX(descr[0]) == STARPU_VECTOR_GET_NX(descr[1]));
  189. size = STARPU_VECTOR_GET_NX(descr[0]);
  190. starpu_cublas_set_stream();
  191. dot = cublasSdot (size, vecd, 1, vecq, 1);
  192. pb->alpha = pb->delta_new / dot;
  193. }
  194. #endif
  195. /*
  196. * compute x = x + alpha d
  197. *
  198. * descr[0] : x, descr[1] : d
  199. * args = &alpha
  200. */
  201. void cpu_codelet_func_6(void *descr[], void *arg)
  202. {
  203. struct cg_problem *pb = arg;
  204. float *vecx, *vecd;
  205. uint32_t size;
  206. /* get the vector */
  207. vecx = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  208. vecd = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  209. size = STARPU_VECTOR_GET_NX(descr[0]);
  210. STARPU_SAXPY(size, pb->alpha, vecd, 1, vecx, 1);
  211. }
  212. #ifdef STARPU_USE_CUDA
  213. void cublas_codelet_func_6(void *descr[], void *arg)
  214. {
  215. struct cg_problem *pb = arg;
  216. float *vecx, *vecd;
  217. uint32_t size;
  218. /* get the vector */
  219. vecx = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  220. vecd = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  221. size = STARPU_VECTOR_GET_NX(descr[0]);
  222. starpu_cublas_set_stream();
  223. cublasSaxpy (size, pb->alpha, vecd, 1, vecx, 1);
  224. }
  225. #endif
  226. /*
  227. * compute r = r - alpha q
  228. *
  229. * descr[0] : r, descr[1] : q
  230. * args = &alpha
  231. */
  232. void cpu_codelet_func_7(void *descr[], void *arg)
  233. {
  234. struct cg_problem *pb = arg;
  235. float *vecr, *vecq;
  236. uint32_t size;
  237. /* get the vector */
  238. vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  239. vecq = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  240. size = STARPU_VECTOR_GET_NX(descr[0]);
  241. STARPU_SAXPY(size, -pb->alpha, vecq, 1, vecr, 1);
  242. }
  243. #ifdef STARPU_USE_CUDA
  244. void cublas_codelet_func_7(void *descr[], void *arg)
  245. {
  246. struct cg_problem *pb = arg;
  247. float *vecr, *vecq;
  248. uint32_t size;
  249. /* get the vector */
  250. vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  251. vecq = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  252. size = STARPU_VECTOR_GET_NX(descr[0]);
  253. starpu_cublas_set_stream();
  254. cublasSaxpy (size, -pb->alpha, vecq, 1, vecr, 1);
  255. }
  256. #endif
  257. /*
  258. * compute delta_old = delta_new
  259. * delta_new = trans(r) r
  260. * beta = delta_new / delta_old
  261. *
  262. * descr[0] = r
  263. * args = &delta_old, &delta_new, &beta
  264. */
  265. void cpu_codelet_func_8(void *descr[], void *arg)
  266. {
  267. float dot;
  268. struct cg_problem *pb = arg;
  269. float *vecr;
  270. uint32_t size;
  271. /* get the vector */
  272. vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  273. size = STARPU_VECTOR_GET_NX(descr[0]);
  274. dot = STARPU_SDOT(size, vecr, 1, vecr, 1);
  275. pb->delta_old = pb->delta_new;
  276. pb->delta_new = dot;
  277. pb->beta = pb->delta_new/pb->delta_old;
  278. }
  279. #ifdef STARPU_USE_CUDA
  280. void cublas_codelet_func_8(void *descr[], void *arg)
  281. {
  282. float dot;
  283. struct cg_problem *pb = arg;
  284. float *vecr;
  285. uint32_t size;
  286. /* get the vector */
  287. vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  288. size = STARPU_VECTOR_GET_NX(descr[0]);
  289. starpu_cublas_set_stream();
  290. dot = cublasSdot (size, vecr, 1, vecr, 1);
  291. pb->delta_old = pb->delta_new;
  292. pb->delta_new = dot;
  293. pb->beta = pb->delta_new/pb->delta_old;
  294. }
  295. #endif
  296. /*
  297. * compute d = r + beta d
  298. *
  299. * descr[0] : d, descr[1] : r
  300. * args = &beta
  301. *
  302. */
  303. void cpu_codelet_func_9(void *descr[], void *arg)
  304. {
  305. struct cg_problem *pb = arg;
  306. float *vecd, *vecr;
  307. uint32_t size;
  308. /* get the vector */
  309. vecd = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  310. vecr = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  311. size = STARPU_VECTOR_GET_NX(descr[0]);
  312. /* d = beta d */
  313. STARPU_SSCAL(size, pb->beta, vecd, 1);
  314. /* d = r + d */
  315. STARPU_SAXPY (size, 1.0f, vecr, 1, vecd, 1);
  316. }
  317. #ifdef STARPU_USE_CUDA
  318. void cublas_codelet_func_9(void *descr[], void *arg)
  319. {
  320. struct cg_problem *pb = arg;
  321. float *vecd, *vecr;
  322. uint32_t size;
  323. /* get the vector */
  324. vecd = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  325. vecr = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  326. size = STARPU_VECTOR_GET_NX(descr[0]);
  327. starpu_cublas_set_stream();
  328. /* d = beta d */
  329. cublasSscal(size, pb->beta, vecd, 1);
  330. /* d = r + d */
  331. cublasSaxpy (size, 1.0f, vecr, 1, vecd, 1);
  332. }
  333. #endif