dw_sparse_cg_kernels.c 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424
  1. /*
  2. * StarPU
  3. * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU Lesser General Public License as published by
  7. * the Free Software Foundation; either version 2.1 of the License, or (at
  8. * your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful, but
  11. * WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. *
  14. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  15. */
  16. #include "dw_sparse_cg.h"
  17. /*
  18. * Algorithm :
  19. *
  20. * i = 0
  21. * r = b - A x
  22. * ( d = A x ; r = r - d )
  23. * d = r
  24. * delta_new = trans(r) r
  25. * delta_0 = delta_new
  26. *
  27. * while (i < i_max && delta_new > eps^2 delta_0)
  28. * {
  29. * q = A d
  30. * alpha = delta_new / ( trans(d) q )
  31. * x = x + alpha d
  32. * if ( i is divisible by 50 )
  33. * r = b - A x
  34. * else
  35. * r = r - alpha q
  36. * delta_old = delta_new
  37. * delta_new = trans(r) r
  38. * beta = delta_new / delta_old
  39. * d = r + beta d
  40. * i = i + 1
  41. * }
  42. */
  43. /*
  44. * compute r = b - A x
  45. *
  46. * descr[0] = A, descr[1] = x, descr [2] = r, descr[3] = b
  47. */
  48. void cpu_codelet_func_1(void *descr[], __attribute__((unused)) void *arg)
  49. {
  50. float *nzval = (float *)STARPU_GET_CSR_NZVAL(descr[0]);
  51. uint32_t *colind = STARPU_GET_CSR_COLIND(descr[0]);
  52. uint32_t *rowptr = STARPU_GET_CSR_ROWPTR(descr[0]);
  53. uint32_t firstentry = STARPU_GET_CSR_ELEMSIZE(descr[0]);
  54. float *vecx = (float *)STARPU_GET_VECTOR_PTR(descr[1]);
  55. float *vecr = (float *)STARPU_GET_VECTOR_PTR(descr[2]);
  56. float *vecb = (float *)STARPU_GET_VECTOR_PTR(descr[3]);
  57. uint32_t nnz;
  58. uint32_t nrow;
  59. nnz = STARPU_GET_CSR_NNZ(descr[0]);
  60. nrow = STARPU_GET_CSR_NROW(descr[0]);
  61. unsigned row;
  62. for (row = 0; row < nrow; row++)
  63. {
  64. float tmp = 0.0f;
  65. unsigned index;
  66. unsigned firstindex = rowptr[row] - firstentry;
  67. unsigned lastindex = rowptr[row+1] - firstentry;
  68. for (index = firstindex; index < lastindex; index++)
  69. {
  70. unsigned col;
  71. col = colind[index];
  72. tmp += nzval[index]*vecx[col];
  73. }
  74. vecr[row] = vecb[row] - tmp;
  75. }
  76. }
  77. /*
  78. * compute d = r
  79. * descr[0] = d, descr[1] = r
  80. */
  81. void cpu_codelet_func_2(void *descr[], __attribute__((unused)) void *arg)
  82. {
  83. /* simply copy r into d */
  84. uint32_t nx = STARPU_GET_VECTOR_NX(descr[0]);
  85. size_t elemsize = STARPU_GET_VECTOR_ELEMSIZE(descr[0]);
  86. STARPU_ASSERT(STARPU_GET_VECTOR_NX(descr[0]) == STARPU_GET_VECTOR_NX(descr[1]));
  87. STARPU_ASSERT(STARPU_GET_VECTOR_ELEMSIZE(descr[0]) == STARPU_GET_VECTOR_ELEMSIZE(descr[1]));
  88. float *src = (float *)STARPU_GET_VECTOR_PTR(descr[1]);
  89. float *dst = (float *)STARPU_GET_VECTOR_PTR(descr[0]);
  90. memcpy(dst, src, nx*elemsize);
  91. }
  92. /*
  93. * compute delta_new = trans(r) r
  94. * delta_0 = delta_new
  95. *
  96. * args = &delta_new, &delta_0
  97. */
  98. void cpu_codelet_func_3(void *descr[], void *arg)
  99. {
  100. struct cg_problem *pb = arg;
  101. float dot;
  102. float *vec;
  103. int size;
  104. /* get the vector */
  105. vec = (float *)STARPU_GET_VECTOR_PTR(descr[0]);
  106. size = (int)STARPU_GET_VECTOR_NX(descr[0]);
  107. dot = SDOT(size, vec, 1, vec, 1);
  108. fprintf(stderr, "func 3 : DOT = %f\n", dot);
  109. pb->delta_new = dot;
  110. pb->delta_0 = dot;
  111. }
  112. #ifdef STARPU_USE_CUDA
  113. void cublas_codelet_func_3(void *descr[], void *arg)
  114. {
  115. struct cg_problem *pb = arg;
  116. float dot;
  117. float *vec;
  118. uint32_t size;
  119. /* get the vector */
  120. vec = (float *)STARPU_GET_VECTOR_PTR(descr[0]);
  121. size = STARPU_GET_VECTOR_NX(descr[0]);
  122. dot = cublasSdot (size, vec, 1, vec, 1);
  123. pb->delta_new = dot;
  124. pb->delta_0 = dot;
  125. }
  126. #endif
  127. /*
  128. * compute q with : q = A d
  129. *
  130. * descr[0] = A, descr[1] = d, descr [2] = q
  131. */
  132. void cpu_codelet_func_4(void *descr[], __attribute__((unused)) void *arg)
  133. {
  134. float *nzval = (float *)STARPU_GET_CSR_NZVAL(descr[0]);
  135. uint32_t *colind = STARPU_GET_CSR_COLIND(descr[0]);
  136. uint32_t *rowptr = STARPU_GET_CSR_ROWPTR(descr[0]);
  137. uint32_t firstentry = STARPU_GET_CSR_FIRSTENTRY(descr[0]);
  138. float *vecd = (float *)STARPU_GET_VECTOR_PTR(descr[1]);
  139. float *vecq = (float *)STARPU_GET_VECTOR_PTR(descr[2]);
  140. uint32_t nnz;
  141. uint32_t nrow;
  142. nnz = STARPU_GET_CSR_NNZ(descr[0]);
  143. nrow = STARPU_GET_CSR_NROW(descr[0]);
  144. unsigned row;
  145. for (row = 0; row < nrow; row++)
  146. {
  147. float tmp = 0.0f;
  148. unsigned index;
  149. unsigned firstindex = rowptr[row] - firstentry;
  150. unsigned lastindex = rowptr[row+1] - firstentry;
  151. for (index = firstindex; index < lastindex; index++)
  152. {
  153. unsigned col;
  154. col = colind[index];
  155. tmp += nzval[index]*vecd[col];
  156. }
  157. vecq[row] = tmp;
  158. }
  159. }
  160. /*
  161. * compute alpha = delta_new / ( trans(d) q )
  162. *
  163. * descr[0] = d, descr[1] = q
  164. * args = &alpha, &delta_new
  165. */
  166. void cpu_codelet_func_5(void *descr[], void *arg)
  167. {
  168. float dot;
  169. struct cg_problem *pb = arg;
  170. float *vecd, *vecq;
  171. uint32_t size;
  172. /* get the vector */
  173. vecd = (float *)STARPU_GET_VECTOR_PTR(descr[0]);
  174. vecq = (float *)STARPU_GET_VECTOR_PTR(descr[1]);
  175. STARPU_ASSERT(STARPU_GET_VECTOR_NX(descr[0]) == STARPU_GET_VECTOR_NX(descr[1]));
  176. size = STARPU_GET_VECTOR_NX(descr[0]);
  177. dot = SDOT(size, vecd, 1, vecq, 1);
  178. pb->alpha = pb->delta_new / dot;
  179. }
  180. #ifdef STARPU_USE_CUDA
  181. void cublas_codelet_func_5(void *descr[], void *arg)
  182. {
  183. float dot;
  184. struct cg_problem *pb = arg;
  185. float *vecd, *vecq;
  186. uint32_t size;
  187. /* get the vector */
  188. vecd = (float *)STARPU_GET_VECTOR_PTR(descr[0]);
  189. vecq = (float *)STARPU_GET_VECTOR_PTR(descr[1]);
  190. STARPU_ASSERT(STARPU_GET_VECTOR_NX(descr[0]) == STARPU_GET_VECTOR_NX(descr[1]));
  191. size = STARPU_GET_VECTOR_NX(descr[0]);
  192. dot = cublasSdot (size, vecd, 1, vecq, 1);
  193. pb->alpha = pb->delta_new / dot;
  194. }
  195. #endif
  196. /*
  197. * compute x = x + alpha d
  198. *
  199. * descr[0] : x, descr[1] : d
  200. * args = &alpha
  201. */
  202. void cpu_codelet_func_6(void *descr[], void *arg)
  203. {
  204. struct cg_problem *pb = arg;
  205. float *vecx, *vecd;
  206. uint32_t size;
  207. /* get the vector */
  208. vecx = (float *)STARPU_GET_VECTOR_PTR(descr[0]);
  209. vecd = (float *)STARPU_GET_VECTOR_PTR(descr[1]);
  210. size = STARPU_GET_VECTOR_NX(descr[0]);
  211. SAXPY(size, pb->alpha, vecd, 1, vecx, 1);
  212. }
  213. #ifdef STARPU_USE_CUDA
  214. void cublas_codelet_func_6(void *descr[], void *arg)
  215. {
  216. struct cg_problem *pb = arg;
  217. float *vecx, *vecd;
  218. uint32_t size;
  219. /* get the vector */
  220. vecx = (float *)STARPU_GET_VECTOR_PTR(descr[0]);
  221. vecd = (float *)STARPU_GET_VECTOR_PTR(descr[1]);
  222. size = STARPU_GET_VECTOR_NX(descr[0]);
  223. cublasSaxpy (size, pb->alpha, vecd, 1, vecx, 1);
  224. }
  225. #endif
  226. /*
  227. * compute r = r - alpha q
  228. *
  229. * descr[0] : r, descr[1] : q
  230. * args = &alpha
  231. */
  232. void cpu_codelet_func_7(void *descr[], void *arg)
  233. {
  234. struct cg_problem *pb = arg;
  235. float *vecr, *vecq;
  236. uint32_t size;
  237. /* get the vector */
  238. vecr = (float *)STARPU_GET_VECTOR_PTR(descr[0]);
  239. vecq = (float *)STARPU_GET_VECTOR_PTR(descr[1]);
  240. size = STARPU_GET_VECTOR_NX(descr[0]);
  241. SAXPY(size, -pb->alpha, vecq, 1, vecr, 1);
  242. }
  243. #ifdef STARPU_USE_CUDA
  244. void cublas_codelet_func_7(void *descr[], void *arg)
  245. {
  246. struct cg_problem *pb = arg;
  247. float *vecr, *vecq;
  248. uint32_t size;
  249. /* get the vector */
  250. vecr = (float *)STARPU_GET_VECTOR_PTR(descr[0]);
  251. vecq = (float *)STARPU_GET_VECTOR_PTR(descr[1]);
  252. size = STARPU_GET_VECTOR_NX(descr[0]);
  253. cublasSaxpy (size, -pb->alpha, vecq, 1, vecr, 1);
  254. }
  255. #endif
  256. /*
  257. * compute delta_old = delta_new
  258. * delta_new = trans(r) r
  259. * beta = delta_new / delta_old
  260. *
  261. * descr[0] = r
  262. * args = &delta_old, &delta_new, &beta
  263. */
  264. void cpu_codelet_func_8(void *descr[], void *arg)
  265. {
  266. float dot;
  267. struct cg_problem *pb = arg;
  268. float *vecr;
  269. uint32_t size;
  270. /* get the vector */
  271. vecr = (float *)STARPU_GET_VECTOR_PTR(descr[0]);
  272. size = STARPU_GET_VECTOR_NX(descr[0]);
  273. dot = SDOT(size, vecr, 1, vecr, 1);
  274. pb->delta_old = pb->delta_new;
  275. pb->delta_new = dot;
  276. pb->beta = pb->delta_new/pb->delta_old;
  277. }
  278. #ifdef STARPU_USE_CUDA
  279. void cublas_codelet_func_8(void *descr[], void *arg)
  280. {
  281. float dot;
  282. struct cg_problem *pb = arg;
  283. float *vecr;
  284. uint32_t size;
  285. /* get the vector */
  286. vecr = (float *)STARPU_GET_VECTOR_PTR(descr[0]);
  287. size = STARPU_GET_VECTOR_NX(descr[0]);
  288. dot = cublasSdot (size, vecr, 1, vecr, 1);
  289. pb->delta_old = pb->delta_new;
  290. pb->delta_new = dot;
  291. pb->beta = pb->delta_new/pb->delta_old;
  292. }
  293. #endif
  294. /*
  295. * compute d = r + beta d
  296. *
  297. * descr[0] : d, descr[1] : r
  298. * args = &beta
  299. *
  300. */
  301. void cpu_codelet_func_9(void *descr[], void *arg)
  302. {
  303. struct cg_problem *pb = arg;
  304. float *vecd, *vecr;
  305. uint32_t size;
  306. /* get the vector */
  307. vecd = (float *)STARPU_GET_VECTOR_PTR(descr[0]);
  308. vecr = (float *)STARPU_GET_VECTOR_PTR(descr[1]);
  309. size = STARPU_GET_VECTOR_NX(descr[0]);
  310. /* d = beta d */
  311. SSCAL(size, pb->beta, vecd, 1);
  312. /* d = r + d */
  313. SAXPY (size, 1.0f, vecr, 1, vecd, 1);
  314. }
  315. #ifdef STARPU_USE_CUDA
  316. void cublas_codelet_func_9(void *descr[], void *arg)
  317. {
  318. struct cg_problem *pb = arg;
  319. float *vecd, *vecr;
  320. uint32_t size;
  321. /* get the vector */
  322. vecd = (float *)STARPU_GET_VECTOR_PTR(descr[0]);
  323. vecr = (float *)STARPU_GET_VECTOR_PTR(descr[1]);
  324. size = STARPU_GET_VECTOR_NX(descr[0]);
  325. /* d = beta d */
  326. cublasSscal(size, pb->beta, vecd, 1);
  327. /* d = r + d */
  328. cublasSaxpy (size, 1.0f, vecr, 1, vecd, 1);
  329. }
  330. #endif