dw_sparse_cg_kernels.c 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2009, 2010 Université de Bordeaux 1
  4. * Copyright (C) 2010 Centre National de la Recherche Scientifique
  5. *
  6. * StarPU is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU Lesser General Public License as published by
  8. * the Free Software Foundation; either version 2.1 of the License, or (at
  9. * your option) any later version.
  10. *
  11. * StarPU is distributed in the hope that it will be useful, but
  12. * WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  14. *
  15. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  16. */
  17. #include "dw_sparse_cg.h"
  18. /*
  19. * Algorithm :
  20. *
  21. * i = 0
  22. * r = b - A x
  23. * ( d = A x ; r = r - d )
  24. * d = r
  25. * delta_new = trans(r) r
  26. * delta_0 = delta_new
  27. *
  28. * while (i < i_max && delta_new > eps^2 delta_0)
  29. * {
  30. * q = A d
  31. * alpha = delta_new / ( trans(d) q )
  32. * x = x + alpha d
  33. * if ( i is divisible by 50 )
  34. * r = b - A x
  35. * else
  36. * r = r - alpha q
  37. * delta_old = delta_new
  38. * delta_new = trans(r) r
  39. * beta = delta_new / delta_old
  40. * d = r + beta d
  41. * i = i + 1
  42. * }
  43. */
  44. /*
  45. * compute r = b - A x
  46. *
  47. * descr[0] = A, descr[1] = x, descr [2] = r, descr[3] = b
  48. */
  49. void cpu_codelet_func_1(void *descr[], __attribute__((unused)) void *arg)
  50. {
  51. float *nzval = (float *)STARPU_CSR_GET_NZVAL(descr[0]);
  52. uint32_t *colind = STARPU_CSR_GET_COLIND(descr[0]);
  53. uint32_t *rowptr = STARPU_CSR_GET_ROWPTR(descr[0]);
  54. uint32_t firstentry = STARPU_CSR_GET_ELEMSIZE(descr[0]);
  55. float *vecx = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  56. float *vecr = (float *)STARPU_VECTOR_GET_PTR(descr[2]);
  57. float *vecb = (float *)STARPU_VECTOR_GET_PTR(descr[3]);
  58. uint32_t nnz;
  59. uint32_t nrow;
  60. nnz = STARPU_CSR_GET_NNZ(descr[0]);
  61. nrow = STARPU_CSR_GET_NROW(descr[0]);
  62. unsigned row;
  63. for (row = 0; row < nrow; row++)
  64. {
  65. float tmp = 0.0f;
  66. unsigned index;
  67. unsigned firstindex = rowptr[row] - firstentry;
  68. unsigned lastindex = rowptr[row+1] - firstentry;
  69. for (index = firstindex; index < lastindex; index++)
  70. {
  71. unsigned col;
  72. col = colind[index];
  73. tmp += nzval[index]*vecx[col];
  74. }
  75. vecr[row] = vecb[row] - tmp;
  76. }
  77. }
  78. /*
  79. * compute d = r
  80. * descr[0] = d, descr[1] = r
  81. */
  82. void cpu_codelet_func_2(void *descr[], __attribute__((unused)) void *arg)
  83. {
  84. /* simply copy r into d */
  85. uint32_t nx = STARPU_VECTOR_GET_NX(descr[0]);
  86. size_t elemsize = STARPU_VECTOR_GET_ELEMSIZE(descr[0]);
  87. STARPU_ASSERT(STARPU_VECTOR_GET_NX(descr[0]) == STARPU_VECTOR_GET_NX(descr[1]));
  88. STARPU_ASSERT(STARPU_VECTOR_GET_ELEMSIZE(descr[0]) == STARPU_VECTOR_GET_ELEMSIZE(descr[1]));
  89. float *src = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  90. float *dst = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  91. memcpy(dst, src, nx*elemsize);
  92. }
  93. /*
  94. * compute delta_new = trans(r) r
  95. * delta_0 = delta_new
  96. *
  97. * args = &delta_new, &delta_0
  98. */
  99. void cpu_codelet_func_3(void *descr[], void *arg)
  100. {
  101. struct cg_problem *pb = arg;
  102. float dot;
  103. float *vec;
  104. int size;
  105. /* get the vector */
  106. vec = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  107. size = (int)STARPU_VECTOR_GET_NX(descr[0]);
  108. dot = SDOT(size, vec, 1, vec, 1);
  109. fprintf(stderr, "func 3 : DOT = %f\n", dot);
  110. pb->delta_new = dot;
  111. pb->delta_0 = dot;
  112. }
  113. #ifdef STARPU_USE_CUDA
  114. void cublas_codelet_func_3(void *descr[], void *arg)
  115. {
  116. struct cg_problem *pb = arg;
  117. float dot;
  118. float *vec;
  119. uint32_t size;
  120. /* get the vector */
  121. vec = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  122. size = STARPU_VECTOR_GET_NX(descr[0]);
  123. dot = cublasSdot (size, vec, 1, vec, 1);
  124. pb->delta_new = dot;
  125. pb->delta_0 = dot;
  126. }
  127. #endif
  128. /*
  129. * compute q with : q = A d
  130. *
  131. * descr[0] = A, descr[1] = d, descr [2] = q
  132. */
  133. void cpu_codelet_func_4(void *descr[], __attribute__((unused)) void *arg)
  134. {
  135. float *nzval = (float *)STARPU_CSR_GET_NZVAL(descr[0]);
  136. uint32_t *colind = STARPU_CSR_GET_COLIND(descr[0]);
  137. uint32_t *rowptr = STARPU_CSR_GET_ROWPTR(descr[0]);
  138. uint32_t firstentry = STARPU_CSR_GET_FIRSTENTRY(descr[0]);
  139. float *vecd = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  140. float *vecq = (float *)STARPU_VECTOR_GET_PTR(descr[2]);
  141. uint32_t nnz;
  142. uint32_t nrow;
  143. nnz = STARPU_CSR_GET_NNZ(descr[0]);
  144. nrow = STARPU_CSR_GET_NROW(descr[0]);
  145. unsigned row;
  146. for (row = 0; row < nrow; row++)
  147. {
  148. float tmp = 0.0f;
  149. unsigned index;
  150. unsigned firstindex = rowptr[row] - firstentry;
  151. unsigned lastindex = rowptr[row+1] - firstentry;
  152. for (index = firstindex; index < lastindex; index++)
  153. {
  154. unsigned col;
  155. col = colind[index];
  156. tmp += nzval[index]*vecd[col];
  157. }
  158. vecq[row] = tmp;
  159. }
  160. }
  161. /*
  162. * compute alpha = delta_new / ( trans(d) q )
  163. *
  164. * descr[0] = d, descr[1] = q
  165. * args = &alpha, &delta_new
  166. */
  167. void cpu_codelet_func_5(void *descr[], void *arg)
  168. {
  169. float dot;
  170. struct cg_problem *pb = arg;
  171. float *vecd, *vecq;
  172. uint32_t size;
  173. /* get the vector */
  174. vecd = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  175. vecq = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  176. STARPU_ASSERT(STARPU_VECTOR_GET_NX(descr[0]) == STARPU_VECTOR_GET_NX(descr[1]));
  177. size = STARPU_VECTOR_GET_NX(descr[0]);
  178. dot = SDOT(size, vecd, 1, vecq, 1);
  179. pb->alpha = pb->delta_new / dot;
  180. }
  181. #ifdef STARPU_USE_CUDA
  182. void cublas_codelet_func_5(void *descr[], void *arg)
  183. {
  184. float dot;
  185. struct cg_problem *pb = arg;
  186. float *vecd, *vecq;
  187. uint32_t size;
  188. /* get the vector */
  189. vecd = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  190. vecq = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  191. STARPU_ASSERT(STARPU_VECTOR_GET_NX(descr[0]) == STARPU_VECTOR_GET_NX(descr[1]));
  192. size = STARPU_VECTOR_GET_NX(descr[0]);
  193. dot = cublasSdot (size, vecd, 1, vecq, 1);
  194. pb->alpha = pb->delta_new / dot;
  195. }
  196. #endif
  197. /*
  198. * compute x = x + alpha d
  199. *
  200. * descr[0] : x, descr[1] : d
  201. * args = &alpha
  202. */
  203. void cpu_codelet_func_6(void *descr[], void *arg)
  204. {
  205. struct cg_problem *pb = arg;
  206. float *vecx, *vecd;
  207. uint32_t size;
  208. /* get the vector */
  209. vecx = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  210. vecd = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  211. size = STARPU_VECTOR_GET_NX(descr[0]);
  212. SAXPY(size, pb->alpha, vecd, 1, vecx, 1);
  213. }
  214. #ifdef STARPU_USE_CUDA
  215. void cublas_codelet_func_6(void *descr[], void *arg)
  216. {
  217. struct cg_problem *pb = arg;
  218. float *vecx, *vecd;
  219. uint32_t size;
  220. /* get the vector */
  221. vecx = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  222. vecd = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  223. size = STARPU_VECTOR_GET_NX(descr[0]);
  224. cublasSaxpy (size, pb->alpha, vecd, 1, vecx, 1);
  225. }
  226. #endif
  227. /*
  228. * compute r = r - alpha q
  229. *
  230. * descr[0] : r, descr[1] : q
  231. * args = &alpha
  232. */
  233. void cpu_codelet_func_7(void *descr[], void *arg)
  234. {
  235. struct cg_problem *pb = arg;
  236. float *vecr, *vecq;
  237. uint32_t size;
  238. /* get the vector */
  239. vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  240. vecq = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  241. size = STARPU_VECTOR_GET_NX(descr[0]);
  242. SAXPY(size, -pb->alpha, vecq, 1, vecr, 1);
  243. }
  244. #ifdef STARPU_USE_CUDA
  245. void cublas_codelet_func_7(void *descr[], void *arg)
  246. {
  247. struct cg_problem *pb = arg;
  248. float *vecr, *vecq;
  249. uint32_t size;
  250. /* get the vector */
  251. vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  252. vecq = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  253. size = STARPU_VECTOR_GET_NX(descr[0]);
  254. cublasSaxpy (size, -pb->alpha, vecq, 1, vecr, 1);
  255. }
  256. #endif
  257. /*
  258. * compute delta_old = delta_new
  259. * delta_new = trans(r) r
  260. * beta = delta_new / delta_old
  261. *
  262. * descr[0] = r
  263. * args = &delta_old, &delta_new, &beta
  264. */
  265. void cpu_codelet_func_8(void *descr[], void *arg)
  266. {
  267. float dot;
  268. struct cg_problem *pb = arg;
  269. float *vecr;
  270. uint32_t size;
  271. /* get the vector */
  272. vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  273. size = STARPU_VECTOR_GET_NX(descr[0]);
  274. dot = SDOT(size, vecr, 1, vecr, 1);
  275. pb->delta_old = pb->delta_new;
  276. pb->delta_new = dot;
  277. pb->beta = pb->delta_new/pb->delta_old;
  278. }
  279. #ifdef STARPU_USE_CUDA
  280. void cublas_codelet_func_8(void *descr[], void *arg)
  281. {
  282. float dot;
  283. struct cg_problem *pb = arg;
  284. float *vecr;
  285. uint32_t size;
  286. /* get the vector */
  287. vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  288. size = STARPU_VECTOR_GET_NX(descr[0]);
  289. dot = cublasSdot (size, vecr, 1, vecr, 1);
  290. pb->delta_old = pb->delta_new;
  291. pb->delta_new = dot;
  292. pb->beta = pb->delta_new/pb->delta_old;
  293. }
  294. #endif
  295. /*
  296. * compute d = r + beta d
  297. *
  298. * descr[0] : d, descr[1] : r
  299. * args = &beta
  300. *
  301. */
  302. void cpu_codelet_func_9(void *descr[], void *arg)
  303. {
  304. struct cg_problem *pb = arg;
  305. float *vecd, *vecr;
  306. uint32_t size;
  307. /* get the vector */
  308. vecd = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  309. vecr = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  310. size = STARPU_VECTOR_GET_NX(descr[0]);
  311. /* d = beta d */
  312. SSCAL(size, pb->beta, vecd, 1);
  313. /* d = r + d */
  314. SAXPY (size, 1.0f, vecr, 1, vecd, 1);
  315. }
  316. #ifdef STARPU_USE_CUDA
  317. void cublas_codelet_func_9(void *descr[], void *arg)
  318. {
  319. struct cg_problem *pb = arg;
  320. float *vecd, *vecr;
  321. uint32_t size;
  322. /* get the vector */
  323. vecd = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  324. vecr = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  325. size = STARPU_VECTOR_GET_NX(descr[0]);
  326. /* d = beta d */
  327. cublasSscal(size, pb->beta, vecd, 1);
  328. /* d = r + d */
  329. cublasSaxpy (size, 1.0f, vecr, 1, vecd, 1);
  330. }
  331. #endif