dw_sparse_cg_kernels.c 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2009, 2010 Université de Bordeaux 1
  4. * Copyright (C) 2010 Centre National de la Recherche Scientifique
  5. *
  6. * StarPU is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU Lesser General Public License as published by
  8. * the Free Software Foundation; either version 2.1 of the License, or (at
  9. * your option) any later version.
  10. *
  11. * StarPU is distributed in the hope that it will be useful, but
  12. * WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  14. *
  15. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  16. */
  17. #include "dw_sparse_cg.h"
  18. /*
  19. * Algorithm :
  20. *
  21. * i = 0
  22. * r = b - A x
  23. * ( d = A x ; r = r - d )
  24. * d = r
  25. * delta_new = trans(r) r
  26. * delta_0 = delta_new
  27. *
  28. * while (i < i_max && delta_new > eps^2 delta_0)
  29. * {
  30. * q = A d
  31. * alpha = delta_new / ( trans(d) q )
  32. * x = x + alpha d
  33. * if ( i is divisible by 50 )
  34. * r = b - A x
  35. * else
  36. * r = r - alpha q
  37. * delta_old = delta_new
  38. * delta_new = trans(r) r
  39. * beta = delta_new / delta_old
  40. * d = r + beta d
  41. * i = i + 1
  42. * }
  43. */
  44. /*
  45. * compute r = b - A x
  46. *
  47. * descr[0] = A, descr[1] = x, descr [2] = r, descr[3] = b
  48. */
  49. void cpu_codelet_func_1(void *descr[], __attribute__((unused)) void *arg)
  50. {
  51. float *nzval = (float *)STARPU_CSR_GET_NZVAL(descr[0]);
  52. uint32_t *colind = STARPU_CSR_GET_COLIND(descr[0]);
  53. uint32_t *rowptr = STARPU_CSR_GET_ROWPTR(descr[0]);
  54. uint32_t firstentry = STARPU_CSR_GET_ELEMSIZE(descr[0]);
  55. float *vecx = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  56. float *vecr = (float *)STARPU_VECTOR_GET_PTR(descr[2]);
  57. float *vecb = (float *)STARPU_VECTOR_GET_PTR(descr[3]);
  58. uint32_t nrow;
  59. nrow = STARPU_CSR_GET_NROW(descr[0]);
  60. unsigned row;
  61. for (row = 0; row < nrow; row++)
  62. {
  63. float tmp = 0.0f;
  64. unsigned index;
  65. unsigned firstindex = rowptr[row] - firstentry;
  66. unsigned lastindex = rowptr[row+1] - firstentry;
  67. for (index = firstindex; index < lastindex; index++)
  68. {
  69. unsigned col;
  70. col = colind[index];
  71. tmp += nzval[index]*vecx[col];
  72. }
  73. vecr[row] = vecb[row] - tmp;
  74. }
  75. }
  76. /*
  77. * compute d = r
  78. * descr[0] = d, descr[1] = r
  79. */
  80. void cpu_codelet_func_2(void *descr[], __attribute__((unused)) void *arg)
  81. {
  82. /* simply copy r into d */
  83. uint32_t nx = STARPU_VECTOR_GET_NX(descr[0]);
  84. size_t elemsize = STARPU_VECTOR_GET_ELEMSIZE(descr[0]);
  85. STARPU_ASSERT(STARPU_VECTOR_GET_NX(descr[0]) == STARPU_VECTOR_GET_NX(descr[1]));
  86. STARPU_ASSERT(STARPU_VECTOR_GET_ELEMSIZE(descr[0]) == STARPU_VECTOR_GET_ELEMSIZE(descr[1]));
  87. float *src = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  88. float *dst = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  89. memcpy(dst, src, nx*elemsize);
  90. }
  91. /*
  92. * compute delta_new = trans(r) r
  93. * delta_0 = delta_new
  94. *
  95. * args = &delta_new, &delta_0
  96. */
  97. void cpu_codelet_func_3(void *descr[], void *arg)
  98. {
  99. struct cg_problem *pb = arg;
  100. float dot;
  101. float *vec;
  102. int size;
  103. /* get the vector */
  104. vec = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  105. size = (int)STARPU_VECTOR_GET_NX(descr[0]);
  106. dot = SDOT(size, vec, 1, vec, 1);
  107. fprintf(stderr, "func 3 : DOT = %f\n", dot);
  108. pb->delta_new = dot;
  109. pb->delta_0 = dot;
  110. }
  111. #ifdef STARPU_USE_CUDA
  112. void cublas_codelet_func_3(void *descr[], void *arg)
  113. {
  114. struct cg_problem *pb = arg;
  115. float dot;
  116. float *vec;
  117. uint32_t size;
  118. /* get the vector */
  119. vec = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  120. size = STARPU_VECTOR_GET_NX(descr[0]);
  121. dot = cublasSdot (size, vec, 1, vec, 1);
  122. pb->delta_new = dot;
  123. pb->delta_0 = dot;
  124. }
  125. #endif
  126. /*
  127. * compute q with : q = A d
  128. *
  129. * descr[0] = A, descr[1] = d, descr [2] = q
  130. */
  131. void cpu_codelet_func_4(void *descr[], __attribute__((unused)) void *arg)
  132. {
  133. float *nzval = (float *)STARPU_CSR_GET_NZVAL(descr[0]);
  134. uint32_t *colind = STARPU_CSR_GET_COLIND(descr[0]);
  135. uint32_t *rowptr = STARPU_CSR_GET_ROWPTR(descr[0]);
  136. uint32_t firstentry = STARPU_CSR_GET_FIRSTENTRY(descr[0]);
  137. float *vecd = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  138. float *vecq = (float *)STARPU_VECTOR_GET_PTR(descr[2]);
  139. uint32_t nrow;
  140. nrow = STARPU_CSR_GET_NROW(descr[0]);
  141. unsigned row;
  142. for (row = 0; row < nrow; row++)
  143. {
  144. float tmp = 0.0f;
  145. unsigned index;
  146. unsigned firstindex = rowptr[row] - firstentry;
  147. unsigned lastindex = rowptr[row+1] - firstentry;
  148. for (index = firstindex; index < lastindex; index++)
  149. {
  150. unsigned col;
  151. col = colind[index];
  152. tmp += nzval[index]*vecd[col];
  153. }
  154. vecq[row] = tmp;
  155. }
  156. }
  157. /*
  158. * compute alpha = delta_new / ( trans(d) q )
  159. *
  160. * descr[0] = d, descr[1] = q
  161. * args = &alpha, &delta_new
  162. */
  163. void cpu_codelet_func_5(void *descr[], void *arg)
  164. {
  165. float dot;
  166. struct cg_problem *pb = arg;
  167. float *vecd, *vecq;
  168. uint32_t size;
  169. /* get the vector */
  170. vecd = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  171. vecq = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  172. STARPU_ASSERT(STARPU_VECTOR_GET_NX(descr[0]) == STARPU_VECTOR_GET_NX(descr[1]));
  173. size = STARPU_VECTOR_GET_NX(descr[0]);
  174. dot = SDOT(size, vecd, 1, vecq, 1);
  175. pb->alpha = pb->delta_new / dot;
  176. }
  177. #ifdef STARPU_USE_CUDA
  178. void cublas_codelet_func_5(void *descr[], void *arg)
  179. {
  180. float dot;
  181. struct cg_problem *pb = arg;
  182. float *vecd, *vecq;
  183. uint32_t size;
  184. /* get the vector */
  185. vecd = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  186. vecq = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  187. STARPU_ASSERT(STARPU_VECTOR_GET_NX(descr[0]) == STARPU_VECTOR_GET_NX(descr[1]));
  188. size = STARPU_VECTOR_GET_NX(descr[0]);
  189. dot = cublasSdot (size, vecd, 1, vecq, 1);
  190. pb->alpha = pb->delta_new / dot;
  191. }
  192. #endif
  193. /*
  194. * compute x = x + alpha d
  195. *
  196. * descr[0] : x, descr[1] : d
  197. * args = &alpha
  198. */
  199. void cpu_codelet_func_6(void *descr[], void *arg)
  200. {
  201. struct cg_problem *pb = arg;
  202. float *vecx, *vecd;
  203. uint32_t size;
  204. /* get the vector */
  205. vecx = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  206. vecd = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  207. size = STARPU_VECTOR_GET_NX(descr[0]);
  208. SAXPY(size, pb->alpha, vecd, 1, vecx, 1);
  209. }
  210. #ifdef STARPU_USE_CUDA
  211. void cublas_codelet_func_6(void *descr[], void *arg)
  212. {
  213. struct cg_problem *pb = arg;
  214. float *vecx, *vecd;
  215. uint32_t size;
  216. /* get the vector */
  217. vecx = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  218. vecd = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  219. size = STARPU_VECTOR_GET_NX(descr[0]);
  220. cublasSaxpy (size, pb->alpha, vecd, 1, vecx, 1);
  221. }
  222. #endif
  223. /*
  224. * compute r = r - alpha q
  225. *
  226. * descr[0] : r, descr[1] : q
  227. * args = &alpha
  228. */
  229. void cpu_codelet_func_7(void *descr[], void *arg)
  230. {
  231. struct cg_problem *pb = arg;
  232. float *vecr, *vecq;
  233. uint32_t size;
  234. /* get the vector */
  235. vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  236. vecq = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  237. size = STARPU_VECTOR_GET_NX(descr[0]);
  238. SAXPY(size, -pb->alpha, vecq, 1, vecr, 1);
  239. }
  240. #ifdef STARPU_USE_CUDA
  241. void cublas_codelet_func_7(void *descr[], void *arg)
  242. {
  243. struct cg_problem *pb = arg;
  244. float *vecr, *vecq;
  245. uint32_t size;
  246. /* get the vector */
  247. vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  248. vecq = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  249. size = STARPU_VECTOR_GET_NX(descr[0]);
  250. cublasSaxpy (size, -pb->alpha, vecq, 1, vecr, 1);
  251. }
  252. #endif
  253. /*
  254. * compute delta_old = delta_new
  255. * delta_new = trans(r) r
  256. * beta = delta_new / delta_old
  257. *
  258. * descr[0] = r
  259. * args = &delta_old, &delta_new, &beta
  260. */
  261. void cpu_codelet_func_8(void *descr[], void *arg)
  262. {
  263. float dot;
  264. struct cg_problem *pb = arg;
  265. float *vecr;
  266. uint32_t size;
  267. /* get the vector */
  268. vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  269. size = STARPU_VECTOR_GET_NX(descr[0]);
  270. dot = SDOT(size, vecr, 1, vecr, 1);
  271. pb->delta_old = pb->delta_new;
  272. pb->delta_new = dot;
  273. pb->beta = pb->delta_new/pb->delta_old;
  274. }
  275. #ifdef STARPU_USE_CUDA
  276. void cublas_codelet_func_8(void *descr[], void *arg)
  277. {
  278. float dot;
  279. struct cg_problem *pb = arg;
  280. float *vecr;
  281. uint32_t size;
  282. /* get the vector */
  283. vecr = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  284. size = STARPU_VECTOR_GET_NX(descr[0]);
  285. dot = cublasSdot (size, vecr, 1, vecr, 1);
  286. pb->delta_old = pb->delta_new;
  287. pb->delta_new = dot;
  288. pb->beta = pb->delta_new/pb->delta_old;
  289. }
  290. #endif
  291. /*
  292. * compute d = r + beta d
  293. *
  294. * descr[0] : d, descr[1] : r
  295. * args = &beta
  296. *
  297. */
  298. void cpu_codelet_func_9(void *descr[], void *arg)
  299. {
  300. struct cg_problem *pb = arg;
  301. float *vecd, *vecr;
  302. uint32_t size;
  303. /* get the vector */
  304. vecd = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  305. vecr = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  306. size = STARPU_VECTOR_GET_NX(descr[0]);
  307. /* d = beta d */
  308. SSCAL(size, pb->beta, vecd, 1);
  309. /* d = r + d */
  310. SAXPY (size, 1.0f, vecr, 1, vecd, 1);
  311. }
  312. #ifdef STARPU_USE_CUDA
  313. void cublas_codelet_func_9(void *descr[], void *arg)
  314. {
  315. struct cg_problem *pb = arg;
  316. float *vecd, *vecr;
  317. uint32_t size;
  318. /* get the vector */
  319. vecd = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  320. vecr = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  321. size = STARPU_VECTOR_GET_NX(descr[0]);
  322. /* d = beta d */
  323. cublasSscal(size, pb->beta, vecd, 1);
  324. /* d = r + d */
  325. cublasSaxpy (size, 1.0f, vecr, 1, vecd, 1);
  326. }
  327. #endif