cg_kernels.c 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2010-2021 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  4. *
  5. * StarPU is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU Lesser General Public License as published by
  7. * the Free Software Foundation; either version 2.1 of the License, or (at
  8. * your option) any later version.
  9. *
  10. * StarPU is distributed in the hope that it will be useful, but
  11. * WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. *
  14. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  15. */
  16. /*
  17. * Standard BLAS kernels used by CG
  18. */
  19. #include "cg.h"
  20. #include <math.h>
  21. #include <limits.h>
  22. #ifdef STARPU_USE_CUDA
  23. #include <cuda.h>
  24. #include <starpu_cublas_v2.h>
  25. static const TYPE gp1 = 1.0;
  26. static const TYPE gm1 = -1.0;
  27. #endif
  28. #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
  29. static int long long n = 4096;
  30. static int nblocks = 8;
  31. #ifdef STARPU_QUICK_CHECK
  32. static int i_max = 5;
  33. #elif !defined(STARPU_LONG_CHECK)
  34. static int i_max = 100;
  35. #else
  36. static int i_max = 1000;
  37. #endif
  38. static double eps = (10e-14);
  39. int use_reduction = 1;
  40. int display_result = 0;
  41. HANDLE_TYPE_MATRIX A_handle;
  42. HANDLE_TYPE_VECTOR b_handle;
  43. HANDLE_TYPE_VECTOR x_handle;
  44. HANDLE_TYPE_VECTOR r_handle;
  45. HANDLE_TYPE_VECTOR d_handle;
  46. HANDLE_TYPE_VECTOR q_handle;
  47. starpu_data_handle_t dtq_handle;
  48. starpu_data_handle_t rtr_handle;
  49. TYPE dtq, rtr;
  50. #if 0
  51. static void print_vector_from_descr(unsigned nx, TYPE *v)
  52. {
  53. unsigned i;
  54. for (i = 0; i < nx; i++)
  55. {
  56. fprintf(stderr, "%2.2e ", v[i]);
  57. }
  58. fprintf(stderr, "\n");
  59. }
  60. static void print_matrix_from_descr(unsigned nx, unsigned ny, unsigned ld, TYPE *mat)
  61. {
  62. unsigned i, j;
  63. for (j = 0; j < nx; j++)
  64. {
  65. for (i = 0; i < ny; i++)
  66. {
  67. fprintf(stderr, "%2.2e ", mat[j+i*ld]);
  68. }
  69. fprintf(stderr, "\n");
  70. }
  71. }
  72. #endif
  73. static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
  74. {
  75. (void)task;
  76. (void)nimpl;
  77. enum starpu_worker_archtype type = starpu_worker_get_type(workerid);
  78. if (type == STARPU_CPU_WORKER || type == STARPU_OPENCL_WORKER || type == STARPU_MIC_WORKER)
  79. return 1;
  80. #ifdef STARPU_USE_CUDA
  81. #ifdef STARPU_SIMGRID
  82. /* We don't know, let's assume it can */
  83. return 1;
  84. #else
  85. /* Cuda device */
  86. const struct cudaDeviceProp *props;
  87. props = starpu_cuda_get_device_properties(workerid);
  88. if (props->major >= 2 || props->minor >= 3)
  89. /* At least compute capability 1.3, supports doubles */
  90. return 1;
  91. #endif
  92. #endif
  93. /* Old card, does not support doubles */
  94. return 0;
  95. }
  96. /*
  97. * Reduction accumulation methods
  98. */
  99. #ifdef STARPU_USE_CUDA
  100. static void accumulate_variable_cuda(void *descr[], void *cl_arg)
  101. {
  102. (void)cl_arg;
  103. TYPE *v_dst = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
  104. TYPE *v_src = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]);
  105. cublasStatus_t status = cublasaxpy(starpu_cublas_get_local_handle(), 1, &gp1, v_src, 1, v_dst, 1);
  106. if (status != CUBLAS_STATUS_SUCCESS)
  107. STARPU_CUBLAS_REPORT_ERROR(status);
  108. }
  109. #endif
  110. void accumulate_variable_cpu(void *descr[], void *cl_arg)
  111. {
  112. (void)cl_arg;
  113. TYPE *v_dst = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
  114. TYPE *v_src = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]);
  115. *v_dst = *v_dst + *v_src;
  116. }
  117. static struct starpu_perfmodel accumulate_variable_model =
  118. {
  119. .type = STARPU_HISTORY_BASED,
  120. .symbol = "accumulate_variable"
  121. };
  122. struct starpu_codelet accumulate_variable_cl =
  123. {
  124. .can_execute = can_execute,
  125. .cpu_funcs = {accumulate_variable_cpu},
  126. .cpu_funcs_name = {"accumulate_variable_cpu"},
  127. #ifdef STARPU_USE_CUDA
  128. .cuda_funcs = {accumulate_variable_cuda},
  129. .cuda_flags = {STARPU_CUDA_ASYNC},
  130. #endif
  131. .modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
  132. .nbuffers = 2,
  133. .model = &accumulate_variable_model
  134. };
  135. #ifdef STARPU_USE_CUDA
  136. static void accumulate_vector_cuda(void *descr[], void *cl_arg)
  137. {
  138. (void)cl_arg;
  139. TYPE *v_dst = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  140. TYPE *v_src = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
  141. unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  142. cublasStatus_t status = cublasaxpy(starpu_cublas_get_local_handle(), n, &gp1, v_src, 1, v_dst, 1);
  143. if (status != CUBLAS_STATUS_SUCCESS)
  144. STARPU_CUBLAS_REPORT_ERROR(status);
  145. }
  146. #endif
  147. void accumulate_vector_cpu(void *descr[], void *cl_arg)
  148. {
  149. (void)cl_arg;
  150. TYPE *v_dst = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  151. TYPE *v_src = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
  152. unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  153. AXPY(n, (TYPE)1.0, v_src, 1, v_dst, 1);
  154. }
  155. static struct starpu_perfmodel accumulate_vector_model =
  156. {
  157. .type = STARPU_HISTORY_BASED,
  158. .symbol = "accumulate_vector"
  159. };
  160. struct starpu_codelet accumulate_vector_cl =
  161. {
  162. .can_execute = can_execute,
  163. .cpu_funcs = {accumulate_vector_cpu},
  164. .cpu_funcs_name = {"accumulate_vector_cpu"},
  165. #ifdef STARPU_USE_CUDA
  166. .cuda_funcs = {accumulate_vector_cuda},
  167. .cuda_flags = {STARPU_CUDA_ASYNC},
  168. #endif
  169. .modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
  170. .nbuffers = 2,
  171. .model = &accumulate_vector_model
  172. };
  173. /*
  174. * Reduction initialization methods
  175. */
  176. #ifdef STARPU_USE_CUDA
  177. extern void zero_vector(TYPE *x, unsigned nelems);
  178. static void bzero_variable_cuda(void *descr[], void *cl_arg)
  179. {
  180. (void)cl_arg;
  181. TYPE *v = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
  182. size_t size = STARPU_VARIABLE_GET_ELEMSIZE(descr[0]);
  183. cudaMemsetAsync(v, 0, size, starpu_cuda_get_local_stream());
  184. }
  185. #endif
  186. void bzero_variable_cpu(void *descr[], void *cl_arg)
  187. {
  188. (void)cl_arg;
  189. TYPE *v = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
  190. *v = (TYPE)0.0;
  191. }
  192. static struct starpu_perfmodel bzero_variable_model =
  193. {
  194. .type = STARPU_HISTORY_BASED,
  195. .symbol = "bzero_variable"
  196. };
  197. struct starpu_codelet bzero_variable_cl =
  198. {
  199. .can_execute = can_execute,
  200. .cpu_funcs = {bzero_variable_cpu},
  201. .cpu_funcs_name = {"bzero_variable_cpu"},
  202. #ifdef STARPU_USE_CUDA
  203. .cuda_funcs = {bzero_variable_cuda},
  204. .cuda_flags = {STARPU_CUDA_ASYNC},
  205. #endif
  206. .modes = {STARPU_W},
  207. .nbuffers = 1,
  208. .model = &bzero_variable_model
  209. };
  210. #ifdef STARPU_USE_CUDA
  211. static void bzero_vector_cuda(void *descr[], void *cl_arg)
  212. {
  213. (void)cl_arg;
  214. TYPE *v = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  215. unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  216. size_t elemsize = STARPU_VECTOR_GET_ELEMSIZE(descr[0]);
  217. cudaMemsetAsync(v, 0, n * elemsize, starpu_cuda_get_local_stream());
  218. }
  219. #endif
  220. void bzero_vector_cpu(void *descr[], void *cl_arg)
  221. {
  222. (void)cl_arg;
  223. TYPE *v = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  224. unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  225. memset(v, 0, n*sizeof(TYPE));
  226. }
  227. static struct starpu_perfmodel bzero_vector_model =
  228. {
  229. .type = STARPU_HISTORY_BASED,
  230. .symbol = "bzero_vector"
  231. };
  232. struct starpu_codelet bzero_vector_cl =
  233. {
  234. .can_execute = can_execute,
  235. .cpu_funcs = {bzero_vector_cpu},
  236. .cpu_funcs_name = {"bzero_vector_cpu"},
  237. #ifdef STARPU_USE_CUDA
  238. .cuda_funcs = {bzero_vector_cuda},
  239. .cuda_flags = {STARPU_CUDA_ASYNC},
  240. #endif
  241. .modes = {STARPU_W},
  242. .nbuffers = 1,
  243. .model = &bzero_vector_model
  244. };
  245. /*
  246. * DOT kernel : s = dot(v1, v2)
  247. */
  248. #ifdef STARPU_USE_CUDA
  249. static void dot_kernel_cuda(void *descr[], void *cl_arg)
  250. {
  251. (void)cl_arg;
  252. TYPE *dot = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
  253. TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
  254. TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[2]);
  255. unsigned n = STARPU_VECTOR_GET_NX(descr[1]);
  256. cublasHandle_t handle = starpu_cublas_get_local_handle();
  257. cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
  258. cublasStatus_t status = cublasdot(handle,
  259. n, v1, 1, v2, 1, dot);
  260. if (status != CUBLAS_STATUS_SUCCESS)
  261. STARPU_CUBLAS_REPORT_ERROR(status);
  262. cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
  263. }
  264. #endif
  265. void dot_kernel_cpu(void *descr[], void *cl_arg)
  266. {
  267. (void)cl_arg;
  268. TYPE *dot = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
  269. TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
  270. TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[2]);
  271. unsigned n = STARPU_VECTOR_GET_NX(descr[1]);
  272. TYPE local_dot;
  273. /* Note that we explicitely cast the result of the DOT kernel because
  274. * some BLAS library will return a double for sdot for instance. */
  275. local_dot = (TYPE)DOT(n, v1, 1, v2, 1);
  276. *dot = *dot + local_dot;
  277. }
  278. static struct starpu_perfmodel dot_kernel_model =
  279. {
  280. .type = STARPU_HISTORY_BASED,
  281. .symbol = "dot_kernel"
  282. };
  283. static struct starpu_codelet dot_kernel_cl =
  284. {
  285. .can_execute = can_execute,
  286. .cpu_funcs = {dot_kernel_cpu},
  287. .cpu_funcs_name = {"dot_kernel_cpu"},
  288. #ifdef STARPU_USE_CUDA
  289. .cuda_funcs = {dot_kernel_cuda},
  290. #endif
  291. .cuda_flags = {STARPU_CUDA_ASYNC},
  292. .nbuffers = 3,
  293. .model = &dot_kernel_model
  294. };
  295. int dot_kernel(HANDLE_TYPE_VECTOR v1,
  296. HANDLE_TYPE_VECTOR v2,
  297. starpu_data_handle_t s,
  298. unsigned nblocks,
  299. int use_reduction)
  300. {
  301. int ret;
  302. /* Blank the accumulation variable */
  303. if (use_reduction)
  304. starpu_data_invalidate_submit(s);
  305. else
  306. {
  307. ret = TASK_INSERT(&bzero_variable_cl, STARPU_W, s, 0);
  308. if (ret == -ENODEV) return ret;
  309. STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
  310. }
  311. unsigned b;
  312. for (b = 0; b < nblocks; b++)
  313. {
  314. ret = TASK_INSERT(&dot_kernel_cl,
  315. use_reduction?STARPU_REDUX:STARPU_RW, s,
  316. STARPU_R, GET_VECTOR_BLOCK(v1, b),
  317. STARPU_R, GET_VECTOR_BLOCK(v2, b),
  318. STARPU_TAG_ONLY, (starpu_tag_t) b,
  319. 0);
  320. STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
  321. }
  322. return 0;
  323. }
  324. /*
  325. * SCAL kernel : v1 = p1 v1
  326. */
  327. #ifdef STARPU_USE_CUDA
  328. static void scal_kernel_cuda(void *descr[], void *cl_arg)
  329. {
  330. TYPE p1;
  331. starpu_codelet_unpack_args(cl_arg, &p1);
  332. TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  333. unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  334. /* v1 = p1 v1 */
  335. TYPE alpha = p1;
  336. cublasStatus_t status = cublasscal(starpu_cublas_get_local_handle(), n, &alpha, v1, 1);
  337. if (status != CUBLAS_STATUS_SUCCESS)
  338. STARPU_CUBLAS_REPORT_ERROR(status);
  339. }
  340. #endif
  341. void scal_kernel_cpu(void *descr[], void *cl_arg)
  342. {
  343. TYPE alpha;
  344. starpu_codelet_unpack_args(cl_arg, &alpha);
  345. TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  346. unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  347. /* v1 = alpha v1 */
  348. SCAL(n, alpha, v1, 1);
  349. }
  350. static struct starpu_perfmodel scal_kernel_model =
  351. {
  352. .type = STARPU_HISTORY_BASED,
  353. .symbol = "scal_kernel"
  354. };
  355. static struct starpu_codelet scal_kernel_cl =
  356. {
  357. .can_execute = can_execute,
  358. .cpu_funcs = {scal_kernel_cpu},
  359. .cpu_funcs_name = {"scal_kernel_cpu"},
  360. #ifdef STARPU_USE_CUDA
  361. .cuda_funcs = {scal_kernel_cuda},
  362. .cuda_flags = {STARPU_CUDA_ASYNC},
  363. #endif
  364. .nbuffers = 1,
  365. .model = &scal_kernel_model
  366. };
  367. /*
  368. * GEMV kernel : v1 = p1 * v1 + p2 * M v2
  369. */
  370. #ifdef STARPU_USE_CUDA
  371. static void gemv_kernel_cuda(void *descr[], void *cl_arg)
  372. {
  373. TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  374. TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[2]);
  375. TYPE *M = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
  376. unsigned ld = STARPU_MATRIX_GET_LD(descr[1]);
  377. unsigned nx = STARPU_MATRIX_GET_NX(descr[1]);
  378. unsigned ny = STARPU_MATRIX_GET_NY(descr[1]);
  379. TYPE alpha, beta;
  380. starpu_codelet_unpack_args(cl_arg, &beta, &alpha);
  381. /* Compute v1 = alpha M v2 + beta v1 */
  382. cublasStatus_t status = cublasgemv(starpu_cublas_get_local_handle(),
  383. CUBLAS_OP_N, nx, ny, &alpha, M, ld, v2, 1, &beta, v1, 1);
  384. if (status != CUBLAS_STATUS_SUCCESS)
  385. STARPU_CUBLAS_REPORT_ERROR(status);
  386. }
  387. #endif
  388. void gemv_kernel_cpu(void *descr[], void *cl_arg)
  389. {
  390. TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  391. TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[2]);
  392. TYPE *M = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
  393. unsigned ld = STARPU_MATRIX_GET_LD(descr[1]);
  394. unsigned nx = STARPU_MATRIX_GET_NX(descr[1]);
  395. unsigned ny = STARPU_MATRIX_GET_NY(descr[1]);
  396. TYPE alpha, beta;
  397. starpu_codelet_unpack_args(cl_arg, &beta, &alpha);
  398. int worker_size = starpu_combined_worker_get_size();
  399. if (worker_size > 1)
  400. {
  401. /* Parallel CPU task */
  402. unsigned rank = starpu_combined_worker_get_rank();
  403. unsigned block_size = (ny + worker_size - 1)/worker_size;
  404. unsigned new_nx = STARPU_MIN(nx, block_size*(rank+1)) - block_size*rank;
  405. nx = new_nx;
  406. v1 = &v1[block_size*rank];
  407. M = &M[block_size*rank];
  408. }
  409. /* Compute v1 = alpha M v2 + beta v1 */
  410. GEMV("N", nx, ny, alpha, M, ld, v2, 1, beta, v1, 1);
  411. }
  412. static struct starpu_perfmodel gemv_kernel_model =
  413. {
  414. .type = STARPU_HISTORY_BASED,
  415. .symbol = "gemv_kernel"
  416. };
  417. static struct starpu_codelet gemv_kernel_cl =
  418. {
  419. .can_execute = can_execute,
  420. .type = STARPU_SPMD,
  421. .max_parallelism = INT_MAX,
  422. .cpu_funcs = {gemv_kernel_cpu},
  423. .cpu_funcs_name = {"gemv_kernel_cpu"},
  424. #ifdef STARPU_USE_CUDA
  425. .cuda_funcs = {gemv_kernel_cuda},
  426. .cuda_flags = {STARPU_CUDA_ASYNC},
  427. #endif
  428. .nbuffers = 3,
  429. .model = &gemv_kernel_model
  430. };
  431. int gemv_kernel(HANDLE_TYPE_VECTOR v1,
  432. HANDLE_TYPE_MATRIX matrix,
  433. HANDLE_TYPE_VECTOR v2,
  434. TYPE p1, TYPE p2,
  435. unsigned nblocks,
  436. int use_reduction)
  437. {
  438. unsigned b1, b2;
  439. int ret;
  440. for (b2 = 0; b2 < nblocks; b2++)
  441. {
  442. ret = TASK_INSERT(&scal_kernel_cl,
  443. STARPU_RW, GET_VECTOR_BLOCK(v1, b2),
  444. STARPU_VALUE, &p1, sizeof(p1),
  445. STARPU_TAG_ONLY, (starpu_tag_t) b2,
  446. 0);
  447. if (ret == -ENODEV) return ret;
  448. STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
  449. }
  450. for (b2 = 0; b2 < nblocks; b2++)
  451. {
  452. for (b1 = 0; b1 < nblocks; b1++)
  453. {
  454. TYPE one = 1.0;
  455. ret = TASK_INSERT(&gemv_kernel_cl,
  456. use_reduction?STARPU_REDUX:STARPU_RW, GET_VECTOR_BLOCK(v1, b2),
  457. STARPU_R, GET_MATRIX_BLOCK(matrix, b2, b1),
  458. STARPU_R, GET_VECTOR_BLOCK(v2, b1),
  459. STARPU_VALUE, &one, sizeof(one),
  460. STARPU_VALUE, &p2, sizeof(p2),
  461. STARPU_TAG_ONLY, ((starpu_tag_t)b2) * nblocks + b1,
  462. 0);
  463. STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
  464. }
  465. }
  466. return 0;
  467. }
  468. /*
  469. * AXPY + SCAL kernel : v1 = p1 * v1 + p2 * v2
  470. */
  471. #ifdef STARPU_USE_CUDA
  472. static void scal_axpy_kernel_cuda(void *descr[], void *cl_arg)
  473. {
  474. TYPE p1, p2;
  475. starpu_codelet_unpack_args(cl_arg, &p1, &p2);
  476. TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  477. TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
  478. unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  479. /* Compute v1 = p1 * v1 + p2 * v2.
  480. * v1 = p1 v1
  481. * v1 = v1 + p2 v2
  482. */
  483. cublasStatus_t status;
  484. status = cublasscal(starpu_cublas_get_local_handle(), n, &p1, v1, 1);
  485. if (status != CUBLAS_STATUS_SUCCESS)
  486. STARPU_CUBLAS_REPORT_ERROR(status);
  487. status = cublasaxpy(starpu_cublas_get_local_handle(), n, &p2, v2, 1, v1, 1);
  488. if (status != CUBLAS_STATUS_SUCCESS)
  489. STARPU_CUBLAS_REPORT_ERROR(status);
  490. }
  491. #endif
  492. void scal_axpy_kernel_cpu(void *descr[], void *cl_arg)
  493. {
  494. TYPE p1, p2;
  495. starpu_codelet_unpack_args(cl_arg, &p1, &p2);
  496. TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  497. TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
  498. unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
  499. /* Compute v1 = p1 * v1 + p2 * v2.
  500. * v1 = p1 v1
  501. * v1 = v1 + p2 v2
  502. */
  503. SCAL(nx, p1, v1, 1);
  504. AXPY(nx, p2, v2, 1, v1, 1);
  505. }
  506. static struct starpu_perfmodel scal_axpy_kernel_model =
  507. {
  508. .type = STARPU_HISTORY_BASED,
  509. .symbol = "scal_axpy_kernel"
  510. };
  511. static struct starpu_codelet scal_axpy_kernel_cl =
  512. {
  513. .can_execute = can_execute,
  514. .cpu_funcs = {scal_axpy_kernel_cpu},
  515. .cpu_funcs_name = {"scal_axpy_kernel_cpu"},
  516. #ifdef STARPU_USE_CUDA
  517. .cuda_funcs = {scal_axpy_kernel_cuda},
  518. .cuda_flags = {STARPU_CUDA_ASYNC},
  519. #endif
  520. .nbuffers = 2,
  521. .model = &scal_axpy_kernel_model
  522. };
  523. int scal_axpy_kernel(HANDLE_TYPE_VECTOR v1, TYPE p1,
  524. HANDLE_TYPE_VECTOR v2, TYPE p2,
  525. unsigned nblocks)
  526. {
  527. unsigned b;
  528. for (b = 0; b < nblocks; b++)
  529. {
  530. int ret;
  531. ret = TASK_INSERT(&scal_axpy_kernel_cl,
  532. STARPU_RW, GET_VECTOR_BLOCK(v1, b),
  533. STARPU_R, GET_VECTOR_BLOCK(v2, b),
  534. STARPU_VALUE, &p1, sizeof(p1),
  535. STARPU_VALUE, &p2, sizeof(p2),
  536. STARPU_TAG_ONLY, (starpu_tag_t) b,
  537. 0);
  538. if (ret == -ENODEV) return ret;
  539. STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
  540. }
  541. return 0;
  542. }
  543. /*
  544. * AXPY kernel : v1 = v1 + p1 * v2
  545. */
  546. #ifdef STARPU_USE_CUDA
  547. static void axpy_kernel_cuda(void *descr[], void *cl_arg)
  548. {
  549. TYPE p1;
  550. starpu_codelet_unpack_args(cl_arg, &p1);
  551. TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  552. TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
  553. unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  554. /* Compute v1 = v1 + p1 * v2.
  555. */
  556. cublasStatus_t status = cublasaxpy(starpu_cublas_get_local_handle(),
  557. n, &p1, v2, 1, v1, 1);
  558. if (status != CUBLAS_STATUS_SUCCESS)
  559. STARPU_CUBLAS_REPORT_ERROR(status);
  560. }
  561. #endif
  562. void axpy_kernel_cpu(void *descr[], void *cl_arg)
  563. {
  564. TYPE p1;
  565. starpu_codelet_unpack_args(cl_arg, &p1);
  566. TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  567. TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
  568. unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
  569. /* Compute v1 = p1 * v1 + p2 * v2.
  570. */
  571. AXPY(nx, p1, v2, 1, v1, 1);
  572. }
  573. static struct starpu_perfmodel axpy_kernel_model =
  574. {
  575. .type = STARPU_HISTORY_BASED,
  576. .symbol = "axpy_kernel"
  577. };
  578. static struct starpu_codelet axpy_kernel_cl =
  579. {
  580. .can_execute = can_execute,
  581. .cpu_funcs = {axpy_kernel_cpu},
  582. .cpu_funcs_name = {"axpy_kernel_cpu"},
  583. #ifdef STARPU_USE_CUDA
  584. .cuda_funcs = {axpy_kernel_cuda},
  585. .cuda_flags = {STARPU_CUDA_ASYNC},
  586. #endif
  587. .nbuffers = 2,
  588. .model = &axpy_kernel_model
  589. };
  590. int axpy_kernel(HANDLE_TYPE_VECTOR v1,
  591. HANDLE_TYPE_VECTOR v2, TYPE p1,
  592. unsigned nblocks)
  593. {
  594. unsigned b;
  595. for (b = 0; b < nblocks; b++)
  596. {
  597. int ret;
  598. ret = TASK_INSERT(&axpy_kernel_cl,
  599. STARPU_RW, GET_VECTOR_BLOCK(v1, b),
  600. STARPU_R, GET_VECTOR_BLOCK(v2, b),
  601. STARPU_VALUE, &p1, sizeof(p1),
  602. STARPU_TAG_ONLY, (starpu_tag_t) b,
  603. 0);
  604. if (ret == -ENODEV) return ret;
  605. STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
  606. }
  607. return 0;
  608. }
  609. /*
  610. * Main loop
  611. */
  612. int cg(void)
  613. {
  614. TYPE delta_new, delta_0, error, delta_old, alpha, beta;
  615. double start, end, timing;
  616. int i = 0, ret;
  617. /* r <- b */
  618. ret = copy_handle(r_handle, b_handle, nblocks);
  619. if (ret == -ENODEV) return ret;
  620. /* r <- r - A x */
  621. ret = gemv_kernel(r_handle, A_handle, x_handle, 1.0, -1.0, nblocks, use_reduction);
  622. if (ret == -ENODEV) return ret;
  623. /* d <- r */
  624. ret = copy_handle(d_handle, r_handle, nblocks);
  625. if (ret == -ENODEV) return ret;
  626. /* delta_new = dot(r,r) */
  627. ret = dot_kernel(r_handle, r_handle, rtr_handle, nblocks, use_reduction);
  628. if (ret == -ENODEV) return ret;
  629. GET_DATA_HANDLE(rtr_handle);
  630. starpu_data_acquire(rtr_handle, STARPU_R);
  631. delta_new = rtr;
  632. delta_0 = delta_new;
  633. starpu_data_release(rtr_handle);
  634. FPRINTF_SERVER(stderr, "Delta limit: %e\n", (double) (eps*eps*delta_0));
  635. FPRINTF_SERVER(stderr, "**************** INITIAL ****************\n");
  636. FPRINTF_SERVER(stderr, "Delta 0: %e\n", delta_new);
  637. BARRIER();
  638. start = starpu_timing_now();
  639. while ((i < i_max) && ((double)delta_new > (double)(eps*eps*delta_0)))
  640. {
  641. starpu_iteration_push(i);
  642. /* q <- A d */
  643. gemv_kernel(q_handle, A_handle, d_handle, 0.0, 1.0, nblocks, use_reduction);
  644. /* dtq <- dot(d,q) */
  645. dot_kernel(d_handle, q_handle, dtq_handle, nblocks, use_reduction);
  646. /* alpha = delta_new / dtq */
  647. GET_DATA_HANDLE(dtq_handle);
  648. starpu_data_acquire(dtq_handle, STARPU_R);
  649. alpha = delta_new / dtq;
  650. starpu_data_release(dtq_handle);
  651. /* x <- x + alpha d */
  652. axpy_kernel(x_handle, d_handle, alpha, nblocks);
  653. if ((i % 50) == 0)
  654. {
  655. /* r <- b */
  656. copy_handle(r_handle, b_handle, nblocks);
  657. /* r <- r - A x */
  658. gemv_kernel(r_handle, A_handle, x_handle, 1.0, -1.0, nblocks, use_reduction);
  659. }
  660. else
  661. {
  662. /* r <- r - alpha q */
  663. axpy_kernel(r_handle, q_handle, -alpha, nblocks);
  664. }
  665. /* delta_new = dot(r,r) */
  666. dot_kernel(r_handle, r_handle, rtr_handle, nblocks, use_reduction);
  667. GET_DATA_HANDLE(rtr_handle);
  668. starpu_data_acquire(rtr_handle, STARPU_R);
  669. delta_old = delta_new;
  670. delta_new = rtr;
  671. beta = delta_new / delta_old;
  672. starpu_data_release(rtr_handle);
  673. /* d <- beta d + r */
  674. scal_axpy_kernel(d_handle, beta, r_handle, 1.0, nblocks);
  675. if ((i % 10) == 0)
  676. {
  677. /* We here take the error as ||r||_2 / (n||b||_2) */
  678. error = sqrt(delta_new/delta_0)/(1.0*n);
  679. FPRINTF_SERVER(stderr, "*****************************************\n");
  680. FPRINTF_SERVER(stderr, "iter %d DELTA %e - %e\n", i, delta_new, error);
  681. }
  682. starpu_iteration_pop();
  683. i++;
  684. }
  685. BARRIER();
  686. end = starpu_timing_now();
  687. timing = end - start;
  688. error = sqrt(delta_new/delta_0)/(1.0*n);
  689. FPRINTF_SERVER(stderr, "*****************************************\n");
  690. FPRINTF_SERVER(stderr, "iter %d DELTA %e - %e\n", i, delta_new, error);
  691. FPRINTF_SERVER(stderr, "Total timing : %2.2f seconds\n", timing/10e6);
  692. FPRINTF_SERVER(stderr, "Seconds per iteration : %2.2e seconds\n", timing/10e6/i);
  693. FPRINTF_SERVER(stderr, "Number of iterations per second : %2.2e it/s\n", i/(timing/10e6));
  694. return 0;
  695. }
  696. void parse_common_args(int argc, char **argv)
  697. {
  698. int i;
  699. for (i = 1; i < argc; i++)
  700. {
  701. if (strcmp(argv[i], "-n") == 0)
  702. {
  703. n = (int long long)atoi(argv[++i]);
  704. continue;
  705. }
  706. if (strcmp(argv[i], "-display-result") == 0)
  707. {
  708. display_result = 1;
  709. continue;
  710. }
  711. if (strcmp(argv[i], "-maxiter") == 0)
  712. {
  713. i_max = atoi(argv[++i]);
  714. if (i_max <= 0)
  715. {
  716. FPRINTF_SERVER(stderr, "the number of iterations must be positive, not %d\n", i_max);
  717. exit(EXIT_FAILURE);
  718. }
  719. continue;
  720. }
  721. if (strcmp(argv[i], "-nblocks") == 0)
  722. {
  723. nblocks = atoi(argv[++i]);
  724. continue;
  725. }
  726. if (strcmp(argv[i], "-no-reduction") == 0)
  727. {
  728. use_reduction = 0;
  729. continue;
  730. }
  731. }
  732. }