cg_kernels.c 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2010-2021 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  4. *
  5. * StarPU is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU Lesser General Public License as published by
  7. * the Free Software Foundation; either version 2.1 of the License, or (at
  8. * your option) any later version.
  9. *
  10. * StarPU is distributed in the hope that it will be useful, but
  11. * WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. *
  14. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  15. */
  16. /*
  17. * Standard BLAS kernels used by CG
  18. */
  19. #include "cg.h"
  20. #include <math.h>
  21. #include <limits.h>
  22. #ifdef STARPU_USE_CUDA
  23. #include <cuda.h>
  24. #include <starpu_cublas_v2.h>
  25. static const TYPE gp1 = 1.0;
  26. static const TYPE gm1 = -1.0;
  27. #endif
  28. #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
  29. static int nblocks = 8;
  30. #ifdef STARPU_QUICK_CHECK
  31. static int i_max = 5;
  32. static int long long n = 2048;
  33. #elif !defined(STARPU_LONG_CHECK)
  34. static int long long n = 4096;
  35. static int i_max = 100;
  36. #else
  37. static int long long n = 4096;
  38. static int i_max = 1000;
  39. #endif
  40. static double eps = (10e-14);
  41. int use_reduction = 1;
  42. int display_result = 0;
  43. HANDLE_TYPE_MATRIX A_handle;
  44. HANDLE_TYPE_VECTOR b_handle;
  45. HANDLE_TYPE_VECTOR x_handle;
  46. HANDLE_TYPE_VECTOR r_handle;
  47. HANDLE_TYPE_VECTOR d_handle;
  48. HANDLE_TYPE_VECTOR q_handle;
  49. starpu_data_handle_t dtq_handle;
  50. starpu_data_handle_t rtr_handle;
  51. TYPE dtq, rtr;
  52. #if 0
  53. static void print_vector_from_descr(unsigned nx, TYPE *v)
  54. {
  55. unsigned i;
  56. for (i = 0; i < nx; i++)
  57. {
  58. fprintf(stderr, "%2.2e ", v[i]);
  59. }
  60. fprintf(stderr, "\n");
  61. }
  62. static void print_matrix_from_descr(unsigned nx, unsigned ny, unsigned ld, TYPE *mat)
  63. {
  64. unsigned i, j;
  65. for (j = 0; j < nx; j++)
  66. {
  67. for (i = 0; i < ny; i++)
  68. {
  69. fprintf(stderr, "%2.2e ", mat[j+i*ld]);
  70. }
  71. fprintf(stderr, "\n");
  72. }
  73. }
  74. #endif
  75. static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
  76. {
  77. (void)task;
  78. (void)nimpl;
  79. enum starpu_worker_archtype type = starpu_worker_get_type(workerid);
  80. if (type == STARPU_CPU_WORKER || type == STARPU_OPENCL_WORKER || type == STARPU_MIC_WORKER)
  81. return 1;
  82. #ifdef STARPU_USE_CUDA
  83. #ifdef STARPU_SIMGRID
  84. /* We don't know, let's assume it can */
  85. return 1;
  86. #else
  87. /* Cuda device */
  88. const struct cudaDeviceProp *props;
  89. props = starpu_cuda_get_device_properties(workerid);
  90. if (props->major >= 2 || props->minor >= 3)
  91. /* At least compute capability 1.3, supports doubles */
  92. return 1;
  93. #endif
  94. #endif
  95. /* Old card, does not support doubles */
  96. return 0;
  97. }
  98. /*
  99. * Reduction accumulation methods
  100. */
  101. #ifdef STARPU_USE_CUDA
  102. static void accumulate_variable_cuda(void *descr[], void *cl_arg)
  103. {
  104. (void)cl_arg;
  105. TYPE *v_dst = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
  106. TYPE *v_src = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]);
  107. cublasStatus_t status = cublasaxpy(starpu_cublas_get_local_handle(), 1, &gp1, v_src, 1, v_dst, 1);
  108. if (status != CUBLAS_STATUS_SUCCESS)
  109. STARPU_CUBLAS_REPORT_ERROR(status);
  110. }
  111. #endif
  112. void accumulate_variable_cpu(void *descr[], void *cl_arg)
  113. {
  114. (void)cl_arg;
  115. TYPE *v_dst = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
  116. TYPE *v_src = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]);
  117. *v_dst = *v_dst + *v_src;
  118. }
  119. static struct starpu_perfmodel accumulate_variable_model =
  120. {
  121. .type = STARPU_HISTORY_BASED,
  122. .symbol = "accumulate_variable"
  123. };
  124. struct starpu_codelet accumulate_variable_cl =
  125. {
  126. .can_execute = can_execute,
  127. .cpu_funcs = {accumulate_variable_cpu},
  128. .cpu_funcs_name = {"accumulate_variable_cpu"},
  129. #ifdef STARPU_USE_CUDA
  130. .cuda_funcs = {accumulate_variable_cuda},
  131. .cuda_flags = {STARPU_CUDA_ASYNC},
  132. #endif
  133. .modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
  134. .nbuffers = 2,
  135. .model = &accumulate_variable_model
  136. };
  137. #ifdef STARPU_USE_CUDA
  138. static void accumulate_vector_cuda(void *descr[], void *cl_arg)
  139. {
  140. (void)cl_arg;
  141. TYPE *v_dst = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  142. TYPE *v_src = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
  143. unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  144. cublasStatus_t status = cublasaxpy(starpu_cublas_get_local_handle(), n, &gp1, v_src, 1, v_dst, 1);
  145. if (status != CUBLAS_STATUS_SUCCESS)
  146. STARPU_CUBLAS_REPORT_ERROR(status);
  147. }
  148. #endif
  149. void accumulate_vector_cpu(void *descr[], void *cl_arg)
  150. {
  151. (void)cl_arg;
  152. TYPE *v_dst = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  153. TYPE *v_src = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
  154. unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  155. AXPY(n, (TYPE)1.0, v_src, 1, v_dst, 1);
  156. }
  157. static struct starpu_perfmodel accumulate_vector_model =
  158. {
  159. .type = STARPU_HISTORY_BASED,
  160. .symbol = "accumulate_vector"
  161. };
  162. struct starpu_codelet accumulate_vector_cl =
  163. {
  164. .can_execute = can_execute,
  165. .cpu_funcs = {accumulate_vector_cpu},
  166. .cpu_funcs_name = {"accumulate_vector_cpu"},
  167. #ifdef STARPU_USE_CUDA
  168. .cuda_funcs = {accumulate_vector_cuda},
  169. .cuda_flags = {STARPU_CUDA_ASYNC},
  170. #endif
  171. .modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
  172. .nbuffers = 2,
  173. .model = &accumulate_vector_model
  174. };
  175. /*
  176. * Reduction initialization methods
  177. */
  178. #ifdef STARPU_USE_CUDA
  179. extern void zero_vector(TYPE *x, unsigned nelems);
  180. static void bzero_variable_cuda(void *descr[], void *cl_arg)
  181. {
  182. (void)cl_arg;
  183. TYPE *v = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
  184. size_t size = STARPU_VARIABLE_GET_ELEMSIZE(descr[0]);
  185. cudaMemsetAsync(v, 0, size, starpu_cuda_get_local_stream());
  186. }
  187. #endif
  188. void bzero_variable_cpu(void *descr[], void *cl_arg)
  189. {
  190. (void)cl_arg;
  191. TYPE *v = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
  192. *v = (TYPE)0.0;
  193. }
  194. static struct starpu_perfmodel bzero_variable_model =
  195. {
  196. .type = STARPU_HISTORY_BASED,
  197. .symbol = "bzero_variable"
  198. };
  199. struct starpu_codelet bzero_variable_cl =
  200. {
  201. .can_execute = can_execute,
  202. .cpu_funcs = {bzero_variable_cpu},
  203. .cpu_funcs_name = {"bzero_variable_cpu"},
  204. #ifdef STARPU_USE_CUDA
  205. .cuda_funcs = {bzero_variable_cuda},
  206. .cuda_flags = {STARPU_CUDA_ASYNC},
  207. #endif
  208. .modes = {STARPU_W},
  209. .nbuffers = 1,
  210. .model = &bzero_variable_model
  211. };
  212. #ifdef STARPU_USE_CUDA
  213. static void bzero_vector_cuda(void *descr[], void *cl_arg)
  214. {
  215. (void)cl_arg;
  216. TYPE *v = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  217. unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  218. size_t elemsize = STARPU_VECTOR_GET_ELEMSIZE(descr[0]);
  219. cudaMemsetAsync(v, 0, n * elemsize, starpu_cuda_get_local_stream());
  220. }
  221. #endif
  222. void bzero_vector_cpu(void *descr[], void *cl_arg)
  223. {
  224. (void)cl_arg;
  225. TYPE *v = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  226. unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  227. memset(v, 0, n*sizeof(TYPE));
  228. }
  229. static struct starpu_perfmodel bzero_vector_model =
  230. {
  231. .type = STARPU_HISTORY_BASED,
  232. .symbol = "bzero_vector"
  233. };
  234. struct starpu_codelet bzero_vector_cl =
  235. {
  236. .can_execute = can_execute,
  237. .cpu_funcs = {bzero_vector_cpu},
  238. .cpu_funcs_name = {"bzero_vector_cpu"},
  239. #ifdef STARPU_USE_CUDA
  240. .cuda_funcs = {bzero_vector_cuda},
  241. .cuda_flags = {STARPU_CUDA_ASYNC},
  242. #endif
  243. .modes = {STARPU_W},
  244. .nbuffers = 1,
  245. .model = &bzero_vector_model
  246. };
  247. /*
  248. * DOT kernel : s = dot(v1, v2)
  249. */
  250. #ifdef STARPU_USE_CUDA
  251. static void dot_kernel_cuda(void *descr[], void *cl_arg)
  252. {
  253. (void)cl_arg;
  254. TYPE *dot = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
  255. TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
  256. TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[2]);
  257. unsigned n = STARPU_VECTOR_GET_NX(descr[1]);
  258. cublasHandle_t handle = starpu_cublas_get_local_handle();
  259. cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
  260. cublasStatus_t status = cublasdot(handle,
  261. n, v1, 1, v2, 1, dot);
  262. if (status != CUBLAS_STATUS_SUCCESS)
  263. STARPU_CUBLAS_REPORT_ERROR(status);
  264. cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
  265. }
  266. #endif
  267. void dot_kernel_cpu(void *descr[], void *cl_arg)
  268. {
  269. (void)cl_arg;
  270. TYPE *dot = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
  271. TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
  272. TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[2]);
  273. unsigned n = STARPU_VECTOR_GET_NX(descr[1]);
  274. TYPE local_dot;
  275. /* Note that we explicitely cast the result of the DOT kernel because
  276. * some BLAS library will return a double for sdot for instance. */
  277. local_dot = (TYPE)DOT(n, v1, 1, v2, 1);
  278. *dot = *dot + local_dot;
  279. }
  280. static struct starpu_perfmodel dot_kernel_model =
  281. {
  282. .type = STARPU_HISTORY_BASED,
  283. .symbol = "dot_kernel"
  284. };
  285. static struct starpu_codelet dot_kernel_cl =
  286. {
  287. .can_execute = can_execute,
  288. .cpu_funcs = {dot_kernel_cpu},
  289. .cpu_funcs_name = {"dot_kernel_cpu"},
  290. #ifdef STARPU_USE_CUDA
  291. .cuda_funcs = {dot_kernel_cuda},
  292. #endif
  293. .cuda_flags = {STARPU_CUDA_ASYNC},
  294. .nbuffers = 3,
  295. .model = &dot_kernel_model
  296. };
  297. int dot_kernel(HANDLE_TYPE_VECTOR v1,
  298. HANDLE_TYPE_VECTOR v2,
  299. starpu_data_handle_t s,
  300. unsigned nblocks,
  301. int use_reduction)
  302. {
  303. int ret;
  304. /* Blank the accumulation variable */
  305. if (use_reduction)
  306. starpu_data_invalidate_submit(s);
  307. else
  308. {
  309. ret = TASK_INSERT(&bzero_variable_cl, STARPU_W, s, 0);
  310. if (ret == -ENODEV) return ret;
  311. STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
  312. }
  313. unsigned b;
  314. for (b = 0; b < nblocks; b++)
  315. {
  316. ret = TASK_INSERT(&dot_kernel_cl,
  317. use_reduction?STARPU_REDUX:STARPU_RW, s,
  318. STARPU_R, GET_VECTOR_BLOCK(v1, b),
  319. STARPU_R, GET_VECTOR_BLOCK(v2, b),
  320. STARPU_TAG_ONLY, (starpu_tag_t) b,
  321. 0);
  322. STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
  323. }
  324. return 0;
  325. }
  326. /*
  327. * SCAL kernel : v1 = p1 v1
  328. */
  329. #ifdef STARPU_USE_CUDA
  330. static void scal_kernel_cuda(void *descr[], void *cl_arg)
  331. {
  332. TYPE p1;
  333. starpu_codelet_unpack_args(cl_arg, &p1);
  334. TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  335. unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  336. /* v1 = p1 v1 */
  337. TYPE alpha = p1;
  338. cublasStatus_t status = cublasscal(starpu_cublas_get_local_handle(), n, &alpha, v1, 1);
  339. if (status != CUBLAS_STATUS_SUCCESS)
  340. STARPU_CUBLAS_REPORT_ERROR(status);
  341. }
  342. #endif
  343. void scal_kernel_cpu(void *descr[], void *cl_arg)
  344. {
  345. TYPE alpha;
  346. starpu_codelet_unpack_args(cl_arg, &alpha);
  347. TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  348. unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  349. /* v1 = alpha v1 */
  350. SCAL(n, alpha, v1, 1);
  351. }
  352. static struct starpu_perfmodel scal_kernel_model =
  353. {
  354. .type = STARPU_HISTORY_BASED,
  355. .symbol = "scal_kernel"
  356. };
  357. static struct starpu_codelet scal_kernel_cl =
  358. {
  359. .can_execute = can_execute,
  360. .cpu_funcs = {scal_kernel_cpu},
  361. .cpu_funcs_name = {"scal_kernel_cpu"},
  362. #ifdef STARPU_USE_CUDA
  363. .cuda_funcs = {scal_kernel_cuda},
  364. .cuda_flags = {STARPU_CUDA_ASYNC},
  365. #endif
  366. .nbuffers = 1,
  367. .model = &scal_kernel_model
  368. };
  369. /*
  370. * GEMV kernel : v1 = p1 * v1 + p2 * M v2
  371. */
  372. #ifdef STARPU_USE_CUDA
  373. static void gemv_kernel_cuda(void *descr[], void *cl_arg)
  374. {
  375. TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  376. TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[2]);
  377. TYPE *M = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
  378. unsigned ld = STARPU_MATRIX_GET_LD(descr[1]);
  379. unsigned nx = STARPU_MATRIX_GET_NX(descr[1]);
  380. unsigned ny = STARPU_MATRIX_GET_NY(descr[1]);
  381. TYPE alpha, beta;
  382. starpu_codelet_unpack_args(cl_arg, &beta, &alpha);
  383. /* Compute v1 = alpha M v2 + beta v1 */
  384. cublasStatus_t status = cublasgemv(starpu_cublas_get_local_handle(),
  385. CUBLAS_OP_N, nx, ny, &alpha, M, ld, v2, 1, &beta, v1, 1);
  386. if (status != CUBLAS_STATUS_SUCCESS)
  387. STARPU_CUBLAS_REPORT_ERROR(status);
  388. }
  389. #endif
  390. void gemv_kernel_cpu(void *descr[], void *cl_arg)
  391. {
  392. TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  393. TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[2]);
  394. TYPE *M = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
  395. unsigned ld = STARPU_MATRIX_GET_LD(descr[1]);
  396. unsigned nx = STARPU_MATRIX_GET_NX(descr[1]);
  397. unsigned ny = STARPU_MATRIX_GET_NY(descr[1]);
  398. TYPE alpha, beta;
  399. starpu_codelet_unpack_args(cl_arg, &beta, &alpha);
  400. int worker_size = starpu_combined_worker_get_size();
  401. if (worker_size > 1)
  402. {
  403. /* Parallel CPU task */
  404. unsigned rank = starpu_combined_worker_get_rank();
  405. unsigned block_size = (ny + worker_size - 1)/worker_size;
  406. unsigned new_nx = STARPU_MIN(nx, block_size*(rank+1)) - block_size*rank;
  407. nx = new_nx;
  408. v1 = &v1[block_size*rank];
  409. M = &M[block_size*rank];
  410. }
  411. /* Compute v1 = alpha M v2 + beta v1 */
  412. GEMV("N", nx, ny, alpha, M, ld, v2, 1, beta, v1, 1);
  413. }
  414. static struct starpu_perfmodel gemv_kernel_model =
  415. {
  416. .type = STARPU_HISTORY_BASED,
  417. .symbol = "gemv_kernel"
  418. };
  419. static struct starpu_codelet gemv_kernel_cl =
  420. {
  421. .can_execute = can_execute,
  422. .type = STARPU_SPMD,
  423. .max_parallelism = INT_MAX,
  424. .cpu_funcs = {gemv_kernel_cpu},
  425. .cpu_funcs_name = {"gemv_kernel_cpu"},
  426. #ifdef STARPU_USE_CUDA
  427. .cuda_funcs = {gemv_kernel_cuda},
  428. .cuda_flags = {STARPU_CUDA_ASYNC},
  429. #endif
  430. .nbuffers = 3,
  431. .model = &gemv_kernel_model
  432. };
  433. int gemv_kernel(HANDLE_TYPE_VECTOR v1,
  434. HANDLE_TYPE_MATRIX matrix,
  435. HANDLE_TYPE_VECTOR v2,
  436. TYPE p1, TYPE p2,
  437. unsigned nblocks,
  438. int use_reduction)
  439. {
  440. unsigned b1, b2;
  441. int ret;
  442. for (b2 = 0; b2 < nblocks; b2++)
  443. {
  444. ret = TASK_INSERT(&scal_kernel_cl,
  445. STARPU_RW, GET_VECTOR_BLOCK(v1, b2),
  446. STARPU_VALUE, &p1, sizeof(p1),
  447. STARPU_TAG_ONLY, (starpu_tag_t) b2,
  448. 0);
  449. if (ret == -ENODEV) return ret;
  450. STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
  451. }
  452. for (b2 = 0; b2 < nblocks; b2++)
  453. {
  454. for (b1 = 0; b1 < nblocks; b1++)
  455. {
  456. TYPE one = 1.0;
  457. ret = TASK_INSERT(&gemv_kernel_cl,
  458. use_reduction?STARPU_REDUX:STARPU_RW, GET_VECTOR_BLOCK(v1, b2),
  459. STARPU_R, GET_MATRIX_BLOCK(matrix, b2, b1),
  460. STARPU_R, GET_VECTOR_BLOCK(v2, b1),
  461. STARPU_VALUE, &one, sizeof(one),
  462. STARPU_VALUE, &p2, sizeof(p2),
  463. STARPU_TAG_ONLY, ((starpu_tag_t)b2) * nblocks + b1,
  464. 0);
  465. STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
  466. }
  467. }
  468. return 0;
  469. }
  470. /*
  471. * AXPY + SCAL kernel : v1 = p1 * v1 + p2 * v2
  472. */
  473. #ifdef STARPU_USE_CUDA
  474. static void scal_axpy_kernel_cuda(void *descr[], void *cl_arg)
  475. {
  476. TYPE p1, p2;
  477. starpu_codelet_unpack_args(cl_arg, &p1, &p2);
  478. TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  479. TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
  480. unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  481. /* Compute v1 = p1 * v1 + p2 * v2.
  482. * v1 = p1 v1
  483. * v1 = v1 + p2 v2
  484. */
  485. cublasStatus_t status;
  486. status = cublasscal(starpu_cublas_get_local_handle(), n, &p1, v1, 1);
  487. if (status != CUBLAS_STATUS_SUCCESS)
  488. STARPU_CUBLAS_REPORT_ERROR(status);
  489. status = cublasaxpy(starpu_cublas_get_local_handle(), n, &p2, v2, 1, v1, 1);
  490. if (status != CUBLAS_STATUS_SUCCESS)
  491. STARPU_CUBLAS_REPORT_ERROR(status);
  492. }
  493. #endif
  494. void scal_axpy_kernel_cpu(void *descr[], void *cl_arg)
  495. {
  496. TYPE p1, p2;
  497. starpu_codelet_unpack_args(cl_arg, &p1, &p2);
  498. TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  499. TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
  500. unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
  501. /* Compute v1 = p1 * v1 + p2 * v2.
  502. * v1 = p1 v1
  503. * v1 = v1 + p2 v2
  504. */
  505. SCAL(nx, p1, v1, 1);
  506. AXPY(nx, p2, v2, 1, v1, 1);
  507. }
  508. static struct starpu_perfmodel scal_axpy_kernel_model =
  509. {
  510. .type = STARPU_HISTORY_BASED,
  511. .symbol = "scal_axpy_kernel"
  512. };
  513. static struct starpu_codelet scal_axpy_kernel_cl =
  514. {
  515. .can_execute = can_execute,
  516. .cpu_funcs = {scal_axpy_kernel_cpu},
  517. .cpu_funcs_name = {"scal_axpy_kernel_cpu"},
  518. #ifdef STARPU_USE_CUDA
  519. .cuda_funcs = {scal_axpy_kernel_cuda},
  520. .cuda_flags = {STARPU_CUDA_ASYNC},
  521. #endif
  522. .nbuffers = 2,
  523. .model = &scal_axpy_kernel_model
  524. };
  525. int scal_axpy_kernel(HANDLE_TYPE_VECTOR v1, TYPE p1,
  526. HANDLE_TYPE_VECTOR v2, TYPE p2,
  527. unsigned nblocks)
  528. {
  529. unsigned b;
  530. for (b = 0; b < nblocks; b++)
  531. {
  532. int ret;
  533. ret = TASK_INSERT(&scal_axpy_kernel_cl,
  534. STARPU_RW, GET_VECTOR_BLOCK(v1, b),
  535. STARPU_R, GET_VECTOR_BLOCK(v2, b),
  536. STARPU_VALUE, &p1, sizeof(p1),
  537. STARPU_VALUE, &p2, sizeof(p2),
  538. STARPU_TAG_ONLY, (starpu_tag_t) b,
  539. 0);
  540. if (ret == -ENODEV) return ret;
  541. STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
  542. }
  543. return 0;
  544. }
  545. /*
  546. * AXPY kernel : v1 = v1 + p1 * v2
  547. */
  548. #ifdef STARPU_USE_CUDA
  549. static void axpy_kernel_cuda(void *descr[], void *cl_arg)
  550. {
  551. TYPE p1;
  552. starpu_codelet_unpack_args(cl_arg, &p1);
  553. TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  554. TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
  555. unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  556. /* Compute v1 = v1 + p1 * v2.
  557. */
  558. cublasStatus_t status = cublasaxpy(starpu_cublas_get_local_handle(),
  559. n, &p1, v2, 1, v1, 1);
  560. if (status != CUBLAS_STATUS_SUCCESS)
  561. STARPU_CUBLAS_REPORT_ERROR(status);
  562. }
  563. #endif
  564. void axpy_kernel_cpu(void *descr[], void *cl_arg)
  565. {
  566. TYPE p1;
  567. starpu_codelet_unpack_args(cl_arg, &p1);
  568. TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  569. TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
  570. unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
  571. /* Compute v1 = p1 * v1 + p2 * v2.
  572. */
  573. AXPY(nx, p1, v2, 1, v1, 1);
  574. }
  575. static struct starpu_perfmodel axpy_kernel_model =
  576. {
  577. .type = STARPU_HISTORY_BASED,
  578. .symbol = "axpy_kernel"
  579. };
  580. static struct starpu_codelet axpy_kernel_cl =
  581. {
  582. .can_execute = can_execute,
  583. .cpu_funcs = {axpy_kernel_cpu},
  584. .cpu_funcs_name = {"axpy_kernel_cpu"},
  585. #ifdef STARPU_USE_CUDA
  586. .cuda_funcs = {axpy_kernel_cuda},
  587. .cuda_flags = {STARPU_CUDA_ASYNC},
  588. #endif
  589. .nbuffers = 2,
  590. .model = &axpy_kernel_model
  591. };
  592. int axpy_kernel(HANDLE_TYPE_VECTOR v1,
  593. HANDLE_TYPE_VECTOR v2, TYPE p1,
  594. unsigned nblocks)
  595. {
  596. unsigned b;
  597. for (b = 0; b < nblocks; b++)
  598. {
  599. int ret;
  600. ret = TASK_INSERT(&axpy_kernel_cl,
  601. STARPU_RW, GET_VECTOR_BLOCK(v1, b),
  602. STARPU_R, GET_VECTOR_BLOCK(v2, b),
  603. STARPU_VALUE, &p1, sizeof(p1),
  604. STARPU_TAG_ONLY, (starpu_tag_t) b,
  605. 0);
  606. if (ret == -ENODEV) return ret;
  607. STARPU_CHECK_RETURN_VALUE(ret, "TASK_INSERT");
  608. }
  609. return 0;
  610. }
  611. /*
  612. * Main loop
  613. */
  614. int cg(void)
  615. {
  616. TYPE delta_new, delta_0, error, delta_old, alpha, beta;
  617. double start, end, timing;
  618. int i = 0, ret;
  619. /* r <- b */
  620. ret = copy_handle(r_handle, b_handle, nblocks);
  621. if (ret == -ENODEV) return ret;
  622. /* r <- r - A x */
  623. ret = gemv_kernel(r_handle, A_handle, x_handle, 1.0, -1.0, nblocks, use_reduction);
  624. if (ret == -ENODEV) return ret;
  625. /* d <- r */
  626. ret = copy_handle(d_handle, r_handle, nblocks);
  627. if (ret == -ENODEV) return ret;
  628. /* delta_new = dot(r,r) */
  629. ret = dot_kernel(r_handle, r_handle, rtr_handle, nblocks, use_reduction);
  630. if (ret == -ENODEV) return ret;
  631. GET_DATA_HANDLE(rtr_handle);
  632. starpu_data_acquire(rtr_handle, STARPU_R);
  633. delta_new = rtr;
  634. delta_0 = delta_new;
  635. starpu_data_release(rtr_handle);
  636. FPRINTF_SERVER(stderr, "Delta limit: %e\n", (double) (eps*eps*delta_0));
  637. FPRINTF_SERVER(stderr, "**************** INITIAL ****************\n");
  638. FPRINTF_SERVER(stderr, "Delta 0: %e\n", delta_new);
  639. BARRIER();
  640. start = starpu_timing_now();
  641. while ((i < i_max) && ((double)delta_new > (double)(eps*eps*delta_0)))
  642. {
  643. starpu_iteration_push(i);
  644. /* q <- A d */
  645. gemv_kernel(q_handle, A_handle, d_handle, 0.0, 1.0, nblocks, use_reduction);
  646. /* dtq <- dot(d,q) */
  647. dot_kernel(d_handle, q_handle, dtq_handle, nblocks, use_reduction);
  648. /* alpha = delta_new / dtq */
  649. GET_DATA_HANDLE(dtq_handle);
  650. starpu_data_acquire(dtq_handle, STARPU_R);
  651. alpha = delta_new / dtq;
  652. starpu_data_release(dtq_handle);
  653. /* x <- x + alpha d */
  654. axpy_kernel(x_handle, d_handle, alpha, nblocks);
  655. if ((i % 50) == 0)
  656. {
  657. /* r <- b */
  658. copy_handle(r_handle, b_handle, nblocks);
  659. /* r <- r - A x */
  660. gemv_kernel(r_handle, A_handle, x_handle, 1.0, -1.0, nblocks, use_reduction);
  661. }
  662. else
  663. {
  664. /* r <- r - alpha q */
  665. axpy_kernel(r_handle, q_handle, -alpha, nblocks);
  666. }
  667. /* delta_new = dot(r,r) */
  668. dot_kernel(r_handle, r_handle, rtr_handle, nblocks, use_reduction);
  669. GET_DATA_HANDLE(rtr_handle);
  670. starpu_data_acquire(rtr_handle, STARPU_R);
  671. delta_old = delta_new;
  672. delta_new = rtr;
  673. beta = delta_new / delta_old;
  674. starpu_data_release(rtr_handle);
  675. /* d <- beta d + r */
  676. scal_axpy_kernel(d_handle, beta, r_handle, 1.0, nblocks);
  677. if ((i % 10) == 0)
  678. {
  679. /* We here take the error as ||r||_2 / (n||b||_2) */
  680. error = sqrt(delta_new/delta_0)/(1.0*n);
  681. FPRINTF_SERVER(stderr, "*****************************************\n");
  682. FPRINTF_SERVER(stderr, "iter %d DELTA %e - %e\n", i, delta_new, error);
  683. }
  684. starpu_iteration_pop();
  685. i++;
  686. }
  687. BARRIER();
  688. end = starpu_timing_now();
  689. timing = end - start;
  690. error = sqrt(delta_new/delta_0)/(1.0*n);
  691. FPRINTF_SERVER(stderr, "*****************************************\n");
  692. FPRINTF_SERVER(stderr, "iter %d DELTA %e - %e\n", i, delta_new, error);
  693. FPRINTF_SERVER(stderr, "Total timing : %2.2f seconds\n", timing/10e6);
  694. FPRINTF_SERVER(stderr, "Seconds per iteration : %2.2e seconds\n", timing/10e6/i);
  695. FPRINTF_SERVER(stderr, "Number of iterations per second : %2.2e it/s\n", i/(timing/10e6));
  696. return 0;
  697. }
  698. void parse_common_args(int argc, char **argv)
  699. {
  700. int i;
  701. for (i = 1; i < argc; i++)
  702. {
  703. if (strcmp(argv[i], "-n") == 0)
  704. {
  705. n = (int long long)atoi(argv[++i]);
  706. continue;
  707. }
  708. if (strcmp(argv[i], "-display-result") == 0)
  709. {
  710. display_result = 1;
  711. continue;
  712. }
  713. if (strcmp(argv[i], "-maxiter") == 0)
  714. {
  715. i_max = atoi(argv[++i]);
  716. if (i_max <= 0)
  717. {
  718. FPRINTF_SERVER(stderr, "the number of iterations must be positive, not %d\n", i_max);
  719. exit(EXIT_FAILURE);
  720. }
  721. continue;
  722. }
  723. if (strcmp(argv[i], "-nblocks") == 0)
  724. {
  725. nblocks = atoi(argv[++i]);
  726. continue;
  727. }
  728. if (strcmp(argv[i], "-no-reduction") == 0)
  729. {
  730. use_reduction = 0;
  731. continue;
  732. }
  733. }
  734. }