cg_kernels.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720
  1. /*
  2. * StarPU
  3. * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU Lesser General Public License as published by
  7. * the Free Software Foundation; either version 2.1 of the License, or (at
  8. * your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful, but
  11. * WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. *
  14. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  15. */
  16. #include "cg.h"
  17. #include <math.h>
  18. struct kernel_params {
  19. TYPE p1;
  20. TYPE p2;
  21. };
  22. #if 0
  23. static void print_vector_from_descr(unsigned nx, TYPE *v)
  24. {
  25. unsigned i;
  26. for (i = 0; i < nx; i++)
  27. {
  28. fprintf(stderr, "%2.2e ", v[i]);
  29. }
  30. fprintf(stderr, "\n");
  31. }
  32. static void print_matrix_from_descr(unsigned nx, unsigned ny, unsigned ld, TYPE *mat)
  33. {
  34. unsigned i, j;
  35. for (j = 0; j < nx; j++)
  36. {
  37. for (i = 0; i < ny; i++)
  38. {
  39. fprintf(stderr, "%2.2e ", mat[j+i*ld]);
  40. }
  41. fprintf(stderr, "\n");
  42. }
  43. }
  44. #endif
  45. /*
  46. * Reduction accumulation methods
  47. */
  48. #ifdef STARPU_USE_CUDA
  49. static void accumulate_variable_cuda(void *descr[], void *cl_arg)
  50. {
  51. TYPE *v_dst = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
  52. TYPE *v_src = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]);
  53. cublasaxpy(1, (TYPE)1.0, v_src, 1, v_dst, 1);
  54. cudaError_t ret = cudaThreadSynchronize();
  55. if (ret)
  56. STARPU_CUDA_REPORT_ERROR(ret);
  57. }
  58. #endif
  59. static void accumulate_variable_cpu(void *descr[], void *cl_arg)
  60. {
  61. TYPE *v_dst = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
  62. TYPE *v_src = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]);
  63. *v_dst = *v_dst + *v_src;
  64. }
  65. static struct starpu_perfmodel_t accumulate_variable_model = {
  66. .type = STARPU_HISTORY_BASED,
  67. .symbol = "accumulate_variable"
  68. };
  69. starpu_codelet accumulate_variable_cl = {
  70. .where = STARPU_CPU|STARPU_CUDA,
  71. .cpu_func = accumulate_variable_cpu,
  72. #ifdef STARPU_USE_CUDA
  73. .cuda_func = accumulate_variable_cuda,
  74. #endif
  75. .nbuffers = 2,
  76. .model = &accumulate_variable_model
  77. };
  78. #ifdef STARPU_USE_CUDA
  79. static void accumulate_vector_cuda(void *descr[], void *cl_arg)
  80. {
  81. TYPE *v_dst = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  82. TYPE *v_src = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
  83. unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  84. cublasaxpy(n, (TYPE)1.0, v_src, 1, v_dst, 1);
  85. cudaError_t ret = cudaThreadSynchronize();
  86. if (ret)
  87. STARPU_CUDA_REPORT_ERROR(ret);
  88. }
  89. #endif
  90. static void accumulate_vector_cpu(void *descr[], void *cl_arg)
  91. {
  92. TYPE *v_dst = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  93. TYPE *v_src = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
  94. unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  95. AXPY(n, (TYPE)1.0, v_src, 1, v_dst, 1);
  96. }
  97. static struct starpu_perfmodel_t accumulate_vector_model = {
  98. .type = STARPU_HISTORY_BASED,
  99. .symbol = "accumulate_vector"
  100. };
  101. starpu_codelet accumulate_vector_cl = {
  102. .where = STARPU_CPU|STARPU_CUDA,
  103. .cpu_func = accumulate_vector_cpu,
  104. #ifdef STARPU_USE_CUDA
  105. .cuda_func = accumulate_vector_cuda,
  106. #endif
  107. .nbuffers = 2,
  108. .model = &accumulate_vector_model
  109. };
  110. /*
  111. * Reduction initialization methods
  112. */
  113. #ifdef STARPU_USE_CUDA
  114. static void bzero_variable_cuda(void *descr[], void *cl_arg)
  115. {
  116. TYPE *v = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
  117. cublasscal (1, (TYPE)0.0, v, 1);
  118. cudaThreadSynchronize();
  119. }
  120. #endif
  121. static void bzero_variable_cpu(void *descr[], void *cl_arg)
  122. {
  123. TYPE *v = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
  124. *v = (TYPE)0.0;
  125. }
  126. static struct starpu_perfmodel_t bzero_variable_model = {
  127. .type = STARPU_HISTORY_BASED,
  128. .symbol = "bzero_variable"
  129. };
  130. starpu_codelet bzero_variable_cl = {
  131. .where = STARPU_CPU|STARPU_CUDA,
  132. .cpu_func = bzero_variable_cpu,
  133. #ifdef STARPU_USE_CUDA
  134. .cuda_func = bzero_variable_cuda,
  135. #endif
  136. .nbuffers = 1,
  137. .model = &bzero_variable_model
  138. };
  139. #ifdef STARPU_USE_CUDA
  140. static void bzero_vector_cuda(void *descr[], void *cl_arg)
  141. {
  142. TYPE *v = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  143. unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  144. cublasscal (n, (TYPE)0.0, v, 1);
  145. cudaError_t ret = cudaThreadSynchronize();
  146. if (ret)
  147. STARPU_CUDA_REPORT_ERROR(ret);
  148. }
  149. #endif
  150. static void bzero_vector_cpu(void *descr[], void *cl_arg)
  151. {
  152. TYPE *v = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  153. unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  154. memset(v, 0, n*sizeof(TYPE));
  155. }
  156. static struct starpu_perfmodel_t bzero_vector_model = {
  157. .type = STARPU_HISTORY_BASED,
  158. .symbol = "bzero_vector"
  159. };
  160. starpu_codelet bzero_vector_cl = {
  161. .where = STARPU_CPU|STARPU_CUDA,
  162. .cpu_func = bzero_vector_cpu,
  163. #ifdef STARPU_USE_CUDA
  164. .cuda_func = bzero_vector_cuda,
  165. #endif
  166. .nbuffers = 1,
  167. .model = &bzero_vector_model
  168. };
  169. /*
  170. * DOT kernel : s = dot(v1, v2)
  171. */
  172. #ifdef STARPU_USE_CUDA
  173. static void dot_kernel_cuda(void *descr[], void *cl_arg)
  174. {
  175. cudaError_t ret;
  176. TYPE *dot = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
  177. TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
  178. TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[2]);
  179. unsigned n = STARPU_VECTOR_GET_NX(descr[1]);
  180. /* Get current value */
  181. TYPE host_dot;
  182. cudaMemcpy(&host_dot, dot, sizeof(TYPE), cudaMemcpyDeviceToHost);
  183. ret = cudaThreadSynchronize();
  184. if (ret)
  185. STARPU_CUDA_REPORT_ERROR(ret);
  186. TYPE local_dot = cublasdot(n, v1, 1, v2, 1);
  187. host_dot += local_dot;
  188. ret = cudaThreadSynchronize();
  189. if (ret)
  190. STARPU_CUDA_REPORT_ERROR(ret);
  191. cudaMemcpy(dot, &host_dot, sizeof(TYPE), cudaMemcpyHostToDevice);
  192. ret = cudaThreadSynchronize();
  193. if (ret)
  194. STARPU_CUDA_REPORT_ERROR(ret);
  195. }
  196. #endif
  197. static void dot_kernel_cpu(void *descr[], void *cl_arg)
  198. {
  199. TYPE *dot = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
  200. TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
  201. TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[2]);
  202. unsigned n = STARPU_VECTOR_GET_NX(descr[1]);
  203. TYPE local_dot = 0.0;
  204. /* Note that we explicitely cast the result of the DOT kernel because
  205. * some BLAS library will return a double for sdot for instance. */
  206. local_dot = (TYPE)DOT(n, v1, 1, v2, 1);
  207. *dot = *dot + local_dot;
  208. }
  209. static struct starpu_perfmodel_t dot_kernel_model = {
  210. .type = STARPU_HISTORY_BASED,
  211. .symbol = "dot_kernel"
  212. };
  213. static starpu_codelet dot_kernel_cl = {
  214. .where = STARPU_CPU|STARPU_CUDA,
  215. .cpu_func = dot_kernel_cpu,
  216. #ifdef STARPU_USE_CUDA
  217. .cuda_func = dot_kernel_cuda,
  218. #endif
  219. .nbuffers = 3,
  220. .model = &dot_kernel_model
  221. };
  222. void dot_kernel(starpu_data_handle v1,
  223. starpu_data_handle v2,
  224. starpu_data_handle s,
  225. unsigned nblocks,
  226. int use_reduction)
  227. {
  228. int ret;
  229. struct starpu_task *task;
  230. /* Blank the accumulation variable */
  231. task = starpu_task_create();
  232. task->cl = &bzero_variable_cl;
  233. task->buffers[0].handle = s;
  234. task->buffers[0].mode = STARPU_W;
  235. ret = starpu_task_submit(task);
  236. assert(!ret);
  237. if (use_reduction)
  238. starpu_task_wait_for_all();
  239. unsigned b;
  240. for (b = 0; b < nblocks; b++)
  241. {
  242. task = starpu_task_create();
  243. task->cl = &dot_kernel_cl;
  244. task->buffers[0].handle = s;
  245. task->buffers[0].mode = use_reduction?STARPU_REDUX:STARPU_RW;
  246. task->buffers[1].handle = starpu_data_get_sub_data(v1, 1, b);
  247. task->buffers[1].mode = STARPU_R;
  248. task->buffers[2].handle = starpu_data_get_sub_data(v2, 1, b);
  249. task->buffers[2].mode = STARPU_R;
  250. ret = starpu_task_submit(task);
  251. assert(!ret);
  252. }
  253. if (use_reduction)
  254. starpu_task_wait_for_all();
  255. }
  256. /*
  257. * SCAL kernel : v1 = p1 v1
  258. */
  259. #ifdef STARPU_USE_CUDA
  260. static void scal_kernel_cuda(void *descr[], void *cl_arg)
  261. {
  262. struct kernel_params *params = cl_arg;
  263. TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  264. unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  265. /* v1 = p1 v1 */
  266. TYPE alpha = params->p1;
  267. cublasscal(n, alpha, v1, 1);
  268. cudaThreadSynchronize();
  269. free(params);
  270. }
  271. #endif
  272. static void scal_kernel_cpu(void *descr[], void *cl_arg)
  273. {
  274. struct kernel_params *params = cl_arg;
  275. TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  276. unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  277. /* v1 = p1 v1 */
  278. TYPE alpha = params->p1;
  279. SCAL(n, alpha, v1, 1);
  280. free(params);
  281. }
  282. static struct starpu_perfmodel_t scal_kernel_model = {
  283. .type = STARPU_HISTORY_BASED,
  284. .symbol = "scal_kernel"
  285. };
  286. static starpu_codelet scal_kernel_cl = {
  287. .where = STARPU_CPU|STARPU_CUDA,
  288. .cpu_func = scal_kernel_cpu,
  289. #ifdef STARPU_USE_CUDA
  290. .cuda_func = scal_kernel_cuda,
  291. #endif
  292. .nbuffers = 1,
  293. .model = &scal_kernel_model
  294. };
  295. /*
  296. * GEMV kernel : v1 = p1 * v1 + p2 * M v2
  297. */
  298. #ifdef STARPU_USE_CUDA
  299. static void gemv_kernel_cuda(void *descr[], void *cl_arg)
  300. {
  301. struct kernel_params *params = cl_arg;
  302. TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  303. TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[2]);
  304. TYPE *M = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
  305. unsigned ld = STARPU_MATRIX_GET_LD(descr[1]);
  306. unsigned nx = STARPU_MATRIX_GET_NX(descr[1]);
  307. unsigned ny = STARPU_MATRIX_GET_NY(descr[1]);
  308. TYPE alpha = params->p2;
  309. TYPE beta = params->p1;
  310. /* Compute v1 = alpha M v2 + beta v1 */
  311. cublasgemv('N', nx, ny, alpha, M, ld, v2, 1, beta, v1, 1);
  312. cudaThreadSynchronize();
  313. free(params);
  314. }
  315. #endif
  316. static void gemv_kernel_cpu(void *descr[], void *cl_arg)
  317. {
  318. struct kernel_params *params = cl_arg;
  319. TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  320. TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[2]);
  321. TYPE *M = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
  322. unsigned ld = STARPU_MATRIX_GET_LD(descr[1]);
  323. unsigned nx = STARPU_MATRIX_GET_NX(descr[1]);
  324. unsigned ny = STARPU_MATRIX_GET_NY(descr[1]);
  325. TYPE alpha = params->p2;
  326. TYPE beta = params->p1;
  327. /* Compute v1 = alpha M v2 + beta v1 */
  328. GEMV("N", nx, ny, alpha, M, ld, v2, 1, beta, v1, 1);
  329. free(params);
  330. }
  331. static struct starpu_perfmodel_t gemv_kernel_model = {
  332. .type = STARPU_HISTORY_BASED,
  333. .symbol = "gemv_kernel"
  334. };
  335. static starpu_codelet gemv_kernel_cl = {
  336. .where = STARPU_CPU|STARPU_CUDA,
  337. .cpu_func = gemv_kernel_cpu,
  338. #ifdef STARPU_USE_CUDA
  339. .cuda_func = gemv_kernel_cuda,
  340. #endif
  341. .nbuffers = 3,
  342. .model = &gemv_kernel_model
  343. };
  344. void gemv_kernel(starpu_data_handle v1,
  345. starpu_data_handle matrix,
  346. starpu_data_handle v2,
  347. TYPE p1, TYPE p2,
  348. unsigned nblocks,
  349. int use_reduction)
  350. {
  351. int ret;
  352. unsigned b1, b2;
  353. if (use_reduction)
  354. starpu_task_wait_for_all();
  355. for (b2 = 0; b2 < nblocks; b2++)
  356. {
  357. struct starpu_task *task = starpu_task_create();
  358. task->cl = &scal_kernel_cl;
  359. task->buffers[0].handle = starpu_data_get_sub_data(v1, 1, b2);
  360. task->buffers[0].mode = STARPU_RW;
  361. struct kernel_params *params = malloc(sizeof(struct kernel_params));
  362. params->p1 = p1;
  363. task->cl_arg = params;
  364. ret = starpu_task_submit(task);
  365. assert(!ret);
  366. }
  367. if (use_reduction)
  368. starpu_task_wait_for_all();
  369. for (b2 = 0; b2 < nblocks; b2++)
  370. {
  371. for (b1 = 0; b1 < nblocks; b1++)
  372. {
  373. struct starpu_task *task = starpu_task_create();
  374. task->cl = &gemv_kernel_cl;
  375. task->buffers[0].handle = starpu_data_get_sub_data(v1, 1, b2);
  376. task->buffers[0].mode = use_reduction?STARPU_REDUX:STARPU_RW;
  377. task->buffers[1].handle = starpu_data_get_sub_data(matrix, 2, b2, b1);
  378. task->buffers[1].mode = STARPU_R;
  379. task->buffers[2].handle = starpu_data_get_sub_data(v2, 1, b1);
  380. task->buffers[2].mode = STARPU_R;
  381. struct kernel_params *params = malloc(sizeof(struct kernel_params));
  382. assert(params);
  383. params->p1 = 1.0;
  384. params->p2 = p2;
  385. task->cl_arg = params;
  386. ret = starpu_task_submit(task);
  387. assert(!ret);
  388. }
  389. }
  390. if (use_reduction)
  391. starpu_task_wait_for_all();
  392. }
  393. /*
  394. * AXPY + SCAL kernel : v1 = p1 * v1 + p2 * v2
  395. */
  396. #ifdef STARPU_USE_CUDA
  397. static void scal_axpy_kernel_cuda(void *descr[], void *cl_arg)
  398. {
  399. struct kernel_params *params = cl_arg;
  400. TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  401. TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
  402. unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  403. /* Compute v1 = p1 * v1 + p2 * v2.
  404. * v1 = p1 v1
  405. * v1 = v1 + p2 v2
  406. */
  407. cublasscal(n, params->p1, v1, 1);
  408. cublasaxpy(n, params->p2, v2, 1, v1, 1);
  409. cudaThreadSynchronize();
  410. free(params);
  411. }
  412. #endif
  413. static void scal_axpy_kernel_cpu(void *descr[], void *cl_arg)
  414. {
  415. struct kernel_params *params = cl_arg;
  416. TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  417. TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
  418. unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
  419. /* Compute v1 = p1 * v1 + p2 * v2.
  420. * v1 = p1 v1
  421. * v1 = v1 + p2 v2
  422. */
  423. SCAL(nx, params->p1, v1, 1);
  424. AXPY(nx, params->p2, v2, 1, v1, 1);
  425. free(params);
  426. }
  427. static struct starpu_perfmodel_t scal_axpy_kernel_model = {
  428. .type = STARPU_HISTORY_BASED,
  429. .symbol = "scal_axpy_kernel"
  430. };
  431. static starpu_codelet scal_axpy_kernel_cl = {
  432. .where = STARPU_CPU|STARPU_CUDA,
  433. .cpu_func = scal_axpy_kernel_cpu,
  434. #ifdef STARPU_USE_CUDA
  435. .cuda_func = scal_axpy_kernel_cuda,
  436. #endif
  437. .nbuffers = 2,
  438. .model = &scal_axpy_kernel_model
  439. };
  440. void scal_axpy_kernel(starpu_data_handle v1, TYPE p1,
  441. starpu_data_handle v2, TYPE p2,
  442. unsigned nblocks)
  443. {
  444. int ret;
  445. unsigned b;
  446. for (b = 0; b < nblocks; b++)
  447. {
  448. struct starpu_task *task = starpu_task_create();
  449. task->cl = &scal_axpy_kernel_cl;
  450. task->buffers[0].handle = starpu_data_get_sub_data(v1, 1, b);
  451. task->buffers[0].mode = STARPU_RW;
  452. task->buffers[1].handle = starpu_data_get_sub_data(v2, 1, b);
  453. task->buffers[1].mode = STARPU_R;
  454. struct kernel_params *params = malloc(sizeof(struct kernel_params));
  455. assert(params);
  456. params->p1 = p1;
  457. params->p2 = p2;
  458. task->cl_arg = params;
  459. ret = starpu_task_submit(task);
  460. assert(!ret);
  461. }
  462. }
  463. /*
  464. * AXPY kernel : v1 = v1 + p1 * v2
  465. */
  466. #ifdef STARPU_USE_CUDA
  467. static void axpy_kernel_cuda(void *descr[], void *cl_arg)
  468. {
  469. struct kernel_params *params = cl_arg;
  470. TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  471. TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
  472. unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  473. /* Compute v1 = v1 + p1 * v2.
  474. */
  475. cublasaxpy(n, params->p1, v2, 1, v1, 1);
  476. cudaThreadSynchronize();
  477. free(params);
  478. }
  479. #endif
  480. static void axpy_kernel_cpu(void *descr[], void *cl_arg)
  481. {
  482. struct kernel_params *params = cl_arg;
  483. TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  484. TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
  485. unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
  486. /* Compute v1 = p1 * v1 + p2 * v2.
  487. */
  488. AXPY(nx, params->p1, v2, 1, v1, 1);
  489. free(params);
  490. }
  491. static struct starpu_perfmodel_t axpy_kernel_model = {
  492. .type = STARPU_HISTORY_BASED,
  493. .symbol = "axpy_kernel"
  494. };
  495. static starpu_codelet axpy_kernel_cl = {
  496. .where = STARPU_CPU|STARPU_CUDA,
  497. .cpu_func = axpy_kernel_cpu,
  498. #ifdef STARPU_USE_CUDA
  499. .cuda_func = axpy_kernel_cuda,
  500. #endif
  501. .nbuffers = 2,
  502. .model = &axpy_kernel_model
  503. };
  504. void axpy_kernel(starpu_data_handle v1,
  505. starpu_data_handle v2, TYPE p1,
  506. unsigned nblocks)
  507. {
  508. int ret;
  509. unsigned b;
  510. for (b = 0; b < nblocks; b++)
  511. {
  512. struct starpu_task *task = starpu_task_create();
  513. task->cl = &axpy_kernel_cl;
  514. task->buffers[0].handle = starpu_data_get_sub_data(v1, 1, b);
  515. task->buffers[0].mode = STARPU_RW;
  516. task->buffers[1].handle = starpu_data_get_sub_data(v2, 1, b);
  517. task->buffers[1].mode = STARPU_R;
  518. struct kernel_params *params = malloc(sizeof(struct kernel_params));
  519. assert(params);
  520. params->p1 = p1;
  521. task->cl_arg = params;
  522. ret = starpu_task_submit(task);
  523. assert(!ret);
  524. }
  525. }
  526. /*
  527. * COPY kernel : vector_dst <- vector_src
  528. */
  529. static void copy_handle_cpu(void *descr[], void *cl_arg)
  530. {
  531. TYPE *dst = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  532. TYPE *src = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
  533. unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
  534. size_t elemsize = STARPU_VECTOR_GET_ELEMSIZE(descr[0]);
  535. memcpy(dst, src, nx*elemsize);
  536. }
  537. #ifdef STARPU_USE_CUDA
  538. static void copy_handle_cuda(void *descr[], void *cl_arg)
  539. {
  540. TYPE *dst = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  541. TYPE *src = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
  542. unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
  543. size_t elemsize = STARPU_VECTOR_GET_ELEMSIZE(descr[0]);
  544. cudaMemcpy(dst, src, nx*elemsize, cudaMemcpyDeviceToDevice);
  545. cudaThreadSynchronize();
  546. }
  547. #endif
  548. static struct starpu_perfmodel_t copy_handle_model = {
  549. .type = STARPU_HISTORY_BASED,
  550. .symbol = "copy_handle"
  551. };
  552. static starpu_codelet copy_handle_cl = {
  553. .where = STARPU_CPU|STARPU_CUDA,
  554. .cpu_func = copy_handle_cpu,
  555. #ifdef STARPU_USE_CUDA
  556. .cuda_func = copy_handle_cuda,
  557. #endif
  558. .nbuffers = 2,
  559. .model = &copy_handle_model
  560. };
  561. void copy_handle(starpu_data_handle dst, starpu_data_handle src, unsigned nblocks)
  562. {
  563. int ret;
  564. unsigned b;
  565. for (b = 0; b < nblocks; b++)
  566. {
  567. struct starpu_task *task = starpu_task_create();
  568. task->cl = &copy_handle_cl;
  569. task->buffers[0].handle = starpu_data_get_sub_data(dst, 1, b);
  570. task->buffers[0].mode = STARPU_W;
  571. task->buffers[1].handle = starpu_data_get_sub_data(src, 1, b);
  572. task->buffers[1].mode = STARPU_R;
  573. ret = starpu_task_submit(task);
  574. assert(!ret);
  575. }
  576. }