cg_kernels.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680
  1. /*
  2. * StarPU
  3. * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU Lesser General Public License as published by
  7. * the Free Software Foundation; either version 2.1 of the License, or (at
  8. * your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful, but
  11. * WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. *
  14. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  15. */
  16. #include "cg.h"
  17. #include <math.h>
  18. struct kernel_params {
  19. TYPE p1;
  20. TYPE p2;
  21. };
  22. #if 0
  23. static void print_vector_from_descr(unsigned nx, TYPE *v)
  24. {
  25. unsigned i;
  26. for (i = 0; i < nx; i++)
  27. {
  28. fprintf(stderr, "%2.2e ", v[i]);
  29. }
  30. fprintf(stderr, "\n");
  31. }
  32. static void print_matrix_from_descr(unsigned nx, unsigned ny, unsigned ld, TYPE *mat)
  33. {
  34. unsigned i, j;
  35. for (j = 0; j < nx; j++)
  36. {
  37. for (i = 0; i < ny; i++)
  38. {
  39. fprintf(stderr, "%2.2e ", mat[j+i*ld]);
  40. }
  41. fprintf(stderr, "\n");
  42. }
  43. }
  44. #endif
  45. /*
  46. * Reduction accumulation methods
  47. */
  48. static void accumulate_variable_cuda(void *descr[], void *cl_arg)
  49. {
  50. TYPE *v_dst = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
  51. TYPE *v_src = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]);
  52. cublasaxpy(1, (TYPE)1.0, v_src, 1, v_dst, 1);
  53. cudaError_t ret = cudaThreadSynchronize();
  54. if (ret)
  55. STARPU_CUDA_REPORT_ERROR(ret);
  56. }
  57. static void accumulate_variable_cpu(void *descr[], void *cl_arg)
  58. {
  59. TYPE *v_dst = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
  60. TYPE *v_src = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]);
  61. *v_dst = *v_dst + *v_src;
  62. }
  63. static struct starpu_perfmodel_t accumulate_variable_model = {
  64. .type = STARPU_HISTORY_BASED,
  65. .symbol = "accumulate_variable"
  66. };
  67. starpu_codelet accumulate_variable_cl = {
  68. .where = STARPU_CPU|STARPU_CUDA,
  69. .cpu_func = accumulate_variable_cpu,
  70. .cuda_func = accumulate_variable_cuda,
  71. .nbuffers = 2,
  72. .model = &accumulate_variable_model
  73. };
  74. static void accumulate_vector_cuda(void *descr[], void *cl_arg)
  75. {
  76. TYPE *v_dst = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  77. TYPE *v_src = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
  78. unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  79. cublasaxpy(n, (TYPE)1.0, v_src, 1, v_dst, 1);
  80. cudaError_t ret = cudaThreadSynchronize();
  81. if (ret)
  82. STARPU_CUDA_REPORT_ERROR(ret);
  83. }
  84. static void accumulate_vector_cpu(void *descr[], void *cl_arg)
  85. {
  86. TYPE *v_dst = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  87. TYPE *v_src = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
  88. unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  89. AXPY(n, (TYPE)1.0, v_src, 1, v_dst, 1);
  90. }
  91. static struct starpu_perfmodel_t accumulate_vector_model = {
  92. .type = STARPU_HISTORY_BASED,
  93. .symbol = "accumulate_vector"
  94. };
  95. starpu_codelet accumulate_vector_cl = {
  96. .where = STARPU_CPU|STARPU_CUDA,
  97. .cpu_func = accumulate_vector_cpu,
  98. .cuda_func = accumulate_vector_cuda,
  99. .nbuffers = 2,
  100. .model = &accumulate_vector_model
  101. };
  102. /*
  103. * Reduction initialization methods
  104. */
  105. static void bzero_variable_cuda(void *descr[], void *cl_arg)
  106. {
  107. TYPE *v = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
  108. cublasscal (1, (TYPE)0.0, v, 1);
  109. cudaThreadSynchronize();
  110. }
  111. static void bzero_variable_cpu(void *descr[], void *cl_arg)
  112. {
  113. TYPE *v = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
  114. *v = (TYPE)0.0;
  115. }
  116. static struct starpu_perfmodel_t bzero_variable_model = {
  117. .type = STARPU_HISTORY_BASED,
  118. .symbol = "bzero_variable"
  119. };
  120. starpu_codelet bzero_variable_cl = {
  121. .where = STARPU_CPU|STARPU_CUDA,
  122. .cpu_func = bzero_variable_cpu,
  123. .cuda_func = bzero_variable_cuda,
  124. .nbuffers = 1,
  125. .model = &bzero_variable_model
  126. };
  127. static void bzero_vector_cuda(void *descr[], void *cl_arg)
  128. {
  129. TYPE *v = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  130. unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  131. cublasscal (n, (TYPE)0.0, v, 1);
  132. cudaError_t ret = cudaThreadSynchronize();
  133. if (ret)
  134. STARPU_CUDA_REPORT_ERROR(ret);
  135. }
  136. static void bzero_vector_cpu(void *descr[], void *cl_arg)
  137. {
  138. TYPE *v = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  139. unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  140. memset(v, 0, n*sizeof(TYPE));
  141. }
  142. static struct starpu_perfmodel_t bzero_vector_model = {
  143. .type = STARPU_HISTORY_BASED,
  144. .symbol = "bzero_vector"
  145. };
  146. starpu_codelet bzero_vector_cl = {
  147. .where = STARPU_CPU|STARPU_CUDA,
  148. .cpu_func = bzero_vector_cpu,
  149. .cuda_func = bzero_vector_cuda,
  150. .nbuffers = 1,
  151. .model = &bzero_vector_model
  152. };
  153. /*
  154. * DOT kernel : s = dot(v1, v2)
  155. */
  156. static void dot_kernel_cuda(void *descr[], void *cl_arg)
  157. {
  158. cudaError_t ret;
  159. TYPE *dot = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
  160. TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
  161. TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[2]);
  162. unsigned n = STARPU_VECTOR_GET_NX(descr[1]);
  163. /* Get current value */
  164. TYPE host_dot;
  165. cudaMemcpy(&host_dot, dot, sizeof(TYPE), cudaMemcpyDeviceToHost);
  166. ret = cudaThreadSynchronize();
  167. if (ret)
  168. STARPU_CUDA_REPORT_ERROR(ret);
  169. TYPE local_dot = cublasdot(n, v1, 1, v2, 1);
  170. host_dot += local_dot;
  171. ret = cudaThreadSynchronize();
  172. if (ret)
  173. STARPU_CUDA_REPORT_ERROR(ret);
  174. cudaMemcpy(dot, &host_dot, sizeof(TYPE), cudaMemcpyHostToDevice);
  175. ret = cudaThreadSynchronize();
  176. if (ret)
  177. STARPU_CUDA_REPORT_ERROR(ret);
  178. }
  179. static void dot_kernel_cpu(void *descr[], void *cl_arg)
  180. {
  181. TYPE *dot = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
  182. TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
  183. TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[2]);
  184. unsigned n = STARPU_VECTOR_GET_NX(descr[1]);
  185. TYPE local_dot = 0.0;
  186. /* Note that we explicitely cast the result of the DOT kernel because
  187. * some BLAS library will return a double for sdot for instance. */
  188. local_dot = (TYPE)DOT(n, v1, 1, v2, 1);
  189. *dot = *dot + local_dot;
  190. }
  191. static struct starpu_perfmodel_t dot_kernel_model = {
  192. .type = STARPU_HISTORY_BASED,
  193. .symbol = "dot_kernel"
  194. };
  195. static starpu_codelet dot_kernel_cl = {
  196. .where = STARPU_CPU|STARPU_CUDA,
  197. .cpu_func = dot_kernel_cpu,
  198. .cuda_func = dot_kernel_cuda,
  199. .nbuffers = 3,
  200. .model = &dot_kernel_model
  201. };
  202. void dot_kernel(starpu_data_handle v1,
  203. starpu_data_handle v2,
  204. starpu_data_handle s,
  205. unsigned nblocks,
  206. int use_reduction)
  207. {
  208. int ret;
  209. struct starpu_task *task;
  210. /* Blank the accumulation variable */
  211. task = starpu_task_create();
  212. task->cl = &bzero_variable_cl;
  213. task->buffers[0].handle = s;
  214. task->buffers[0].mode = STARPU_W;
  215. ret = starpu_task_submit(task);
  216. assert(!ret);
  217. if (use_reduction)
  218. starpu_task_wait_for_all();
  219. unsigned b;
  220. for (b = 0; b < nblocks; b++)
  221. {
  222. task = starpu_task_create();
  223. task->cl = &dot_kernel_cl;
  224. task->buffers[0].handle = s;
  225. task->buffers[0].mode = use_reduction?STARPU_REDUX:STARPU_RW;
  226. task->buffers[1].handle = starpu_data_get_sub_data(v1, 1, b);
  227. task->buffers[1].mode = STARPU_R;
  228. task->buffers[2].handle = starpu_data_get_sub_data(v2, 1, b);
  229. task->buffers[2].mode = STARPU_R;
  230. ret = starpu_task_submit(task);
  231. assert(!ret);
  232. }
  233. if (use_reduction)
  234. starpu_task_wait_for_all();
  235. }
  236. /*
  237. * SCAL kernel : v1 = p1 v1
  238. */
  239. static void scal_kernel_cuda(void *descr[], void *cl_arg)
  240. {
  241. struct kernel_params *params = cl_arg;
  242. TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  243. unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  244. /* v1 = p1 v1 */
  245. TYPE alpha = params->p1;
  246. cublasscal(n, alpha, v1, 1);
  247. cudaThreadSynchronize();
  248. free(params);
  249. }
  250. static void scal_kernel_cpu(void *descr[], void *cl_arg)
  251. {
  252. struct kernel_params *params = cl_arg;
  253. TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  254. unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  255. /* v1 = p1 v1 */
  256. TYPE alpha = params->p1;
  257. SCAL(n, alpha, v1, 1);
  258. free(params);
  259. }
  260. static struct starpu_perfmodel_t scal_kernel_model = {
  261. .type = STARPU_HISTORY_BASED,
  262. .symbol = "scal_kernel"
  263. };
  264. static starpu_codelet scal_kernel_cl = {
  265. .where = STARPU_CPU|STARPU_CUDA,
  266. .cpu_func = scal_kernel_cpu,
  267. .cuda_func = scal_kernel_cuda,
  268. .nbuffers = 1,
  269. .model = &scal_kernel_model
  270. };
  271. /*
  272. * GEMV kernel : v1 = p1 * v1 + p2 * M v2
  273. */
  274. static void gemv_kernel_cuda(void *descr[], void *cl_arg)
  275. {
  276. struct kernel_params *params = cl_arg;
  277. TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  278. TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[2]);
  279. TYPE *M = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
  280. unsigned ld = STARPU_MATRIX_GET_LD(descr[1]);
  281. unsigned nx = STARPU_MATRIX_GET_NX(descr[1]);
  282. unsigned ny = STARPU_MATRIX_GET_NY(descr[1]);
  283. TYPE alpha = params->p2;
  284. TYPE beta = params->p1;
  285. /* Compute v1 = alpha M v2 + beta v1 */
  286. cublasgemv('N', nx, ny, alpha, M, ld, v2, 1, beta, v1, 1);
  287. cudaThreadSynchronize();
  288. free(params);
  289. }
  290. static void gemv_kernel_cpu(void *descr[], void *cl_arg)
  291. {
  292. struct kernel_params *params = cl_arg;
  293. TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  294. TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[2]);
  295. TYPE *M = (TYPE *)STARPU_MATRIX_GET_PTR(descr[1]);
  296. unsigned ld = STARPU_MATRIX_GET_LD(descr[1]);
  297. unsigned nx = STARPU_MATRIX_GET_NX(descr[1]);
  298. unsigned ny = STARPU_MATRIX_GET_NY(descr[1]);
  299. TYPE alpha = params->p2;
  300. TYPE beta = params->p1;
  301. /* Compute v1 = alpha M v2 + beta v1 */
  302. GEMV("N", nx, ny, alpha, M, ld, v2, 1, beta, v1, 1);
  303. free(params);
  304. }
  305. static struct starpu_perfmodel_t gemv_kernel_model = {
  306. .type = STARPU_HISTORY_BASED,
  307. .symbol = "gemv_kernel"
  308. };
  309. static starpu_codelet gemv_kernel_cl = {
  310. .where = STARPU_CPU|STARPU_CUDA,
  311. .cpu_func = gemv_kernel_cpu,
  312. .cuda_func = gemv_kernel_cuda,
  313. .nbuffers = 3,
  314. .model = &gemv_kernel_model
  315. };
  316. void gemv_kernel(starpu_data_handle v1,
  317. starpu_data_handle matrix,
  318. starpu_data_handle v2,
  319. TYPE p1, TYPE p2,
  320. unsigned nblocks,
  321. int use_reduction)
  322. {
  323. int ret;
  324. unsigned b1, b2;
  325. if (use_reduction)
  326. starpu_task_wait_for_all();
  327. for (b2 = 0; b2 < nblocks; b2++)
  328. {
  329. struct starpu_task *task = starpu_task_create();
  330. task->cl = &scal_kernel_cl;
  331. task->buffers[0].handle = starpu_data_get_sub_data(v1, 1, b2);
  332. task->buffers[0].mode = STARPU_RW;
  333. struct kernel_params *params = malloc(sizeof(struct kernel_params));
  334. params->p1 = p1;
  335. task->cl_arg = params;
  336. ret = starpu_task_submit(task);
  337. assert(!ret);
  338. }
  339. if (use_reduction)
  340. starpu_task_wait_for_all();
  341. for (b2 = 0; b2 < nblocks; b2++)
  342. {
  343. for (b1 = 0; b1 < nblocks; b1++)
  344. {
  345. struct starpu_task *task = starpu_task_create();
  346. task->cl = &gemv_kernel_cl;
  347. task->buffers[0].handle = starpu_data_get_sub_data(v1, 1, b2);
  348. task->buffers[0].mode = use_reduction?STARPU_REDUX:STARPU_RW;
  349. task->buffers[1].handle = starpu_data_get_sub_data(matrix, 2, b2, b1);
  350. task->buffers[1].mode = STARPU_R;
  351. task->buffers[2].handle = starpu_data_get_sub_data(v2, 1, b1);
  352. task->buffers[2].mode = STARPU_R;
  353. struct kernel_params *params = malloc(sizeof(struct kernel_params));
  354. assert(params);
  355. params->p1 = 1.0;
  356. params->p2 = p2;
  357. task->cl_arg = params;
  358. ret = starpu_task_submit(task);
  359. assert(!ret);
  360. }
  361. }
  362. if (use_reduction)
  363. starpu_task_wait_for_all();
  364. }
  365. /*
  366. * AXPY + SCAL kernel : v1 = p1 * v1 + p2 * v2
  367. */
  368. static void scal_axpy_kernel_cuda(void *descr[], void *cl_arg)
  369. {
  370. struct kernel_params *params = cl_arg;
  371. TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  372. TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
  373. unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  374. /* Compute v1 = p1 * v1 + p2 * v2.
  375. * v1 = p1 v1
  376. * v1 = v1 + p2 v2
  377. */
  378. cublasscal(n, params->p1, v1, 1);
  379. cublasaxpy(n, params->p2, v2, 1, v1, 1);
  380. cudaThreadSynchronize();
  381. free(params);
  382. }
  383. static void scal_axpy_kernel_cpu(void *descr[], void *cl_arg)
  384. {
  385. struct kernel_params *params = cl_arg;
  386. TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  387. TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
  388. unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
  389. /* Compute v1 = p1 * v1 + p2 * v2.
  390. * v1 = p1 v1
  391. * v1 = v1 + p2 v2
  392. */
  393. SCAL(nx, params->p1, v1, 1);
  394. AXPY(nx, params->p2, v2, 1, v1, 1);
  395. free(params);
  396. }
  397. static struct starpu_perfmodel_t scal_axpy_kernel_model = {
  398. .type = STARPU_HISTORY_BASED,
  399. .symbol = "scal_axpy_kernel"
  400. };
  401. static starpu_codelet scal_axpy_kernel_cl = {
  402. .where = STARPU_CPU|STARPU_CUDA,
  403. .cpu_func = scal_axpy_kernel_cpu,
  404. .cuda_func = scal_axpy_kernel_cuda,
  405. .nbuffers = 2,
  406. .model = &scal_axpy_kernel_model
  407. };
  408. void scal_axpy_kernel(starpu_data_handle v1, TYPE p1,
  409. starpu_data_handle v2, TYPE p2,
  410. unsigned nblocks)
  411. {
  412. int ret;
  413. unsigned b;
  414. for (b = 0; b < nblocks; b++)
  415. {
  416. struct starpu_task *task = starpu_task_create();
  417. task->cl = &scal_axpy_kernel_cl;
  418. task->buffers[0].handle = starpu_data_get_sub_data(v1, 1, b);
  419. task->buffers[0].mode = STARPU_RW;
  420. task->buffers[1].handle = starpu_data_get_sub_data(v2, 1, b);
  421. task->buffers[1].mode = STARPU_R;
  422. struct kernel_params *params = malloc(sizeof(struct kernel_params));
  423. assert(params);
  424. params->p1 = p1;
  425. params->p2 = p2;
  426. task->cl_arg = params;
  427. ret = starpu_task_submit(task);
  428. assert(!ret);
  429. }
  430. }
  431. /*
  432. * AXPY kernel : v1 = v1 + p1 * v2
  433. */
  434. static void axpy_kernel_cuda(void *descr[], void *cl_arg)
  435. {
  436. struct kernel_params *params = cl_arg;
  437. TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  438. TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
  439. unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  440. /* Compute v1 = v1 + p1 * v2.
  441. */
  442. cublasaxpy(n, params->p1, v2, 1, v1, 1);
  443. cudaThreadSynchronize();
  444. free(params);
  445. }
  446. static void axpy_kernel_cpu(void *descr[], void *cl_arg)
  447. {
  448. struct kernel_params *params = cl_arg;
  449. TYPE *v1 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  450. TYPE *v2 = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
  451. unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
  452. /* Compute v1 = p1 * v1 + p2 * v2.
  453. */
  454. AXPY(nx, params->p1, v2, 1, v1, 1);
  455. free(params);
  456. }
  457. static struct starpu_perfmodel_t axpy_kernel_model = {
  458. .type = STARPU_HISTORY_BASED,
  459. .symbol = "axpy_kernel"
  460. };
  461. static starpu_codelet axpy_kernel_cl = {
  462. .where = STARPU_CPU|STARPU_CUDA,
  463. .cpu_func = axpy_kernel_cpu,
  464. .cuda_func = axpy_kernel_cuda,
  465. .nbuffers = 2,
  466. .model = &axpy_kernel_model
  467. };
  468. void axpy_kernel(starpu_data_handle v1,
  469. starpu_data_handle v2, TYPE p1,
  470. unsigned nblocks)
  471. {
  472. int ret;
  473. unsigned b;
  474. for (b = 0; b < nblocks; b++)
  475. {
  476. struct starpu_task *task = starpu_task_create();
  477. task->cl = &axpy_kernel_cl;
  478. task->buffers[0].handle = starpu_data_get_sub_data(v1, 1, b);
  479. task->buffers[0].mode = STARPU_RW;
  480. task->buffers[1].handle = starpu_data_get_sub_data(v2, 1, b);
  481. task->buffers[1].mode = STARPU_R;
  482. struct kernel_params *params = malloc(sizeof(struct kernel_params));
  483. assert(params);
  484. params->p1 = p1;
  485. task->cl_arg = params;
  486. ret = starpu_task_submit(task);
  487. assert(!ret);
  488. }
  489. }
  490. /*
  491. * COPY kernel : vector_dst <- vector_src
  492. */
  493. static void copy_handle_cpu(void *descr[], void *cl_arg)
  494. {
  495. TYPE *dst = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  496. TYPE *src = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
  497. unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
  498. size_t elemsize = STARPU_VECTOR_GET_ELEMSIZE(descr[0]);
  499. memcpy(dst, src, nx*elemsize);
  500. }
  501. static void copy_handle_cuda(void *descr[], void *cl_arg)
  502. {
  503. TYPE *dst = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
  504. TYPE *src = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
  505. unsigned nx = STARPU_VECTOR_GET_NX(descr[0]);
  506. size_t elemsize = STARPU_VECTOR_GET_ELEMSIZE(descr[0]);
  507. cudaMemcpy(dst, src, nx*elemsize, cudaMemcpyDeviceToDevice);
  508. cudaThreadSynchronize();
  509. }
  510. static struct starpu_perfmodel_t copy_handle_model = {
  511. .type = STARPU_HISTORY_BASED,
  512. .symbol = "copy_handle"
  513. };
  514. static starpu_codelet copy_handle_cl = {
  515. .where = STARPU_CPU|STARPU_CUDA,
  516. .cpu_func = copy_handle_cpu,
  517. .cuda_func = copy_handle_cuda,
  518. .nbuffers = 2,
  519. .model = &copy_handle_model
  520. };
  521. void copy_handle(starpu_data_handle dst, starpu_data_handle src, unsigned nblocks)
  522. {
  523. int ret;
  524. unsigned b;
  525. for (b = 0; b < nblocks; b++)
  526. {
  527. struct starpu_task *task = starpu_task_create();
  528. task->cl = &copy_handle_cl;
  529. task->buffers[0].handle = starpu_data_get_sub_data(dst, 1, b);
  530. task->buffers[0].mode = STARPU_W;
  531. task->buffers[1].handle = starpu_data_get_sub_data(src, 1, b);
  532. task->buffers[1].mode = STARPU_R;
  533. ret = starpu_task_submit(task);
  534. assert(!ret);
  535. }
  536. }