csr_interface.c 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2009, 2010 Université de Bordeaux 1
  4. * Copyright (C) 2010 Mehdi Juhoor <mjuhoor@gmail.com>
  5. * Copyright (C) 2010, 2011 Centre National de la Recherche Scientifique
  6. *
  7. * StarPU is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU Lesser General Public License as published by
  9. * the Free Software Foundation; either version 2.1 of the License, or (at
  10. * your option) any later version.
  11. *
  12. * StarPU is distributed in the hope that it will be useful, but
  13. * WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  15. *
  16. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  17. */
  18. #include <starpu.h>
  19. #include <common/config.h>
  20. #include <datawizard/coherency.h>
  21. #include <datawizard/copy_driver.h>
  22. #include <datawizard/filters.h>
  23. #include <common/hash.h>
  24. #include <starpu_cuda.h>
  25. #include <starpu_opencl.h>
  26. #include <drivers/opencl/driver_opencl.h>
  27. static int copy_ram_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
  28. #ifdef STARPU_USE_CUDA
  29. static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
  30. static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
  31. static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
  32. static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
  33. static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
  34. static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
  35. #endif
  36. #ifdef STARPU_USE_OPENCL
  37. static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
  38. static int copy_opencl_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
  39. #endif
  40. static const struct starpu_data_copy_methods csr_copy_data_methods_s = {
  41. .ram_to_ram = copy_ram_to_ram,
  42. .ram_to_spu = NULL,
  43. #ifdef STARPU_USE_CUDA
  44. .ram_to_cuda = copy_ram_to_cuda,
  45. .cuda_to_ram = copy_cuda_to_ram,
  46. .cuda_to_cuda = copy_cuda_to_cuda,
  47. .ram_to_cuda_async = copy_ram_to_cuda_async,
  48. .cuda_to_ram_async = copy_cuda_to_ram_async,
  49. .cuda_to_cuda_async = copy_cuda_to_cuda_async,
  50. #endif
  51. #ifdef STARPU_USE_OPENCL
  52. .ram_to_opencl = copy_ram_to_opencl,
  53. .opencl_to_ram = copy_opencl_to_ram,
  54. #endif
  55. .cuda_to_spu = NULL,
  56. .spu_to_ram = NULL,
  57. .spu_to_cuda = NULL,
  58. .spu_to_spu = NULL
  59. };
  60. static void register_csr_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface);
  61. static ssize_t allocate_csr_buffer_on_node(void *data_interface_, uint32_t dst_node);
  62. static void free_csr_buffer_on_node(void *data_interface, uint32_t node);
  63. static size_t csr_interface_get_size(starpu_data_handle handle);
  64. static int csr_compare(void *data_interface_a, void *data_interface_b);
  65. static uint32_t footprint_csr_interface_crc32(starpu_data_handle handle);
  66. static struct starpu_data_interface_ops_t interface_csr_ops = {
  67. .register_data_handle = register_csr_handle,
  68. .allocate_data_on_node = allocate_csr_buffer_on_node,
  69. .free_data_on_node = free_csr_buffer_on_node,
  70. .copy_methods = &csr_copy_data_methods_s,
  71. .get_size = csr_interface_get_size,
  72. .interfaceid = STARPU_CSR_INTERFACE_ID,
  73. .interface_size = sizeof(starpu_csr_interface_t),
  74. .footprint = footprint_csr_interface_crc32,
  75. .compare = csr_compare
  76. };
  77. static void register_csr_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface)
  78. {
  79. starpu_csr_interface_t *csr_interface = data_interface;
  80. unsigned node;
  81. for (node = 0; node < STARPU_MAXNODES; node++)
  82. {
  83. starpu_csr_interface_t *local_interface =
  84. starpu_data_get_interface_on_node(handle, node);
  85. if (node == home_node) {
  86. local_interface->nzval = csr_interface->nzval;
  87. local_interface->colind = csr_interface->colind;
  88. }
  89. else {
  90. local_interface->nzval = 0;
  91. local_interface->colind = NULL;
  92. }
  93. local_interface->rowptr = csr_interface->rowptr;
  94. local_interface->nnz = csr_interface->nnz;
  95. local_interface->nrow = csr_interface->nrow;
  96. local_interface->firstentry = csr_interface->firstentry;
  97. local_interface->elemsize = csr_interface->elemsize;
  98. }
  99. }
  100. /* declare a new data with the BLAS interface */
  101. void starpu_csr_data_register(starpu_data_handle *handleptr, uint32_t home_node,
  102. uint32_t nnz, uint32_t nrow, uintptr_t nzval, uint32_t *colind, uint32_t *rowptr, uint32_t firstentry, size_t elemsize)
  103. {
  104. starpu_csr_interface_t csr_interface = {
  105. .nnz = nnz,
  106. .nrow = nrow,
  107. .nzval = nzval,
  108. .colind = colind,
  109. .rowptr = rowptr,
  110. .firstentry = firstentry,
  111. .elemsize = elemsize
  112. };
  113. starpu_data_register(handleptr, home_node, &csr_interface, &interface_csr_ops);
  114. }
  115. static uint32_t footprint_csr_interface_crc32(starpu_data_handle handle)
  116. {
  117. return _starpu_crc32_be(starpu_csr_get_nnz(handle), 0);
  118. }
  119. static int csr_compare(void *data_interface_a, void *data_interface_b)
  120. {
  121. starpu_csr_interface_t *csr_a = data_interface_a;
  122. starpu_csr_interface_t *csr_b = data_interface_b;
  123. /* Two matricess are considered compatible if they have the same size */
  124. return ((csr_a->nnz == csr_b->nnz)
  125. && (csr_a->nrow == csr_b->nrow)
  126. && (csr_a->elemsize == csr_b->elemsize));
  127. }
  128. /* offer an access to the data parameters */
  129. uint32_t starpu_csr_get_nnz(starpu_data_handle handle)
  130. {
  131. starpu_csr_interface_t *csr_interface =
  132. starpu_data_get_interface_on_node(handle, 0);
  133. return csr_interface->nnz;
  134. }
  135. uint32_t starpu_csr_get_nrow(starpu_data_handle handle)
  136. {
  137. starpu_csr_interface_t *csr_interface =
  138. starpu_data_get_interface_on_node(handle, 0);
  139. return csr_interface->nrow;
  140. }
  141. uint32_t starpu_csr_get_firstentry(starpu_data_handle handle)
  142. {
  143. starpu_csr_interface_t *csr_interface =
  144. starpu_data_get_interface_on_node(handle, 0);
  145. return csr_interface->firstentry;
  146. }
  147. size_t starpu_csr_get_elemsize(starpu_data_handle handle)
  148. {
  149. starpu_csr_interface_t *csr_interface =
  150. starpu_data_get_interface_on_node(handle, 0);
  151. return csr_interface->elemsize;
  152. }
  153. uintptr_t starpu_csr_get_local_nzval(starpu_data_handle handle)
  154. {
  155. unsigned node;
  156. node = _starpu_get_local_memory_node();
  157. STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
  158. starpu_csr_interface_t *csr_interface =
  159. starpu_data_get_interface_on_node(handle, node);
  160. return csr_interface->nzval;
  161. }
  162. uint32_t *starpu_csr_get_local_colind(starpu_data_handle handle)
  163. {
  164. unsigned node;
  165. node = _starpu_get_local_memory_node();
  166. STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
  167. starpu_csr_interface_t *csr_interface =
  168. starpu_data_get_interface_on_node(handle, node);
  169. return csr_interface->colind;
  170. }
  171. uint32_t *starpu_csr_get_local_rowptr(starpu_data_handle handle)
  172. {
  173. unsigned node;
  174. node = _starpu_get_local_memory_node();
  175. STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
  176. starpu_csr_interface_t *csr_interface =
  177. starpu_data_get_interface_on_node(handle, node);
  178. return csr_interface->rowptr;
  179. }
  180. static size_t csr_interface_get_size(starpu_data_handle handle)
  181. {
  182. size_t size;
  183. uint32_t nnz = starpu_csr_get_nnz(handle);
  184. uint32_t nrow = starpu_csr_get_nrow(handle);
  185. size_t elemsize = starpu_csr_get_elemsize(handle);
  186. size = nnz*elemsize + nnz*sizeof(uint32_t) + (nrow+1)*sizeof(uint32_t);
  187. return size;
  188. }
  189. /* memory allocation/deallocation primitives for the BLAS interface */
  190. /* returns the size of the allocated area */
  191. static ssize_t allocate_csr_buffer_on_node(void *data_interface_, uint32_t dst_node)
  192. {
  193. uintptr_t addr_nzval;
  194. uint32_t *addr_colind, *addr_rowptr;
  195. ssize_t allocated_memory;
  196. /* we need the 3 arrays to be allocated */
  197. starpu_csr_interface_t *csr_interface = data_interface_;
  198. uint32_t nnz = csr_interface->nnz;
  199. uint32_t nrow = csr_interface->nrow;
  200. size_t elemsize = csr_interface->elemsize;
  201. starpu_node_kind kind = _starpu_get_node_kind(dst_node);
  202. switch(kind) {
  203. case STARPU_CPU_RAM:
  204. addr_nzval = (uintptr_t)malloc(nnz*elemsize);
  205. if (!addr_nzval)
  206. goto fail_nzval;
  207. addr_colind = malloc(nnz*sizeof(uint32_t));
  208. if (!addr_colind)
  209. goto fail_colind;
  210. addr_rowptr = malloc((nrow+1)*sizeof(uint32_t));
  211. if (!addr_rowptr)
  212. goto fail_rowptr;
  213. break;
  214. #ifdef STARPU_USE_CUDA
  215. case STARPU_CUDA_RAM:
  216. cudaMalloc((void **)&addr_nzval, nnz*elemsize);
  217. if (!addr_nzval)
  218. goto fail_nzval;
  219. cudaMalloc((void **)&addr_colind, nnz*sizeof(uint32_t));
  220. if (!addr_colind)
  221. goto fail_colind;
  222. cudaMalloc((void **)&addr_rowptr, (nrow+1)*sizeof(uint32_t));
  223. if (!addr_rowptr)
  224. goto fail_rowptr;
  225. break;
  226. #endif
  227. #ifdef STARPU_USE_OPENCL
  228. case STARPU_OPENCL_RAM:
  229. {
  230. int ret;
  231. void *ptr;
  232. ret = _starpu_opencl_allocate_memory(&ptr, nnz*elemsize, CL_MEM_READ_WRITE);
  233. addr_nzval = (uintptr_t)ptr;
  234. if (ret) goto fail_nzval;
  235. ret = _starpu_opencl_allocate_memory(&ptr, nnz*sizeof(uint32_t), CL_MEM_READ_WRITE);
  236. addr_colind = ptr;
  237. if (ret) goto fail_colind;
  238. ret = _starpu_opencl_allocate_memory(&ptr, (nrow+1)*sizeof(uint32_t), CL_MEM_READ_WRITE);
  239. addr_rowptr = ptr;
  240. if (ret) goto fail_rowptr;
  241. break;
  242. }
  243. #endif
  244. default:
  245. assert(0);
  246. }
  247. /* allocation succeeded */
  248. allocated_memory =
  249. nnz*elemsize + nnz*sizeof(uint32_t) + (nrow+1)*sizeof(uint32_t);
  250. /* update the data properly in consequence */
  251. csr_interface->nzval = addr_nzval;
  252. csr_interface->colind = addr_colind;
  253. csr_interface->rowptr = addr_rowptr;
  254. return allocated_memory;
  255. fail_rowptr:
  256. switch(kind) {
  257. case STARPU_CPU_RAM:
  258. free((void *)addr_colind);
  259. #ifdef STARPU_USE_CUDA
  260. case STARPU_CUDA_RAM:
  261. cudaFree((void*)addr_colind);
  262. break;
  263. #endif
  264. #ifdef STARPU_USE_OPENCL
  265. case STARPU_OPENCL_RAM:
  266. clReleaseMemObject((void*)addr_colind);
  267. break;
  268. #endif
  269. default:
  270. assert(0);
  271. }
  272. fail_colind:
  273. switch(kind) {
  274. case STARPU_CPU_RAM:
  275. free((void *)addr_nzval);
  276. #ifdef STARPU_USE_CUDA
  277. case STARPU_CUDA_RAM:
  278. cudaFree((void*)addr_nzval);
  279. break;
  280. #endif
  281. #ifdef STARPU_USE_OPENCL
  282. case STARPU_OPENCL_RAM:
  283. clReleaseMemObject((void*)addr_nzval);
  284. break;
  285. #endif
  286. default:
  287. assert(0);
  288. }
  289. fail_nzval:
  290. /* allocation failed */
  291. return -ENOMEM;
  292. }
  293. static void free_csr_buffer_on_node(void *data_interface, uint32_t node)
  294. {
  295. starpu_csr_interface_t *csr_interface = data_interface;
  296. starpu_node_kind kind = _starpu_get_node_kind(node);
  297. switch(kind) {
  298. case STARPU_CPU_RAM:
  299. free((void*)csr_interface->nzval);
  300. free((void*)csr_interface->colind);
  301. free((void*)csr_interface->rowptr);
  302. break;
  303. #ifdef STARPU_USE_CUDA
  304. case STARPU_CUDA_RAM:
  305. cudaFree((void*)csr_interface->nzval);
  306. cudaFree((void*)csr_interface->colind);
  307. cudaFree((void*)csr_interface->rowptr);
  308. break;
  309. #endif
  310. #ifdef STARPU_USE_OPENCL
  311. case STARPU_OPENCL_RAM:
  312. clReleaseMemObject((void*)csr_interface->nzval);
  313. clReleaseMemObject((void*)csr_interface->colind);
  314. clReleaseMemObject((void*)csr_interface->rowptr);
  315. break;
  316. #endif
  317. default:
  318. assert(0);
  319. }
  320. }
  321. #ifdef STARPU_USE_CUDA
  322. static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), enum cudaMemcpyKind kind)
  323. {
  324. starpu_csr_interface_t *src_csr = src_interface;
  325. starpu_csr_interface_t *dst_csr = dst_interface;
  326. uint32_t nnz = src_csr->nnz;
  327. uint32_t nrow = src_csr->nrow;
  328. size_t elemsize = src_csr->elemsize;
  329. cudaError_t cures;
  330. cures = cudaMemcpy((char *)dst_csr->nzval, (char *)src_csr->nzval, nnz*elemsize, kind);
  331. if (STARPU_UNLIKELY(cures))
  332. STARPU_CUDA_REPORT_ERROR(cures);
  333. cures = cudaMemcpy((char *)dst_csr->colind, (char *)src_csr->colind, nnz*sizeof(uint32_t), kind);
  334. if (STARPU_UNLIKELY(cures))
  335. STARPU_CUDA_REPORT_ERROR(cures);
  336. cures = cudaMemcpy((char *)dst_csr->rowptr, (char *)src_csr->rowptr, (nrow+1)*sizeof(uint32_t), kind);
  337. if (STARPU_UNLIKELY(cures))
  338. STARPU_CUDA_REPORT_ERROR(cures);
  339. STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
  340. return 0;
  341. }
  342. static int copy_cuda_common_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), enum cudaMemcpyKind kind, cudaStream_t stream)
  343. {
  344. starpu_csr_interface_t *src_csr = src_interface;
  345. starpu_csr_interface_t *dst_csr = dst_interface;
  346. uint32_t nnz = src_csr->nnz;
  347. uint32_t nrow = src_csr->nrow;
  348. size_t elemsize = src_csr->elemsize;
  349. cudaError_t cures;
  350. int synchronous_fallback = 0;
  351. cures = cudaMemcpyAsync((char *)dst_csr->nzval, (char *)src_csr->nzval, nnz*elemsize, kind, stream);
  352. if (cures)
  353. {
  354. synchronous_fallback = 1;
  355. cures = cudaMemcpy((char *)dst_csr->nzval, (char *)src_csr->nzval, nnz*elemsize, kind);
  356. if (STARPU_UNLIKELY(cures))
  357. STARPU_CUDA_REPORT_ERROR(cures);
  358. }
  359. if (!synchronous_fallback)
  360. {
  361. cures = cudaMemcpyAsync((char *)dst_csr->colind, (char *)src_csr->colind, nnz*sizeof(uint32_t), kind, stream);
  362. }
  363. if (synchronous_fallback || cures != cudaSuccess)
  364. {
  365. synchronous_fallback = 1;
  366. cures = cudaMemcpy((char *)dst_csr->colind, (char *)src_csr->colind, nnz*sizeof(uint32_t), kind);
  367. if (STARPU_UNLIKELY(cures))
  368. STARPU_CUDA_REPORT_ERROR(cures);
  369. }
  370. if (!synchronous_fallback)
  371. {
  372. cures = cudaMemcpyAsync((char *)dst_csr->rowptr, (char *)src_csr->rowptr, (nrow+1)*sizeof(uint32_t), kind, stream);
  373. }
  374. if (synchronous_fallback || cures != cudaSuccess)
  375. {
  376. synchronous_fallback = 1;
  377. cures = cudaMemcpy((char *)dst_csr->rowptr, (char *)src_csr->rowptr, (nrow+1)*sizeof(uint32_t), kind);
  378. if (STARPU_UNLIKELY(cures))
  379. STARPU_CUDA_REPORT_ERROR(cures);
  380. }
  381. if (synchronous_fallback)
  382. {
  383. STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
  384. return 0;
  385. }
  386. else {
  387. return -EAGAIN;
  388. }
  389. }
  390. static int copy_cuda_peer(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
  391. {
  392. #ifdef HAVE_CUDA_MEMCPY_PEER
  393. starpu_csr_interface_t *src_csr = src_interface;
  394. starpu_csr_interface_t *dst_csr = dst_interface;
  395. uint32_t nnz = src_csr->nnz;
  396. uint32_t nrow = src_csr->nrow;
  397. size_t elemsize = src_csr->elemsize;
  398. int src_dev = starpu_memory_node_to_devid(src_node);
  399. int dst_dev = starpu_memory_node_to_devid(dst_node);
  400. cudaError_t cures;
  401. cures = cudaMemcpyPeer((char *)dst_csr->nzval, dst_dev, (char *)src_csr->nzval, src_dev, nnz*elemsize);
  402. if (STARPU_UNLIKELY(cures))
  403. STARPU_CUDA_REPORT_ERROR(cures);
  404. cures = cudaMemcpyPeer((char *)dst_csr->colind, dst_dev, (char *)src_csr->colind, src_dev, nnz*sizeof(uint32_t));
  405. if (STARPU_UNLIKELY(cures))
  406. STARPU_CUDA_REPORT_ERROR(cures);
  407. cures = cudaMemcpyPeer((char *)dst_csr->rowptr, dst_dev, (char *)src_csr->rowptr, src_dev, (nrow+1)*sizeof(uint32_t));
  408. if (STARPU_UNLIKELY(cures))
  409. STARPU_CUDA_REPORT_ERROR(cures);
  410. STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
  411. return 0;
  412. #else
  413. STARPU_ABORT();
  414. return 0;
  415. #endif
  416. }
  417. static int copy_cuda_peer_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t stream)
  418. {
  419. #ifdef HAVE_CUDA_MEMCPY_PEER
  420. starpu_csr_interface_t *src_csr = src_interface;
  421. starpu_csr_interface_t *dst_csr = dst_interface;
  422. uint32_t nnz = src_csr->nnz;
  423. uint32_t nrow = src_csr->nrow;
  424. size_t elemsize = src_csr->elemsize;
  425. cudaError_t cures;
  426. int src_dev = starpu_memory_node_to_devid(src_node);
  427. int dst_dev = starpu_memory_node_to_devid(dst_node);
  428. int synchronous_fallback = 0;
  429. cures = cudaMemcpyPeerAsync((char *)dst_csr->nzval, dst_dev, (char *)src_csr->nzval, src_dev, nnz*elemsize, stream);
  430. if (cures)
  431. {
  432. synchronous_fallback = 1;
  433. cures = cudaMemcpyPeer((char *)dst_csr->nzval, dst_dev, (char *)src_csr->nzval, src_dev, nnz*elemsize);
  434. if (STARPU_UNLIKELY(cures))
  435. STARPU_CUDA_REPORT_ERROR(cures);
  436. }
  437. if (!synchronous_fallback)
  438. {
  439. cures = cudaMemcpyPeerAsync((char *)dst_csr->colind, dst_dev, (char *)src_csr->colind, src_dev, nnz*sizeof(uint32_t), stream);
  440. }
  441. if (synchronous_fallback || cures != cudaSuccess)
  442. {
  443. synchronous_fallback = 1;
  444. cures = cudaMemcpyPeer((char *)dst_csr->colind, dst_dev, (char *)src_csr->colind, src_dev, nnz*sizeof(uint32_t));
  445. if (STARPU_UNLIKELY(cures))
  446. STARPU_CUDA_REPORT_ERROR(cures);
  447. }
  448. if (!synchronous_fallback)
  449. {
  450. cures = cudaMemcpyPeerAsync((char *)dst_csr->rowptr, dst_dev, (char *)src_csr->rowptr, src_dev, (nrow+1)*sizeof(uint32_t), stream);
  451. }
  452. if (synchronous_fallback || cures != cudaSuccess)
  453. {
  454. synchronous_fallback = 1;
  455. cures = cudaMemcpyPeer((char *)dst_csr->rowptr, dst_dev, (char *)src_csr->rowptr, src_dev, (nrow+1)*sizeof(uint32_t));
  456. if (STARPU_UNLIKELY(cures))
  457. STARPU_CUDA_REPORT_ERROR(cures);
  458. }
  459. if (synchronous_fallback)
  460. {
  461. STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
  462. return 0;
  463. }
  464. else {
  465. return -EAGAIN;
  466. }
  467. #else
  468. /* Illegal without Peer tranfers */
  469. STARPU_ABORT();
  470. return 0;
  471. #endif
  472. }
  473. static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
  474. {
  475. return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
  476. }
  477. static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
  478. {
  479. return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
  480. }
  481. static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
  482. {
  483. if (src_node == dst_node)
  484. return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
  485. else
  486. return copy_cuda_peer(src_interface, src_node, dst_interface, dst_node);
  487. }
  488. static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
  489. {
  490. return copy_cuda_common_async(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, stream);
  491. }
  492. static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
  493. {
  494. return copy_cuda_common_async(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, stream);
  495. }
  496. static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
  497. {
  498. if (src_node == dst_node)
  499. return copy_cuda_common_async(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice, stream);
  500. else
  501. return copy_cuda_peer_async(src_interface, src_node, dst_interface, dst_node, stream);
  502. }
  503. #endif // STARPU_USE_CUDA
  504. #ifdef STARPU_USE_OPENCL
  505. static int copy_opencl_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
  506. {
  507. starpu_csr_interface_t *src_csr = src_interface;
  508. starpu_csr_interface_t *dst_csr = dst_interface;
  509. uint32_t nnz = src_csr->nnz;
  510. uint32_t nrow = src_csr->nrow;
  511. size_t elemsize = src_csr->elemsize;
  512. int err;
  513. err = _starpu_opencl_copy_opencl_to_ram((cl_mem)src_csr->nzval, (void *)dst_csr->nzval, nnz*elemsize, 0, NULL);
  514. if (STARPU_UNLIKELY(err))
  515. STARPU_OPENCL_REPORT_ERROR(err);
  516. err = _starpu_opencl_copy_opencl_to_ram((cl_mem)src_csr->colind, (void *)dst_csr->colind, nnz*sizeof(uint32_t), 0, NULL);
  517. if (STARPU_UNLIKELY(err))
  518. STARPU_OPENCL_REPORT_ERROR(err);
  519. err = _starpu_opencl_copy_opencl_to_ram((cl_mem)src_csr->rowptr, (void *)dst_csr->rowptr, (nrow+1)*sizeof(uint32_t), 0, NULL);
  520. if (STARPU_UNLIKELY(err))
  521. STARPU_OPENCL_REPORT_ERROR(err);
  522. STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
  523. return 0;
  524. }
  525. static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
  526. {
  527. starpu_csr_interface_t *src_csr = src_interface;
  528. starpu_csr_interface_t *dst_csr = dst_interface;
  529. uint32_t nnz = src_csr->nnz;
  530. uint32_t nrow = src_csr->nrow;
  531. size_t elemsize = src_csr->elemsize;
  532. int err;
  533. err = _starpu_opencl_copy_ram_to_opencl((void *)src_csr->nzval, (cl_mem)dst_csr->nzval, nnz*elemsize, 0, NULL);
  534. if (STARPU_UNLIKELY(err))
  535. STARPU_OPENCL_REPORT_ERROR(err);
  536. err = _starpu_opencl_copy_ram_to_opencl((void *)src_csr->colind, (cl_mem)dst_csr->colind, nnz*sizeof(uint32_t), 0, NULL);
  537. if (STARPU_UNLIKELY(err))
  538. STARPU_OPENCL_REPORT_ERROR(err);
  539. err = _starpu_opencl_copy_ram_to_opencl((void *)src_csr->rowptr, (cl_mem)dst_csr->rowptr, (nrow+1)*sizeof(uint32_t), 0, NULL);
  540. if (STARPU_UNLIKELY(err))
  541. STARPU_OPENCL_REPORT_ERROR(err);
  542. STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
  543. return 0;
  544. }
  545. #endif // STARPU_USE_OPENCL
  546. /* as not all platform easily have a BLAS lib installed ... */
  547. static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
  548. {
  549. starpu_csr_interface_t *src_csr = src_interface;
  550. starpu_csr_interface_t *dst_csr = dst_interface;
  551. uint32_t nnz = src_csr->nnz;
  552. uint32_t nrow = src_csr->nrow;
  553. size_t elemsize = src_csr->elemsize;
  554. memcpy((void *)dst_csr->nzval, (void *)src_csr->nzval, nnz*elemsize);
  555. memcpy((void *)dst_csr->colind, (void *)src_csr->colind, nnz*sizeof(uint32_t));
  556. memcpy((void *)dst_csr->rowptr, (void *)src_csr->rowptr, (nrow+1)*sizeof(uint32_t));
  557. STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
  558. return 0;
  559. }