csr_interface.c 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2009, 2010 Université de Bordeaux 1
  4. * Copyright (C) 2010 Mehdi Juhoor <mjuhoor@gmail.com>
  5. * Copyright (C) 2010, 2011 Centre National de la Recherche Scientifique
  6. *
  7. * StarPU is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU Lesser General Public License as published by
  9. * the Free Software Foundation; either version 2.1 of the License, or (at
  10. * your option) any later version.
  11. *
  12. * StarPU is distributed in the hope that it will be useful, but
  13. * WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  15. *
  16. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  17. */
  18. #include <starpu.h>
  19. #include <common/config.h>
  20. #include <datawizard/coherency.h>
  21. #include <datawizard/copy_driver.h>
  22. #include <datawizard/filters.h>
  23. #include <common/hash.h>
  24. #include <starpu_cuda.h>
  25. #include <starpu_opencl.h>
  26. #include <drivers/opencl/driver_opencl.h>
  27. static int copy_ram_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
  28. #ifdef STARPU_USE_CUDA
  29. static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
  30. static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
  31. static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
  32. static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
  33. static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
  34. static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
  35. #endif
  36. #ifdef STARPU_USE_OPENCL
  37. static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
  38. static int copy_opencl_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
  39. #endif
  40. static const struct starpu_data_copy_methods csr_copy_data_methods_s = {
  41. .ram_to_ram = copy_ram_to_ram,
  42. .ram_to_spu = NULL,
  43. #ifdef STARPU_USE_CUDA
  44. .ram_to_cuda = copy_ram_to_cuda,
  45. .cuda_to_ram = copy_cuda_to_ram,
  46. .cuda_to_cuda = copy_cuda_to_cuda,
  47. .ram_to_cuda_async = copy_ram_to_cuda_async,
  48. .cuda_to_ram_async = copy_cuda_to_ram_async,
  49. .cuda_to_cuda_async = copy_cuda_to_cuda_async,
  50. #endif
  51. #ifdef STARPU_USE_OPENCL
  52. .ram_to_opencl = copy_ram_to_opencl,
  53. .opencl_to_ram = copy_opencl_to_ram,
  54. #endif
  55. .cuda_to_spu = NULL,
  56. .spu_to_ram = NULL,
  57. .spu_to_cuda = NULL,
  58. .spu_to_spu = NULL
  59. };
  60. static void register_csr_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface);
  61. static ssize_t allocate_csr_buffer_on_node(void *data_interface_, uint32_t dst_node);
  62. static void free_csr_buffer_on_node(void *data_interface, uint32_t node);
  63. static size_t csr_interface_get_size(starpu_data_handle handle);
  64. static int csr_compare(void *data_interface_a, void *data_interface_b);
  65. static uint32_t footprint_csr_interface_crc32(starpu_data_handle handle);
  66. static struct starpu_data_interface_ops_t interface_csr_ops = {
  67. .register_data_handle = register_csr_handle,
  68. .allocate_data_on_node = allocate_csr_buffer_on_node,
  69. .free_data_on_node = free_csr_buffer_on_node,
  70. .copy_methods = &csr_copy_data_methods_s,
  71. .get_size = csr_interface_get_size,
  72. .interfaceid = STARPU_CSR_INTERFACE_ID,
  73. .interface_size = sizeof(starpu_csr_interface_t),
  74. .footprint = footprint_csr_interface_crc32,
  75. .compare = csr_compare
  76. };
  77. static void register_csr_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface)
  78. {
  79. starpu_csr_interface_t *csr_interface = (starpu_csr_interface_t *) data_interface;
  80. unsigned node;
  81. for (node = 0; node < STARPU_MAXNODES; node++)
  82. {
  83. starpu_csr_interface_t *local_interface = (starpu_csr_interface_t *)
  84. starpu_data_get_interface_on_node(handle, node);
  85. if (node == home_node) {
  86. local_interface->nzval = csr_interface->nzval;
  87. local_interface->colind = csr_interface->colind;
  88. }
  89. else {
  90. local_interface->nzval = 0;
  91. local_interface->colind = NULL;
  92. }
  93. local_interface->rowptr = csr_interface->rowptr;
  94. local_interface->nnz = csr_interface->nnz;
  95. local_interface->nrow = csr_interface->nrow;
  96. local_interface->firstentry = csr_interface->firstentry;
  97. local_interface->elemsize = csr_interface->elemsize;
  98. }
  99. }
  100. /* declare a new data with the BLAS interface */
  101. void starpu_csr_data_register(starpu_data_handle *handleptr, uint32_t home_node,
  102. uint32_t nnz, uint32_t nrow, uintptr_t nzval, uint32_t *colind, uint32_t *rowptr, uint32_t firstentry, size_t elemsize)
  103. {
  104. starpu_csr_interface_t csr_interface = {
  105. .nnz = nnz,
  106. .nrow = nrow,
  107. .nzval = nzval,
  108. .colind = colind,
  109. .rowptr = rowptr,
  110. .firstentry = firstentry,
  111. .elemsize = elemsize
  112. };
  113. starpu_data_register(handleptr, home_node, &csr_interface, &interface_csr_ops);
  114. }
  115. static uint32_t footprint_csr_interface_crc32(starpu_data_handle handle)
  116. {
  117. return _starpu_crc32_be(starpu_csr_get_nnz(handle), 0);
  118. }
  119. static int csr_compare(void *data_interface_a, void *data_interface_b)
  120. {
  121. starpu_csr_interface_t *csr_a = (starpu_csr_interface_t *) data_interface_a;
  122. starpu_csr_interface_t *csr_b = (starpu_csr_interface_t *) data_interface_b;
  123. /* Two matricess are considered compatible if they have the same size */
  124. return ((csr_a->nnz == csr_b->nnz)
  125. && (csr_a->nrow == csr_b->nrow)
  126. && (csr_a->elemsize == csr_b->elemsize));
  127. }
  128. /* offer an access to the data parameters */
  129. uint32_t starpu_csr_get_nnz(starpu_data_handle handle)
  130. {
  131. starpu_csr_interface_t *csr_interface = (starpu_csr_interface_t *)
  132. starpu_data_get_interface_on_node(handle, 0);
  133. return csr_interface->nnz;
  134. }
  135. uint32_t starpu_csr_get_nrow(starpu_data_handle handle)
  136. {
  137. starpu_csr_interface_t *csr_interface = (starpu_csr_interface_t *)
  138. starpu_data_get_interface_on_node(handle, 0);
  139. return csr_interface->nrow;
  140. }
  141. uint32_t starpu_csr_get_firstentry(starpu_data_handle handle)
  142. {
  143. starpu_csr_interface_t *csr_interface = (starpu_csr_interface_t *)
  144. starpu_data_get_interface_on_node(handle, 0);
  145. return csr_interface->firstentry;
  146. }
  147. size_t starpu_csr_get_elemsize(starpu_data_handle handle)
  148. {
  149. starpu_csr_interface_t *csr_interface = (starpu_csr_interface_t *)
  150. starpu_data_get_interface_on_node(handle, 0);
  151. return csr_interface->elemsize;
  152. }
  153. uintptr_t starpu_csr_get_local_nzval(starpu_data_handle handle)
  154. {
  155. unsigned node;
  156. node = _starpu_get_local_memory_node();
  157. STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
  158. starpu_csr_interface_t *csr_interface = (starpu_csr_interface_t *)
  159. starpu_data_get_interface_on_node(handle, node);
  160. return csr_interface->nzval;
  161. }
  162. uint32_t *starpu_csr_get_local_colind(starpu_data_handle handle)
  163. {
  164. unsigned node;
  165. node = _starpu_get_local_memory_node();
  166. STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
  167. starpu_csr_interface_t *csr_interface = (starpu_csr_interface_t *)
  168. starpu_data_get_interface_on_node(handle, node);
  169. return csr_interface->colind;
  170. }
  171. uint32_t *starpu_csr_get_local_rowptr(starpu_data_handle handle)
  172. {
  173. unsigned node;
  174. node = _starpu_get_local_memory_node();
  175. STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
  176. starpu_csr_interface_t *csr_interface = (starpu_csr_interface_t *)
  177. starpu_data_get_interface_on_node(handle, node);
  178. return csr_interface->rowptr;
  179. }
  180. static size_t csr_interface_get_size(starpu_data_handle handle)
  181. {
  182. size_t size;
  183. uint32_t nnz = starpu_csr_get_nnz(handle);
  184. uint32_t nrow = starpu_csr_get_nrow(handle);
  185. size_t elemsize = starpu_csr_get_elemsize(handle);
  186. size = nnz*elemsize + nnz*sizeof(uint32_t) + (nrow+1)*sizeof(uint32_t);
  187. return size;
  188. }
  189. /* memory allocation/deallocation primitives for the BLAS interface */
  190. /* returns the size of the allocated area */
  191. static ssize_t allocate_csr_buffer_on_node(void *data_interface_, uint32_t dst_node)
  192. {
  193. uintptr_t addr_nzval;
  194. uint32_t *addr_colind, *addr_rowptr;
  195. ssize_t allocated_memory;
  196. /* we need the 3 arrays to be allocated */
  197. starpu_csr_interface_t *csr_interface = (starpu_csr_interface_t *) data_interface_;
  198. uint32_t nnz = csr_interface->nnz;
  199. uint32_t nrow = csr_interface->nrow;
  200. size_t elemsize = csr_interface->elemsize;
  201. starpu_node_kind kind = _starpu_get_node_kind(dst_node);
  202. switch(kind) {
  203. case STARPU_CPU_RAM:
  204. addr_nzval = (uintptr_t)malloc(nnz*elemsize);
  205. if (!addr_nzval)
  206. goto fail_nzval;
  207. addr_colind = (uint32_t *) malloc(nnz*sizeof(uint32_t));
  208. if (!addr_colind)
  209. goto fail_colind;
  210. addr_rowptr = (uint32_t *) malloc((nrow+1)*sizeof(uint32_t));
  211. if (!addr_rowptr)
  212. goto fail_rowptr;
  213. break;
  214. #ifdef STARPU_USE_CUDA
  215. case STARPU_CUDA_RAM:
  216. cudaMalloc((void **)&addr_nzval, nnz*elemsize);
  217. if (!addr_nzval)
  218. goto fail_nzval;
  219. cudaMalloc((void **)&addr_colind, nnz*sizeof(uint32_t));
  220. if (!addr_colind)
  221. goto fail_colind;
  222. cudaMalloc((void **)&addr_rowptr, (nrow+1)*sizeof(uint32_t));
  223. if (!addr_rowptr)
  224. goto fail_rowptr;
  225. break;
  226. #endif
  227. #ifdef STARPU_USE_OPENCL
  228. case STARPU_OPENCL_RAM:
  229. {
  230. int ret;
  231. void *ptr;
  232. ret = _starpu_opencl_allocate_memory(&ptr, nnz*elemsize, CL_MEM_READ_WRITE);
  233. addr_nzval = (uintptr_t)ptr;
  234. if (ret) goto fail_nzval;
  235. ret = _starpu_opencl_allocate_memory(&ptr, nnz*sizeof(uint32_t), CL_MEM_READ_WRITE);
  236. addr_colind = ptr;
  237. if (ret) goto fail_colind;
  238. ret = _starpu_opencl_allocate_memory(&ptr, (nrow+1)*sizeof(uint32_t), CL_MEM_READ_WRITE);
  239. addr_rowptr = ptr;
  240. if (ret) goto fail_rowptr;
  241. break;
  242. }
  243. #endif
  244. default:
  245. assert(0);
  246. }
  247. /* allocation succeeded */
  248. allocated_memory =
  249. nnz*elemsize + nnz*sizeof(uint32_t) + (nrow+1)*sizeof(uint32_t);
  250. /* update the data properly in consequence */
  251. csr_interface->nzval = addr_nzval;
  252. csr_interface->colind = addr_colind;
  253. csr_interface->rowptr = addr_rowptr;
  254. return allocated_memory;
  255. fail_rowptr:
  256. switch(kind) {
  257. case STARPU_CPU_RAM:
  258. free((void *)addr_colind);
  259. #ifdef STARPU_USE_CUDA
  260. case STARPU_CUDA_RAM:
  261. cudaFree((void*)addr_colind);
  262. break;
  263. #endif
  264. #ifdef STARPU_USE_OPENCL
  265. case STARPU_OPENCL_RAM:
  266. clReleaseMemObject((void*)addr_colind);
  267. break;
  268. #endif
  269. default:
  270. assert(0);
  271. }
  272. fail_colind:
  273. switch(kind) {
  274. case STARPU_CPU_RAM:
  275. free((void *)addr_nzval);
  276. #ifdef STARPU_USE_CUDA
  277. case STARPU_CUDA_RAM:
  278. cudaFree((void*)addr_nzval);
  279. break;
  280. #endif
  281. #ifdef STARPU_USE_OPENCL
  282. case STARPU_OPENCL_RAM:
  283. clReleaseMemObject((void*)addr_nzval);
  284. break;
  285. #endif
  286. default:
  287. assert(0);
  288. }
  289. fail_nzval:
  290. /* allocation failed */
  291. return -ENOMEM;
  292. }
  293. static void free_csr_buffer_on_node(void *data_interface, uint32_t node)
  294. {
  295. starpu_csr_interface_t *csr_interface = (starpu_csr_interface_t *) data_interface;
  296. starpu_node_kind kind = _starpu_get_node_kind(node);
  297. switch(kind) {
  298. case STARPU_CPU_RAM:
  299. free((void*)csr_interface->nzval);
  300. free((void*)csr_interface->colind);
  301. free((void*)csr_interface->rowptr);
  302. break;
  303. #ifdef STARPU_USE_CUDA
  304. case STARPU_CUDA_RAM:
  305. cudaFree((void*)csr_interface->nzval);
  306. cudaFree((void*)csr_interface->colind);
  307. cudaFree((void*)csr_interface->rowptr);
  308. break;
  309. #endif
  310. #ifdef STARPU_USE_OPENCL
  311. case STARPU_OPENCL_RAM:
  312. clReleaseMemObject((void*)csr_interface->nzval);
  313. clReleaseMemObject((void*)csr_interface->colind);
  314. clReleaseMemObject((void*)csr_interface->rowptr);
  315. break;
  316. #endif
  317. default:
  318. assert(0);
  319. }
  320. }
  321. #ifdef STARPU_USE_CUDA
  322. static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind)
  323. {
  324. starpu_csr_interface_t *src_csr = src_interface;
  325. starpu_csr_interface_t *dst_csr = dst_interface;
  326. uint32_t nnz = src_csr->nnz;
  327. uint32_t nrow = src_csr->nrow;
  328. size_t elemsize = src_csr->elemsize;
  329. cudaError_t cures;
  330. cures = cudaMemcpy((char *)dst_csr->nzval, (char *)src_csr->nzval, nnz*elemsize, kind);
  331. if (STARPU_UNLIKELY(cures))
  332. STARPU_CUDA_REPORT_ERROR(cures);
  333. cures = cudaMemcpy((char *)dst_csr->colind, (char *)src_csr->colind, nnz*sizeof(uint32_t), kind);
  334. if (STARPU_UNLIKELY(cures))
  335. STARPU_CUDA_REPORT_ERROR(cures);
  336. cures = cudaMemcpy((char *)dst_csr->rowptr, (char *)src_csr->rowptr, (nrow+1)*sizeof(uint32_t), kind);
  337. if (STARPU_UNLIKELY(cures))
  338. STARPU_CUDA_REPORT_ERROR(cures);
  339. STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
  340. return 0;
  341. }
  342. static int copy_cuda_common_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind, cudaStream_t stream)
  343. {
  344. starpu_csr_interface_t *src_csr = src_interface;
  345. starpu_csr_interface_t *dst_csr = dst_interface;
  346. uint32_t nnz = src_csr->nnz;
  347. uint32_t nrow = src_csr->nrow;
  348. size_t elemsize = src_csr->elemsize;
  349. cudaError_t cures;
  350. int synchronous_fallback = 0;
  351. cures = cudaMemcpyAsync((char *)dst_csr->nzval, (char *)src_csr->nzval, nnz*elemsize, kind, stream);
  352. if (cures)
  353. {
  354. synchronous_fallback = 1;
  355. cures = cudaMemcpy((char *)dst_csr->nzval, (char *)src_csr->nzval, nnz*elemsize, kind);
  356. if (STARPU_UNLIKELY(cures))
  357. STARPU_CUDA_REPORT_ERROR(cures);
  358. }
  359. if (!synchronous_fallback)
  360. {
  361. cures = cudaMemcpyAsync((char *)dst_csr->colind, (char *)src_csr->colind, nnz*sizeof(uint32_t), kind, stream);
  362. }
  363. if (synchronous_fallback || cures != cudaSuccess)
  364. {
  365. synchronous_fallback = 1;
  366. cures = cudaMemcpy((char *)dst_csr->colind, (char *)src_csr->colind, nnz*sizeof(uint32_t), kind);
  367. if (STARPU_UNLIKELY(cures))
  368. STARPU_CUDA_REPORT_ERROR(cures);
  369. }
  370. if (!synchronous_fallback)
  371. {
  372. cures = cudaMemcpyAsync((char *)dst_csr->rowptr, (char *)src_csr->rowptr, (nrow+1)*sizeof(uint32_t), kind, stream);
  373. }
  374. if (synchronous_fallback || cures != cudaSuccess)
  375. {
  376. synchronous_fallback = 1;
  377. cures = cudaMemcpy((char *)dst_csr->rowptr, (char *)src_csr->rowptr, (nrow+1)*sizeof(uint32_t), kind);
  378. if (STARPU_UNLIKELY(cures))
  379. STARPU_CUDA_REPORT_ERROR(cures);
  380. }
  381. if (synchronous_fallback)
  382. {
  383. STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
  384. return 0;
  385. }
  386. else {
  387. return -EAGAIN;
  388. }
  389. }
  390. static int copy_cuda_peer(void *src_interface STARPU_ATTRIBUTE_UNUSED, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface STARPU_ATTRIBUTE_UNUSED, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
  391. {
  392. #ifdef HAVE_CUDA_MEMCPY_PEER
  393. starpu_csr_interface_t *src_csr = src_interface;
  394. starpu_csr_interface_t *dst_csr = dst_interface;
  395. uint32_t nnz = src_csr->nnz;
  396. uint32_t nrow = src_csr->nrow;
  397. size_t elemsize = src_csr->elemsize;
  398. int src_dev = starpu_memory_node_to_devid(src_node);
  399. int dst_dev = starpu_memory_node_to_devid(dst_node);
  400. cudaError_t cures;
  401. cures = cudaMemcpyPeer((char *)dst_csr->nzval, dst_dev, (char *)src_csr->nzval, src_dev, nnz*elemsize);
  402. if (STARPU_UNLIKELY(cures))
  403. STARPU_CUDA_REPORT_ERROR(cures);
  404. cures = cudaMemcpyPeer((char *)dst_csr->colind, dst_dev, (char *)src_csr->colind, src_dev, nnz*sizeof(uint32_t));
  405. if (STARPU_UNLIKELY(cures))
  406. STARPU_CUDA_REPORT_ERROR(cures);
  407. cures = cudaMemcpyPeer((char *)dst_csr->rowptr, dst_dev, (char *)src_csr->rowptr, src_dev, (nrow+1)*sizeof(uint32_t));
  408. if (STARPU_UNLIKELY(cures))
  409. STARPU_CUDA_REPORT_ERROR(cures);
  410. STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
  411. return 0;
  412. #else
  413. STARPU_ABORT();
  414. return 0;
  415. #endif
  416. }
  417. static int copy_cuda_peer_async(void *src_interface STARPU_ATTRIBUTE_UNUSED, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
  418. void *dst_interface STARPU_ATTRIBUTE_UNUSED, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream STARPU_ATTRIBUTE_UNUSED)
  419. {
  420. #ifdef HAVE_CUDA_MEMCPY_PEER
  421. starpu_csr_interface_t *src_csr = src_interface;
  422. starpu_csr_interface_t *dst_csr = dst_interface;
  423. uint32_t nnz = src_csr->nnz;
  424. uint32_t nrow = src_csr->nrow;
  425. size_t elemsize = src_csr->elemsize;
  426. cudaError_t cures;
  427. int src_dev = starpu_memory_node_to_devid(src_node);
  428. int dst_dev = starpu_memory_node_to_devid(dst_node);
  429. int synchronous_fallback = 0;
  430. cures = cudaMemcpyPeerAsync((char *)dst_csr->nzval, dst_dev, (char *)src_csr->nzval, src_dev, nnz*elemsize, stream);
  431. if (cures)
  432. {
  433. synchronous_fallback = 1;
  434. cures = cudaMemcpyPeer((char *)dst_csr->nzval, dst_dev, (char *)src_csr->nzval, src_dev, nnz*elemsize);
  435. if (STARPU_UNLIKELY(cures))
  436. STARPU_CUDA_REPORT_ERROR(cures);
  437. }
  438. if (!synchronous_fallback)
  439. {
  440. cures = cudaMemcpyPeerAsync((char *)dst_csr->colind, dst_dev, (char *)src_csr->colind, src_dev, nnz*sizeof(uint32_t), stream);
  441. }
  442. if (synchronous_fallback || cures != cudaSuccess)
  443. {
  444. synchronous_fallback = 1;
  445. cures = cudaMemcpyPeer((char *)dst_csr->colind, dst_dev, (char *)src_csr->colind, src_dev, nnz*sizeof(uint32_t));
  446. if (STARPU_UNLIKELY(cures))
  447. STARPU_CUDA_REPORT_ERROR(cures);
  448. }
  449. if (!synchronous_fallback)
  450. {
  451. cures = cudaMemcpyPeerAsync((char *)dst_csr->rowptr, dst_dev, (char *)src_csr->rowptr, src_dev, (nrow+1)*sizeof(uint32_t), stream);
  452. }
  453. if (synchronous_fallback || cures != cudaSuccess)
  454. {
  455. synchronous_fallback = 1;
  456. cures = cudaMemcpyPeer((char *)dst_csr->rowptr, dst_dev, (char *)src_csr->rowptr, src_dev, (nrow+1)*sizeof(uint32_t));
  457. if (STARPU_UNLIKELY(cures))
  458. STARPU_CUDA_REPORT_ERROR(cures);
  459. }
  460. if (synchronous_fallback)
  461. {
  462. STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
  463. return 0;
  464. }
  465. else {
  466. return -EAGAIN;
  467. }
  468. #else
  469. /* Illegal without Peer tranfers */
  470. STARPU_ABORT();
  471. return 0;
  472. #endif
  473. }
  474. static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
  475. {
  476. return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
  477. }
  478. static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
  479. {
  480. return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
  481. }
  482. static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
  483. {
  484. if (src_node == dst_node)
  485. return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
  486. else
  487. return copy_cuda_peer(src_interface, src_node, dst_interface, dst_node);
  488. }
  489. static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
  490. {
  491. return copy_cuda_common_async(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, stream);
  492. }
  493. static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
  494. {
  495. return copy_cuda_common_async(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, stream);
  496. }
  497. static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
  498. {
  499. if (src_node == dst_node)
  500. return copy_cuda_common_async(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice, stream);
  501. else
  502. return copy_cuda_peer_async(src_interface, src_node, dst_interface, dst_node, stream);
  503. }
  504. #endif // STARPU_USE_CUDA
  505. #ifdef STARPU_USE_OPENCL
  506. static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
  507. {
  508. starpu_csr_interface_t *src_csr = src_interface;
  509. starpu_csr_interface_t *dst_csr = dst_interface;
  510. uint32_t nnz = src_csr->nnz;
  511. uint32_t nrow = src_csr->nrow;
  512. size_t elemsize = src_csr->elemsize;
  513. int err;
  514. err = _starpu_opencl_copy_opencl_to_ram((cl_mem)src_csr->nzval, (void *)dst_csr->nzval, nnz*elemsize, 0, NULL);
  515. if (STARPU_UNLIKELY(err))
  516. STARPU_OPENCL_REPORT_ERROR(err);
  517. err = _starpu_opencl_copy_opencl_to_ram((cl_mem)src_csr->colind, (void *)dst_csr->colind, nnz*sizeof(uint32_t), 0, NULL);
  518. if (STARPU_UNLIKELY(err))
  519. STARPU_OPENCL_REPORT_ERROR(err);
  520. err = _starpu_opencl_copy_opencl_to_ram((cl_mem)src_csr->rowptr, (void *)dst_csr->rowptr, (nrow+1)*sizeof(uint32_t), 0, NULL);
  521. if (STARPU_UNLIKELY(err))
  522. STARPU_OPENCL_REPORT_ERROR(err);
  523. STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
  524. return 0;
  525. }
  526. static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
  527. {
  528. starpu_csr_interface_t *src_csr = src_interface;
  529. starpu_csr_interface_t *dst_csr = dst_interface;
  530. uint32_t nnz = src_csr->nnz;
  531. uint32_t nrow = src_csr->nrow;
  532. size_t elemsize = src_csr->elemsize;
  533. int err;
  534. err = _starpu_opencl_copy_ram_to_opencl((void *)src_csr->nzval, (cl_mem)dst_csr->nzval, nnz*elemsize, 0, NULL);
  535. if (STARPU_UNLIKELY(err))
  536. STARPU_OPENCL_REPORT_ERROR(err);
  537. err = _starpu_opencl_copy_ram_to_opencl((void *)src_csr->colind, (cl_mem)dst_csr->colind, nnz*sizeof(uint32_t), 0, NULL);
  538. if (STARPU_UNLIKELY(err))
  539. STARPU_OPENCL_REPORT_ERROR(err);
  540. err = _starpu_opencl_copy_ram_to_opencl((void *)src_csr->rowptr, (cl_mem)dst_csr->rowptr, (nrow+1)*sizeof(uint32_t), 0, NULL);
  541. if (STARPU_UNLIKELY(err))
  542. STARPU_OPENCL_REPORT_ERROR(err);
  543. STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
  544. return 0;
  545. }
  546. #endif // STARPU_USE_OPENCL
  547. /* as not all platform easily have a BLAS lib installed ... */
  548. static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
  549. {
  550. starpu_csr_interface_t *src_csr = (starpu_csr_interface_t *) src_interface;
  551. starpu_csr_interface_t *dst_csr = (starpu_csr_interface_t *) dst_interface;
  552. uint32_t nnz = src_csr->nnz;
  553. uint32_t nrow = src_csr->nrow;
  554. size_t elemsize = src_csr->elemsize;
  555. memcpy((void *)dst_csr->nzval, (void *)src_csr->nzval, nnz*elemsize);
  556. memcpy((void *)dst_csr->colind, (void *)src_csr->colind, nnz*sizeof(uint32_t));
  557. memcpy((void *)dst_csr->rowptr, (void *)src_csr->rowptr, (nrow+1)*sizeof(uint32_t));
  558. STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
  559. return 0;
  560. }