block_interface.c 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2009-2011 Université de Bordeaux 1
  4. * Copyright (C) 2010, 2011 Centre National de la Recherche Scientifique
  5. *
  6. * StarPU is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU Lesser General Public License as published by
  8. * the Free Software Foundation; either version 2.1 of the License, or (at
  9. * your option) any later version.
  10. *
  11. * StarPU is distributed in the hope that it will be useful, but
  12. * WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  14. *
  15. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  16. */
  17. #include <starpu.h>
  18. #include <common/config.h>
  19. #include <datawizard/coherency.h>
  20. #include <datawizard/copy_driver.h>
  21. #include <datawizard/filters.h>
  22. #include <common/hash.h>
  23. #include <starpu_cuda.h>
  24. #include <starpu_opencl.h>
  25. #include <drivers/opencl/driver_opencl.h>
  26. static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
  27. #ifdef STARPU_USE_CUDA
  28. static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
  29. static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
  30. static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
  31. static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
  32. static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
  33. #endif
  34. #ifdef STARPU_USE_OPENCL
  35. static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
  36. static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
  37. static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event);
  38. static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event);
  39. #endif
  40. static const struct starpu_data_copy_methods block_copy_data_methods_s = {
  41. .ram_to_ram = copy_ram_to_ram,
  42. .ram_to_spu = NULL,
  43. #ifdef STARPU_USE_CUDA
  44. .ram_to_cuda = copy_ram_to_cuda,
  45. .cuda_to_ram = copy_cuda_to_ram,
  46. .ram_to_cuda_async = copy_ram_to_cuda_async,
  47. .cuda_to_ram_async = copy_cuda_to_ram_async,
  48. .cuda_to_cuda = copy_cuda_to_cuda,
  49. #endif
  50. #ifdef STARPU_USE_OPENCL
  51. .ram_to_opencl = copy_ram_to_opencl,
  52. .opencl_to_ram = copy_opencl_to_ram,
  53. .ram_to_opencl_async = copy_ram_to_opencl_async,
  54. .opencl_to_ram_async = copy_opencl_to_ram_async,
  55. #endif
  56. .cuda_to_spu = NULL,
  57. .spu_to_ram = NULL,
  58. .spu_to_cuda = NULL,
  59. .spu_to_spu = NULL
  60. };
  61. static void register_block_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface);
  62. static void *block_handle_to_pointer(starpu_data_handle data_handle, uint32_t node);
  63. static ssize_t allocate_block_buffer_on_node(void *data_interface_, uint32_t dst_node);
  64. static void free_block_buffer_on_node(void *data_interface, uint32_t node);
  65. static size_t block_interface_get_size(starpu_data_handle handle);
  66. static uint32_t footprint_block_interface_crc32(starpu_data_handle handle);
  67. static int block_compare(void *data_interface_a, void *data_interface_b);
  68. static void display_block_interface(starpu_data_handle handle, FILE *f);
  69. #ifdef STARPU_USE_GORDON
  70. static int convert_block_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss);
  71. #endif
  72. static struct starpu_data_interface_ops_t interface_block_ops = {
  73. .register_data_handle = register_block_handle,
  74. .allocate_data_on_node = allocate_block_buffer_on_node,
  75. .handle_to_pointer = block_handle_to_pointer,
  76. .free_data_on_node = free_block_buffer_on_node,
  77. .copy_methods = &block_copy_data_methods_s,
  78. .get_size = block_interface_get_size,
  79. .footprint = footprint_block_interface_crc32,
  80. .compare = block_compare,
  81. #ifdef STARPU_USE_GORDON
  82. .convert_to_gordon = convert_block_to_gordon,
  83. #endif
  84. .interfaceid = STARPU_BLOCK_INTERFACE_ID,
  85. .interface_size = sizeof(starpu_block_interface_t),
  86. .display = display_block_interface
  87. };
  88. #ifdef STARPU_USE_GORDON
  89. int convert_block_to_gordon(void *data_interface, uint64_t *ptr, gordon_strideSize_t *ss)
  90. {
  91. /* TODO */
  92. STARPU_ABORT();
  93. return 0;
  94. }
  95. #endif
  96. static void *block_handle_to_pointer(starpu_data_handle handle, uint32_t node)
  97. {
  98. STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
  99. starpu_block_interface_t *block_interface =
  100. starpu_data_get_interface_on_node(handle, node);
  101. return (void*) block_interface->ptr;
  102. }
  103. static void register_block_handle(starpu_data_handle handle, uint32_t home_node, void *data_interface)
  104. {
  105. starpu_block_interface_t *block_interface = data_interface;
  106. unsigned node;
  107. for (node = 0; node < STARPU_MAXNODES; node++)
  108. {
  109. starpu_block_interface_t *local_interface =
  110. starpu_data_get_interface_on_node(handle, node);
  111. if (node == home_node) {
  112. local_interface->ptr = block_interface->ptr;
  113. local_interface->dev_handle = block_interface->dev_handle;
  114. local_interface->offset = block_interface->offset;
  115. local_interface->ldy = block_interface->ldy;
  116. local_interface->ldz = block_interface->ldz;
  117. }
  118. else {
  119. local_interface->ptr = 0;
  120. local_interface->dev_handle = 0;
  121. local_interface->offset = 0;
  122. local_interface->ldy = 0;
  123. local_interface->ldz = 0;
  124. }
  125. local_interface->nx = block_interface->nx;
  126. local_interface->ny = block_interface->ny;
  127. local_interface->nz = block_interface->nz;
  128. local_interface->elemsize = block_interface->elemsize;
  129. }
  130. }
  131. /* declare a new data with the BLAS interface */
  132. void starpu_block_data_register(starpu_data_handle *handleptr, uint32_t home_node,
  133. uintptr_t ptr, uint32_t ldy, uint32_t ldz, uint32_t nx,
  134. uint32_t ny, uint32_t nz, size_t elemsize)
  135. {
  136. starpu_block_interface_t block_interface = {
  137. .ptr = ptr,
  138. .dev_handle = ptr,
  139. .offset = 0,
  140. .ldy = ldy,
  141. .ldz = ldz,
  142. .nx = nx,
  143. .ny = ny,
  144. .nz = nz,
  145. .elemsize = elemsize
  146. };
  147. starpu_data_register(handleptr, home_node, &block_interface, &interface_block_ops);
  148. }
  149. static uint32_t footprint_block_interface_crc32(starpu_data_handle handle)
  150. {
  151. uint32_t hash;
  152. hash = _starpu_crc32_be(starpu_block_get_nx(handle), 0);
  153. hash = _starpu_crc32_be(starpu_block_get_ny(handle), hash);
  154. hash = _starpu_crc32_be(starpu_block_get_nz(handle), hash);
  155. return hash;
  156. }
  157. static int block_compare(void *data_interface_a, void *data_interface_b)
  158. {
  159. starpu_block_interface_t *block_a = data_interface_a;
  160. starpu_block_interface_t *block_b = data_interface_b;
  161. /* Two matricess are considered compatible if they have the same size */
  162. return ((block_a->nx == block_b->nx)
  163. && (block_a->ny == block_b->ny)
  164. && (block_a->nz == block_b->nz)
  165. && (block_a->elemsize == block_b->elemsize));
  166. }
  167. static void display_block_interface(starpu_data_handle handle, FILE *f)
  168. {
  169. starpu_block_interface_t *block_interface;
  170. block_interface = starpu_data_get_interface_on_node(handle, 0);
  171. fprintf(f, "%u\t%u\t%u\t", block_interface->nx, block_interface->ny, block_interface->nz);
  172. }
  173. static size_t block_interface_get_size(starpu_data_handle handle)
  174. {
  175. size_t size;
  176. starpu_block_interface_t *block_interface;
  177. block_interface = starpu_data_get_interface_on_node(handle, 0);
  178. size = block_interface->nx*block_interface->ny*block_interface->nz*block_interface->elemsize;
  179. return size;
  180. }
  181. /* offer an access to the data parameters */
  182. uint32_t starpu_block_get_nx(starpu_data_handle handle)
  183. {
  184. starpu_block_interface_t *block_interface =
  185. starpu_data_get_interface_on_node(handle, 0);
  186. return block_interface->nx;
  187. }
  188. uint32_t starpu_block_get_ny(starpu_data_handle handle)
  189. {
  190. starpu_block_interface_t *block_interface =
  191. starpu_data_get_interface_on_node(handle, 0);
  192. return block_interface->ny;
  193. }
  194. uint32_t starpu_block_get_nz(starpu_data_handle handle)
  195. {
  196. starpu_block_interface_t *block_interface =
  197. starpu_data_get_interface_on_node(handle, 0);
  198. return block_interface->nz;
  199. }
  200. uint32_t starpu_block_get_local_ldy(starpu_data_handle handle)
  201. {
  202. unsigned node;
  203. node = _starpu_get_local_memory_node();
  204. STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
  205. starpu_block_interface_t *block_interface =
  206. starpu_data_get_interface_on_node(handle, node);
  207. return block_interface->ldy;
  208. }
  209. uint32_t starpu_block_get_local_ldz(starpu_data_handle handle)
  210. {
  211. unsigned node;
  212. node = _starpu_get_local_memory_node();
  213. STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
  214. starpu_block_interface_t *block_interface =
  215. starpu_data_get_interface_on_node(handle, node);
  216. return block_interface->ldz;
  217. }
  218. uintptr_t starpu_block_get_local_ptr(starpu_data_handle handle)
  219. {
  220. unsigned node;
  221. node = _starpu_get_local_memory_node();
  222. STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
  223. starpu_block_interface_t *block_interface =
  224. starpu_data_get_interface_on_node(handle, node);
  225. return block_interface->ptr;
  226. }
  227. size_t starpu_block_get_elemsize(starpu_data_handle handle)
  228. {
  229. starpu_block_interface_t *block_interface =
  230. starpu_data_get_interface_on_node(handle, 0);
  231. return block_interface->elemsize;
  232. }
  233. /* memory allocation/deallocation primitives for the BLOCK interface */
  234. /* returns the size of the allocated area */
  235. static ssize_t allocate_block_buffer_on_node(void *data_interface_, uint32_t dst_node)
  236. {
  237. uintptr_t addr = 0;
  238. unsigned fail = 0;
  239. ssize_t allocated_memory;
  240. #ifdef STARPU_USE_CUDA
  241. cudaError_t status;
  242. #endif
  243. starpu_block_interface_t *dst_block = data_interface_;
  244. uint32_t nx = dst_block->nx;
  245. uint32_t ny = dst_block->ny;
  246. uint32_t nz = dst_block->nz;
  247. size_t elemsize = dst_block->elemsize;
  248. starpu_node_kind kind = _starpu_get_node_kind(dst_node);
  249. switch(kind) {
  250. case STARPU_CPU_RAM:
  251. addr = (uintptr_t)malloc(nx*ny*nz*elemsize);
  252. if (!addr)
  253. fail = 1;
  254. break;
  255. #ifdef STARPU_USE_CUDA
  256. case STARPU_CUDA_RAM:
  257. status = cudaMalloc((void **)&addr, nx*ny*nz*elemsize);
  258. //_STARPU_DEBUG("cudaMalloc -> addr %p\n", addr);
  259. if (!addr || status != cudaSuccess)
  260. {
  261. if (STARPU_UNLIKELY(status != cudaErrorMemoryAllocation))
  262. STARPU_CUDA_REPORT_ERROR(status);
  263. fail = 1;
  264. }
  265. break;
  266. #endif
  267. #ifdef STARPU_USE_OPENCL
  268. case STARPU_OPENCL_RAM:
  269. {
  270. int ret;
  271. void *ptr;
  272. ret = _starpu_opencl_allocate_memory(&ptr, nx*ny*nz*elemsize, CL_MEM_READ_WRITE);
  273. addr = (uintptr_t)ptr;
  274. if (ret) {
  275. fail = 1;
  276. }
  277. break;
  278. }
  279. #endif
  280. default:
  281. assert(0);
  282. }
  283. if (!fail) {
  284. /* allocation succeeded */
  285. allocated_memory = nx*ny*nz*elemsize;
  286. /* update the data properly in consequence */
  287. dst_block->ptr = addr;
  288. dst_block->dev_handle = addr;
  289. dst_block->offset = 0;
  290. dst_block->ldy = nx;
  291. dst_block->ldz = nx*ny;
  292. } else {
  293. /* allocation failed */
  294. allocated_memory = -ENOMEM;
  295. }
  296. return allocated_memory;
  297. }
  298. static void free_block_buffer_on_node(void *data_interface, uint32_t node)
  299. {
  300. starpu_block_interface_t *block_interface = data_interface;
  301. #ifdef STARPU_USE_CUDA
  302. cudaError_t status;
  303. #endif
  304. starpu_node_kind kind = _starpu_get_node_kind(node);
  305. switch(kind) {
  306. case STARPU_CPU_RAM:
  307. free((void*)block_interface->ptr);
  308. break;
  309. #ifdef STARPU_USE_CUDA
  310. case STARPU_CUDA_RAM:
  311. status = cudaFree((void*)block_interface->ptr);
  312. if (STARPU_UNLIKELY(status))
  313. STARPU_CUDA_REPORT_ERROR(status);
  314. break;
  315. #endif
  316. #ifdef STARPU_USE_OPENCL
  317. case STARPU_OPENCL_RAM:
  318. clReleaseMemObject((void *)block_interface->ptr);
  319. break;
  320. #endif
  321. default:
  322. assert(0);
  323. }
  324. }
  325. #ifdef STARPU_USE_CUDA
  326. static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind)
  327. {
  328. starpu_block_interface_t *src_block = src_interface;
  329. starpu_block_interface_t *dst_block = dst_interface;
  330. uint32_t nx = src_block->nx;
  331. uint32_t ny = src_block->ny;
  332. uint32_t nz = src_block->nz;
  333. size_t elemsize = src_block->elemsize;
  334. cudaError_t cures;
  335. if ((nx == src_block->ldy) && (src_block->ldy == dst_block->ldy))
  336. {
  337. /* Is that a single contiguous buffer ? */
  338. if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
  339. {
  340. cures = cudaMemcpy((char *)dst_block->ptr, (char *)src_block->ptr,
  341. nx*ny*nz*elemsize, kind);
  342. if (STARPU_UNLIKELY(cures))
  343. STARPU_CUDA_REPORT_ERROR(cures);
  344. }
  345. else {
  346. /* Are all plans contiguous */
  347. cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
  348. (char *)src_block->ptr, src_block->ldz*elemsize,
  349. nx*ny*elemsize, nz, kind);
  350. if (STARPU_UNLIKELY(cures))
  351. STARPU_CUDA_REPORT_ERROR(cures);
  352. }
  353. }
  354. else {
  355. /* Default case: we transfer all lines one by one: ny*nz transfers */
  356. unsigned layer;
  357. for (layer = 0; layer < src_block->nz; layer++)
  358. {
  359. uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + layer*src_block->ldz*src_block->elemsize;
  360. uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + layer*dst_block->ldz*dst_block->elemsize;
  361. cures = cudaMemcpy2D((char *)dst_ptr, dst_block->ldy*elemsize,
  362. (char *)src_ptr, src_block->ldy*elemsize,
  363. nx*elemsize, ny, kind);
  364. if (STARPU_UNLIKELY(cures))
  365. STARPU_CUDA_REPORT_ERROR(cures);
  366. }
  367. }
  368. STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->elemsize*src_block->elemsize);
  369. return 0;
  370. }
  371. static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream, enum cudaMemcpyKind kind)
  372. {
  373. starpu_block_interface_t *src_block = src_interface;
  374. starpu_block_interface_t *dst_block = dst_interface;
  375. uint32_t nx = src_block->nx;
  376. uint32_t ny = src_block->ny;
  377. uint32_t nz = src_block->nz;
  378. size_t elemsize = src_block->elemsize;
  379. cudaError_t cures;
  380. int ret;
  381. /* We may have a contiguous buffer for the entire block, or contiguous
  382. * plans within the block, we can avoid many small transfers that way */
  383. if ((nx == src_block->ldy) && (src_block->ldy == dst_block->ldy))
  384. {
  385. /* Is that a single contiguous buffer ? */
  386. if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
  387. {
  388. cures = cudaMemcpyAsync((char *)dst_block->ptr, (char *)src_block->ptr,
  389. nx*ny*nz*elemsize, kind, stream);
  390. if (STARPU_UNLIKELY(cures))
  391. {
  392. cures = cudaMemcpy((char *)dst_block->ptr, (char *)src_block->ptr,
  393. nx*ny*nz*elemsize, kind);
  394. if (STARPU_UNLIKELY(cures))
  395. STARPU_CUDA_REPORT_ERROR(cures);
  396. ret = 0;
  397. }
  398. else {
  399. ret = -EAGAIN;
  400. }
  401. }
  402. else {
  403. /* Are all plans contiguous */
  404. cures = cudaMemcpy2DAsync((char *)dst_block->ptr, dst_block->ldz*elemsize,
  405. (char *)src_block->ptr, src_block->ldz*elemsize,
  406. nx*ny*elemsize, nz, kind, stream);
  407. if (STARPU_UNLIKELY(cures))
  408. {
  409. cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
  410. (char *)src_block->ptr, src_block->ldz*elemsize,
  411. nx*ny*elemsize, nz, kind);
  412. if (STARPU_UNLIKELY(cures))
  413. STARPU_CUDA_REPORT_ERROR(cures);
  414. ret = 0;
  415. }
  416. else {
  417. ret = -EAGAIN;
  418. }
  419. }
  420. }
  421. else {
  422. /* Default case: we transfer all lines one by one: ny*nz transfers */
  423. unsigned layer;
  424. for (layer = 0; layer < src_block->nz; layer++)
  425. {
  426. uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + layer*src_block->ldz*src_block->elemsize;
  427. uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + layer*dst_block->ldz*dst_block->elemsize;
  428. cures = cudaMemcpy2DAsync((char *)dst_ptr, dst_block->ldy*elemsize,
  429. (char *)src_ptr, src_block->ldy*elemsize,
  430. nx*elemsize, ny, kind, stream);
  431. if (STARPU_UNLIKELY(cures))
  432. {
  433. /* I don't know how to do that "better" */
  434. goto no_async_default;
  435. }
  436. }
  437. ret = -EAGAIN;
  438. }
  439. STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
  440. return ret;
  441. no_async_default:
  442. {
  443. unsigned layer;
  444. for (layer = 0; layer < src_block->nz; layer++)
  445. {
  446. uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + layer*src_block->ldz*src_block->elemsize;
  447. uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + layer*dst_block->ldz*dst_block->elemsize;
  448. cures = cudaMemcpy2D((char *)dst_ptr, dst_block->ldy*elemsize,
  449. (char *)src_ptr, src_block->ldy*elemsize,
  450. nx*elemsize, ny, kind);
  451. if (STARPU_UNLIKELY(cures))
  452. STARPU_CUDA_REPORT_ERROR(cures);
  453. }
  454. STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
  455. return 0;
  456. }
  457. }
  458. static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
  459. {
  460. return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
  461. }
  462. static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
  463. {
  464. return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
  465. }
  466. static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
  467. {
  468. return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
  469. }
  470. static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
  471. {
  472. return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
  473. }
  474. static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
  475. {
  476. return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
  477. }
  478. #endif // STARPU_USE_CUDA
  479. #ifdef STARPU_USE_OPENCL
  480. static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event)
  481. {
  482. starpu_block_interface_t *src_block = src_interface;
  483. starpu_block_interface_t *dst_block = dst_interface;
  484. int err,ret;
  485. uint32_t nx = src_block->nx;
  486. uint32_t ny = src_block->ny;
  487. /* We may have a contiguous buffer for the entire block, or contiguous
  488. * plans within the block, we can avoid many small transfers that way */
  489. if ((nx == src_block->ldy) && (src_block->ldy == dst_block->ldy))
  490. {
  491. /* Is that a single contiguous buffer ? */
  492. if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
  493. {
  494. err = _starpu_opencl_copy_ram_to_opencl_async_sync((void*)src_block->ptr, (cl_mem)dst_block->dev_handle,
  495. src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
  496. dst_block->offset, (cl_event*)_event, &ret);
  497. if (STARPU_UNLIKELY(err))
  498. STARPU_OPENCL_REPORT_ERROR(err);
  499. }
  500. else {
  501. /* Are all plans contiguous */
  502. /* XXX non contiguous buffers are not properly supported yet. (TODO) */
  503. STARPU_ASSERT(0);
  504. }
  505. }
  506. else {
  507. /* Default case: we transfer all lines one by one: ny*nz transfers */
  508. unsigned layer;
  509. for (layer = 0; layer < src_block->nz; layer++)
  510. {
  511. unsigned j;
  512. for(j=0 ; j<src_block->ny ; j++) {
  513. void *ptr = (void*)src_block->ptr+(layer*src_block->ldz*src_block->elemsize)+(j*src_block->ldy*src_block->elemsize);
  514. err = _starpu_opencl_copy_ram_to_opencl(ptr, (cl_mem)dst_block->dev_handle,
  515. src_block->nx*src_block->elemsize,
  516. layer*dst_block->ldz*dst_block->elemsize + j*dst_block->ldy*dst_block->elemsize
  517. + dst_block->offset, NULL);
  518. if (STARPU_UNLIKELY(err))
  519. STARPU_OPENCL_REPORT_ERROR(err);
  520. }
  521. // int *foo = (int *)(src_block->ptr+(layer*src_block->ldz*src_block->elemsize));
  522. // fprintf(stderr, "layer %d --> value %d\n", layer, foo[1]);
  523. // const size_t buffer_origin[3] = {layer*src_block->ldz*src_block->elemsize, 0, 0};
  524. // //const size_t buffer_origin[3] = {0, 0, 0};
  525. // const size_t host_origin[3] = {layer*dst_block->ldz*dst_block->elemsize+dst_block->offset, 0, 0};
  526. // size_t region[3] = {src_block->nx*src_block->elemsize,src_block->ny, 1};
  527. // size_t buffer_row_pitch=region[0];
  528. // size_t buffer_slice_pitch=region[1] * buffer_row_pitch;
  529. // size_t host_row_pitch=region[0];
  530. // size_t host_slice_pitch=region[1] * host_row_pitch;
  531. //
  532. // _starpu_opencl_copy_rect_ram_to_opencl((void *)src_block->ptr, (cl_mem)dst_block->dev_handle,
  533. // buffer_origin, host_origin, region,
  534. // buffer_row_pitch, buffer_slice_pitch,
  535. // host_row_pitch, host_slice_pitch, NULL);
  536. }
  537. }
  538. STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
  539. return ret;
  540. }
  541. static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, void *_event)
  542. {
  543. starpu_block_interface_t *src_block = src_interface;
  544. starpu_block_interface_t *dst_block = dst_interface;
  545. int err, ret;
  546. /* We may have a contiguous buffer for the entire block, or contiguous
  547. * plans within the block, we can avoid many small transfers that way */
  548. if ((src_block->nx == src_block->ldy) && (src_block->ldy == dst_block->ldy))
  549. {
  550. /* Is that a single contiguous buffer ? */
  551. if (((src_block->nx*src_block->ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
  552. {
  553. err = _starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_block->dev_handle, (void*)dst_block->ptr,
  554. src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
  555. src_block->offset, (cl_event*)_event, &ret);
  556. if (STARPU_UNLIKELY(err))
  557. STARPU_OPENCL_REPORT_ERROR(err);
  558. }
  559. else {
  560. /* Are all plans contiguous */
  561. /* XXX non contiguous buffers are not properly supported yet. (TODO) */
  562. STARPU_ASSERT(0);
  563. }
  564. }
  565. else {
  566. /* Default case: we transfer all lines one by one: ny*nz transfers */
  567. /* XXX non contiguous buffers are not properly supported yet. (TODO) */
  568. unsigned layer;
  569. for (layer = 0; layer < src_block->nz; layer++)
  570. {
  571. unsigned j;
  572. for(j=0 ; j<src_block->ny ; j++) {
  573. void *ptr = (void *)dst_block->ptr+(layer*dst_block->ldz*dst_block->elemsize)+(j*dst_block->ldy*dst_block->elemsize);
  574. err = _starpu_opencl_copy_opencl_to_ram((void*)src_block->dev_handle, ptr,
  575. src_block->nx*src_block->elemsize,
  576. layer*src_block->ldz*src_block->elemsize+j*src_block->ldy*src_block->elemsize+
  577. src_block->offset, NULL);
  578. }
  579. // const size_t buffer_origin[3] = {src_block->offset, 0, 0};
  580. // const size_t host_origin[3] = {layer*src_block->ldz*src_block->elemsize, 0, 0};
  581. // size_t region[3] = {src_block->nx*src_block->elemsize,src_block->ny, 1};
  582. // size_t buffer_row_pitch=region[0];
  583. // size_t buffer_slice_pitch=region[1] * buffer_row_pitch;
  584. // size_t host_row_pitch=region[0];
  585. // size_t host_slice_pitch=region[1] * host_row_pitch;
  586. //
  587. // _starpu_opencl_copy_rect_opencl_to_ram((cl_mem)src_block->dev_handle, (void *)dst_block->ptr,
  588. // buffer_origin, host_origin, region,
  589. // buffer_row_pitch, buffer_slice_pitch,
  590. // host_row_pitch, host_slice_pitch, NULL);
  591. }
  592. }
  593. STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
  594. return ret;
  595. }
  596. static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
  597. {
  598. return copy_ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
  599. }
  600. static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
  601. {
  602. return copy_opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, NULL);
  603. }
  604. #endif
  605. /* as not all platform easily have a BLAS lib installed ... */
  606. static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
  607. {
  608. starpu_block_interface_t *src_block = src_interface;
  609. starpu_block_interface_t *dst_block = dst_interface;
  610. uint32_t nx = dst_block->nx;
  611. uint32_t ny = dst_block->ny;
  612. uint32_t nz = dst_block->nz;
  613. size_t elemsize = dst_block->elemsize;
  614. uint32_t ldy_src = src_block->ldy;
  615. uint32_t ldz_src = src_block->ldz;
  616. uint32_t ldy_dst = dst_block->ldy;
  617. uint32_t ldz_dst = dst_block->ldz;
  618. uintptr_t ptr_src = src_block->ptr;
  619. uintptr_t ptr_dst = dst_block->ptr;
  620. unsigned y, z;
  621. for (z = 0; z < nz; z++)
  622. for (y = 0; y < ny; y++)
  623. {
  624. uint32_t src_offset = (y*ldy_src + y*z*ldz_src)*elemsize;
  625. uint32_t dst_offset = (y*ldy_dst + y*z*ldz_dst)*elemsize;
  626. memcpy((void *)(ptr_dst + dst_offset),
  627. (void *)(ptr_src + src_offset), nx*elemsize);
  628. }
  629. STARPU_TRACE_DATA_COPY(src_node, dst_node, nx*ny*nz*elemsize);
  630. return 0;
  631. }