block_interface.c 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816
  1. /*
  2. * StarPU
  3. * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU Lesser General Public License as published by
  7. * the Free Software Foundation; either version 2.1 of the License, or (at
  8. * your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful, but
  11. * WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. *
  14. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  15. */
  16. #include <starpu.h>
  17. #include <common/config.h>
  18. #include <datawizard/coherency.h>
  19. #include <datawizard/copy_driver.h>
  20. #include <datawizard/filters.h>
  21. #include <common/hash.h>
  22. #ifdef STARPU_USE_OPENCL
  23. #include <starpu_opencl.h>
  24. #include <drivers/opencl/driver_opencl.h>
  25. #endif
  26. static int dummy_copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
  27. #ifdef STARPU_USE_CUDA
  28. static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
  29. static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
  30. static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream);
  31. static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream);
  32. #endif
  33. #ifdef STARPU_USE_OPENCL
  34. static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
  35. static int copy_opencl_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
  36. static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), void *_event);
  37. static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), void *_event);
  38. #endif
  39. static const struct starpu_data_copy_methods block_copy_data_methods_s = {
  40. .ram_to_ram = dummy_copy_ram_to_ram,
  41. .ram_to_spu = NULL,
  42. #ifdef STARPU_USE_CUDA
  43. .ram_to_cuda = copy_ram_to_cuda,
  44. .cuda_to_ram = copy_cuda_to_ram,
  45. .ram_to_cuda_async = copy_ram_to_cuda_async,
  46. .cuda_to_ram_async = copy_cuda_to_ram_async,
  47. #endif
  48. #ifdef STARPU_USE_OPENCL
  49. .ram_to_opencl = copy_ram_to_opencl,
  50. .opencl_to_ram = copy_opencl_to_ram,
  51. .ram_to_opencl_async = copy_ram_to_opencl_async,
  52. .opencl_to_ram_async = copy_opencl_to_ram_async,
  53. #endif
  54. .cuda_to_cuda = NULL,
  55. .cuda_to_spu = NULL,
  56. .spu_to_ram = NULL,
  57. .spu_to_cuda = NULL,
  58. .spu_to_spu = NULL
  59. };
  60. static void register_block_handle(starpu_data_handle handle, uint32_t home_node, void *interface);
  61. static size_t allocate_block_buffer_on_node(void *interface_, uint32_t dst_node);
  62. static void free_block_buffer_on_node(void *interface, uint32_t node);
  63. static size_t block_interface_get_size(starpu_data_handle handle);
  64. static uint32_t footprint_block_interface_crc32(starpu_data_handle handle);
  65. static int block_compare(void *interface_a, void *interface_b);
  66. static void display_block_interface(starpu_data_handle handle, FILE *f);
  67. #ifdef STARPU_USE_GORDON
  68. static int convert_block_to_gordon(void *interface, uint64_t *ptr, gordon_strideSize_t *ss);
  69. #endif
  70. static struct starpu_data_interface_ops_t interface_block_ops = {
  71. .register_data_handle = register_block_handle,
  72. .allocate_data_on_node = allocate_block_buffer_on_node,
  73. .free_data_on_node = free_block_buffer_on_node,
  74. .copy_methods = &block_copy_data_methods_s,
  75. .get_size = block_interface_get_size,
  76. .footprint = footprint_block_interface_crc32,
  77. .compare = block_compare,
  78. #ifdef STARPU_USE_GORDON
  79. .convert_to_gordon = convert_block_to_gordon,
  80. #endif
  81. .interfaceid = STARPU_BLOCK_INTERFACE_ID,
  82. .interface_size = sizeof(starpu_block_interface_t),
  83. .display = display_block_interface
  84. };
  85. #ifdef STARPU_USE_GORDON
  86. int convert_block_to_gordon(void *interface, uint64_t *ptr, gordon_strideSize_t *ss)
  87. {
  88. /* TODO */
  89. STARPU_ABORT();
  90. return 0;
  91. }
  92. #endif
  93. static void register_block_handle(starpu_data_handle handle, uint32_t home_node, void *interface)
  94. {
  95. starpu_block_interface_t *block_interface = interface;
  96. unsigned node;
  97. for (node = 0; node < STARPU_MAXNODES; node++)
  98. {
  99. starpu_block_interface_t *local_interface =
  100. starpu_data_get_interface_on_node(handle, node);
  101. if (node == home_node) {
  102. local_interface->ptr = block_interface->ptr;
  103. local_interface->dev_handle = block_interface->dev_handle;
  104. local_interface->offset = block_interface->offset;
  105. local_interface->ldy = block_interface->ldy;
  106. local_interface->ldz = block_interface->ldz;
  107. }
  108. else {
  109. local_interface->ptr = 0;
  110. local_interface->dev_handle = 0;
  111. local_interface->offset = 0;
  112. local_interface->ldy = 0;
  113. local_interface->ldz = 0;
  114. }
  115. local_interface->nx = block_interface->nx;
  116. local_interface->ny = block_interface->ny;
  117. local_interface->nz = block_interface->nz;
  118. local_interface->elemsize = block_interface->elemsize;
  119. }
  120. }
  121. /* declare a new data with the BLAS interface */
  122. void starpu_block_data_register(starpu_data_handle *handleptr, uint32_t home_node,
  123. uintptr_t ptr, uint32_t ldy, uint32_t ldz, uint32_t nx,
  124. uint32_t ny, uint32_t nz, size_t elemsize)
  125. {
  126. starpu_block_interface_t interface = {
  127. .ptr = ptr,
  128. .dev_handle = ptr,
  129. .offset = 0,
  130. .ldy = ldy,
  131. .ldz = ldz,
  132. .nx = nx,
  133. .ny = ny,
  134. .nz = nz,
  135. .elemsize = elemsize
  136. };
  137. starpu_data_register(handleptr, home_node, &interface, &interface_block_ops);
  138. }
  139. static uint32_t footprint_block_interface_crc32(starpu_data_handle handle)
  140. {
  141. uint32_t hash;
  142. hash = _starpu_crc32_be(starpu_block_get_nx(handle), 0);
  143. hash = _starpu_crc32_be(starpu_block_get_ny(handle), hash);
  144. hash = _starpu_crc32_be(starpu_block_get_nz(handle), hash);
  145. return hash;
  146. }
  147. static int block_compare(void *interface_a, void *interface_b)
  148. {
  149. starpu_block_interface_t *block_a = interface_a;
  150. starpu_block_interface_t *block_b = interface_b;
  151. /* Two matricess are considered compatible if they have the same size */
  152. return ((block_a->nx == block_b->nx)
  153. && (block_a->ny == block_b->ny)
  154. && (block_a->nz == block_b->nz)
  155. && (block_a->elemsize == block_b->elemsize));
  156. }
  157. static void display_block_interface(starpu_data_handle handle, FILE *f)
  158. {
  159. starpu_block_interface_t *interface;
  160. interface = starpu_data_get_interface_on_node(handle, 0);
  161. fprintf(f, "%u\t%u\t%u\t", interface->nx, interface->ny, interface->nz);
  162. }
  163. static size_t block_interface_get_size(starpu_data_handle handle)
  164. {
  165. size_t size;
  166. starpu_block_interface_t *interface;
  167. interface = starpu_data_get_interface_on_node(handle, 0);
  168. size = interface->nx*interface->ny*interface->nz*interface->elemsize;
  169. return size;
  170. }
  171. /* offer an access to the data parameters */
  172. uint32_t starpu_block_get_nx(starpu_data_handle handle)
  173. {
  174. starpu_block_interface_t *interface =
  175. starpu_data_get_interface_on_node(handle, 0);
  176. return interface->nx;
  177. }
  178. uint32_t starpu_block_get_ny(starpu_data_handle handle)
  179. {
  180. starpu_block_interface_t *interface =
  181. starpu_data_get_interface_on_node(handle, 0);
  182. return interface->ny;
  183. }
  184. uint32_t starpu_block_get_nz(starpu_data_handle handle)
  185. {
  186. starpu_block_interface_t *interface =
  187. starpu_data_get_interface_on_node(handle, 0);
  188. return interface->nz;
  189. }
  190. uint32_t starpu_block_get_local_ldy(starpu_data_handle handle)
  191. {
  192. unsigned node;
  193. node = _starpu_get_local_memory_node();
  194. STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
  195. starpu_block_interface_t *interface =
  196. starpu_data_get_interface_on_node(handle, node);
  197. return interface->ldy;
  198. }
  199. uint32_t starpu_block_get_local_ldz(starpu_data_handle handle)
  200. {
  201. unsigned node;
  202. node = _starpu_get_local_memory_node();
  203. STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
  204. starpu_block_interface_t *interface =
  205. starpu_data_get_interface_on_node(handle, node);
  206. return interface->ldz;
  207. }
  208. uintptr_t starpu_block_get_local_ptr(starpu_data_handle handle)
  209. {
  210. unsigned node;
  211. node = _starpu_get_local_memory_node();
  212. STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
  213. starpu_block_interface_t *interface =
  214. starpu_data_get_interface_on_node(handle, node);
  215. return interface->ptr;
  216. }
  217. size_t starpu_block_get_elemsize(starpu_data_handle handle)
  218. {
  219. starpu_block_interface_t *interface =
  220. starpu_data_get_interface_on_node(handle, 0);
  221. return interface->elemsize;
  222. }
  223. /* memory allocation/deallocation primitives for the BLOCK interface */
  224. /* returns the size of the allocated area */
  225. static size_t allocate_block_buffer_on_node(void *interface_, uint32_t dst_node)
  226. {
  227. uintptr_t addr = 0;
  228. unsigned fail = 0;
  229. size_t allocated_memory;
  230. #ifdef STARPU_USE_CUDA
  231. cudaError_t status;
  232. #endif
  233. starpu_block_interface_t *dst_block = interface_;
  234. uint32_t nx = dst_block->nx;
  235. uint32_t ny = dst_block->ny;
  236. uint32_t nz = dst_block->nz;
  237. size_t elemsize = dst_block->elemsize;
  238. starpu_node_kind kind = _starpu_get_node_kind(dst_node);
  239. switch(kind) {
  240. case STARPU_CPU_RAM:
  241. addr = (uintptr_t)malloc(nx*ny*nz*elemsize);
  242. if (!addr)
  243. fail = 1;
  244. break;
  245. #ifdef STARPU_USE_CUDA
  246. case STARPU_CUDA_RAM:
  247. status = cudaMalloc((void **)&addr, nx*ny*nz*elemsize);
  248. //fprintf(stderr, "cudaMalloc -> addr %p\n", addr);
  249. if (!addr || status != cudaSuccess)
  250. {
  251. if (STARPU_UNLIKELY(status != cudaErrorMemoryAllocation))
  252. STARPU_CUDA_REPORT_ERROR(status);
  253. fail = 1;
  254. }
  255. break;
  256. #endif
  257. #ifdef STARPU_USE_OPENCL
  258. case STARPU_OPENCL_RAM:
  259. {
  260. int ret;
  261. void *ptr;
  262. ret = _starpu_opencl_allocate_memory(&ptr, nx*ny*nz*elemsize, CL_MEM_READ_WRITE);
  263. addr = (uintptr_t)ptr;
  264. if (ret) {
  265. fail = 1;
  266. }
  267. break;
  268. }
  269. #endif
  270. default:
  271. assert(0);
  272. }
  273. if (!fail) {
  274. /* allocation succeeded */
  275. allocated_memory = nx*ny*nz*elemsize;
  276. /* update the data properly in consequence */
  277. dst_block->ptr = addr;
  278. dst_block->dev_handle = addr;
  279. dst_block->offset = 0;
  280. dst_block->ldy = nx;
  281. dst_block->ldz = nx*ny;
  282. } else {
  283. /* allocation failed */
  284. allocated_memory = 0;
  285. }
  286. return allocated_memory;
  287. }
  288. static void free_block_buffer_on_node(void *interface, uint32_t node)
  289. {
  290. starpu_block_interface_t *block_interface = interface;
  291. #ifdef STARPU_USE_CUDA
  292. cudaError_t status;
  293. #endif
  294. starpu_node_kind kind = _starpu_get_node_kind(node);
  295. switch(kind) {
  296. case STARPU_CPU_RAM:
  297. free((void*)block_interface->ptr);
  298. break;
  299. #ifdef STARPU_USE_CUDA
  300. case STARPU_CUDA_RAM:
  301. status = cudaFree((void*)block_interface->ptr);
  302. if (STARPU_UNLIKELY(status))
  303. STARPU_CUDA_REPORT_ERROR(status);
  304. break;
  305. #endif
  306. #ifdef STARPU_USE_OPENCL
  307. case STARPU_OPENCL_RAM:
  308. clReleaseMemObject((void *)block_interface->ptr);
  309. break;
  310. #endif
  311. default:
  312. assert(0);
  313. }
  314. }
  315. #ifdef STARPU_USE_CUDA
  316. static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
  317. {
  318. starpu_block_interface_t *src_block = src_interface;
  319. starpu_block_interface_t *dst_block = dst_interface;
  320. uint32_t nx = src_block->nx;
  321. uint32_t ny = src_block->ny;
  322. uint32_t nz = src_block->nz;
  323. size_t elemsize = src_block->elemsize;
  324. cudaError_t cures;
  325. if ((nx == src_block->ldy) && (src_block->ldy == dst_block->ldy))
  326. {
  327. /* Is that a single contiguous buffer ? */
  328. if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
  329. {
  330. cures = cudaMemcpy((char *)dst_block->ptr, (char *)src_block->ptr,
  331. nx*ny*nz*elemsize, cudaMemcpyDeviceToHost);
  332. if (STARPU_UNLIKELY(cures))
  333. STARPU_CUDA_REPORT_ERROR(cures);
  334. }
  335. else {
  336. /* Are all plans contiguous */
  337. cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
  338. (char *)src_block->ptr, src_block->ldz*elemsize,
  339. nx*ny*elemsize, nz, cudaMemcpyDeviceToHost);
  340. if (STARPU_UNLIKELY(cures))
  341. STARPU_CUDA_REPORT_ERROR(cures);
  342. }
  343. }
  344. else {
  345. /* Default case: we transfer all lines one by one: ny*nz transfers */
  346. unsigned layer;
  347. for (layer = 0; layer < src_block->nz; layer++)
  348. {
  349. uint8_t *src_ptr = ((uint8_t *)src_block->ptr)
  350. + src_block->ldz*src_block->elemsize;
  351. uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr)
  352. + dst_block->ldz*dst_block->elemsize;
  353. cures = cudaMemcpy2D((char *)dst_ptr, dst_block->ldy*elemsize,
  354. (char *)src_ptr, src_block->ldy*elemsize,
  355. nx*elemsize, ny, cudaMemcpyDeviceToHost);
  356. if (STARPU_UNLIKELY(cures))
  357. STARPU_CUDA_REPORT_ERROR(cures);
  358. }
  359. }
  360. cudaThreadSynchronize();
  361. STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->elemsize*src_block->elemsize);
  362. return 0;
  363. }
  364. static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
  365. {
  366. starpu_block_interface_t *src_block = src_interface;
  367. starpu_block_interface_t *dst_block = dst_interface;
  368. uint32_t nx = src_block->nx;
  369. uint32_t ny = src_block->ny;
  370. uint32_t nz = src_block->nz;
  371. size_t elemsize = src_block->elemsize;
  372. cudaError_t cures;
  373. int ret;
  374. /* We may have a contiguous buffer for the entire block, or contiguous
  375. * plans within the block, we can avoid many small transfers that way */
  376. if ((nx == src_block->ldy) && (src_block->ldy == dst_block->ldy))
  377. {
  378. /* Is that a single contiguous buffer ? */
  379. if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
  380. {
  381. cures = cudaMemcpyAsync((char *)dst_block->ptr, (char *)src_block->ptr,
  382. nx*ny*nz*elemsize, cudaMemcpyDeviceToHost, *stream);
  383. if (STARPU_UNLIKELY(cures))
  384. {
  385. cures = cudaMemcpy((char *)dst_block->ptr, (char *)src_block->ptr,
  386. nx*ny*nz*elemsize, cudaMemcpyDeviceToHost);
  387. if (STARPU_UNLIKELY(cures))
  388. STARPU_CUDA_REPORT_ERROR(cures);
  389. cudaThreadSynchronize();
  390. ret = 0;
  391. }
  392. else {
  393. ret = EAGAIN;
  394. }
  395. }
  396. else {
  397. /* Are all plans contiguous */
  398. cures = cudaMemcpy2DAsync((char *)dst_block->ptr, dst_block->ldz*elemsize,
  399. (char *)src_block->ptr, src_block->ldz*elemsize,
  400. nx*ny*elemsize, nz, cudaMemcpyDeviceToHost, *stream);
  401. if (STARPU_UNLIKELY(cures))
  402. {
  403. cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
  404. (char *)src_block->ptr, src_block->ldz*elemsize,
  405. nx*ny*elemsize, nz, cudaMemcpyDeviceToHost);
  406. if (STARPU_UNLIKELY(cures))
  407. STARPU_CUDA_REPORT_ERROR(cures);
  408. cudaThreadSynchronize();
  409. ret = 0;
  410. }
  411. else {
  412. ret = EAGAIN;
  413. }
  414. }
  415. }
  416. else {
  417. /* Default case: we transfer all lines one by one: ny*nz transfers */
  418. unsigned layer;
  419. for (layer = 0; layer < src_block->nz; layer++)
  420. {
  421. uint8_t *src_ptr = ((uint8_t *)src_block->ptr)
  422. + src_block->ldz*src_block->elemsize;
  423. uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr)
  424. + dst_block->ldz*dst_block->elemsize;
  425. cures = cudaMemcpy2DAsync((char *)dst_ptr, dst_block->ldy*elemsize,
  426. (char *)src_ptr, src_block->ldy*elemsize,
  427. nx*elemsize, ny, cudaMemcpyDeviceToHost, *stream);
  428. if (STARPU_UNLIKELY(cures))
  429. {
  430. /* I don't know how to do that "better" */
  431. goto no_async_default;
  432. }
  433. }
  434. ret = EAGAIN;
  435. }
  436. STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
  437. return ret;
  438. no_async_default:
  439. {
  440. unsigned layer;
  441. for (layer = 0; layer < src_block->nz; layer++)
  442. {
  443. uint8_t *src_ptr = ((uint8_t *)src_block->ptr)
  444. + src_block->ldz*src_block->elemsize;
  445. uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr)
  446. + dst_block->ldz*dst_block->elemsize;
  447. cures = cudaMemcpy2D((char *)dst_ptr, dst_block->ldy*elemsize,
  448. (char *)src_ptr, src_block->ldy*elemsize,
  449. nx*elemsize, ny, cudaMemcpyDeviceToHost);
  450. if (STARPU_UNLIKELY(cures))
  451. STARPU_CUDA_REPORT_ERROR(cures);
  452. }
  453. cudaThreadSynchronize();
  454. STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
  455. return 0;
  456. }
  457. }
  458. static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
  459. {
  460. starpu_block_interface_t *src_block = src_interface;
  461. starpu_block_interface_t *dst_block = dst_interface;
  462. uint32_t nx = src_block->nx;
  463. uint32_t ny = src_block->ny;
  464. uint32_t nz = src_block->nz;
  465. size_t elemsize = src_block->elemsize;
  466. cudaError_t cures;
  467. int ret;
  468. /* We may have a contiguous buffer for the entire block, or contiguous
  469. * plans within the block, we can avoid many small transfers that way */
  470. if ((nx == src_block->ldy) && (src_block->ldy == dst_block->ldy))
  471. {
  472. /* Is that a single contiguous buffer ? */
  473. if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
  474. {
  475. cures = cudaMemcpyAsync((char *)dst_block->ptr, (char *)src_block->ptr,
  476. nx*ny*nz*elemsize, cudaMemcpyHostToDevice, *stream);
  477. if (STARPU_UNLIKELY(cures))
  478. {
  479. cures = cudaMemcpy((char *)dst_block->ptr, (char *)src_block->ptr,
  480. nx*ny*nz*elemsize, cudaMemcpyHostToDevice);
  481. if (STARPU_UNLIKELY(cures))
  482. STARPU_CUDA_REPORT_ERROR(cures);
  483. cudaThreadSynchronize();
  484. ret = 0;
  485. }
  486. else {
  487. ret = EAGAIN;
  488. }
  489. }
  490. else {
  491. /* Are all plans contiguous */
  492. cures = cudaMemcpy2DAsync((char *)dst_block->ptr, dst_block->ldz*elemsize,
  493. (char *)src_block->ptr, src_block->ldz*elemsize,
  494. nx*ny*elemsize, nz, cudaMemcpyHostToDevice, *stream);
  495. if (STARPU_UNLIKELY(cures))
  496. {
  497. cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
  498. (char *)src_block->ptr, src_block->ldz*elemsize,
  499. nx*ny*elemsize, nz, cudaMemcpyHostToDevice);
  500. if (STARPU_UNLIKELY(cures))
  501. STARPU_CUDA_REPORT_ERROR(cures);
  502. cudaThreadSynchronize();
  503. ret = 0;
  504. }
  505. else {
  506. ret = EAGAIN;
  507. }
  508. }
  509. }
  510. else {
  511. /* Default case: we transfer all lines one by one: ny*nz transfers */
  512. unsigned layer;
  513. for (layer = 0; layer < src_block->nz; layer++)
  514. {
  515. uint8_t *src_ptr = ((uint8_t *)src_block->ptr)
  516. + src_block->ldz*src_block->elemsize;
  517. uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr)
  518. + dst_block->ldz*dst_block->elemsize;
  519. cures = cudaMemcpy2DAsync((char *)dst_ptr, dst_block->ldy*elemsize,
  520. (char *)src_ptr, src_block->ldy*elemsize,
  521. nx*elemsize, ny, cudaMemcpyHostToDevice, *stream);
  522. if (STARPU_UNLIKELY(cures))
  523. {
  524. /* I don't know how to do that "better" */
  525. goto no_async_default;
  526. }
  527. }
  528. ret = EAGAIN;
  529. }
  530. STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
  531. return ret;
  532. no_async_default:
  533. {
  534. unsigned layer;
  535. for (layer = 0; layer < src_block->nz; layer++)
  536. {
  537. uint8_t *src_ptr = ((uint8_t *)src_block->ptr)
  538. + src_block->ldz*src_block->elemsize;
  539. uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr)
  540. + dst_block->ldz*dst_block->elemsize;
  541. cures = cudaMemcpy2D((char *)dst_ptr, dst_block->ldy*elemsize,
  542. (char *)src_ptr, src_block->ldy*elemsize,
  543. nx*elemsize, ny, cudaMemcpyHostToDevice);
  544. if (STARPU_UNLIKELY(cures))
  545. STARPU_CUDA_REPORT_ERROR(cures);
  546. }
  547. cudaThreadSynchronize();
  548. STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
  549. return 0;
  550. }
  551. }
  552. static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
  553. {
  554. cudaError_t cures;
  555. starpu_block_interface_t *src_block = src_interface;
  556. starpu_block_interface_t *dst_block = dst_interface;
  557. uint32_t nx = src_block->nx;
  558. uint32_t ny = src_block->ny;
  559. uint32_t nz = src_block->nz;
  560. size_t elemsize = src_block->elemsize;
  561. if ((src_block->nx == src_block->ldy) && (src_block->ldy == dst_block->ldy))
  562. {
  563. /* we are lucky */
  564. cures = cudaMemcpy((char *)dst_block->ptr, (char *)src_block->ptr,
  565. nx*ny*nz*elemsize, cudaMemcpyHostToDevice);
  566. if (STARPU_UNLIKELY(cures))
  567. STARPU_CUDA_REPORT_ERROR(cures);
  568. }
  569. else {
  570. unsigned layer;
  571. for (layer = 0; layer < src_block->nz; layer++)
  572. {
  573. uint8_t *src_ptr = ((uint8_t *)src_block->ptr)
  574. + src_block->ldz*src_block->elemsize;
  575. uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr)
  576. + dst_block->ldz*dst_block->elemsize;
  577. cures = cudaMemcpy2D((char *)dst_ptr, dst_block->ldy*elemsize,
  578. (char *)src_ptr, src_block->ldy*elemsize,
  579. nx*elemsize, ny, cudaMemcpyHostToDevice);
  580. if (STARPU_UNLIKELY(cures))
  581. STARPU_CUDA_REPORT_ERROR(cures);
  582. }
  583. }
  584. cudaThreadSynchronize();
  585. STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
  586. return 0;
  587. }
  588. #endif // STARPU_USE_CUDA
  589. #ifdef STARPU_USE_OPENCL
  590. static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), void *_event)
  591. {
  592. starpu_block_interface_t *src_block = src_interface;
  593. starpu_block_interface_t *dst_block = dst_interface;
  594. int err,ret;
  595. /* XXX non contiguous buffers are not properly supported yet. (TODO) */
  596. STARPU_ASSERT((src_block->nx == src_block->ldy) && (src_block->ldy == dst_block->ldy));
  597. err = _starpu_opencl_copy_to_opencl_async_sync((void*)src_block->ptr, (cl_mem)dst_block->dev_handle,
  598. src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
  599. dst_block->offset, (cl_event*)_event, &ret);
  600. if (STARPU_UNLIKELY(err))
  601. STARPU_OPENCL_REPORT_ERROR(err);
  602. STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
  603. return ret;
  604. }
  605. static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), void *_event)
  606. {
  607. starpu_block_interface_t *src_block = src_interface;
  608. starpu_block_interface_t *dst_block = dst_interface;
  609. int err,ret=EAGAIN;
  610. /* XXX non contiguous buffers are not properly supported yet. (TODO) */
  611. STARPU_ASSERT((src_block->nx == src_block->ldy) && (src_block->ldy == dst_block->ldy));
  612. err = _starpu_opencl_copy_from_opencl((cl_mem)src_block->dev_handle, (void*)dst_block->ptr,
  613. src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
  614. src_block->offset, (cl_event*)_event);
  615. if (STARPU_UNLIKELY(err)) {
  616. if (_event) {
  617. err = _starpu_opencl_copy_from_opencl((cl_mem)src_block->dev_handle, (void*)dst_block->ptr,
  618. src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
  619. src_block->offset, NULL);
  620. ret = 0;
  621. }
  622. if (STARPU_UNLIKELY(err))
  623. STARPU_OPENCL_REPORT_ERROR(err);
  624. }
  625. STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
  626. return ret;
  627. }
  628. static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
  629. {
  630. copy_ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
  631. return 0;
  632. }
  633. static int copy_opencl_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
  634. {
  635. copy_opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, NULL);
  636. return 0;
  637. }
  638. #endif
  639. /* as not all platform easily have a BLAS lib installed ... */
  640. static int dummy_copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
  641. {
  642. starpu_block_interface_t *src_block = src_interface;
  643. starpu_block_interface_t *dst_block = dst_interface;
  644. uint32_t nx = dst_block->nx;
  645. uint32_t ny = dst_block->ny;
  646. uint32_t nz = dst_block->nz;
  647. size_t elemsize = dst_block->elemsize;
  648. uint32_t ldy_src = src_block->ldy;
  649. uint32_t ldz_src = src_block->ldz;
  650. uint32_t ldy_dst = dst_block->ldy;
  651. uint32_t ldz_dst = dst_block->ldz;
  652. uintptr_t ptr_src = src_block->ptr;
  653. uintptr_t ptr_dst = dst_block->ptr;
  654. unsigned y, z;
  655. for (z = 0; z < nz; z++)
  656. for (y = 0; y < ny; y++)
  657. {
  658. uint32_t src_offset = (y*ldy_src + y*z*ldz_src)*elemsize;
  659. uint32_t dst_offset = (y*ldy_dst + y*z*ldz_dst)*elemsize;
  660. memcpy((void *)(ptr_dst + dst_offset),
  661. (void *)(ptr_src + src_offset), nx*elemsize);
  662. }
  663. STARPU_TRACE_DATA_COPY(src_node, dst_node, nx*ny*nz*elemsize);
  664. return 0;
  665. }