block_interface.c 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888
  1. /*
  2. * StarPU
  3. * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU Lesser General Public License as published by
  7. * the Free Software Foundation; either version 2.1 of the License, or (at
  8. * your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful, but
  11. * WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. *
  14. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  15. */
  16. #include <starpu.h>
  17. #include <common/config.h>
  18. #include <datawizard/coherency.h>
  19. #include <datawizard/copy_driver.h>
  20. #include <datawizard/filters.h>
  21. #include <common/hash.h>
  22. #include <starpu_cuda.h>
  23. #include <starpu_opencl.h>
  24. #include <drivers/opencl/driver_opencl.h>
  25. static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
  26. #ifdef STARPU_USE_CUDA
  27. static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
  28. static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
  29. static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream);
  30. static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream);
  31. #endif
  32. #ifdef STARPU_USE_OPENCL
  33. static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
  34. static int copy_opencl_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
  35. static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), void *_event);
  36. static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), void *_event);
  37. #endif
  38. static const struct starpu_data_copy_methods block_copy_data_methods_s = {
  39. .ram_to_ram = copy_ram_to_ram,
  40. .ram_to_spu = NULL,
  41. #ifdef STARPU_USE_CUDA
  42. .ram_to_cuda = copy_ram_to_cuda,
  43. .cuda_to_ram = copy_cuda_to_ram,
  44. .ram_to_cuda_async = copy_ram_to_cuda_async,
  45. .cuda_to_ram_async = copy_cuda_to_ram_async,
  46. #endif
  47. #ifdef STARPU_USE_OPENCL
  48. .ram_to_opencl = copy_ram_to_opencl,
  49. .opencl_to_ram = copy_opencl_to_ram,
  50. .ram_to_opencl_async = copy_ram_to_opencl_async,
  51. .opencl_to_ram_async = copy_opencl_to_ram_async,
  52. #endif
  53. .cuda_to_cuda = NULL,
  54. .cuda_to_spu = NULL,
  55. .spu_to_ram = NULL,
  56. .spu_to_cuda = NULL,
  57. .spu_to_spu = NULL
  58. };
  59. static void register_block_handle(starpu_data_handle handle, uint32_t home_node, void *interface);
  60. static ssize_t allocate_block_buffer_on_node(void *interface_, uint32_t dst_node);
  61. static void free_block_buffer_on_node(void *interface, uint32_t node);
  62. static size_t block_interface_get_size(starpu_data_handle handle);
  63. static uint32_t footprint_block_interface_crc32(starpu_data_handle handle);
  64. static int block_compare(void *interface_a, void *interface_b);
  65. static void display_block_interface(starpu_data_handle handle, FILE *f);
  66. #ifdef STARPU_USE_GORDON
  67. static int convert_block_to_gordon(void *interface, uint64_t *ptr, gordon_strideSize_t *ss);
  68. #endif
  69. static struct starpu_data_interface_ops_t interface_block_ops = {
  70. .register_data_handle = register_block_handle,
  71. .allocate_data_on_node = allocate_block_buffer_on_node,
  72. .free_data_on_node = free_block_buffer_on_node,
  73. .copy_methods = &block_copy_data_methods_s,
  74. .get_size = block_interface_get_size,
  75. .footprint = footprint_block_interface_crc32,
  76. .compare = block_compare,
  77. #ifdef STARPU_USE_GORDON
  78. .convert_to_gordon = convert_block_to_gordon,
  79. #endif
  80. .interfaceid = STARPU_BLOCK_INTERFACE_ID,
  81. .interface_size = sizeof(starpu_block_interface_t),
  82. .display = display_block_interface
  83. };
  84. #ifdef STARPU_USE_GORDON
  85. int convert_block_to_gordon(void *interface, uint64_t *ptr, gordon_strideSize_t *ss)
  86. {
  87. /* TODO */
  88. STARPU_ABORT();
  89. return 0;
  90. }
  91. #endif
  92. static void register_block_handle(starpu_data_handle handle, uint32_t home_node, void *interface)
  93. {
  94. starpu_block_interface_t *block_interface = interface;
  95. unsigned node;
  96. for (node = 0; node < STARPU_MAXNODES; node++)
  97. {
  98. starpu_block_interface_t *local_interface =
  99. starpu_data_get_interface_on_node(handle, node);
  100. if (node == home_node) {
  101. local_interface->ptr = block_interface->ptr;
  102. local_interface->dev_handle = block_interface->dev_handle;
  103. local_interface->offset = block_interface->offset;
  104. local_interface->ldy = block_interface->ldy;
  105. local_interface->ldz = block_interface->ldz;
  106. }
  107. else {
  108. local_interface->ptr = 0;
  109. local_interface->dev_handle = 0;
  110. local_interface->offset = 0;
  111. local_interface->ldy = 0;
  112. local_interface->ldz = 0;
  113. }
  114. local_interface->nx = block_interface->nx;
  115. local_interface->ny = block_interface->ny;
  116. local_interface->nz = block_interface->nz;
  117. local_interface->elemsize = block_interface->elemsize;
  118. }
  119. }
  120. /* declare a new data with the BLAS interface */
  121. void starpu_block_data_register(starpu_data_handle *handleptr, uint32_t home_node,
  122. uintptr_t ptr, uint32_t ldy, uint32_t ldz, uint32_t nx,
  123. uint32_t ny, uint32_t nz, size_t elemsize)
  124. {
  125. starpu_block_interface_t interface = {
  126. .ptr = ptr,
  127. .dev_handle = ptr,
  128. .offset = 0,
  129. .ldy = ldy,
  130. .ldz = ldz,
  131. .nx = nx,
  132. .ny = ny,
  133. .nz = nz,
  134. .elemsize = elemsize
  135. };
  136. starpu_data_register(handleptr, home_node, &interface, &interface_block_ops);
  137. }
  138. static uint32_t footprint_block_interface_crc32(starpu_data_handle handle)
  139. {
  140. uint32_t hash;
  141. hash = _starpu_crc32_be(starpu_block_get_nx(handle), 0);
  142. hash = _starpu_crc32_be(starpu_block_get_ny(handle), hash);
  143. hash = _starpu_crc32_be(starpu_block_get_nz(handle), hash);
  144. return hash;
  145. }
  146. static int block_compare(void *interface_a, void *interface_b)
  147. {
  148. starpu_block_interface_t *block_a = interface_a;
  149. starpu_block_interface_t *block_b = interface_b;
  150. /* Two matricess are considered compatible if they have the same size */
  151. return ((block_a->nx == block_b->nx)
  152. && (block_a->ny == block_b->ny)
  153. && (block_a->nz == block_b->nz)
  154. && (block_a->elemsize == block_b->elemsize));
  155. }
  156. static void display_block_interface(starpu_data_handle handle, FILE *f)
  157. {
  158. starpu_block_interface_t *interface;
  159. interface = starpu_data_get_interface_on_node(handle, 0);
  160. fprintf(f, "%u\t%u\t%u\t", interface->nx, interface->ny, interface->nz);
  161. }
  162. static size_t block_interface_get_size(starpu_data_handle handle)
  163. {
  164. size_t size;
  165. starpu_block_interface_t *interface;
  166. interface = starpu_data_get_interface_on_node(handle, 0);
  167. size = interface->nx*interface->ny*interface->nz*interface->elemsize;
  168. return size;
  169. }
  170. /* offer an access to the data parameters */
  171. uint32_t starpu_block_get_nx(starpu_data_handle handle)
  172. {
  173. starpu_block_interface_t *interface =
  174. starpu_data_get_interface_on_node(handle, 0);
  175. return interface->nx;
  176. }
  177. uint32_t starpu_block_get_ny(starpu_data_handle handle)
  178. {
  179. starpu_block_interface_t *interface =
  180. starpu_data_get_interface_on_node(handle, 0);
  181. return interface->ny;
  182. }
  183. uint32_t starpu_block_get_nz(starpu_data_handle handle)
  184. {
  185. starpu_block_interface_t *interface =
  186. starpu_data_get_interface_on_node(handle, 0);
  187. return interface->nz;
  188. }
  189. uint32_t starpu_block_get_local_ldy(starpu_data_handle handle)
  190. {
  191. unsigned node;
  192. node = _starpu_get_local_memory_node();
  193. STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
  194. starpu_block_interface_t *interface =
  195. starpu_data_get_interface_on_node(handle, node);
  196. return interface->ldy;
  197. }
  198. uint32_t starpu_block_get_local_ldz(starpu_data_handle handle)
  199. {
  200. unsigned node;
  201. node = _starpu_get_local_memory_node();
  202. STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
  203. starpu_block_interface_t *interface =
  204. starpu_data_get_interface_on_node(handle, node);
  205. return interface->ldz;
  206. }
  207. uintptr_t starpu_block_get_local_ptr(starpu_data_handle handle)
  208. {
  209. unsigned node;
  210. node = _starpu_get_local_memory_node();
  211. STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
  212. starpu_block_interface_t *interface =
  213. starpu_data_get_interface_on_node(handle, node);
  214. return interface->ptr;
  215. }
  216. size_t starpu_block_get_elemsize(starpu_data_handle handle)
  217. {
  218. starpu_block_interface_t *interface =
  219. starpu_data_get_interface_on_node(handle, 0);
  220. return interface->elemsize;
  221. }
  222. /* memory allocation/deallocation primitives for the BLOCK interface */
  223. /* returns the size of the allocated area */
  224. static ssize_t allocate_block_buffer_on_node(void *interface_, uint32_t dst_node)
  225. {
  226. uintptr_t addr = 0;
  227. unsigned fail = 0;
  228. ssize_t allocated_memory;
  229. #ifdef STARPU_USE_CUDA
  230. cudaError_t status;
  231. #endif
  232. starpu_block_interface_t *dst_block = interface_;
  233. uint32_t nx = dst_block->nx;
  234. uint32_t ny = dst_block->ny;
  235. uint32_t nz = dst_block->nz;
  236. size_t elemsize = dst_block->elemsize;
  237. starpu_node_kind kind = _starpu_get_node_kind(dst_node);
  238. switch(kind) {
  239. case STARPU_CPU_RAM:
  240. addr = (uintptr_t)malloc(nx*ny*nz*elemsize);
  241. if (!addr)
  242. fail = 1;
  243. break;
  244. #ifdef STARPU_USE_CUDA
  245. case STARPU_CUDA_RAM:
  246. status = cudaMalloc((void **)&addr, nx*ny*nz*elemsize);
  247. //_STARPU_DEBUG("cudaMalloc -> addr %p\n", addr);
  248. if (!addr || status != cudaSuccess)
  249. {
  250. if (STARPU_UNLIKELY(status != cudaErrorMemoryAllocation))
  251. STARPU_CUDA_REPORT_ERROR(status);
  252. fail = 1;
  253. }
  254. break;
  255. #endif
  256. #ifdef STARPU_USE_OPENCL
  257. case STARPU_OPENCL_RAM:
  258. {
  259. int ret;
  260. void *ptr;
  261. ret = _starpu_opencl_allocate_memory(&ptr, nx*ny*nz*elemsize, CL_MEM_READ_WRITE);
  262. addr = (uintptr_t)ptr;
  263. if (ret) {
  264. fail = 1;
  265. }
  266. break;
  267. }
  268. #endif
  269. default:
  270. assert(0);
  271. }
  272. if (!fail) {
  273. /* allocation succeeded */
  274. allocated_memory = nx*ny*nz*elemsize;
  275. /* update the data properly in consequence */
  276. dst_block->ptr = addr;
  277. dst_block->dev_handle = addr;
  278. dst_block->offset = 0;
  279. dst_block->ldy = nx;
  280. dst_block->ldz = nx*ny;
  281. } else {
  282. /* allocation failed */
  283. allocated_memory = -ENOMEM;
  284. }
  285. return allocated_memory;
  286. }
  287. static void free_block_buffer_on_node(void *interface, uint32_t node)
  288. {
  289. starpu_block_interface_t *block_interface = interface;
  290. #ifdef STARPU_USE_CUDA
  291. cudaError_t status;
  292. #endif
  293. starpu_node_kind kind = _starpu_get_node_kind(node);
  294. switch(kind) {
  295. case STARPU_CPU_RAM:
  296. free((void*)block_interface->ptr);
  297. break;
  298. #ifdef STARPU_USE_CUDA
  299. case STARPU_CUDA_RAM:
  300. status = cudaFree((void*)block_interface->ptr);
  301. if (STARPU_UNLIKELY(status))
  302. STARPU_CUDA_REPORT_ERROR(status);
  303. break;
  304. #endif
  305. #ifdef STARPU_USE_OPENCL
  306. case STARPU_OPENCL_RAM:
  307. clReleaseMemObject((void *)block_interface->ptr);
  308. break;
  309. #endif
  310. default:
  311. assert(0);
  312. }
  313. }
  314. #ifdef STARPU_USE_CUDA
  315. static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
  316. {
  317. starpu_block_interface_t *src_block = src_interface;
  318. starpu_block_interface_t *dst_block = dst_interface;
  319. uint32_t nx = src_block->nx;
  320. uint32_t ny = src_block->ny;
  321. uint32_t nz = src_block->nz;
  322. size_t elemsize = src_block->elemsize;
  323. cudaError_t cures;
  324. if ((nx == src_block->ldy) && (src_block->ldy == dst_block->ldy))
  325. {
  326. /* Is that a single contiguous buffer ? */
  327. if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
  328. {
  329. cures = cudaMemcpy((char *)dst_block->ptr, (char *)src_block->ptr,
  330. nx*ny*nz*elemsize, cudaMemcpyDeviceToHost);
  331. if (STARPU_UNLIKELY(cures))
  332. STARPU_CUDA_REPORT_ERROR(cures);
  333. }
  334. else {
  335. /* Are all plans contiguous */
  336. cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
  337. (char *)src_block->ptr, src_block->ldz*elemsize,
  338. nx*ny*elemsize, nz, cudaMemcpyDeviceToHost);
  339. if (STARPU_UNLIKELY(cures))
  340. STARPU_CUDA_REPORT_ERROR(cures);
  341. }
  342. }
  343. else {
  344. /* Default case: we transfer all lines one by one: ny*nz transfers */
  345. unsigned layer;
  346. for (layer = 0; layer < src_block->nz; layer++)
  347. {
  348. uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + layer*src_block->ldz*src_block->elemsize;
  349. uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + layer*dst_block->ldz*dst_block->elemsize;
  350. cures = cudaMemcpy2D((char *)dst_ptr, dst_block->ldy*elemsize,
  351. (char *)src_ptr, src_block->ldy*elemsize,
  352. nx*elemsize, ny, cudaMemcpyDeviceToHost);
  353. if (STARPU_UNLIKELY(cures))
  354. STARPU_CUDA_REPORT_ERROR(cures);
  355. }
  356. }
  357. cudaThreadSynchronize();
  358. STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->elemsize*src_block->elemsize);
  359. return 0;
  360. }
  361. static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
  362. {
  363. starpu_block_interface_t *src_block = src_interface;
  364. starpu_block_interface_t *dst_block = dst_interface;
  365. uint32_t nx = src_block->nx;
  366. uint32_t ny = src_block->ny;
  367. uint32_t nz = src_block->nz;
  368. size_t elemsize = src_block->elemsize;
  369. cudaError_t cures;
  370. int ret;
  371. /* We may have a contiguous buffer for the entire block, or contiguous
  372. * plans within the block, we can avoid many small transfers that way */
  373. if ((nx == src_block->ldy) && (src_block->ldy == dst_block->ldy))
  374. {
  375. /* Is that a single contiguous buffer ? */
  376. if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
  377. {
  378. cures = cudaMemcpyAsync((char *)dst_block->ptr, (char *)src_block->ptr,
  379. nx*ny*nz*elemsize, cudaMemcpyDeviceToHost, *stream);
  380. if (STARPU_UNLIKELY(cures))
  381. {
  382. cures = cudaMemcpy((char *)dst_block->ptr, (char *)src_block->ptr,
  383. nx*ny*nz*elemsize, cudaMemcpyDeviceToHost);
  384. if (STARPU_UNLIKELY(cures))
  385. STARPU_CUDA_REPORT_ERROR(cures);
  386. cudaThreadSynchronize();
  387. ret = 0;
  388. }
  389. else {
  390. ret = -EAGAIN;
  391. }
  392. }
  393. else {
  394. /* Are all plans contiguous */
  395. cures = cudaMemcpy2DAsync((char *)dst_block->ptr, dst_block->ldz*elemsize,
  396. (char *)src_block->ptr, src_block->ldz*elemsize,
  397. nx*ny*elemsize, nz, cudaMemcpyDeviceToHost, *stream);
  398. if (STARPU_UNLIKELY(cures))
  399. {
  400. cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
  401. (char *)src_block->ptr, src_block->ldz*elemsize,
  402. nx*ny*elemsize, nz, cudaMemcpyDeviceToHost);
  403. if (STARPU_UNLIKELY(cures))
  404. STARPU_CUDA_REPORT_ERROR(cures);
  405. cudaThreadSynchronize();
  406. ret = 0;
  407. }
  408. else {
  409. ret = -EAGAIN;
  410. }
  411. }
  412. }
  413. else {
  414. /* Default case: we transfer all lines one by one: ny*nz transfers */
  415. unsigned layer;
  416. for (layer = 0; layer < src_block->nz; layer++)
  417. {
  418. uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + layer*src_block->ldz*src_block->elemsize;
  419. uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + layer*dst_block->ldz*dst_block->elemsize;
  420. cures = cudaMemcpy2DAsync((char *)dst_ptr, dst_block->ldy*elemsize,
  421. (char *)src_ptr, src_block->ldy*elemsize,
  422. nx*elemsize, ny, cudaMemcpyDeviceToHost, *stream);
  423. if (STARPU_UNLIKELY(cures))
  424. {
  425. /* I don't know how to do that "better" */
  426. goto no_async_default;
  427. }
  428. }
  429. ret = -EAGAIN;
  430. }
  431. STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
  432. return ret;
  433. no_async_default:
  434. {
  435. unsigned layer;
  436. for (layer = 0; layer < src_block->nz; layer++)
  437. {
  438. uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + layer*src_block->ldz*src_block->elemsize;
  439. uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + layer*dst_block->ldz*dst_block->elemsize;
  440. cures = cudaMemcpy2D((char *)dst_ptr, dst_block->ldy*elemsize,
  441. (char *)src_ptr, src_block->ldy*elemsize,
  442. nx*elemsize, ny, cudaMemcpyDeviceToHost);
  443. if (STARPU_UNLIKELY(cures))
  444. STARPU_CUDA_REPORT_ERROR(cures);
  445. }
  446. cudaThreadSynchronize();
  447. STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
  448. return 0;
  449. }
  450. }
  451. static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
  452. {
  453. starpu_block_interface_t *src_block = src_interface;
  454. starpu_block_interface_t *dst_block = dst_interface;
  455. uint32_t nx = src_block->nx;
  456. uint32_t ny = src_block->ny;
  457. uint32_t nz = src_block->nz;
  458. size_t elemsize = src_block->elemsize;
  459. cudaError_t cures;
  460. int ret;
  461. /* We may have a contiguous buffer for the entire block, or contiguous
  462. * plans within the block, we can avoid many small transfers that way */
  463. if ((nx == src_block->ldy) && (src_block->ldy == dst_block->ldy))
  464. {
  465. /* Is that a single contiguous buffer ? */
  466. if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
  467. {
  468. cures = cudaMemcpyAsync((char *)dst_block->ptr, (char *)src_block->ptr,
  469. nx*ny*nz*elemsize, cudaMemcpyHostToDevice, *stream);
  470. if (STARPU_UNLIKELY(cures))
  471. {
  472. cures = cudaMemcpy((char *)dst_block->ptr, (char *)src_block->ptr,
  473. nx*ny*nz*elemsize, cudaMemcpyHostToDevice);
  474. if (STARPU_UNLIKELY(cures))
  475. STARPU_CUDA_REPORT_ERROR(cures);
  476. cudaThreadSynchronize();
  477. ret = 0;
  478. }
  479. else {
  480. ret = -EAGAIN;
  481. }
  482. }
  483. else {
  484. /* Are all plans contiguous */
  485. cures = cudaMemcpy2DAsync((char *)dst_block->ptr, dst_block->ldz*elemsize,
  486. (char *)src_block->ptr, src_block->ldz*elemsize,
  487. nx*ny*elemsize, nz, cudaMemcpyHostToDevice, *stream);
  488. if (STARPU_UNLIKELY(cures))
  489. {
  490. cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
  491. (char *)src_block->ptr, src_block->ldz*elemsize,
  492. nx*ny*elemsize, nz, cudaMemcpyHostToDevice);
  493. if (STARPU_UNLIKELY(cures))
  494. STARPU_CUDA_REPORT_ERROR(cures);
  495. cudaThreadSynchronize();
  496. ret = 0;
  497. }
  498. else {
  499. ret = -EAGAIN;
  500. }
  501. }
  502. }
  503. else {
  504. /* Default case: we transfer all lines one by one: ny*nz transfers */
  505. unsigned layer;
  506. for (layer = 0; layer < src_block->nz; layer++)
  507. {
  508. uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + layer*src_block->ldz*src_block->elemsize;
  509. uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + layer*dst_block->ldz*dst_block->elemsize;
  510. cures = cudaMemcpy2DAsync((char *)dst_ptr, dst_block->ldy*elemsize,
  511. (char *)src_ptr, src_block->ldy*elemsize,
  512. nx*elemsize, ny, cudaMemcpyHostToDevice, *stream);
  513. if (STARPU_UNLIKELY(cures))
  514. {
  515. /* I don't know how to do that "better" */
  516. goto no_async_default;
  517. }
  518. }
  519. ret = -EAGAIN;
  520. }
  521. STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
  522. return ret;
  523. no_async_default:
  524. {
  525. unsigned layer;
  526. for (layer = 0; layer < src_block->nz; layer++)
  527. {
  528. uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + layer*src_block->ldz*src_block->elemsize;
  529. uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + layer*dst_block->ldz*dst_block->elemsize;
  530. cures = cudaMemcpy2D((char *)dst_ptr, dst_block->ldy*elemsize,
  531. (char *)src_ptr, src_block->ldy*elemsize,
  532. nx*elemsize, ny, cudaMemcpyHostToDevice);
  533. if (STARPU_UNLIKELY(cures))
  534. STARPU_CUDA_REPORT_ERROR(cures);
  535. }
  536. cudaThreadSynchronize();
  537. STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
  538. return 0;
  539. }
  540. }
  541. static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
  542. {
  543. starpu_block_interface_t *src_block = src_interface;
  544. starpu_block_interface_t *dst_block = dst_interface;
  545. uint32_t nx = src_block->nx;
  546. uint32_t ny = src_block->ny;
  547. uint32_t nz = src_block->nz;
  548. size_t elemsize = src_block->elemsize;
  549. cudaError_t cures;
  550. /* We may have a contiguous buffer for the entire block, or contiguous
  551. * plans within the block, we can avoid many small transfers that way */
  552. if ((nx == src_block->ldy) && (src_block->ldy == dst_block->ldy))
  553. {
  554. /* Is that a single contiguous buffer ? */
  555. if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
  556. {
  557. cures = cudaMemcpy((char *)dst_block->ptr, (char *)src_block->ptr,
  558. nx*ny*nz*elemsize, cudaMemcpyHostToDevice);
  559. }
  560. else {
  561. /* Are all plans contiguous */
  562. cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
  563. (char *)src_block->ptr, src_block->ldz*elemsize,
  564. nx*ny*elemsize, nz, cudaMemcpyHostToDevice);
  565. }
  566. if (STARPU_UNLIKELY(cures))
  567. STARPU_CUDA_REPORT_ERROR(cures);
  568. }
  569. else {
  570. /* Default case: we transfer all lines one by one: ny*nz transfers */
  571. unsigned layer;
  572. for (layer = 0; layer < src_block->nz; layer++)
  573. {
  574. uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + layer*src_block->ldz*src_block->elemsize;
  575. uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + layer*dst_block->ldz*dst_block->elemsize;
  576. cures = cudaMemcpy2D((char *)dst_ptr, dst_block->ldy*elemsize,
  577. (char *)src_ptr, src_block->ldy*elemsize,
  578. nx*elemsize, ny, cudaMemcpyHostToDevice);
  579. if (STARPU_UNLIKELY(cures))
  580. STARPU_CUDA_REPORT_ERROR(cures);
  581. }
  582. }
  583. cudaThreadSynchronize();
  584. STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
  585. return 0;
  586. }
  587. #endif // STARPU_USE_CUDA
  588. #ifdef STARPU_USE_OPENCL
  589. static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), void *_event)
  590. {
  591. starpu_block_interface_t *src_block = src_interface;
  592. starpu_block_interface_t *dst_block = dst_interface;
  593. int err,ret;
  594. uint32_t nx = src_block->nx;
  595. uint32_t ny = src_block->ny;
  596. /* We may have a contiguous buffer for the entire block, or contiguous
  597. * plans within the block, we can avoid many small transfers that way */
  598. if ((nx == src_block->ldy) && (src_block->ldy == dst_block->ldy))
  599. {
  600. /* Is that a single contiguous buffer ? */
  601. if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
  602. {
  603. err = _starpu_opencl_copy_ram_to_opencl_async_sync((void*)src_block->ptr, (cl_mem)dst_block->dev_handle,
  604. src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
  605. dst_block->offset, (cl_event*)_event, &ret);
  606. if (STARPU_UNLIKELY(err))
  607. STARPU_OPENCL_REPORT_ERROR(err);
  608. }
  609. else {
  610. /* Are all plans contiguous */
  611. /* XXX non contiguous buffers are not properly supported yet. (TODO) */
  612. STARPU_ASSERT(0);
  613. }
  614. }
  615. else {
  616. /* Default case: we transfer all lines one by one: ny*nz transfers */
  617. unsigned layer;
  618. for (layer = 0; layer < src_block->nz; layer++)
  619. {
  620. unsigned j;
  621. for(j=0 ; j<src_block->ny ; j++) {
  622. void *ptr = (void*)src_block->ptr+(layer*src_block->ldz*src_block->elemsize)+(j*src_block->ldy*src_block->elemsize);
  623. err = _starpu_opencl_copy_ram_to_opencl(ptr, (cl_mem)dst_block->dev_handle,
  624. src_block->nx*src_block->elemsize,
  625. layer*dst_block->ldz*dst_block->elemsize + j*dst_block->ldy*dst_block->elemsize
  626. + dst_block->offset, NULL);
  627. if (STARPU_UNLIKELY(err))
  628. STARPU_OPENCL_REPORT_ERROR(err);
  629. }
  630. // int *foo = (int *)(src_block->ptr+(layer*src_block->ldz*src_block->elemsize));
  631. // fprintf(stderr, "layer %d --> value %d\n", layer, foo[1]);
  632. // const size_t buffer_origin[3] = {layer*src_block->ldz*src_block->elemsize, 0, 0};
  633. // //const size_t buffer_origin[3] = {0, 0, 0};
  634. // const size_t host_origin[3] = {layer*dst_block->ldz*dst_block->elemsize+dst_block->offset, 0, 0};
  635. // size_t region[3] = {src_block->nx*src_block->elemsize,src_block->ny, 1};
  636. // size_t buffer_row_pitch=region[0];
  637. // size_t buffer_slice_pitch=region[1] * buffer_row_pitch;
  638. // size_t host_row_pitch=region[0];
  639. // size_t host_slice_pitch=region[1] * host_row_pitch;
  640. //
  641. // _starpu_opencl_copy_rect_ram_to_opencl((void *)src_block->ptr, (cl_mem)dst_block->dev_handle,
  642. // buffer_origin, host_origin, region,
  643. // buffer_row_pitch, buffer_slice_pitch,
  644. // host_row_pitch, host_slice_pitch, NULL);
  645. }
  646. }
  647. STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
  648. return ret;
  649. }
  650. static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), void *_event)
  651. {
  652. starpu_block_interface_t *src_block = src_interface;
  653. starpu_block_interface_t *dst_block = dst_interface;
  654. int err, ret;
  655. /* We may have a contiguous buffer for the entire block, or contiguous
  656. * plans within the block, we can avoid many small transfers that way */
  657. if ((src_block->nx == src_block->ldy) && (src_block->ldy == dst_block->ldy))
  658. {
  659. /* Is that a single contiguous buffer ? */
  660. if (((src_block->nx*src_block->ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
  661. {
  662. err = _starpu_opencl_copy_opencl_to_ram_async_sync((cl_mem)src_block->dev_handle, (void*)dst_block->ptr,
  663. src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
  664. src_block->offset, (cl_event*)_event, &ret);
  665. if (STARPU_UNLIKELY(err))
  666. STARPU_OPENCL_REPORT_ERROR(err);
  667. }
  668. else {
  669. /* Are all plans contiguous */
  670. /* XXX non contiguous buffers are not properly supported yet. (TODO) */
  671. STARPU_ASSERT(0);
  672. }
  673. }
  674. else {
  675. /* Default case: we transfer all lines one by one: ny*nz transfers */
  676. /* XXX non contiguous buffers are not properly supported yet. (TODO) */
  677. unsigned layer;
  678. for (layer = 0; layer < src_block->nz; layer++)
  679. {
  680. unsigned j;
  681. for(j=0 ; j<src_block->ny ; j++) {
  682. void *ptr = (void *)dst_block->ptr+(layer*dst_block->ldz*dst_block->elemsize)+(j*dst_block->ldy*dst_block->elemsize);
  683. err = _starpu_opencl_copy_opencl_to_ram((void*)src_block->dev_handle, ptr,
  684. src_block->nx*src_block->elemsize,
  685. layer*src_block->ldz*src_block->elemsize+j*src_block->ldy*src_block->elemsize+
  686. src_block->offset, NULL);
  687. }
  688. // const size_t buffer_origin[3] = {src_block->offset, 0, 0};
  689. // const size_t host_origin[3] = {layer*src_block->ldz*src_block->elemsize, 0, 0};
  690. // size_t region[3] = {src_block->nx*src_block->elemsize,src_block->ny, 1};
  691. // size_t buffer_row_pitch=region[0];
  692. // size_t buffer_slice_pitch=region[1] * buffer_row_pitch;
  693. // size_t host_row_pitch=region[0];
  694. // size_t host_slice_pitch=region[1] * host_row_pitch;
  695. //
  696. // _starpu_opencl_copy_rect_opencl_to_ram((cl_mem)src_block->dev_handle, (void *)dst_block->ptr,
  697. // buffer_origin, host_origin, region,
  698. // buffer_row_pitch, buffer_slice_pitch,
  699. // host_row_pitch, host_slice_pitch, NULL);
  700. }
  701. }
  702. STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
  703. return ret;
  704. }
  705. static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
  706. {
  707. return copy_ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
  708. }
  709. static int copy_opencl_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
  710. {
  711. return copy_opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, NULL);
  712. }
  713. #endif
  714. /* as not all platform easily have a BLAS lib installed ... */
  715. static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
  716. {
  717. starpu_block_interface_t *src_block = src_interface;
  718. starpu_block_interface_t *dst_block = dst_interface;
  719. uint32_t nx = dst_block->nx;
  720. uint32_t ny = dst_block->ny;
  721. uint32_t nz = dst_block->nz;
  722. size_t elemsize = dst_block->elemsize;
  723. uint32_t ldy_src = src_block->ldy;
  724. uint32_t ldz_src = src_block->ldz;
  725. uint32_t ldy_dst = dst_block->ldy;
  726. uint32_t ldz_dst = dst_block->ldz;
  727. uintptr_t ptr_src = src_block->ptr;
  728. uintptr_t ptr_dst = dst_block->ptr;
  729. unsigned y, z;
  730. for (z = 0; z < nz; z++)
  731. for (y = 0; y < ny; y++)
  732. {
  733. uint32_t src_offset = (y*ldy_src + y*z*ldz_src)*elemsize;
  734. uint32_t dst_offset = (y*ldy_dst + y*z*ldz_dst)*elemsize;
  735. memcpy((void *)(ptr_dst + dst_offset),
  736. (void *)(ptr_src + src_offset), nx*elemsize);
  737. }
  738. STARPU_TRACE_DATA_COPY(src_node, dst_node, nx*ny*nz*elemsize);
  739. return 0;
  740. }