implicit-stencil-kernels.c 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2010-2015 Université de Bordeaux
  4. * Copyright (C) 2012, 2013, 2016, 2017 CNRS
  5. *
  6. * StarPU is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU Lesser General Public License as published by
  8. * the Free Software Foundation; either version 2.1 of the License, or (at
  9. * your option) any later version.
  10. *
  11. * StarPU is distributed in the hope that it will be useful, but
  12. * WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  14. *
  15. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  16. */
  17. #include "implicit-stencil.h"
  18. /* Computation Kernels */
  19. /*
  20. * There are three codeletets:
  21. *
  22. * - cl_update, which takes a block and the boundaries of its neighbours, loads
  23. * the boundaries into the block and perform some update loops:
  24. *
  25. * comp. buffer save. buffers comp. buffer save. buffers comp. buffer
  26. * | ... |
  27. * | | +------------------+ +------------------+
  28. * | #N+1 | | #N+1 bottom copy====>#N+1 bottom copy |
  29. * +-------------+ +------------------+ +------------------+
  30. * | #N top copy | | #N top copy | | |
  31. * +-------------+ +------------------+ | |
  32. * | #N |
  33. * ...
  34. * | | +----------------+ +----------------------+
  35. * | | | #N bottom copy | | block #N bottom copy |
  36. * ^ +------------------+ +----------------+ +----------------------+
  37. * | | #N-1 top copy <====#N-1 top copy | | block #N-1 |
  38. * | +------------------+ +----------------+ | |
  39. * Z ...
  40. *
  41. * - save_cl_top, which take a block and its top boundary, and saves the top of
  42. * the block into the boundary (to be given as bottom of the neighbour above
  43. * this block).
  44. *
  45. * comp. buffer save. buffers comp. buffer save. buffers comp. buffer
  46. * | ... |
  47. * | | +------------------+ +------------------+
  48. * | #N+1 | | #N+1 bottom copy | | #N+1 bottom copy |
  49. * +-------------+ +------------------+ +------------------+
  50. * | #N top copy | | #N top copy <==== |
  51. * +-------------+ +------------------+ |..................|
  52. * | #N |
  53. * ...
  54. * | | +----------------+ +----------------------+
  55. * | | | #N bottom copy | | block #N bottom copy |
  56. * ^ +------------------+ +----------------+ +----------------------+
  57. * | | #N-1 top copy | | #N-1 top copy | | block #N-1 |
  58. * | +------------------+ +----------------+ | |
  59. * Z ...
  60. *
  61. * - save_cl_bottom, same for the bottom
  62. * comp. buffer save. buffers comp. buffer save. buffers comp. buffer
  63. * | ... |
  64. * | | +------------------+ +------------------+
  65. * | #N+1 | | #N+1 bottom copy | | #N+1 bottom copy |
  66. * +-------------+ +------------------+ +------------------+
  67. * | #N top copy | | #N top copy | | |
  68. * +-------------+ +------------------+ | |
  69. * | #N |
  70. * ...
  71. * |..................| +----------------+ +----------------------+
  72. * | ====>#N bottom copy | | block #N bottom copy |
  73. * ^ +------------------+ +----------------+ +----------------------+
  74. * | | #N-1 top copy | | #N-1 top copy | | block #N-1 |
  75. * | +------------------+ +----------------+ | |
  76. * Z ...
  77. *
  78. * The idea is that the computation buffers thus don't have to move, only their
  79. * boundaries are copied to buffers that do move (be it CPU/GPU, GPU/GPU or via
  80. * MPI)
  81. *
  82. * For each of the buffers above, there are two (0/1) buffers to make new/old switch costless.
  83. */
  84. #if 0
  85. # define DEBUG(fmt, ...) fprintf(stderr,fmt,##__VA_ARGS__)
  86. #else
  87. # define DEBUG(fmt, ...) (void) 0
  88. #endif
  89. /* Record which GPU ran which block, for nice pictures */
  90. int who_runs_what_len;
  91. int *who_runs_what;
  92. int *who_runs_what_index;
  93. double *last_tick;
  94. /* Achieved iterations */
  95. static int achieved_iter;
  96. /* Record how many updates each worker performed */
  97. unsigned update_per_worker[STARPU_NMAXWORKERS];
  98. static void record_who_runs_what(struct block_description *block)
  99. {
  100. double now, now2, diff, delta = get_ticks() * 1000;
  101. int workerid = starpu_worker_get_id_check();
  102. now = starpu_timing_now();
  103. now2 = now - start;
  104. diff = now2 - last_tick[block->bz];
  105. while (diff >= delta)
  106. {
  107. last_tick[block->bz] += delta;
  108. diff = now2 - last_tick[block->bz];
  109. if (who_runs_what_index[block->bz] < who_runs_what_len)
  110. who_runs_what[block->bz + (who_runs_what_index[block->bz]++) * get_nbz()] = -1;
  111. }
  112. if (who_runs_what_index[block->bz] < who_runs_what_len)
  113. who_runs_what[block->bz + (who_runs_what_index[block->bz]++) * get_nbz()] = global_workerid(workerid);
  114. }
  115. static void check_load(struct starpu_block_interface *block, struct starpu_block_interface *boundary)
  116. {
  117. /* Sanity checks */
  118. STARPU_ASSERT(block->nx == boundary->nx);
  119. STARPU_ASSERT(block->ny == boundary->ny);
  120. STARPU_ASSERT(boundary->nz == K);
  121. /* NB: this is not fully garanteed ... but it's *very* likely and that
  122. * makes our life much simpler */
  123. STARPU_ASSERT(block->ldy == boundary->ldy);
  124. STARPU_ASSERT(block->ldz == boundary->ldz);
  125. }
  126. /*
  127. * Load a neighbour's boundary into block, CPU version
  128. */
  129. static void load_subblock_from_buffer_cpu(void *_block,
  130. void *_boundary,
  131. unsigned firstz)
  132. {
  133. struct starpu_block_interface *block = (struct starpu_block_interface *)_block;
  134. struct starpu_block_interface *boundary = (struct starpu_block_interface *)_boundary;
  135. check_load(block, boundary);
  136. /* We do a contiguous memory transfer */
  137. size_t boundary_size = K*block->ldz*block->elemsize;
  138. unsigned offset = firstz*block->ldz;
  139. TYPE *block_data = (TYPE *)block->ptr;
  140. TYPE *boundary_data = (TYPE *)boundary->ptr;
  141. memcpy(&block_data[offset], boundary_data, boundary_size);
  142. }
  143. /*
  144. * Load a neighbour's boundary into block, CUDA version
  145. */
  146. #ifdef STARPU_USE_CUDA
  147. static void load_subblock_from_buffer_cuda(void *_block,
  148. void *_boundary,
  149. unsigned firstz)
  150. {
  151. struct starpu_block_interface *block = (struct starpu_block_interface *)_block;
  152. struct starpu_block_interface *boundary = (struct starpu_block_interface *)_boundary;
  153. check_load(block, boundary);
  154. /* We do a contiguous memory transfer */
  155. size_t boundary_size = K*block->ldz*block->elemsize;
  156. unsigned offset = firstz*block->ldz;
  157. TYPE *block_data = (TYPE *)block->ptr;
  158. TYPE *boundary_data = (TYPE *)boundary->ptr;
  159. cudaMemcpyAsync(&block_data[offset], boundary_data, boundary_size, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
  160. }
  161. /*
  162. * cl_update (CUDA version)
  163. */
  164. static void update_func_cuda(void *descr[], void *arg)
  165. {
  166. unsigned z;
  167. starpu_codelet_unpack_args(arg, &z);
  168. struct block_description *block = get_block_description(z);
  169. int workerid = starpu_worker_get_id_check();
  170. DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
  171. if (block->bz == 0)
  172. FPRINTF(stderr,"!!! DO update_func_cuda z %u CUDA%d !!!\n", block->bz, workerid);
  173. else
  174. DEBUG( "!!! DO update_func_cuda z %u CUDA%d !!!\n", block->bz, workerid);
  175. #if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
  176. int rank = 0;
  177. MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  178. DEBUG( "!!! RANK %d !!!\n", rank);
  179. #endif
  180. DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
  181. unsigned block_size_z = get_block_size(block->bz);
  182. unsigned i;
  183. update_per_worker[workerid]++;
  184. record_who_runs_what(block);
  185. /*
  186. * Load neighbours' boundaries : TOP
  187. */
  188. /* The offset along the z axis is (block_size_z + K) */
  189. load_subblock_from_buffer_cuda(descr[0], descr[2], block_size_z+K);
  190. load_subblock_from_buffer_cuda(descr[1], descr[3], block_size_z+K);
  191. /*
  192. * Load neighbours' boundaries : BOTTOM
  193. */
  194. load_subblock_from_buffer_cuda(descr[0], descr[4], 0);
  195. load_subblock_from_buffer_cuda(descr[1], descr[5], 0);
  196. /*
  197. * Stencils ... do the actual work here :) TODO
  198. */
  199. for (i=1; i<=K; i++)
  200. {
  201. struct starpu_block_interface *oldb = descr[i%2], *newb = descr[(i+1)%2];
  202. TYPE *old = (void*) oldb->ptr, *newer = (void*) newb->ptr;
  203. /* Shadow data */
  204. cuda_shadow_host(block->bz, old, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
  205. /* And perform actual computation */
  206. #ifdef LIFE
  207. cuda_life_update_host(block->bz, old, newer, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
  208. #else
  209. cudaMemcpyAsync(newer, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*newer), cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
  210. #endif /* LIFE */
  211. }
  212. if (block->bz == 0)
  213. starpu_top_update_data_integer(starpu_top_achieved_loop, ++achieved_iter);
  214. }
  215. #endif /* STARPU_USE_CUDA */
  216. /*
  217. * Load a neighbour's boundary into block, OpenCL version
  218. */
  219. #ifdef STARPU_USE_OPENCL
  220. static void load_subblock_from_buffer_opencl(struct starpu_block_interface *block,
  221. struct starpu_block_interface *boundary,
  222. unsigned firstz)
  223. {
  224. check_load(block, boundary);
  225. /* We do a contiguous memory transfer */
  226. size_t boundary_size = K*block->ldz*block->elemsize;
  227. unsigned offset = firstz*block->ldz;
  228. cl_mem block_data = (cl_mem)block->dev_handle;
  229. cl_mem boundary_data = (cl_mem)boundary->dev_handle;
  230. cl_command_queue cq;
  231. starpu_opencl_get_current_queue(&cq);
  232. cl_int ret = clEnqueueCopyBuffer(cq, boundary_data, block_data, 0, offset, boundary_size, 0, NULL, NULL);
  233. if (ret != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(ret);
  234. }
  235. /*
  236. * cl_update (OpenCL version)
  237. */
  238. static void update_func_opencl(void *descr[], void *arg)
  239. {
  240. unsigned z;
  241. starpu_codelet_unpack_args(arg, &z);
  242. struct block_description *block = get_block_description(z);
  243. int workerid = starpu_worker_get_id_check();
  244. DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
  245. if (block->bz == 0)
  246. FPRINTF(stderr,"!!! DO update_func_opencl z %u OPENCL%d !!!\n", block->bz, workerid);
  247. else
  248. DEBUG( "!!! DO update_func_opencl z %u OPENCL%d !!!\n", block->bz, workerid);
  249. #if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
  250. int rank = 0;
  251. MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  252. DEBUG( "!!! RANK %d !!!\n", rank);
  253. #endif
  254. DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
  255. unsigned block_size_z = get_block_size(block->bz);
  256. unsigned i;
  257. update_per_worker[workerid]++;
  258. record_who_runs_what(block);
  259. cl_command_queue cq;
  260. starpu_opencl_get_current_queue(&cq);
  261. /*
  262. * Load neighbours' boundaries : TOP
  263. */
  264. /* The offset along the z axis is (block_size_z + K) */
  265. load_subblock_from_buffer_opencl(descr[0], descr[2], block_size_z+K);
  266. load_subblock_from_buffer_opencl(descr[1], descr[3], block_size_z+K);
  267. /*
  268. * Load neighbours' boundaries : BOTTOM
  269. */
  270. load_subblock_from_buffer_opencl(descr[0], descr[4], 0);
  271. load_subblock_from_buffer_opencl(descr[1], descr[5], 0);
  272. /*
  273. * Stencils ... do the actual work here :) TODO
  274. */
  275. for (i=1; i<=K; i++)
  276. {
  277. struct starpu_block_interface *oldb = descr[i%2], *newb = descr[(i+1)%2];
  278. TYPE *old = (void*) oldb->dev_handle, *newer = (void*) newb->dev_handle;
  279. /* Shadow data */
  280. opencl_shadow_host(block->bz, old, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
  281. /* And perform actual computation */
  282. #ifdef LIFE
  283. opencl_life_update_host(block->bz, old, newer, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
  284. #else
  285. cl_event event;
  286. cl_int ret = clEnqueueCopyBuffer(cq, old, newer, 0, 0, oldb->nx * oldb->ny * oldb->nz * sizeof(*newer), 0, NULL, &event);
  287. if (ret != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(ret);
  288. #endif /* LIFE */
  289. }
  290. if (block->bz == 0)
  291. starpu_top_update_data_integer(starpu_top_achieved_loop, ++achieved_iter);
  292. }
  293. #endif /* STARPU_USE_OPENCL */
  294. /*
  295. * cl_update (CPU version)
  296. */
  297. void update_func_cpu(void *descr[], void *arg)
  298. {
  299. unsigned zz;
  300. starpu_codelet_unpack_args(arg, &zz);
  301. struct block_description *block = get_block_description(zz);
  302. int workerid = starpu_worker_get_id_check();
  303. DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
  304. if (block->bz == 0)
  305. DEBUG("!!! DO update_func_cpu z %u CPU%d !!!\n", block->bz, workerid);
  306. else
  307. DEBUG("!!! DO update_func_cpu z %u CPU%d !!!\n", block->bz, workerid);
  308. #if defined(STARPU_USE_MPI) && !defined(STARPU_SIMGRID) && !defined(STARPU_USE_MPI_MASTER_SLAVE)
  309. int rank = 0;
  310. MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  311. DEBUG( "!!! RANK %d !!!\n", rank);
  312. #endif
  313. DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
  314. unsigned block_size_z = get_block_size(block->bz);
  315. unsigned i;
  316. update_per_worker[workerid]++;
  317. record_who_runs_what(block);
  318. /*
  319. * Load neighbours' boundaries : TOP
  320. */
  321. /* The offset along the z axis is (block_size_z + K) */
  322. load_subblock_from_buffer_cpu(descr[0], descr[2], block_size_z+K);
  323. load_subblock_from_buffer_cpu(descr[1], descr[3], block_size_z+K);
  324. /*
  325. * Load neighbours' boundaries : BOTTOM
  326. */
  327. load_subblock_from_buffer_cpu(descr[0], descr[4], 0);
  328. load_subblock_from_buffer_cpu(descr[1], descr[5], 0);
  329. /*
  330. * Stencils ... do the actual work here :) TODO
  331. */
  332. for (i=1; i<=K; i++)
  333. {
  334. struct starpu_block_interface *oldb = (struct starpu_block_interface *) descr[i%2], *newb = (struct starpu_block_interface *) descr[(i+1)%2];
  335. TYPE *old = (TYPE*) oldb->ptr, *newer = (TYPE*) newb->ptr;
  336. /* Shadow data */
  337. unsigned ldy = oldb->ldy, ldz = oldb->ldz;
  338. unsigned nx = oldb->nx, ny = oldb->ny, nz = oldb->nz;
  339. unsigned x, y, z;
  340. unsigned stepx = 1;
  341. unsigned stepy = 1;
  342. unsigned stepz = 1;
  343. unsigned idx = 0;
  344. unsigned idy = 0;
  345. unsigned idz = 0;
  346. TYPE *ptr = old;
  347. # include "shadow.h"
  348. /* And perform actual computation */
  349. #ifdef LIFE
  350. life_update(block->bz, old, newer, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
  351. #else
  352. memcpy(newer, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*newer));
  353. #endif /* LIFE */
  354. }
  355. if (block->bz == 0)
  356. starpu_top_update_data_integer(starpu_top_achieved_loop, ++achieved_iter);
  357. }
  358. /* Performance model and codelet structure */
  359. static struct starpu_perfmodel cl_update_model =
  360. {
  361. .type = STARPU_HISTORY_BASED,
  362. .symbol = "cl_update"
  363. };
  364. struct starpu_codelet cl_update =
  365. {
  366. .cpu_funcs = {update_func_cpu},
  367. #ifdef STARPU_USE_CUDA
  368. .cuda_funcs = {update_func_cuda},
  369. .cuda_flags = {STARPU_CUDA_ASYNC},
  370. #endif
  371. #ifdef STARPU_USE_OPENCL
  372. .opencl_funcs = {update_func_opencl},
  373. .opencl_flags = {STARPU_OPENCL_ASYNC},
  374. #endif
  375. .model = &cl_update_model,
  376. .nbuffers = 6,
  377. .modes = {STARPU_RW, STARPU_RW, STARPU_R, STARPU_R, STARPU_R, STARPU_R}
  378. };
  379. /*
  380. * Save the block internal boundaries to give them to our neighbours.
  381. */
  382. /* CPU version */
  383. static void load_subblock_into_buffer_cpu(void *_block,
  384. void *_boundary,
  385. unsigned firstz)
  386. {
  387. struct starpu_block_interface *block = (struct starpu_block_interface *)_block;
  388. struct starpu_block_interface *boundary = (struct starpu_block_interface *)_boundary;
  389. check_load(block, boundary);
  390. /* We do a contiguous memory transfer */
  391. size_t boundary_size = K*block->ldz*block->elemsize;
  392. unsigned offset = firstz*block->ldz;
  393. TYPE *block_data = (TYPE *)block->ptr;
  394. TYPE *boundary_data = (TYPE *)boundary->ptr;
  395. memcpy(boundary_data, &block_data[offset], boundary_size);
  396. }
  397. /* CUDA version */
  398. #ifdef STARPU_USE_CUDA
  399. static void load_subblock_into_buffer_cuda(void *_block,
  400. void *_boundary,
  401. unsigned firstz)
  402. {
  403. struct starpu_block_interface *block = (struct starpu_block_interface *)_block;
  404. struct starpu_block_interface *boundary = (struct starpu_block_interface *)_boundary;
  405. check_load(block, boundary);
  406. /* We do a contiguous memory transfer */
  407. size_t boundary_size = K*block->ldz*block->elemsize;
  408. unsigned offset = firstz*block->ldz;
  409. TYPE *block_data = (TYPE *)block->ptr;
  410. TYPE *boundary_data = (TYPE *)boundary->ptr;
  411. cudaMemcpyAsync(boundary_data, &block_data[offset], boundary_size, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
  412. }
  413. #endif /* STARPU_USE_CUDA */
  414. /* OPENCL version */
  415. #ifdef STARPU_USE_OPENCL
  416. static void load_subblock_into_buffer_opencl(struct starpu_block_interface *block,
  417. struct starpu_block_interface *boundary,
  418. unsigned firstz)
  419. {
  420. check_load(block, boundary);
  421. /* We do a contiguous memory transfer */
  422. size_t boundary_size = K*block->ldz*block->elemsize;
  423. unsigned offset = firstz*block->ldz;
  424. cl_mem block_data = (cl_mem)block->dev_handle;
  425. cl_mem boundary_data = (cl_mem)boundary->dev_handle;
  426. cl_command_queue cq;
  427. starpu_opencl_get_current_queue(&cq);
  428. cl_int ret = clEnqueueCopyBuffer(cq, block_data, boundary_data, offset, 0, boundary_size, 0, NULL, NULL);
  429. if (ret != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(ret);
  430. }
  431. #endif /* STARPU_USE_OPENCL */
  432. /* Record how many top/bottom saves each worker performed */
  433. unsigned top_per_worker[STARPU_NMAXWORKERS];
  434. unsigned bottom_per_worker[STARPU_NMAXWORKERS];
  435. /* top save, CPU version */
  436. void dummy_func_top_cpu(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
  437. {
  438. unsigned z;
  439. starpu_codelet_unpack_args(arg, &z);
  440. struct block_description *block = get_block_description(z);
  441. int workerid = starpu_worker_get_id_check();
  442. top_per_worker[workerid]++;
  443. DEBUG( "DO SAVE Bottom block %d\n", block->bz);
  444. /* The offset along the z axis is (block_size_z + K)- K */
  445. unsigned block_size_z = get_block_size(block->bz);
  446. load_subblock_into_buffer_cpu(descr[0], descr[2], block_size_z);
  447. load_subblock_into_buffer_cpu(descr[1], descr[3], block_size_z);
  448. }
  449. /* bottom save, CPU version */
  450. void dummy_func_bottom_cpu(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
  451. {
  452. unsigned z;
  453. starpu_codelet_unpack_args(arg, &z);
  454. struct block_description *block = get_block_description(z);
  455. STARPU_ASSERT(block);
  456. int workerid = starpu_worker_get_id_check();
  457. bottom_per_worker[workerid]++;
  458. DEBUG( "DO SAVE Top block %d\n", block->bz);
  459. load_subblock_into_buffer_cpu(descr[0], descr[2], K);
  460. load_subblock_into_buffer_cpu(descr[1], descr[3], K);
  461. }
  462. /* top save, CUDA version */
  463. #ifdef STARPU_USE_CUDA
  464. static void dummy_func_top_cuda(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
  465. {
  466. unsigned z;
  467. starpu_codelet_unpack_args(arg, &z);
  468. struct block_description *block = get_block_description(z);
  469. int workerid = starpu_worker_get_id_check();
  470. top_per_worker[workerid]++;
  471. DEBUG( "DO SAVE Top block %d\n", block->bz);
  472. /* The offset along the z axis is (block_size_z + K)- K */
  473. unsigned block_size_z = get_block_size(block->bz);
  474. load_subblock_into_buffer_cuda(descr[0], descr[2], block_size_z);
  475. load_subblock_into_buffer_cuda(descr[1], descr[3], block_size_z);
  476. }
  477. /* bottom save, CUDA version */
  478. static void dummy_func_bottom_cuda(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
  479. {
  480. unsigned z;
  481. starpu_codelet_unpack_args(arg, &z);
  482. struct block_description *block = get_block_description(z);
  483. (void) block;
  484. int workerid = starpu_worker_get_id_check();
  485. bottom_per_worker[workerid]++;
  486. DEBUG( "DO SAVE Bottom block %d on CUDA\n", block->bz);
  487. load_subblock_into_buffer_cuda(descr[0], descr[2], K);
  488. load_subblock_into_buffer_cuda(descr[1], descr[3], K);
  489. }
  490. #endif /* STARPU_USE_CUDA */
  491. /* top save, OpenCL version */
  492. #ifdef STARPU_USE_OPENCL
  493. static void dummy_func_top_opencl(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
  494. {
  495. unsigned z;
  496. starpu_codelet_unpack_args(arg, &z);
  497. struct block_description *block = get_block_description(z);
  498. int workerid = starpu_worker_get_id_check();
  499. top_per_worker[workerid]++;
  500. DEBUG( "DO SAVE Top block %d\n", block->bz);
  501. /* The offset along the z axis is (block_size_z + K)- K */
  502. unsigned block_size_z = get_block_size(block->bz);
  503. load_subblock_into_buffer_opencl(descr[0], descr[2], block_size_z);
  504. load_subblock_into_buffer_opencl(descr[1], descr[3], block_size_z);
  505. }
  506. /* bottom save, OPENCL version */
  507. static void dummy_func_bottom_opencl(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
  508. {
  509. unsigned z;
  510. starpu_codelet_unpack_args(arg, &z);
  511. struct block_description *block = get_block_description(z);
  512. (void) block;
  513. int workerid = starpu_worker_get_id_check();
  514. bottom_per_worker[workerid]++;
  515. DEBUG( "DO SAVE Bottom block %d on OPENCL\n", block->bz);
  516. load_subblock_into_buffer_opencl(descr[0], descr[2], K);
  517. load_subblock_into_buffer_opencl(descr[1], descr[3], K);
  518. }
  519. #endif /* STARPU_USE_OPENCL */
  520. /* Performance models and codelet for save */
  521. static struct starpu_perfmodel save_cl_bottom_model =
  522. {
  523. .type = STARPU_HISTORY_BASED,
  524. .symbol = "save_cl_bottom"
  525. };
  526. static struct starpu_perfmodel save_cl_top_model =
  527. {
  528. .type = STARPU_HISTORY_BASED,
  529. .symbol = "save_cl_top"
  530. };
  531. struct starpu_codelet save_cl_bottom =
  532. {
  533. .cpu_funcs = {dummy_func_bottom_cpu},
  534. #ifdef STARPU_USE_CUDA
  535. .cuda_funcs = {dummy_func_bottom_cuda},
  536. .cuda_flags = {STARPU_CUDA_ASYNC},
  537. #endif
  538. #ifdef STARPU_USE_OPENCL
  539. .opencl_funcs = {dummy_func_bottom_opencl},
  540. .opencl_flags = {STARPU_OPENCL_ASYNC},
  541. #endif
  542. .model = &save_cl_bottom_model,
  543. .nbuffers = 4,
  544. .modes = {STARPU_R, STARPU_R, STARPU_W, STARPU_W}
  545. };
  546. struct starpu_codelet save_cl_top =
  547. {
  548. .cpu_funcs = {dummy_func_top_cpu},
  549. #ifdef STARPU_USE_CUDA
  550. .cuda_funcs = {dummy_func_top_cuda},
  551. .cuda_flags = {STARPU_CUDA_ASYNC},
  552. #endif
  553. #ifdef STARPU_USE_OPENCL
  554. .opencl_funcs = {dummy_func_top_opencl},
  555. .opencl_flags = {STARPU_OPENCL_ASYNC},
  556. #endif
  557. .model = &save_cl_top_model,
  558. .nbuffers = 4,
  559. .modes = {STARPU_R, STARPU_R, STARPU_W, STARPU_W}
  560. };
  561. /* Memset a block's buffers */
  562. static void memset_func(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
  563. {
  564. unsigned sizex, sizey, bz;
  565. starpu_codelet_unpack_args(arg, &sizex, &sizey, &bz);
  566. struct block_description *block = get_block_description(bz);
  567. unsigned size_bz = get_block_size(bz);
  568. unsigned x,y,z;
  569. for (x = 0; x < sizex + 2*K; x++)
  570. {
  571. for (y = 0; y < sizey + 2*K; y++)
  572. {
  573. /* Main blocks */
  574. for (z = 0; z < size_bz + 2*K; z++)
  575. {
  576. block->layers[0][(x)+(y)*(sizex + 2*K)+(z)*(sizex+2*K)*(sizey+2*K)] = 0;
  577. block->layers[1][(x)+(y)*(sizex + 2*K)+(z)*(sizex+2*K)*(sizey+2*K)] = 0;
  578. }
  579. for (z = 0; z < K; z++)
  580. {
  581. /* Boundary blocks : Top */
  582. block->boundaries[T][0][(x)+(y)*(sizex + 2*K)+(z)*(sizex+2*K)*(sizey+2*K)] = 0;
  583. block->boundaries[T][1][(x)+(y)*(sizex + 2*K)+(z)*(sizex+2*K)*(sizey+2*K)] = 0;
  584. /* Boundary blocks : Bottom */
  585. block->boundaries[B][0][(x)+(y)*(sizex + 2*K)+(z)*(sizex+2*K)*(sizey+2*K)] = 0;
  586. block->boundaries[B][1][(x)+(y)*(sizex + 2*K)+(z)*(sizex+2*K)*(sizey+2*K)] = 0;
  587. }
  588. }
  589. }
  590. //memset(block->layers[0], 0, (sizex + 2*K)*(sizey + 2*K)*(size_bz + 2*K)*sizeof(block->layers[0]));
  591. //memset(block->layers[1], 0, (sizex + 2*K)*(sizey + 2*K)*(size_bz + 2*K)*sizeof(block->layers[1]));
  592. //memset(block->boundaries[T][0], 0, (sizex + 2*K)*(sizey + 2*K)*K*sizeof(block->boundaries[T][0]));
  593. //memset(block->boundaries[T][1], 0, (sizex + 2*K)*(sizey + 2*K)*K*sizeof(block->boundaries[T][1]));
  594. //memset(block->boundaries[B][0], 0, (sizex + 2*K)*(sizey + 2*K)*K*sizeof(block->boundaries[B][0]));
  595. //memset(block->boundaries[B][1], 0, (sizex + 2*K)*(sizey + 2*K)*K*sizeof(block->boundaries[B][1]));
  596. }
  597. static double memset_cost_function(struct starpu_task *task, unsigned nimpl)
  598. {
  599. (void) task;
  600. (void) nimpl;
  601. return 0.000001;
  602. }
  603. static struct starpu_perfmodel memset_model =
  604. {
  605. .type = STARPU_COMMON,
  606. .cost_function = memset_cost_function,
  607. .symbol = "memset"
  608. };
  609. struct starpu_codelet cl_memset =
  610. {
  611. .cpu_funcs = {memset_func},
  612. .model = &memset_model,
  613. .nbuffers = 6,
  614. .modes = {STARPU_W, STARPU_W, STARPU_W, STARPU_W, STARPU_W, STARPU_W}
  615. };
  616. /* Initialize a block's layer */
  617. static void initlayer_func(void *descr[] STARPU_ATTRIBUTE_UNUSED, void *arg)
  618. {
  619. unsigned sizex, sizey, bz;
  620. starpu_codelet_unpack_args(arg, &sizex, &sizey, &bz);
  621. struct block_description *block = get_block_description(bz);
  622. unsigned size_bz = get_block_size(bz);
  623. /* Initialize layer with some random data */
  624. unsigned x, y, z;
  625. unsigned sum = 0;
  626. for (x = 0; x < sizex; x++)
  627. for (y = 0; y < sizey; y++)
  628. for (z = 0; z < size_bz; z++)
  629. sum += block->layers[0][(K+x)+(K+y)*(sizex + 2*K)+(K+z)*(sizex+2*K)*(sizey+2*K)] = (int)((x/7.+y/13.+(bz*size_bz + z)/17.) * 10.) % 2;
  630. }
  631. static double initlayer_cost_function(struct starpu_task *task, unsigned nimpl)
  632. {
  633. (void) task;
  634. (void) nimpl;
  635. return 0.000001;
  636. }
  637. static struct starpu_perfmodel initlayer_model =
  638. {
  639. .type = STARPU_COMMON,
  640. .cost_function = initlayer_cost_function,
  641. .symbol = "initlayer"
  642. };
  643. struct starpu_codelet cl_initlayer =
  644. {
  645. .cpu_funcs = {initlayer_func},
  646. .model = &initlayer_model,
  647. .nbuffers = 1,
  648. .modes = {STARPU_W}
  649. };