stencil-kernels.c 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2010 Université de Bordeaux 1
  4. *
  5. * StarPU is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU Lesser General Public License as published by
  7. * the Free Software Foundation; either version 2.1 of the License, or (at
  8. * your option) any later version.
  9. *
  10. * StarPU is distributed in the hope that it will be useful, but
  11. * WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. *
  14. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  15. */
  16. #include "stencil.h"
  17. #include <sys/time.h>
  18. #ifndef timersub
  19. #define timersub(x, y, res) \
  20. do { \
  21. (res)->tv_sec = (x)->tv_sec - (y)->tv_sec; \
  22. (res)->tv_usec = (x)->tv_usec - (y)->tv_usec; \
  23. if ((res)->tv_usec < 0) { \
  24. (res)->tv_sec--; \
  25. (res)->tv_usec += 1000000; \
  26. } \
  27. } while (0)
  28. #endif
  29. #ifndef timeradd
  30. #define timeradd(x, y, res) \
  31. do { \
  32. (res)->tv_sec = (x)->tv_sec + (y)->tv_sec; \
  33. (res)->tv_usec = (x)->tv_usec + (y)->tv_usec; \
  34. if ((res)->tv_usec >= 1000000) { \
  35. (res)->tv_sec++; \
  36. (res)->tv_usec -= 1000000; \
  37. } \
  38. } while (0)
  39. #endif
  40. /* Computation Kernels */
  41. /*
  42. * There are three codeletets:
  43. *
  44. * - cl_update, which takes a block and the boundaries of its neighbours, loads
  45. * the boundaries into the block and perform some update loops:
  46. *
  47. * comp. buffer save. buffers comp. buffer save. buffers comp. buffer
  48. * | ... |
  49. * | | +------------------+ +------------------+
  50. * | #N+1 | | #N+1 bottom copy====>#N+1 bottom copy |
  51. * +-------------+ +------------------+ +------------------+
  52. * | #N top copy | | #N top copy | | |
  53. * +-------------+ +------------------+ | |
  54. * | #N |
  55. * ...
  56. * | | +----------------+ +----------------------+
  57. * | | | #N bottom copy | | block #N bottom copy |
  58. * ^ +------------------+ +----------------+ +----------------------+
  59. * | | #N-1 top copy <====#N-1 top copy | | block #N-1 |
  60. * | +------------------+ +----------------+ | |
  61. * Z ...
  62. *
  63. * - save_cl_top, which take a block and its top boundary, and saves the top of
  64. * the block into the boundary (to be given as bottom of the neighbour above
  65. * this block).
  66. *
  67. * comp. buffer save. buffers comp. buffer save. buffers comp. buffer
  68. * | ... |
  69. * | | +------------------+ +------------------+
  70. * | #N+1 | | #N+1 bottom copy | | #N+1 bottom copy |
  71. * +-------------+ +------------------+ +------------------+
  72. * | #N top copy | | #N top copy <==== |
  73. * +-------------+ +------------------+ |..................|
  74. * | #N |
  75. * ...
  76. * | | +----------------+ +----------------------+
  77. * | | | #N bottom copy | | block #N bottom copy |
  78. * ^ +------------------+ +----------------+ +----------------------+
  79. * | | #N-1 top copy | | #N-1 top copy | | block #N-1 |
  80. * | +------------------+ +----------------+ | |
  81. * Z ...
  82. *
  83. * - save_cl_bottom, same for the bottom
  84. * comp. buffer save. buffers comp. buffer save. buffers comp. buffer
  85. * | ... |
  86. * | | +------------------+ +------------------+
  87. * | #N+1 | | #N+1 bottom copy | | #N+1 bottom copy |
  88. * +-------------+ +------------------+ +------------------+
  89. * | #N top copy | | #N top copy | | |
  90. * +-------------+ +------------------+ | |
  91. * | #N |
  92. * ...
  93. * |..................| +----------------+ +----------------------+
  94. * | ====>#N bottom copy | | block #N bottom copy |
  95. * ^ +------------------+ +----------------+ +----------------------+
  96. * | | #N-1 top copy | | #N-1 top copy | | block #N-1 |
  97. * | +------------------+ +----------------+ | |
  98. * Z ...
  99. *
  100. * The idea is that the computation buffers thus don't have to move, only their
  101. * boundaries are copied to buffers that do move (be it CPU/GPU, GPU/GPU or via
  102. * MPI)
  103. *
  104. * For each of the buffers above, there are two (0/1) buffers to make new/old switch costless.
  105. */
  106. #if 0
  107. # define DEBUG(fmt, ...) fprintf(stderr,fmt,##__VA_ARGS__)
  108. #else
  109. # define DEBUG(fmt, ...) (void) 0
  110. #endif
  111. /* Record which GPU ran which block, for nice pictures */
  112. int who_runs_what_len;
  113. int *who_runs_what;
  114. int *who_runs_what_index;
  115. struct timeval *last_tick;
  116. /* Record how many updates each worker performed */
  117. unsigned update_per_worker[STARPU_NMAXWORKERS];
  118. /*
  119. * Load a neighbour's boundary into block, CPU version
  120. */
  121. static void load_subblock_from_buffer_cpu(starpu_block_interface_t *block,
  122. starpu_block_interface_t *boundary,
  123. unsigned firstz)
  124. {
  125. /* Sanity checks */
  126. STARPU_ASSERT(block->nx == boundary->nx);
  127. STARPU_ASSERT(block->ny == boundary->ny);
  128. STARPU_ASSERT(boundary->nz == K);
  129. /* NB: this is not fully garanteed ... but it's *very* likely and that
  130. * makes our life much simpler */
  131. STARPU_ASSERT(block->ldy == boundary->ldy);
  132. STARPU_ASSERT(block->ldz == boundary->ldz);
  133. /* We do a contiguous memory transfer */
  134. size_t boundary_size = K*block->ldz*block->elemsize;
  135. unsigned offset = firstz*block->ldz;
  136. TYPE *block_data = (TYPE *)block->ptr;
  137. TYPE *boundary_data = (TYPE *)boundary->ptr;
  138. memcpy(&block_data[offset], boundary_data, boundary_size);
  139. }
  140. /*
  141. * Load a neighbour's boundary into block, CUDA version
  142. */
  143. #ifdef STARPU_USE_CUDA
  144. static void load_subblock_from_buffer_cuda(starpu_block_interface_t *block,
  145. starpu_block_interface_t *boundary,
  146. unsigned firstz)
  147. {
  148. /* Sanity checks */
  149. STARPU_ASSERT(block->nx == boundary->nx);
  150. STARPU_ASSERT(block->ny == boundary->ny);
  151. STARPU_ASSERT(boundary->nz == K);
  152. /* NB: this is not fully garanteed ... but it's *very* likely and that
  153. * makes our life much simpler */
  154. STARPU_ASSERT(block->ldy == boundary->ldy);
  155. STARPU_ASSERT(block->ldz == boundary->ldz);
  156. /* We do a contiguous memory transfer */
  157. size_t boundary_size = K*block->ldz*block->elemsize;
  158. unsigned offset = firstz*block->ldz;
  159. TYPE *block_data = (TYPE *)block->ptr;
  160. TYPE *boundary_data = (TYPE *)boundary->ptr;
  161. cudaMemcpyAsync(&block_data[offset], boundary_data, boundary_size, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
  162. }
  163. /*
  164. * cl_update (CUDA version)
  165. */
  166. static void update_func_cuda(void *descr[], void *arg)
  167. {
  168. struct block_description *block = arg;
  169. int workerid = starpu_worker_get_id();
  170. DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
  171. if (block->bz == 0)
  172. fprintf(stderr,"!!! DO update_func_cuda z %d CUDA%d !!!\n", block->bz, workerid);
  173. else
  174. DEBUG( "!!! DO update_func_cuda z %d CUDA%d !!!\n", block->bz, workerid);
  175. #ifdef STARPU_USE_MPI
  176. int rank = 0;
  177. MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  178. DEBUG( "!!! RANK %d !!!\n", rank);
  179. #endif
  180. DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
  181. unsigned block_size_z = get_block_size(block->bz);
  182. unsigned i;
  183. update_per_worker[workerid]++;
  184. struct timeval tv, tv2, diff, delta = {.tv_sec = 0, .tv_usec = get_ticks()*1000};
  185. gettimeofday(&tv, NULL);
  186. timersub(&tv, &start, &tv2);
  187. timersub(&tv2, &last_tick[block->bz], &diff);
  188. while (timercmp(&diff, &delta, >=)) {
  189. timeradd(&last_tick[block->bz], &delta, &last_tick[block->bz]);
  190. timersub(&tv2, &last_tick[block->bz], &diff);
  191. if (who_runs_what_index[block->bz] < who_runs_what_len)
  192. who_runs_what[block->bz + (who_runs_what_index[block->bz]++) * get_nbz()] = -1;
  193. }
  194. if (who_runs_what_index[block->bz] < who_runs_what_len)
  195. who_runs_what[block->bz + (who_runs_what_index[block->bz]++) * get_nbz()] = global_workerid(workerid);
  196. /*
  197. * Load neighbours' boundaries : TOP
  198. */
  199. /* The offset along the z axis is (block_size_z + K) */
  200. load_subblock_from_buffer_cuda(descr[0], descr[2], block_size_z+K);
  201. load_subblock_from_buffer_cuda(descr[1], descr[3], block_size_z+K);
  202. /*
  203. * Load neighbours' boundaries : BOTTOM
  204. */
  205. load_subblock_from_buffer_cuda(descr[0], descr[4], 0);
  206. load_subblock_from_buffer_cuda(descr[1], descr[5], 0);
  207. /*
  208. * Stencils ... do the actual work here :) TODO
  209. */
  210. for (i=1; i<=K; i++)
  211. {
  212. starpu_block_interface_t *oldb = descr[i%2], *newb = descr[(i+1)%2];
  213. TYPE *old = (void*) oldb->ptr, *new = (void*) newb->ptr;
  214. /* Shadow data */
  215. cuda_shadow_host(block->bz, old, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
  216. /* And perform actual computation */
  217. #ifdef LIFE
  218. cuda_life_update_host(block->bz, old, new, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
  219. #else
  220. cudaMemcpyAsync(new, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*new), cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
  221. #endif /* LIFE */
  222. }
  223. cudaError_t cures;
  224. if ((cures = cudaStreamSynchronize(starpu_cuda_get_local_stream())) != cudaSuccess)
  225. STARPU_CUDA_REPORT_ERROR(cures);
  226. }
  227. #endif /* STARPU_USE_CUDA */
  228. /*
  229. * cl_update (CPU version)
  230. */
  231. static void update_func_cpu(void *descr[], void *arg)
  232. {
  233. struct block_description *block = arg;
  234. int workerid = starpu_worker_get_id();
  235. DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
  236. if (block->bz == 0)
  237. fprintf(stderr,"!!! DO update_func_cpu z %d CPU%d !!!\n", block->bz, workerid);
  238. else
  239. DEBUG( "!!! DO update_func_cpu z %d CPU%d !!!\n", block->bz, workerid);
  240. #ifdef STARPU_USE_MPI
  241. int rank = 0;
  242. MPI_Comm_rank(MPI_COMM_WORLD, &rank);
  243. DEBUG( "!!! RANK %d !!!\n", rank);
  244. #endif
  245. DEBUG( "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
  246. unsigned block_size_z = get_block_size(block->bz);
  247. unsigned i;
  248. update_per_worker[workerid]++;
  249. struct timeval tv, tv2, diff, delta = {.tv_sec = 0, .tv_usec = get_ticks() * 1000};
  250. gettimeofday(&tv, NULL);
  251. timersub(&tv, &start, &tv2);
  252. timersub(&tv2, &last_tick[block->bz], &diff);
  253. while (timercmp(&diff, &delta, >=)) {
  254. timeradd(&last_tick[block->bz], &delta, &last_tick[block->bz]);
  255. timersub(&tv2, &last_tick[block->bz], &diff);
  256. if (who_runs_what_index[block->bz] < who_runs_what_len)
  257. who_runs_what[block->bz + (who_runs_what_index[block->bz]++) * get_nbz()] = -1;
  258. }
  259. if (who_runs_what_index[block->bz] < who_runs_what_len)
  260. who_runs_what[block->bz + (who_runs_what_index[block->bz]++) * get_nbz()] = global_workerid(workerid);
  261. /*
  262. * Load neighbours' boundaries : TOP
  263. */
  264. /* The offset along the z axis is (block_size_z + K) */
  265. load_subblock_from_buffer_cpu(descr[0], descr[2], block_size_z+K);
  266. load_subblock_from_buffer_cpu(descr[1], descr[3], block_size_z+K);
  267. /*
  268. * Load neighbours' boundaries : BOTTOM
  269. */
  270. load_subblock_from_buffer_cpu(descr[0], descr[4], 0);
  271. load_subblock_from_buffer_cpu(descr[1], descr[5], 0);
  272. /*
  273. * Stencils ... do the actual work here :) TODO
  274. */
  275. for (i=1; i<=K; i++)
  276. {
  277. starpu_block_interface_t *oldb = descr[i%2], *newb = descr[(i+1)%2];
  278. TYPE *old = (void*) oldb->ptr, *new = (void*) newb->ptr;
  279. /* Shadow data */
  280. unsigned ldy = oldb->ldy, ldz = oldb->ldz;
  281. unsigned nx = oldb->nx, ny = oldb->ny, nz = oldb->nz;
  282. unsigned x, y, z;
  283. unsigned stepx = 1;
  284. unsigned stepy = 1;
  285. unsigned stepz = 1;
  286. unsigned idx = 0;
  287. unsigned idy = 0;
  288. unsigned idz = 0;
  289. TYPE *ptr = old;
  290. # include "shadow.h"
  291. /* And perform actual computation */
  292. #ifdef LIFE
  293. life_update(block->bz, old, new, oldb->nx, oldb->ny, oldb->nz, oldb->ldy, oldb->ldz, i);
  294. #else
  295. memcpy(new, old, oldb->nx * oldb->ny * oldb->nz * sizeof(*new));
  296. #endif /* LIFE */
  297. }
  298. }
  299. /* Performance model and codelet structure */
  300. static struct starpu_perfmodel_t cl_update_model = {
  301. .type = STARPU_HISTORY_BASED,
  302. .symbol = "cl_update"
  303. };
  304. starpu_codelet cl_update = {
  305. .where =
  306. #ifdef STARPU_USE_CUDA
  307. STARPU_CUDA|
  308. #endif
  309. STARPU_CPU,
  310. .cpu_func = update_func_cpu,
  311. #ifdef STARPU_USE_CUDA
  312. .cuda_func = update_func_cuda,
  313. #endif
  314. .model = &cl_update_model,
  315. .nbuffers = 6
  316. };
  317. /*
  318. * Save the block internal boundaries to give them to our neighbours.
  319. */
  320. /* CPU version */
  321. static void load_subblock_into_buffer_cpu(starpu_block_interface_t *block,
  322. starpu_block_interface_t *boundary,
  323. unsigned firstz)
  324. {
  325. /* Sanity checks */
  326. STARPU_ASSERT(block->nx == boundary->nx);
  327. STARPU_ASSERT(block->ny == boundary->ny);
  328. STARPU_ASSERT(boundary->nz == K);
  329. /* NB: this is not fully garanteed ... but it's *very* likely and that
  330. * makes our life much simpler */
  331. STARPU_ASSERT(block->ldy == boundary->ldy);
  332. STARPU_ASSERT(block->ldz == boundary->ldz);
  333. /* We do a contiguous memory transfer */
  334. size_t boundary_size = K*block->ldz*block->elemsize;
  335. unsigned offset = firstz*block->ldz;
  336. TYPE *block_data = (TYPE *)block->ptr;
  337. TYPE *boundary_data = (TYPE *)boundary->ptr;
  338. memcpy(boundary_data, &block_data[offset], boundary_size);
  339. }
  340. /* CUDA version */
  341. #ifdef STARPU_USE_CUDA
  342. static void load_subblock_into_buffer_cuda(starpu_block_interface_t *block,
  343. starpu_block_interface_t *boundary,
  344. unsigned firstz)
  345. {
  346. /* Sanity checks */
  347. STARPU_ASSERT(block->nx == boundary->nx);
  348. STARPU_ASSERT(block->ny == boundary->ny);
  349. STARPU_ASSERT(boundary->nz == K);
  350. /* NB: this is not fully garanteed ... but it's *very* likely and that
  351. * makes our life much simpler */
  352. STARPU_ASSERT(block->ldy == boundary->ldy);
  353. STARPU_ASSERT(block->ldz == boundary->ldz);
  354. /* We do a contiguous memory transfer */
  355. size_t boundary_size = K*block->ldz*block->elemsize;
  356. unsigned offset = firstz*block->ldz;
  357. TYPE *block_data = (TYPE *)block->ptr;
  358. TYPE *boundary_data = (TYPE *)boundary->ptr;
  359. cudaMemcpyAsync(boundary_data, &block_data[offset], boundary_size, cudaMemcpyDeviceToDevice, starpu_cuda_get_local_stream());
  360. }
  361. #endif /* STARPU_USE_CUDA */
  362. /* Record how many top/bottom saves each worker performed */
  363. unsigned top_per_worker[STARPU_NMAXWORKERS];
  364. unsigned bottom_per_worker[STARPU_NMAXWORKERS];
  365. /* top save, CPU version */
  366. static void dummy_func_top_cpu(void *descr[] __attribute__((unused)), void *arg)
  367. {
  368. struct block_description *block = arg;
  369. int workerid = starpu_worker_get_id();
  370. top_per_worker[workerid]++;
  371. DEBUG( "DO SAVE Bottom block %d\n", block->bz);
  372. /* The offset along the z axis is (block_size_z + K)- K */
  373. unsigned block_size_z = get_block_size(block->bz);
  374. load_subblock_into_buffer_cpu(descr[0], descr[2], block_size_z);
  375. load_subblock_into_buffer_cpu(descr[1], descr[3], block_size_z);
  376. }
  377. /* bottom save, CPU version */
  378. static void dummy_func_bottom_cpu(void *descr[] __attribute__((unused)), void *arg)
  379. {
  380. struct block_description *block = arg;
  381. int workerid = starpu_worker_get_id();
  382. bottom_per_worker[workerid]++;
  383. DEBUG( "DO SAVE Top block %d\n", block->bz);
  384. load_subblock_into_buffer_cpu(descr[0], descr[2], K);
  385. load_subblock_into_buffer_cpu(descr[1], descr[3], K);
  386. }
  387. /* top save, CUDA version */
  388. #ifdef STARPU_USE_CUDA
  389. static void dummy_func_top_cuda(void *descr[] __attribute__((unused)), void *arg)
  390. {
  391. struct block_description *block = arg;
  392. int workerid = starpu_worker_get_id();
  393. top_per_worker[workerid]++;
  394. DEBUG( "DO SAVE Top block %d\n", block->bz);
  395. /* The offset along the z axis is (block_size_z + K)- K */
  396. unsigned block_size_z = get_block_size(block->bz);
  397. load_subblock_into_buffer_cuda(descr[0], descr[2], block_size_z);
  398. load_subblock_into_buffer_cuda(descr[1], descr[3], block_size_z);
  399. cudaStreamSynchronize(starpu_cuda_get_local_stream());
  400. }
  401. /* bottom save, CUDA version */
  402. static void dummy_func_bottom_cuda(void *descr[] __attribute__((unused)), void *arg)
  403. {
  404. struct block_description *block = arg;
  405. int workerid = starpu_worker_get_id();
  406. bottom_per_worker[workerid]++;
  407. DEBUG( "DO SAVE Bottom block %d on CUDA\n", block->bz);
  408. load_subblock_into_buffer_cuda(descr[0], descr[2], K);
  409. load_subblock_into_buffer_cuda(descr[1], descr[3], K);
  410. cudaStreamSynchronize(starpu_cuda_get_local_stream());
  411. }
  412. #endif /* STARPU_USE_CUDA */
  413. /* Performance models and codelet for save */
  414. static struct starpu_perfmodel_t save_cl_bottom_model = {
  415. .type = STARPU_HISTORY_BASED,
  416. .symbol = "save_cl_bottom"
  417. };
  418. static struct starpu_perfmodel_t save_cl_top_model = {
  419. .type = STARPU_HISTORY_BASED,
  420. .symbol = "save_cl_top"
  421. };
  422. starpu_codelet save_cl_bottom = {
  423. .where =
  424. #ifdef STARPU_USE_CUDA
  425. STARPU_CUDA|
  426. #endif
  427. STARPU_CPU,
  428. .cpu_func = dummy_func_bottom_cpu,
  429. #ifdef STARPU_USE_CUDA
  430. .cuda_func = dummy_func_bottom_cuda,
  431. #endif
  432. .model = &save_cl_bottom_model,
  433. .nbuffers = 4
  434. };
  435. starpu_codelet save_cl_top = {
  436. .where =
  437. #ifdef STARPU_USE_CUDA
  438. STARPU_CUDA|
  439. #endif
  440. STARPU_CPU,
  441. .cpu_func = dummy_func_top_cpu,
  442. #ifdef STARPU_USE_CUDA
  443. .cuda_func = dummy_func_top_cuda,
  444. #endif
  445. .model = &save_cl_top_model,
  446. .nbuffers = 4
  447. };