stencil-blocks.c 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2010,2011,2013-2017 Université de Bordeaux
  4. * Copyright (C) 2013 Inria
  5. * Copyright (C) 2011,2013,2015-2017,2019 CNRS
  6. *
  7. * StarPU is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU Lesser General Public License as published by
  9. * the Free Software Foundation; either version 2.1 of the License, or (at
  10. * your option) any later version.
  11. *
  12. * StarPU is distributed in the hope that it will be useful, but
  13. * WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  15. *
  16. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  17. */
  18. #include "stencil.h"
  19. #include <math.h>
  20. /* Manage block and tags allocation */
  21. static struct block_description *blocks;
  22. static unsigned sizex, sizey, sizez;
  23. static unsigned nbz;
  24. static unsigned *block_sizes_z;
  25. /*
  26. * Tags for various codelet completion
  27. */
  28. /*
  29. * common tag format:
  30. */
  31. static starpu_tag_t tag_common(int z, int dir, int type)
  32. {
  33. return (((((starpu_tag_t)type) << 4) | ((dir+1)/2)) << 32)|(starpu_tag_t)z;
  34. }
  35. /* Completion of last update tasks */
  36. starpu_tag_t TAG_FINISH(int z)
  37. {
  38. z = (z + nbz)%nbz;
  39. starpu_tag_t tag = tag_common(z, 0, 1);
  40. return tag;
  41. }
  42. /* Completion of the save codelet for MPI send/recv */
  43. starpu_tag_t TAG_START(int z, int dir)
  44. {
  45. z = (z + nbz)%nbz;
  46. starpu_tag_t tag = tag_common(z, dir, 2);
  47. return tag;
  48. }
  49. /*
  50. * common MPI tag format:
  51. * iter is actually not needed for coherency, but it makes debugging easier
  52. */
  53. static int mpi_tag_common(int z, int iter, int dir, int buffer)
  54. {
  55. return (((((iter << 12)|z)<<4) | ((1+dir)/2))<<4)|buffer;
  56. }
  57. int MPI_TAG0(int z, int iter, int dir)
  58. {
  59. z = (z + nbz)%nbz;
  60. int tag = mpi_tag_common(z, iter, dir, 0);
  61. return tag;
  62. }
  63. int MPI_TAG1(int z, int iter, int dir)
  64. {
  65. z = (z + nbz)%nbz;
  66. int tag = mpi_tag_common(z, iter, dir, 1);
  67. return tag;
  68. }
  69. /*
  70. * Block descriptors
  71. */
  72. /* Compute the size of the different blocks */
  73. static void compute_block_sizes(void)
  74. {
  75. block_sizes_z = (unsigned *) malloc(nbz*sizeof(unsigned));
  76. STARPU_ASSERT(block_sizes_z);
  77. /* Perhaps the last chunk is smaller */
  78. unsigned default_block_size = (sizez+nbz-1)/nbz;
  79. unsigned remaining = sizez;
  80. unsigned b;
  81. for (b = 0; b < nbz; b++)
  82. {
  83. block_sizes_z[b] = MIN(default_block_size, remaining);
  84. remaining -= block_sizes_z[b];
  85. }
  86. STARPU_ASSERT(remaining == 0);
  87. }
  88. unsigned get_block_size(int bz)
  89. {
  90. return block_sizes_z[bz];
  91. }
  92. struct block_description *get_block_description(int z)
  93. {
  94. z = (z + nbz)%nbz;
  95. STARPU_ASSERT(&blocks[z]);
  96. return &blocks[z];
  97. }
  98. int get_block_mpi_node(int z)
  99. {
  100. z = (z + nbz)%nbz;
  101. return blocks[z].mpi_node;
  102. }
  103. void create_blocks_array(unsigned _sizex, unsigned _sizey, unsigned _sizez, unsigned _nbz)
  104. {
  105. /* Store the parameters */
  106. nbz = _nbz;
  107. sizex = _sizex;
  108. sizey = _sizey;
  109. sizez = _sizez;
  110. /* Create a grid of block descriptors */
  111. blocks = (struct block_description *) calloc(nbz, sizeof(struct block_description));
  112. STARPU_ASSERT(blocks);
  113. /* What is the size of the different blocks ? */
  114. compute_block_sizes();
  115. unsigned bz;
  116. for (bz = 0; bz < nbz; bz++)
  117. {
  118. struct block_description * block =
  119. get_block_description(bz);
  120. /* Which block is it ? */
  121. block->bz = bz;
  122. /* For simplicity, we store which are the neighbours blocks */
  123. block->boundary_blocks[B] = get_block_description((bz-1+nbz)%nbz);
  124. block->boundary_blocks[T] = get_block_description((bz+1)%nbz);
  125. }
  126. }
  127. void free_blocks_array()
  128. {
  129. free(blocks);
  130. free(block_sizes_z);
  131. }
  132. /*
  133. * Initialization of the blocks
  134. */
  135. void assign_blocks_to_workers(int rank)
  136. {
  137. unsigned bz;
  138. /* NB: perhaps we could count a GPU as multiple workers */
  139. /* how many workers are there ? */
  140. /*unsigned nworkers = starpu_worker_get_count();*/
  141. /* how many blocks are on that MPI node ? */
  142. // unsigned nblocks = 0;
  143. // for (bz = 0; bz < nbz; bz++)
  144. // {
  145. // struct block_description *block =
  146. // get_block_description(bz);
  147. //
  148. // if (block->mpi_node == rank)
  149. // nblocks++;
  150. // }
  151. /* how many blocks per worker ? */
  152. /*unsigned nblocks_per_worker = (nblocks + nworkers - 1)/nworkers;*/
  153. /* we now attribute up to nblocks_per_worker blocks per workers */
  154. unsigned attributed = 0;
  155. for (bz = 0; bz < nbz; bz++)
  156. {
  157. struct block_description *block =
  158. get_block_description(bz);
  159. if (block->mpi_node == rank)
  160. {
  161. unsigned workerid;
  162. /* Manage initial block distribution between CPU and GPU */
  163. #if 0
  164. #if 1
  165. /* GPUs then CPUs */
  166. if (attributed < 3*18)
  167. workerid = attributed / 18;
  168. else
  169. workerid = 3+ (attributed - 3*18) / 2;
  170. #else
  171. /* GPUs interleaved with CPUs */
  172. if ((attributed % 20) <= 1)
  173. workerid = 3 + attributed / 20;
  174. else if (attributed < 60)
  175. workerid = attributed / 20;
  176. else
  177. workerid = (attributed - 60)/2 + 6;
  178. #endif
  179. #else
  180. /* Only GPUS */
  181. workerid = (attributed / 21) % 3;
  182. #endif
  183. /*= attributed/nblocks_per_worker;*/
  184. block->preferred_worker = workerid;
  185. attributed++;
  186. }
  187. }
  188. }
  189. void assign_blocks_to_mpi_nodes(int world_size)
  190. {
  191. unsigned nzblocks_per_process = (nbz + world_size - 1) / world_size;
  192. unsigned bz;
  193. for (bz = 0; bz < nbz; bz++)
  194. {
  195. struct block_description *block =
  196. get_block_description(bz);
  197. block->mpi_node = bz / nzblocks_per_process;
  198. }
  199. }
  200. static size_t allocated = 0;
  201. static void allocate_block_on_node(starpu_data_handle_t *handleptr, unsigned bz, TYPE **ptr, unsigned nx, unsigned ny, unsigned nz)
  202. {
  203. int ret;
  204. size_t block_size = nx*ny*nz*sizeof(TYPE);
  205. /* Allocate memory */
  206. #if 1
  207. ret = starpu_malloc_flags((void **)ptr, block_size, STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
  208. STARPU_ASSERT(ret == 0);
  209. #else
  210. *ptr = malloc(block_size);
  211. STARPU_ASSERT(*ptr);
  212. #endif
  213. allocated += block_size;
  214. #ifndef STARPU_SIMGRID
  215. /* Fill the blocks with 0 */
  216. memset(*ptr, 0, block_size);
  217. #endif
  218. /* Register it to StarPU */
  219. starpu_block_data_register(handleptr, STARPU_MAIN_RAM, (uintptr_t)*ptr, nx, nx*ny, nx, ny, nz, sizeof(TYPE));
  220. starpu_data_set_coordinates(*handleptr, 1, bz);
  221. }
  222. static void free_block_on_node(starpu_data_handle_t handleptr, unsigned nx, unsigned ny, unsigned nz)
  223. {
  224. void *ptr = (void *) starpu_block_get_local_ptr(handleptr);
  225. size_t block_size = nx*ny*nz*sizeof(TYPE);
  226. starpu_data_unregister(handleptr);
  227. starpu_free_flags(ptr, block_size, STARPU_MALLOC_PINNED|STARPU_MALLOC_SIMULATION_FOLDED);
  228. }
  229. void display_memory_consumption(int rank)
  230. {
  231. FPRINTF(stderr, "%lu B of memory were allocated on node %d\n", (unsigned long) allocated, rank);
  232. }
  233. void allocate_memory_on_node(int rank)
  234. {
  235. unsigned bz;
  236. for (bz = 0; bz < nbz; bz++)
  237. {
  238. struct block_description *block = get_block_description(bz);
  239. int node = block->mpi_node;
  240. /* Main blocks */
  241. if (node == rank)
  242. {
  243. unsigned size_bz = block_sizes_z[bz];
  244. allocate_block_on_node(&block->layers_handle[0], bz, &block->layers[0],
  245. (sizex + 2*K), (sizey + 2*K), (size_bz + 2*K));
  246. #ifndef STARPU_SIMGRID
  247. #ifdef LIFE
  248. unsigned x, y, z;
  249. unsigned sum = 0;
  250. for (x = 0; x < sizex; x++)
  251. for (y = 0; y < sizey; y++)
  252. for (z = 0; z < size_bz; z++)
  253. /* Just random data */
  254. sum += block->layers[0][(K+x)+(K+y)*(sizex + 2*K)+(K+z)*(sizex+2*K)*(sizey+2*K)] = (int)((x/7.+y/13.+(bz*size_bz + z)/17.) * 10.) % 2;
  255. /* printf("block %d starts with %d/%d alive\n", bz, sum, sizex*sizey*size_bz);*/
  256. #endif
  257. #endif
  258. allocate_block_on_node(&block->layers_handle[1], bz, &block->layers[1],
  259. (sizex + 2*K), (sizey + 2*K), (size_bz + 2*K));
  260. }
  261. /* Boundary blocks : Top */
  262. int top_node = block->boundary_blocks[T]->mpi_node;
  263. if ((node == rank) || (top_node == rank))
  264. {
  265. allocate_block_on_node(&block->boundaries_handle[T][0], bz, &block->boundaries[T][0],
  266. (sizex + 2*K), (sizey + 2*K), K);
  267. allocate_block_on_node(&block->boundaries_handle[T][1], bz, &block->boundaries[T][1],
  268. (sizex + 2*K), (sizey + 2*K), K);
  269. }
  270. /* Boundary blocks : Bottom */
  271. int bottom_node = block->boundary_blocks[B]->mpi_node;
  272. if ((node == rank) || (bottom_node == rank))
  273. {
  274. allocate_block_on_node(&block->boundaries_handle[B][0], bz, &block->boundaries[B][0],
  275. (sizex + 2*K), (sizey + 2*K), K);
  276. allocate_block_on_node(&block->boundaries_handle[B][1], bz, &block->boundaries[B][1],
  277. (sizex + 2*K), (sizey + 2*K), K);
  278. }
  279. }
  280. }
  281. void free_memory_on_node(int rank)
  282. {
  283. unsigned bz;
  284. for (bz = 0; bz < nbz; bz++)
  285. {
  286. struct block_description *block = get_block_description(bz);
  287. int node = block->mpi_node;
  288. /* Main blocks */
  289. if (node == rank)
  290. {
  291. free_block_on_node(block->layers_handle[0], (sizex + 2*K), (sizey + 2*K), K);
  292. free_block_on_node(block->layers_handle[1], (sizex + 2*K), (sizey + 2*K), K);
  293. }
  294. /* Boundary blocks : Top */
  295. int top_node = block->boundary_blocks[T]->mpi_node;
  296. if ((node == rank) || (top_node == rank))
  297. {
  298. free_block_on_node(block->boundaries_handle[T][0], (sizex + 2*K), (sizey + 2*K), K);
  299. free_block_on_node(block->boundaries_handle[T][1], (sizex + 2*K), (sizey + 2*K), K);
  300. }
  301. /* Boundary blocks : Bottom */
  302. int bottom_node = block->boundary_blocks[B]->mpi_node;
  303. if ((node == rank) || (bottom_node == rank))
  304. {
  305. free_block_on_node(block->boundaries_handle[B][0], (sizex + 2*K), (sizey + 2*K), K);
  306. free_block_on_node(block->boundaries_handle[B][1], (sizex + 2*K), (sizey + 2*K), K);
  307. }
  308. }
  309. }
  310. /* check how many cells are alive */
  311. void check(int rank)
  312. {
  313. unsigned bz;
  314. for (bz = 0; bz < nbz; bz++)
  315. {
  316. struct block_description *block = get_block_description(bz);
  317. int node = block->mpi_node;
  318. /* Main blocks */
  319. if (node == rank)
  320. {
  321. #ifdef LIFE
  322. unsigned size_bz = block_sizes_z[bz];
  323. unsigned x, y, z;
  324. unsigned sum = 0;
  325. for (x = 0; x < sizex; x++)
  326. for (y = 0; y < sizey; y++)
  327. for (z = 0; z < size_bz; z++)
  328. sum += block->layers[0][(K+x)+(K+y)*(sizex + 2*K)+(K+z)*(sizex+2*K)*(sizey+2*K)];
  329. printf("block %u got %u/%u alive\n", bz, sum, sizex*sizey*size_bz);
  330. #endif
  331. }
  332. }
  333. }