stencil-blocks.c 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2010 Université de Bordeaux 1
  4. *
  5. * StarPU is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU Lesser General Public License as published by
  7. * the Free Software Foundation; either version 2.1 of the License, or (at
  8. * your option) any later version.
  9. *
  10. * StarPU is distributed in the hope that it will be useful, but
  11. * WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. *
  14. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  15. */
  16. #include "stencil.h"
  17. #include <math.h>
  18. /* Manage block and tags allocation */
  19. static struct block_description *blocks;
  20. static unsigned sizex, sizey, sizez;
  21. static unsigned nbz;
  22. static unsigned *block_sizes_z;
  23. /*
  24. * Tags for various codelet completion
  25. */
  26. /*
  27. * common tag format:
  28. */
  29. static starpu_tag_t tag_common(int z, int dir, int type)
  30. {
  31. return (((((starpu_tag_t)type) << 4) | ((dir+1)/2)) << 32)|(starpu_tag_t)z;
  32. }
  33. /* Completion of last update tasks */
  34. starpu_tag_t TAG_FINISH(int z)
  35. {
  36. z = (z + nbz)%nbz;
  37. starpu_tag_t tag = tag_common(z, 0, 1);
  38. return tag;
  39. }
  40. /* Completion of the save codelet for MPI send/recv */
  41. starpu_tag_t TAG_START(int z, int dir)
  42. {
  43. z = (z + nbz)%nbz;
  44. starpu_tag_t tag = tag_common(z, dir, 2);
  45. return tag;
  46. }
  47. /*
  48. * common MPI tag format:
  49. * iter is actually not needed for coherency, but it makes debugging easier
  50. */
  51. static int mpi_tag_common(int z, int iter, int dir, int buffer)
  52. {
  53. return (((((iter << 12)|z)<<4) | ((1+dir)/2))<<4)|buffer;
  54. }
  55. int MPI_TAG0(int z, int iter, int dir)
  56. {
  57. z = (z + nbz)%nbz;
  58. int tag = mpi_tag_common(z, iter, dir, 0);
  59. return tag;
  60. }
  61. int MPI_TAG1(int z, int iter, int dir)
  62. {
  63. z = (z + nbz)%nbz;
  64. int tag = mpi_tag_common(z, iter, dir, 1);
  65. return tag;
  66. }
  67. /*
  68. * Block descriptors
  69. */
  70. /* Compute the size of the different blocks */
  71. static void compute_block_sizes(void)
  72. {
  73. block_sizes_z = (unsigned *) malloc(nbz*sizeof(unsigned));
  74. STARPU_ASSERT(block_sizes_z);
  75. /* Perhaps the last chunk is smaller */
  76. unsigned default_block_size = (sizez+nbz-1)/nbz;
  77. unsigned remaining = sizez;
  78. unsigned b;
  79. for (b = 0; b < nbz; b++)
  80. {
  81. block_sizes_z[b] = MIN(default_block_size, remaining);
  82. remaining -= block_sizes_z[b];
  83. }
  84. STARPU_ASSERT(remaining == 0);
  85. }
  86. unsigned get_block_size(int bz)
  87. {
  88. return block_sizes_z[bz];
  89. }
  90. struct block_description *get_block_description(int z)
  91. {
  92. z = (z + nbz)%nbz;
  93. STARPU_ASSERT(&blocks[z]);
  94. return &blocks[z];
  95. }
  96. unsigned get_block_mpi_node(int z)
  97. {
  98. z = (z + nbz)%nbz;
  99. return blocks[z].mpi_node;
  100. }
  101. void create_blocks_array(unsigned _sizex, unsigned _sizey, unsigned _sizez, unsigned _nbz)
  102. {
  103. /* Store the parameters */
  104. nbz = _nbz;
  105. sizex = _sizex;
  106. sizey = _sizey;
  107. sizez = _sizez;
  108. /* Create a grid of block descriptors */
  109. blocks = (struct block_description *) calloc(nbz, sizeof(struct block_description));
  110. STARPU_ASSERT(blocks);
  111. /* What is the size of the different blocks ? */
  112. compute_block_sizes();
  113. unsigned bz;
  114. for (bz = 0; bz < nbz; bz++)
  115. {
  116. struct block_description * block =
  117. get_block_description(bz);
  118. /* Which block is it ? */
  119. block->bz = bz;
  120. /* For simplicity, we store which are the neighbours blocks */
  121. block->boundary_blocks[B] = get_block_description((bz-1+nbz)%nbz);
  122. block->boundary_blocks[T] = get_block_description((bz+1)%nbz);
  123. }
  124. }
  125. /*
  126. * Initialization of the blocks
  127. */
  128. void assign_blocks_to_workers(int rank)
  129. {
  130. unsigned bz;
  131. /* NB: perhaps we could count a GPU as multiple workers */
  132. /* how many workers are there ? */
  133. /*unsigned nworkers = starpu_worker_get_count();*/
  134. /* how many blocks are on that MPI node ? */
  135. unsigned nblocks = 0;
  136. for (bz = 0; bz < nbz; bz++)
  137. {
  138. struct block_description *block =
  139. get_block_description(bz);
  140. if (block->mpi_node == rank)
  141. nblocks++;
  142. }
  143. /* how many blocks per worker ? */
  144. /*unsigned nblocks_per_worker = (nblocks + nworkers - 1)/nworkers;*/
  145. /* we now attribute up to nblocks_per_worker blocks per workers */
  146. unsigned attributed = 0;
  147. for (bz = 0; bz < nbz; bz++)
  148. {
  149. struct block_description *block =
  150. get_block_description(bz);
  151. if (block->mpi_node == rank)
  152. {
  153. unsigned workerid;
  154. /* Manage initial block distribution between CPU and GPU */
  155. #if 0
  156. #if 1
  157. /* GPUs then CPUs */
  158. if (attributed < 3*18)
  159. workerid = attributed / 18;
  160. else
  161. workerid = 3+ (attributed - 3*18) / 2;
  162. #else
  163. /* GPUs interleaved with CPUs */
  164. if ((attributed % 20) <= 1)
  165. workerid = 3 + attributed / 20;
  166. else if (attributed < 60)
  167. workerid = attributed / 20;
  168. else
  169. workerid = (attributed - 60)/2 + 6;
  170. #endif
  171. #else
  172. /* Only GPUS */
  173. workerid = (attributed / 21) % 3;
  174. #endif
  175. /*= attributed/nblocks_per_worker;*/
  176. block->preferred_worker = workerid;
  177. attributed++;
  178. }
  179. }
  180. }
  181. void assign_blocks_to_mpi_nodes(int world_size)
  182. {
  183. unsigned nzblocks_per_process = (nbz + world_size - 1) / world_size;
  184. unsigned bz;
  185. for (bz = 0; bz < nbz; bz++)
  186. {
  187. struct block_description *block =
  188. get_block_description(bz);
  189. block->mpi_node = bz / nzblocks_per_process;
  190. }
  191. }
  192. static size_t allocated = 0;
  193. static void allocate_block_on_node(starpu_data_handle_t *handleptr, TYPE **ptr, unsigned nx, unsigned ny, unsigned nz)
  194. {
  195. int ret;
  196. size_t block_size = nx*ny*nz*sizeof(TYPE);
  197. /* Allocate memory */
  198. #if 1
  199. ret = starpu_malloc((void **)ptr, block_size);
  200. STARPU_ASSERT(ret == 0);
  201. #else
  202. *ptr = malloc(block_size);
  203. STARPU_ASSERT(*ptr);
  204. #endif
  205. allocated += block_size;
  206. /* Fill the blocks with 0 */
  207. memset(*ptr, 0, block_size);
  208. /* Register it to StarPU */
  209. starpu_block_data_register(handleptr, 0, (uintptr_t)*ptr, nx, nx*ny, nx, ny, nz, sizeof(TYPE));
  210. }
  211. void display_memory_consumption(int rank)
  212. {
  213. fprintf(stderr, "%lu MB of memory were allocated on node %d\n", allocated/(1024*1024), rank);
  214. }
  215. void allocate_memory_on_node(int rank)
  216. {
  217. unsigned bz;
  218. for (bz = 0; bz < nbz; bz++)
  219. {
  220. struct block_description *block = get_block_description(bz);
  221. unsigned node = block->mpi_node;
  222. unsigned size_bz = block_sizes_z[bz];
  223. /* Main blocks */
  224. if (node == rank)
  225. {
  226. allocate_block_on_node(&block->layers_handle[0], &block->layers[0],
  227. (sizex + 2*K), (sizey + 2*K), (size_bz + 2*K));
  228. #ifdef LIFE
  229. unsigned x, y, z;
  230. unsigned sum = 0;
  231. for (x = 0; x < sizex; x++)
  232. for (y = 0; y < sizey; y++)
  233. for (z = 0; z < size_bz; z++)
  234. /* Just random data */
  235. sum += block->layers[0][(K+x)+(K+y)*(sizex + 2*K)+(K+z)*(sizex+2*K)*(sizey+2*K)] = (int)((x/7.+y/13.+(bz*size_bz + z)/17.) * 10.) % 2;
  236. /* printf("block %d starts with %d/%d alive\n", bz, sum, sizex*sizey*size_bz);*/
  237. #endif
  238. allocate_block_on_node(&block->layers_handle[1], &block->layers[1],
  239. (sizex + 2*K), (sizey + 2*K), (size_bz + 2*K));
  240. }
  241. /* Boundary blocks : Top */
  242. unsigned top_node = block->boundary_blocks[T]->mpi_node;
  243. if ((node == rank) || (top_node == rank))
  244. {
  245. allocate_block_on_node(&block->boundaries_handle[T][0], &block->boundaries[T][0],
  246. (sizex + 2*K), (sizey + 2*K), K);
  247. allocate_block_on_node(&block->boundaries_handle[T][1], &block->boundaries[T][1],
  248. (sizex + 2*K), (sizey + 2*K), K);
  249. }
  250. /* Boundary blocks : Bottom */
  251. unsigned bottom_node = block->boundary_blocks[B]->mpi_node;
  252. if ((node == rank) || (bottom_node == rank))
  253. {
  254. allocate_block_on_node(&block->boundaries_handle[B][0], &block->boundaries[B][0],
  255. (sizex + 2*K), (sizey + 2*K), K);
  256. allocate_block_on_node(&block->boundaries_handle[B][1], &block->boundaries[B][1],
  257. (sizex + 2*K), (sizey + 2*K), K);
  258. }
  259. }
  260. }
  261. /* check how many cells are alive */
  262. void check(int rank)
  263. {
  264. unsigned bz;
  265. for (bz = 0; bz < nbz; bz++)
  266. {
  267. struct block_description *block = get_block_description(bz);
  268. unsigned node = block->mpi_node;
  269. /* Main blocks */
  270. if (node == rank)
  271. {
  272. unsigned size_bz = block_sizes_z[bz];
  273. #ifdef LIFE
  274. unsigned x, y, z;
  275. unsigned sum = 0;
  276. for (x = 0; x < sizex; x++)
  277. for (y = 0; y < sizey; y++)
  278. for (z = 0; z < size_bz; z++)
  279. sum += block->layers[0][(K+x)+(K+y)*(sizex + 2*K)+(K+z)*(sizex+2*K)*(sizey+2*K)];
  280. printf("block %d got %d/%d alive\n", bz, sum, sizex*sizey*size_bz);
  281. #endif
  282. }
  283. }
  284. }