dw_mult_no_stride.c 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401
  1. /*
  2. * StarPU
  3. * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU Lesser General Public License as published by
  7. * the Free Software Foundation; either version 2.1 of the License, or (at
  8. * your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful, but
  11. * WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. *
  14. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  15. */
  16. #include "dw_mult.h"
  17. #ifdef USE_GORDON
  18. #include "gordon/func_sgemm_ibm.h"
  19. #endif
  20. float *A[MAXSLICESY][MAXSLICESZ];
  21. float *B[MAXSLICESZ][MAXSLICESX];
  22. float *C[MAXSLICESY][MAXSLICESX];
  23. starpu_data_handle A_state[MAXSLICESY][MAXSLICESZ];
  24. starpu_data_handle B_state[MAXSLICESZ][MAXSLICESX];
  25. starpu_data_handle C_state[MAXSLICESY][MAXSLICESX];
  26. #define TAG(x,y,z,iter) \
  27. ((starpu_tag_t)((z) + (iter)*nslicesz + (x)*(nslicesz*niter) + (y)*(nslicesx*nslicesz*niter)))
  28. static void submit_new_iter(unsigned x, unsigned y, unsigned iter);
  29. /*
  30. * That program should compute C = A * B
  31. *
  32. * A of size (z,y)
  33. * B of size (x,z)
  34. * C of size (x,y)
  35. |---------------|
  36. z | B |
  37. |---------------|
  38. z x
  39. |----| |---------------|
  40. | | | |
  41. | | | |
  42. | A | y | C |
  43. | | | |
  44. | | | |
  45. |----| |---------------|
  46. */
  47. #define MEM_ALIGNMENT 16
  48. static void init_problem_data(void)
  49. {
  50. unsigned i,j;
  51. /* Allocate grids of buffer */
  52. /* TODO pin ... */
  53. unsigned z, y, x;
  54. for (y = 0; y < nslicesy; y++)
  55. {
  56. for (z = 0; z < nslicesz; z++)
  57. {
  58. #ifdef HAVE_POSIX_MEMALIGN
  59. posix_memalign((void **)&A[y][z], MEM_ALIGNMENT, BLOCKSIZEZ*BLOCKSIZEY*sizeof(float));
  60. #else
  61. A[y][z] = malloc(BLOCKSIZEZ*BLOCKSIZEY*sizeof(float));
  62. #endif
  63. assert(A[y][z]);
  64. }
  65. }
  66. for (z = 0; z < nslicesz; z++)
  67. {
  68. for (x = 0; x < nslicesx; x++)
  69. {
  70. #ifdef HAVE_POSIX_MEMALIGN
  71. posix_memalign((void **)&B[z][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEZ*sizeof(float));
  72. #else
  73. B[z][x] = malloc(BLOCKSIZEX*BLOCKSIZEZ*sizeof(float));
  74. #endif
  75. assert(B[z][x]);
  76. }
  77. }
  78. for (y = 0; y < nslicesy; y++)
  79. {
  80. for (x = 0; x < nslicesx; x++)
  81. {
  82. #ifdef HAVE_POSIX_MEMALIGN
  83. posix_memalign((void **)&C[y][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEY*sizeof(float));
  84. #else
  85. C[y][x] = malloc(BLOCKSIZEX*BLOCKSIZEY*sizeof(float));
  86. #endif
  87. assert(C[y][x]);
  88. }
  89. }
  90. /* fill the A and B matrices */
  91. unsigned blockx, blocky, blockz;
  92. if (norandom) {
  93. for (blocky = 0; blocky < nslicesy; blocky++)
  94. for (blockz = 0; blockz < nslicesz; blockz++)
  95. for (j = 0; j < BLOCKSIZEY; j++)
  96. for (i = 0; i < BLOCKSIZEZ; i++)
  97. {
  98. A[blocky][blockz][i*BLOCKSIZEY + j] = (float)(1 + blockz + blocky*nslicesz);
  99. }
  100. for (blockz = 0; blockz < nslicesz; blockz++)
  101. for (blockx = 0; blockx < nslicesx; blockx++)
  102. for (j = 0; j < BLOCKSIZEZ; j++)
  103. for (i = 0; i < BLOCKSIZEX; i++)
  104. {
  105. B[blockz][blockx][i*BLOCKSIZEZ + j] = (float)(1 + blockx + blockz*nslicesx);
  106. }
  107. }
  108. else {
  109. for (blocky = 0; blocky < nslicesy; blocky++)
  110. for (blockz = 0; blockz < nslicesz; blockz++)
  111. for (j = 0; j < BLOCKSIZEY; j++)
  112. for (i = 0; i < BLOCKSIZEZ; i++)
  113. {
  114. A[blocky][blockz][i*BLOCKSIZEY + j] = (float)(starpu_drand48());
  115. }
  116. for (blockz = 0; blockz < nslicesz; blockz++)
  117. for (blockx = 0; blockx < nslicesx; blockx++)
  118. for (j = 0; j < BLOCKSIZEZ; j++)
  119. for (i = 0; i < BLOCKSIZEX; i++)
  120. {
  121. B[blockz][blockx][i*BLOCKSIZEZ + j] = (float)(starpu_drand48());
  122. }
  123. }
  124. for (blocky = 0; blocky < nslicesy; blocky++)
  125. for (blockx = 0; blockx < nslicesx; blockx++)
  126. for (j = 0; j < BLOCKSIZEY; j++)
  127. for (i = 0; i < BLOCKSIZEX; i++)
  128. {
  129. C[blocky][blockx][i*BLOCKSIZEY + j] = (float)(blockx + blocky*nslicesx + 1);
  130. }
  131. /* declare the StarPU data to monitor */
  132. for (y = 0; y < nslicesy; y++)
  133. {
  134. for (z = 0; z < nslicesz; z++)
  135. {
  136. starpu_register_blas_data(&A_state[y][z], 0, (uintptr_t)A[y][z],
  137. BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEZ, sizeof(float));
  138. }
  139. }
  140. for (z = 0; z < nslicesz; z++)
  141. {
  142. for (x = 0; x < nslicesx; x++)
  143. {
  144. starpu_register_blas_data(&B_state[z][x], 0, (uintptr_t)B[z][x],
  145. BLOCKSIZEZ, BLOCKSIZEZ, BLOCKSIZEX, sizeof(float));
  146. }
  147. }
  148. for (y = 0; y < nslicesy; y++)
  149. {
  150. for (x = 0; x < nslicesx; x++)
  151. {
  152. starpu_register_blas_data(&C_state[y][x], 0, (uintptr_t)C[y][x],
  153. BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEX, sizeof(float));
  154. }
  155. }
  156. #ifdef USE_GORDON
  157. conf.k = BLOCKSIZEZ;
  158. conf.m = BLOCKSIZEY;
  159. conf.n = BLOCKSIZEX;
  160. #endif
  161. display_memory_consumption();
  162. }
  163. static void cleanup_problem(void)
  164. {
  165. unsigned z, y, x;
  166. for (y = 0; y < nslicesy; y++)
  167. {
  168. for (z = 0; z < nslicesz; z++)
  169. {
  170. // free(A[y][z]);
  171. }
  172. }
  173. for (z = 0; z < nslicesz; z++)
  174. {
  175. for (x = 0; x < nslicesx; x++)
  176. {
  177. // free(B[z][x]);
  178. }
  179. }
  180. for (y = 0; y < nslicesy; y++)
  181. {
  182. for (x = 0; x < nslicesx; x++)
  183. {
  184. // free(C[y][x]);
  185. starpu_tag_remove(TAG(nslicesz - 1, y, x, niter - 1));
  186. }
  187. }
  188. }
  189. struct cb2_s {
  190. unsigned blockx;
  191. unsigned blocky;
  192. unsigned iter;
  193. };
  194. static starpu_codelet cl = {
  195. .cpu_func = cpu_mult,
  196. #ifdef USE_CUDA
  197. .cuda_func = cublas_mult,
  198. #endif
  199. #ifdef USE_GORDON
  200. /* .gordon_func will be set by load_elf_sgemm */
  201. #endif
  202. .model = &sgemm_model,
  203. .where = STARPU_CPU|STARPU_CUDA|STARPU_GORDON,
  204. .nbuffers = 3
  205. };
  206. #ifdef USE_GORDON
  207. static const char *spu_func_sgemm_elf_file = "./gordon/func_sgemm_ibm.spuelf";
  208. static unsigned spu_func_sgemm_elf_id;
  209. static unsigned spu_func_sgemm_ibm_id;
  210. static void load_elf_sgemm(void)
  211. {
  212. spu_func_sgemm_elf_id =
  213. gordon_register_elf_plugin(spu_func_sgemm_elf_file);
  214. spu_func_sgemm_ibm_id = gordon_register_kernel(spu_func_sgemm_elf_id, "func_sgemm_ibm");
  215. gordon_load_plugin_on_all_spu(spu_func_sgemm_elf_id);
  216. gordon_load_kernel_on_all_spu(spu_func_sgemm_ibm_id);
  217. cl.gordon_func = spu_func_sgemm_ibm_id;
  218. }
  219. #endif // USE_GORDON
  220. static struct starpu_task *construct_task(unsigned x, unsigned y, unsigned z, unsigned iter)
  221. {
  222. struct starpu_task *task = starpu_task_create();
  223. task->cl = &cl;
  224. #ifdef USE_GORDON
  225. task->cl_arg = &conf;
  226. task->cl_arg_size = sizeof(struct ibm_sgemm_block_conf);
  227. #endif
  228. task->use_tag = 1;
  229. task->tag_id = TAG(z, y, x, iter);
  230. task->buffers[0].handle = A_state[y][z];
  231. task->buffers[0].mode = STARPU_R;
  232. task->buffers[1].handle = B_state[z][x];
  233. task->buffers[1].mode = STARPU_R;
  234. task->buffers[2].handle = C_state[y][x];
  235. task->buffers[2].mode = STARPU_RW;
  236. return task;
  237. }
  238. static void callback_func_2(void *arg)
  239. {
  240. /* the argument is a pointer to a counter of the remaining tasks */
  241. struct cb2_s *cb2 = arg;
  242. unsigned x,y,z,iter;
  243. iter = cb2->iter;
  244. x = cb2->blockx;
  245. y = cb2->blocky;
  246. free(cb2);
  247. /* do some accounting */
  248. int id = starpu_get_worker_id();
  249. flop_per_worker[id] += BLAS3_FLOP(BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZEZ);
  250. ls_per_worker[id] += BLAS3_LS(BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZEZ);
  251. /* TAG(nslicesz - 1, y, x, iter) remains ... */
  252. for (z = 0; z < nslicesz - 1; z++)
  253. {
  254. starpu_tag_remove(TAG(z, y, x, iter));
  255. }
  256. if (iter > 0)
  257. {
  258. starpu_tag_remove(TAG(nslicesz - 1, y, x, iter-1));
  259. }
  260. if (iter != niter - 1) {
  261. submit_new_iter(x, y, iter+1);
  262. }
  263. }
  264. static void submit_new_iter(unsigned x, unsigned y, unsigned iter)
  265. {
  266. unsigned z;
  267. for (z = 0; z < nslicesz; z++)
  268. {
  269. struct starpu_task *task;
  270. task = construct_task(x, y, z, iter);
  271. if (z != 0) {
  272. starpu_tag_declare_deps(TAG(z, y, x, iter), 1, TAG(z-1, y, x, iter));
  273. }
  274. if (z == nslicesz - 1) {
  275. struct cb2_s *cb2 = malloc(sizeof(struct cb2_s));
  276. cb2->blockx = x;
  277. cb2->blocky = y;
  278. cb2->iter = iter;
  279. task->callback_func = callback_func_2;
  280. task->callback_arg = cb2;
  281. }
  282. starpu_submit_task(task);
  283. }
  284. }
  285. static void launch_codelets(void)
  286. {
  287. #ifdef USE_FXT
  288. fxt_register_thread(0);
  289. #endif
  290. /* partition the work into slices */
  291. unsigned taskx, tasky;
  292. srand(time(NULL));
  293. gettimeofday(&start, NULL);
  294. for (taskx = 0; taskx < nslicesx; taskx++)
  295. for (tasky = 0; tasky < nslicesy; tasky++)
  296. {
  297. submit_new_iter(taskx, tasky, 0);
  298. }
  299. }
  300. int main(__attribute__ ((unused)) int argc,
  301. __attribute__ ((unused)) char **argv)
  302. {
  303. parse_args(argc, argv);
  304. /* start the runtime */
  305. starpu_init(NULL);
  306. starpu_helper_init_cublas();
  307. #ifdef USE_GORDON
  308. load_elf_sgemm();
  309. #endif
  310. init_problem_data();
  311. launch_codelets();
  312. starpu_wait_all_tasks();
  313. gettimeofday(&end, NULL);
  314. double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  315. display_stats(timing);
  316. cleanup_problem();
  317. starpu_helper_shutdown_cublas();
  318. starpu_shutdown();
  319. return 0;
  320. }