dw_mult_no_stride.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2009, 2010 Université de Bordeaux 1
  4. * Copyright (C) 2010 Centre National de la Recherche Scientifique
  5. *
  6. * StarPU is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU Lesser General Public License as published by
  8. * the Free Software Foundation; either version 2.1 of the License, or (at
  9. * your option) any later version.
  10. *
  11. * StarPU is distributed in the hope that it will be useful, but
  12. * WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  14. *
  15. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  16. */
  17. #include "simple.h"
  18. #include "dw_mult.h"
  19. #ifdef STARPU_USE_GORDON
  20. #include "gordon/func_sgemm_ibm.h"
  21. #endif
  22. #include "xgemm_kernels.c"
  23. TYPE *A[MAXSLICESY][MAXSLICESZ];
  24. TYPE *B[MAXSLICESZ][MAXSLICESX];
  25. TYPE *C[MAXSLICESY][MAXSLICESX];
  26. starpu_data_handle A_state[MAXSLICESY][MAXSLICESZ];
  27. starpu_data_handle B_state[MAXSLICESZ][MAXSLICESX];
  28. starpu_data_handle C_state[MAXSLICESY][MAXSLICESX];
  29. #define TAG(x,y,z,iter) \
  30. ((starpu_tag_t)((z) + (iter)*nslicesz + (x)*(nslicesz*niter) + (y)*(nslicesx*nslicesz*niter)))
  31. static void submit_new_iter(unsigned x, unsigned y, unsigned iter);
  32. /*
  33. * This program computes C = A * B
  34. *
  35. * The difference with xgemm.c is that matrices are here already split in
  36. * blocks, and thus no data partitioning is needed.
  37. *
  38. * A of size (z,y)
  39. * B of size (x,z)
  40. * C of size (x,y)
  41. |---------------|
  42. z | B |
  43. |---------------|
  44. z x
  45. |----| |---------------|
  46. | | | |
  47. | | | |
  48. | A | y | C |
  49. | | | |
  50. | | | |
  51. |----| |---------------|
  52. */
  53. #define MEM_ALIGNMENT 16
  54. static void init_problem_data(void)
  55. {
  56. unsigned i,j;
  57. /* debug ... */
  58. memset(A, 0, MAXSLICESY*MAXSLICESZ*sizeof(TYPE *));
  59. memset(B, 0, MAXSLICESZ*MAXSLICESZ*sizeof(TYPE *));
  60. memset(C, 0, MAXSLICESY*MAXSLICESX*sizeof(TYPE *));
  61. memset(&A_state, 0, MAXSLICESY*MAXSLICESZ*sizeof(starpu_data_handle));
  62. memset(&B_state, 0, MAXSLICESZ*MAXSLICESZ*sizeof(starpu_data_handle));
  63. memset(&C_state, 0, MAXSLICESY*MAXSLICESX*sizeof(starpu_data_handle));
  64. /* Allocate grids of buffer */
  65. /* TODO pin ... */
  66. unsigned z, y, x;
  67. for (y = 0; y < nslicesy; y++)
  68. {
  69. for (z = 0; z < nslicesz; z++)
  70. {
  71. #ifdef STARPU_HAVE_POSIX_MEMALIGN
  72. posix_memalign((void **)&A[y][z], MEM_ALIGNMENT, BLOCKSIZEZ*BLOCKSIZEY*sizeof(TYPE));
  73. #else
  74. A[y][z] = malloc(BLOCKSIZEZ*BLOCKSIZEY*sizeof(TYPE));
  75. #endif
  76. assert(A[y][z]);
  77. }
  78. }
  79. for (z = 0; z < nslicesz; z++)
  80. {
  81. for (x = 0; x < nslicesx; x++)
  82. {
  83. #ifdef STARPU_HAVE_POSIX_MEMALIGN
  84. posix_memalign((void **)&B[z][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEZ*sizeof(TYPE));
  85. #else
  86. B[z][x] = malloc(BLOCKSIZEX*BLOCKSIZEZ*sizeof(TYPE));
  87. #endif
  88. assert(B[z][x]);
  89. }
  90. }
  91. for (y = 0; y < nslicesy; y++)
  92. {
  93. for (x = 0; x < nslicesx; x++)
  94. {
  95. #ifdef STARPU_HAVE_POSIX_MEMALIGN
  96. posix_memalign((void **)&C[y][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEY*sizeof(TYPE));
  97. #else
  98. C[y][x] = malloc(BLOCKSIZEX*BLOCKSIZEY*sizeof(TYPE));
  99. #endif
  100. assert(C[y][x]);
  101. }
  102. }
  103. /* fill the A and B matrices */
  104. unsigned blockx, blocky, blockz;
  105. if (norandom) {
  106. for (blocky = 0; blocky < nslicesy; blocky++)
  107. for (blockz = 0; blockz < nslicesz; blockz++)
  108. for (j = 0; j < BLOCKSIZEY; j++)
  109. for (i = 0; i < BLOCKSIZEZ; i++)
  110. {
  111. A[blocky][blockz][i*BLOCKSIZEY + j] = (TYPE)(1 + blockz + blocky*nslicesz);
  112. }
  113. for (blockz = 0; blockz < nslicesz; blockz++)
  114. for (blockx = 0; blockx < nslicesx; blockx++)
  115. for (j = 0; j < BLOCKSIZEZ; j++)
  116. for (i = 0; i < BLOCKSIZEX; i++)
  117. {
  118. B[blockz][blockx][i*BLOCKSIZEZ + j] = (TYPE)(1 + blockx + blockz*nslicesx);
  119. }
  120. }
  121. else {
  122. for (blocky = 0; blocky < nslicesy; blocky++)
  123. for (blockz = 0; blockz < nslicesz; blockz++)
  124. for (j = 0; j < BLOCKSIZEY; j++)
  125. for (i = 0; i < BLOCKSIZEZ; i++)
  126. {
  127. A[blocky][blockz][i*BLOCKSIZEY + j] = (TYPE)(starpu_drand48());
  128. }
  129. for (blockz = 0; blockz < nslicesz; blockz++)
  130. for (blockx = 0; blockx < nslicesx; blockx++)
  131. for (j = 0; j < BLOCKSIZEZ; j++)
  132. for (i = 0; i < BLOCKSIZEX; i++)
  133. {
  134. B[blockz][blockx][i*BLOCKSIZEZ + j] = (TYPE)(starpu_drand48());
  135. }
  136. }
  137. for (blocky = 0; blocky < nslicesy; blocky++)
  138. for (blockx = 0; blockx < nslicesx; blockx++)
  139. for (j = 0; j < BLOCKSIZEY; j++)
  140. for (i = 0; i < BLOCKSIZEX; i++)
  141. {
  142. C[blocky][blockx][i*BLOCKSIZEY + j] = (TYPE)(blockx + blocky*nslicesx + 1);
  143. }
  144. /* TODO: aren't we supposed to set data consistency to relaxed, since
  145. * tags are supposed to provide the correct dependencies? */
  146. /* declare the StarPU data to monitor */
  147. for (y = 0; y < nslicesy; y++)
  148. {
  149. for (z = 0; z < nslicesz; z++)
  150. {
  151. starpu_matrix_data_register(&A_state[y][z], 0, (uintptr_t)A[y][z],
  152. BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEZ, sizeof(TYPE));
  153. }
  154. }
  155. for (z = 0; z < nslicesz; z++)
  156. {
  157. for (x = 0; x < nslicesx; x++)
  158. {
  159. starpu_matrix_data_register(&B_state[z][x], 0, (uintptr_t)B[z][x],
  160. BLOCKSIZEZ, BLOCKSIZEZ, BLOCKSIZEX, sizeof(TYPE));
  161. }
  162. }
  163. for (y = 0; y < nslicesy; y++)
  164. {
  165. for (x = 0; x < nslicesx; x++)
  166. {
  167. starpu_matrix_data_register(&C_state[y][x], 0, (uintptr_t)C[y][x],
  168. BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEX, sizeof(TYPE));
  169. }
  170. }
  171. #ifdef STARPU_USE_GORDON
  172. conf.k = BLOCKSIZEZ;
  173. conf.m = BLOCKSIZEY;
  174. conf.n = BLOCKSIZEX;
  175. #endif
  176. fprintf(stderr, "block size : x %d y %d z %d\n", BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZEZ);
  177. display_memory_consumption();
  178. }
  179. static void cleanup_problem(void)
  180. {
  181. unsigned z, y, x;
  182. #ifdef CHECK_OUTPUT
  183. TYPE maxerr = 0.0;
  184. TYPE err;
  185. fprintf(stderr, "Checking results ....");
  186. for (y = 0; y < nslicesy; y++)
  187. {
  188. for (x = 0; x < nslicesx; x++)
  189. {
  190. for (z = 0; z < nslicesz; z++)
  191. {
  192. SGEMM("N", "N", BLOCKSIZEY, BLOCKSIZEX, BLOCKSIZEZ, -(TYPE)(niter), A[y][z], BLOCKSIZEY, B[z][x], BLOCKSIZEZ, 1.0f, C[y][x], BLOCKSIZEY);
  193. }
  194. /* make sure C - niter AB = 0 */
  195. err = SASUM(BLOCKSIZEX*BLOCKSIZEY, C[y][x], 1);
  196. if (err > BLOCKSIZEX*BLOCKSIZEY*niter*0.001)
  197. fprintf(stderr, "\nerr = %f ( x = %d y = %d ) ... ", err/niter, x, y );
  198. maxerr = STARPU_MAX(err, maxerr);
  199. }
  200. }
  201. if (maxerr > BLOCKSIZEX*BLOCKSIZEY*niter*0.001)
  202. {
  203. fprintf(stderr, " maxerr = %f\n", maxerr/niter);
  204. }
  205. else {
  206. fprintf(stderr, " OK\n");
  207. }
  208. fflush(stderr);
  209. #endif
  210. for (y = 0; y < nslicesy; y++)
  211. {
  212. for (z = 0; z < nslicesz; z++)
  213. {
  214. // free(A[y][z]);
  215. }
  216. }
  217. for (z = 0; z < nslicesz; z++)
  218. {
  219. for (x = 0; x < nslicesx; x++)
  220. {
  221. // free(B[z][x]);
  222. }
  223. }
  224. for (y = 0; y < nslicesy; y++)
  225. {
  226. for (x = 0; x < nslicesx; x++)
  227. {
  228. // free(C[y][x]);
  229. starpu_tag_remove(TAG(nslicesz - 1, y, x, niter - 1));
  230. }
  231. }
  232. }
  233. struct cb2_s {
  234. unsigned blockx;
  235. unsigned blocky;
  236. unsigned iter;
  237. };
  238. static starpu_codelet cl = {
  239. .where = STARPU_CPU|STARPU_CUDA
  240. #ifdef SPU_FUNC_SGEMM
  241. |STARPU_GORDON
  242. #endif
  243. ,
  244. .cpu_func = STARPU_GEMM(cpu_mult),
  245. #ifdef STARPU_USE_CUDA
  246. .cuda_func = STARPU_GEMM(cublas_mult),
  247. #endif
  248. #ifdef STARPU_USE_GORDON
  249. /* .gordon_func will be set by load_elf_sgemm */
  250. #endif
  251. .nbuffers = 3
  252. };
  253. #ifdef STARPU_USE_GORDON
  254. static const char *spu_func_sgemm_elf_file = "./gordon/func_sgemm_ibm.spuelf";
  255. static unsigned spu_func_sgemm_elf_id;
  256. static unsigned spu_func_sgemm_ibm_id;
  257. static void load_elf_sgemm(void)
  258. {
  259. spu_func_sgemm_elf_id =
  260. gordon_register_elf_plugin(spu_func_sgemm_elf_file);
  261. spu_func_sgemm_ibm_id = gordon_register_kernel(spu_func_sgemm_elf_id, "func_sgemm_ibm");
  262. gordon_load_plugin_on_all_spu(spu_func_sgemm_elf_id);
  263. gordon_load_kernel_on_all_spu(spu_func_sgemm_ibm_id);
  264. cl.gordon_func = spu_func_sgemm_ibm_id;
  265. }
  266. #endif // STARPU_USE_GORDON
  267. static struct starpu_task *construct_task(unsigned x, unsigned y, unsigned z, unsigned iter)
  268. {
  269. /* A B[task] = C[task] */
  270. struct starpu_task *task = starpu_task_create();
  271. task->cl = &cl;
  272. task->use_tag = 1;
  273. task->tag_id = TAG(z, y, x, iter);
  274. task->buffers[0].handle = A_state[y][z];
  275. task->buffers[0].mode = STARPU_R;
  276. task->buffers[1].handle = B_state[z][x];
  277. task->buffers[1].mode = STARPU_R;
  278. task->buffers[2].handle = C_state[y][x];
  279. task->buffers[2].mode = STARPU_RW;
  280. #ifdef STARPU_USE_GORDON
  281. task->cl_arg = &conf;
  282. task->cl_arg_size = sizeof(struct ibm_sgemm_block_conf);
  283. #endif
  284. return task;
  285. }
  286. static void callback_func_2(void *arg)
  287. {
  288. /* the argument is a pointer to a counter of the remaining tasks */
  289. struct cb2_s *cb2 = arg;
  290. unsigned x,y,z,iter;
  291. iter = cb2->iter;
  292. x = cb2->blockx;
  293. y = cb2->blocky;
  294. free(cb2);
  295. /* do some accounting */
  296. int id = starpu_worker_get_id();
  297. flop_per_worker[id] += BLAS3_FLOP(BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZEZ);
  298. ls_per_worker[id] += BLAS3_LS(BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZEZ);
  299. /* TAG(nslicesz - 1, y, x, iter) remains ... */
  300. for (z = 0; z < nslicesz - 1; z++)
  301. {
  302. starpu_tag_remove(TAG(z, y, x, iter));
  303. }
  304. if (iter > 0)
  305. {
  306. starpu_tag_remove(TAG(nslicesz - 1, y, x, iter-1));
  307. }
  308. if (iter != niter - 1) {
  309. submit_new_iter(x, y, iter+1);
  310. }
  311. }
  312. static void submit_new_iter(unsigned x, unsigned y, unsigned iter)
  313. {
  314. unsigned z;
  315. for (z = 0; z < nslicesz; z++)
  316. {
  317. struct starpu_task *task;
  318. task = construct_task(x, y, z, iter);
  319. if (z != 0) {
  320. starpu_tag_declare_deps(TAG(z, y, x, iter), 1, TAG(z-1, y, x, iter));
  321. }
  322. if (z == nslicesz - 1) {
  323. struct cb2_s *cb2 = malloc(sizeof(struct cb2_s));
  324. cb2->blockx = x;
  325. cb2->blocky = y;
  326. cb2->iter = iter;
  327. task->callback_func = callback_func_2;
  328. task->callback_arg = cb2;
  329. }
  330. starpu_task_submit(task);
  331. }
  332. }
  333. static void launch_codelets(void)
  334. {
  335. #ifdef STARPU_USE_FXT
  336. _starpu_fxt_register_thread(0);
  337. #endif
  338. /* partition the work into slices */
  339. unsigned taskx, tasky;
  340. srand(time(NULL));
  341. /* should we use a single performance model for all archs and use an
  342. * acceleration factor ? */
  343. if (use_common_model) {
  344. cl.model = &STARPU_GEMM(model_common);
  345. }
  346. else {
  347. cl.model = &STARPU_GEMM(model);
  348. }
  349. for (taskx = 0; taskx < nslicesx; taskx++)
  350. {
  351. for (tasky = 0; tasky < nslicesy; tasky++)
  352. {
  353. submit_new_iter(taskx, tasky, 0);
  354. }
  355. }
  356. }
  357. int main(__attribute__ ((unused)) int argc,
  358. __attribute__ ((unused)) char **argv)
  359. {
  360. parse_args(argc, argv);
  361. /* start the runtime */
  362. starpu_init(NULL);
  363. starpu_helper_cublas_init();
  364. #ifdef STARPU_USE_GORDON
  365. load_elf_sgemm();
  366. #endif
  367. init_problem_data();
  368. gettimeofday(&start, NULL);
  369. launch_codelets();
  370. starpu_task_wait_for_all();
  371. gettimeofday(&end, NULL);
  372. double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  373. display_stats(timing);
  374. cleanup_problem();
  375. starpu_helper_cublas_shutdown();
  376. starpu_shutdown();
  377. return 0;
  378. }