dw_mult_no_stride.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465
  1. /*
  2. * StarPU
  3. * Copyright (C) Université Bordeaux 1, CNRS 2008-2010 (see AUTHORS file)
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU Lesser General Public License as published by
  7. * the Free Software Foundation; either version 2.1 of the License, or (at
  8. * your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful, but
  11. * WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. *
  14. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  15. */
  16. #include "simple.h"
  17. #include "dw_mult.h"
  18. #ifdef STARPU_USE_GORDON
  19. #include "gordon/func_sgemm_ibm.h"
  20. #endif
  21. #include "xgemm_kernels.c"
  22. TYPE *A[MAXSLICESY][MAXSLICESZ];
  23. TYPE *B[MAXSLICESZ][MAXSLICESX];
  24. TYPE *C[MAXSLICESY][MAXSLICESX];
  25. starpu_data_handle A_state[MAXSLICESY][MAXSLICESZ];
  26. starpu_data_handle B_state[MAXSLICESZ][MAXSLICESX];
  27. starpu_data_handle C_state[MAXSLICESY][MAXSLICESX];
  28. #define TAG(x,y,z,iter) \
  29. ((starpu_tag_t)((z) + (iter)*nslicesz + (x)*(nslicesz*niter) + (y)*(nslicesx*nslicesz*niter)))
  30. static void submit_new_iter(unsigned x, unsigned y, unsigned iter);
  31. /*
  32. * This program computes C = A * B
  33. *
  34. * The difference with xgemm.c is that matrices are here already split in
  35. * blocks, and thus no data partitioning is needed.
  36. *
  37. * A of size (z,y)
  38. * B of size (x,z)
  39. * C of size (x,y)
  40. |---------------|
  41. z | B |
  42. |---------------|
  43. z x
  44. |----| |---------------|
  45. | | | |
  46. | | | |
  47. | A | y | C |
  48. | | | |
  49. | | | |
  50. |----| |---------------|
  51. */
  52. #define MEM_ALIGNMENT 16
  53. static void init_problem_data(void)
  54. {
  55. unsigned i,j;
  56. /* debug ... */
  57. memset(A, 0, MAXSLICESY*MAXSLICESZ*sizeof(TYPE *));
  58. memset(B, 0, MAXSLICESZ*MAXSLICESZ*sizeof(TYPE *));
  59. memset(C, 0, MAXSLICESY*MAXSLICESX*sizeof(TYPE *));
  60. memset(&A_state, 0, MAXSLICESY*MAXSLICESZ*sizeof(starpu_data_handle));
  61. memset(&B_state, 0, MAXSLICESZ*MAXSLICESZ*sizeof(starpu_data_handle));
  62. memset(&C_state, 0, MAXSLICESY*MAXSLICESX*sizeof(starpu_data_handle));
  63. /* Allocate grids of buffer */
  64. /* TODO pin ... */
  65. unsigned z, y, x;
  66. for (y = 0; y < nslicesy; y++)
  67. {
  68. for (z = 0; z < nslicesz; z++)
  69. {
  70. #ifdef STARPU_HAVE_POSIX_MEMALIGN
  71. posix_memalign((void **)&A[y][z], MEM_ALIGNMENT, BLOCKSIZEZ*BLOCKSIZEY*sizeof(TYPE));
  72. #else
  73. A[y][z] = malloc(BLOCKSIZEZ*BLOCKSIZEY*sizeof(TYPE));
  74. #endif
  75. assert(A[y][z]);
  76. }
  77. }
  78. for (z = 0; z < nslicesz; z++)
  79. {
  80. for (x = 0; x < nslicesx; x++)
  81. {
  82. #ifdef STARPU_HAVE_POSIX_MEMALIGN
  83. posix_memalign((void **)&B[z][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEZ*sizeof(TYPE));
  84. #else
  85. B[z][x] = malloc(BLOCKSIZEX*BLOCKSIZEZ*sizeof(TYPE));
  86. #endif
  87. assert(B[z][x]);
  88. }
  89. }
  90. for (y = 0; y < nslicesy; y++)
  91. {
  92. for (x = 0; x < nslicesx; x++)
  93. {
  94. #ifdef STARPU_HAVE_POSIX_MEMALIGN
  95. posix_memalign((void **)&C[y][x], MEM_ALIGNMENT, BLOCKSIZEX*BLOCKSIZEY*sizeof(TYPE));
  96. #else
  97. C[y][x] = malloc(BLOCKSIZEX*BLOCKSIZEY*sizeof(TYPE));
  98. #endif
  99. assert(C[y][x]);
  100. }
  101. }
  102. /* fill the A and B matrices */
  103. unsigned blockx, blocky, blockz;
  104. if (norandom) {
  105. for (blocky = 0; blocky < nslicesy; blocky++)
  106. for (blockz = 0; blockz < nslicesz; blockz++)
  107. for (j = 0; j < BLOCKSIZEY; j++)
  108. for (i = 0; i < BLOCKSIZEZ; i++)
  109. {
  110. A[blocky][blockz][i*BLOCKSIZEY + j] = (TYPE)(1 + blockz + blocky*nslicesz);
  111. }
  112. for (blockz = 0; blockz < nslicesz; blockz++)
  113. for (blockx = 0; blockx < nslicesx; blockx++)
  114. for (j = 0; j < BLOCKSIZEZ; j++)
  115. for (i = 0; i < BLOCKSIZEX; i++)
  116. {
  117. B[blockz][blockx][i*BLOCKSIZEZ + j] = (TYPE)(1 + blockx + blockz*nslicesx);
  118. }
  119. }
  120. else {
  121. for (blocky = 0; blocky < nslicesy; blocky++)
  122. for (blockz = 0; blockz < nslicesz; blockz++)
  123. for (j = 0; j < BLOCKSIZEY; j++)
  124. for (i = 0; i < BLOCKSIZEZ; i++)
  125. {
  126. A[blocky][blockz][i*BLOCKSIZEY + j] = (TYPE)(starpu_drand48());
  127. }
  128. for (blockz = 0; blockz < nslicesz; blockz++)
  129. for (blockx = 0; blockx < nslicesx; blockx++)
  130. for (j = 0; j < BLOCKSIZEZ; j++)
  131. for (i = 0; i < BLOCKSIZEX; i++)
  132. {
  133. B[blockz][blockx][i*BLOCKSIZEZ + j] = (TYPE)(starpu_drand48());
  134. }
  135. }
  136. for (blocky = 0; blocky < nslicesy; blocky++)
  137. for (blockx = 0; blockx < nslicesx; blockx++)
  138. for (j = 0; j < BLOCKSIZEY; j++)
  139. for (i = 0; i < BLOCKSIZEX; i++)
  140. {
  141. C[blocky][blockx][i*BLOCKSIZEY + j] = (TYPE)(blockx + blocky*nslicesx + 1);
  142. }
  143. /* TODO: aren't we supposed to set data consistency to relaxed, since
  144. * tags are supposed to provide the correct dependencies? */
  145. /* declare the StarPU data to monitor */
  146. for (y = 0; y < nslicesy; y++)
  147. {
  148. for (z = 0; z < nslicesz; z++)
  149. {
  150. starpu_matrix_data_register(&A_state[y][z], 0, (uintptr_t)A[y][z],
  151. BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEZ, sizeof(TYPE));
  152. }
  153. }
  154. for (z = 0; z < nslicesz; z++)
  155. {
  156. for (x = 0; x < nslicesx; x++)
  157. {
  158. starpu_matrix_data_register(&B_state[z][x], 0, (uintptr_t)B[z][x],
  159. BLOCKSIZEZ, BLOCKSIZEZ, BLOCKSIZEX, sizeof(TYPE));
  160. }
  161. }
  162. for (y = 0; y < nslicesy; y++)
  163. {
  164. for (x = 0; x < nslicesx; x++)
  165. {
  166. starpu_matrix_data_register(&C_state[y][x], 0, (uintptr_t)C[y][x],
  167. BLOCKSIZEY, BLOCKSIZEY, BLOCKSIZEX, sizeof(TYPE));
  168. }
  169. }
  170. #ifdef STARPU_USE_GORDON
  171. conf.k = BLOCKSIZEZ;
  172. conf.m = BLOCKSIZEY;
  173. conf.n = BLOCKSIZEX;
  174. #endif
  175. fprintf(stderr, "block size : x %d y %d z %d\n", BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZEZ);
  176. display_memory_consumption();
  177. }
  178. static void cleanup_problem(void)
  179. {
  180. unsigned z, y, x;
  181. #ifdef CHECK_OUTPUT
  182. TYPE maxerr = 0.0;
  183. TYPE err;
  184. fprintf(stderr, "Checking results ....");
  185. for (y = 0; y < nslicesy; y++)
  186. {
  187. for (x = 0; x < nslicesx; x++)
  188. {
  189. for (z = 0; z < nslicesz; z++)
  190. {
  191. SGEMM("N", "N", BLOCKSIZEY, BLOCKSIZEX, BLOCKSIZEZ, -(TYPE)(niter), A[y][z], BLOCKSIZEY, B[z][x], BLOCKSIZEZ, 1.0f, C[y][x], BLOCKSIZEY);
  192. }
  193. /* make sure C - niter AB = 0 */
  194. err = SASUM(BLOCKSIZEX*BLOCKSIZEY, C[y][x], 1);
  195. if (err > BLOCKSIZEX*BLOCKSIZEY*niter*0.001)
  196. fprintf(stderr, "\nerr = %f ( x = %d y = %d ) ... ", err/niter, x, y );
  197. maxerr = STARPU_MAX(err, maxerr);
  198. }
  199. }
  200. if (maxerr > BLOCKSIZEX*BLOCKSIZEY*niter*0.001)
  201. {
  202. fprintf(stderr, " maxerr = %f\n", maxerr/niter);
  203. }
  204. else {
  205. fprintf(stderr, " OK\n");
  206. }
  207. fflush(stderr);
  208. #endif
  209. for (y = 0; y < nslicesy; y++)
  210. {
  211. for (z = 0; z < nslicesz; z++)
  212. {
  213. // free(A[y][z]);
  214. }
  215. }
  216. for (z = 0; z < nslicesz; z++)
  217. {
  218. for (x = 0; x < nslicesx; x++)
  219. {
  220. // free(B[z][x]);
  221. }
  222. }
  223. for (y = 0; y < nslicesy; y++)
  224. {
  225. for (x = 0; x < nslicesx; x++)
  226. {
  227. // free(C[y][x]);
  228. starpu_tag_remove(TAG(nslicesz - 1, y, x, niter - 1));
  229. }
  230. }
  231. }
  232. struct cb2_s {
  233. unsigned blockx;
  234. unsigned blocky;
  235. unsigned iter;
  236. };
  237. static starpu_codelet cl = {
  238. .where = STARPU_CPU|STARPU_CUDA
  239. #ifdef SPU_FUNC_SGEMM
  240. |STARPU_GORDON
  241. #endif
  242. ,
  243. .cpu_func = STARPU_GEMM(cpu_mult),
  244. #ifdef STARPU_USE_CUDA
  245. .cuda_func = STARPU_GEMM(cublas_mult),
  246. #endif
  247. #ifdef STARPU_USE_GORDON
  248. /* .gordon_func will be set by load_elf_sgemm */
  249. #endif
  250. .nbuffers = 3
  251. };
  252. #ifdef STARPU_USE_GORDON
  253. static const char *spu_func_sgemm_elf_file = "./gordon/func_sgemm_ibm.spuelf";
  254. static unsigned spu_func_sgemm_elf_id;
  255. static unsigned spu_func_sgemm_ibm_id;
  256. static void load_elf_sgemm(void)
  257. {
  258. spu_func_sgemm_elf_id =
  259. gordon_register_elf_plugin(spu_func_sgemm_elf_file);
  260. spu_func_sgemm_ibm_id = gordon_register_kernel(spu_func_sgemm_elf_id, "func_sgemm_ibm");
  261. gordon_load_plugin_on_all_spu(spu_func_sgemm_elf_id);
  262. gordon_load_kernel_on_all_spu(spu_func_sgemm_ibm_id);
  263. cl.gordon_func = spu_func_sgemm_ibm_id;
  264. }
  265. #endif // STARPU_USE_GORDON
  266. static struct starpu_task *construct_task(unsigned x, unsigned y, unsigned z, unsigned iter)
  267. {
  268. /* A B[task] = C[task] */
  269. struct starpu_task *task = starpu_task_create();
  270. task->cl = &cl;
  271. task->use_tag = 1;
  272. task->tag_id = TAG(z, y, x, iter);
  273. task->buffers[0].handle = A_state[y][z];
  274. task->buffers[0].mode = STARPU_R;
  275. task->buffers[1].handle = B_state[z][x];
  276. task->buffers[1].mode = STARPU_R;
  277. task->buffers[2].handle = C_state[y][x];
  278. task->buffers[2].mode = STARPU_RW;
  279. #ifdef STARPU_USE_GORDON
  280. task->cl_arg = &conf;
  281. task->cl_arg_size = sizeof(struct ibm_sgemm_block_conf);
  282. #endif
  283. return task;
  284. }
  285. static void callback_func_2(void *arg)
  286. {
  287. /* the argument is a pointer to a counter of the remaining tasks */
  288. struct cb2_s *cb2 = arg;
  289. unsigned x,y,z,iter;
  290. iter = cb2->iter;
  291. x = cb2->blockx;
  292. y = cb2->blocky;
  293. free(cb2);
  294. /* do some accounting */
  295. int id = starpu_worker_get_id();
  296. flop_per_worker[id] += BLAS3_FLOP(BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZEZ);
  297. ls_per_worker[id] += BLAS3_LS(BLOCKSIZEX, BLOCKSIZEY, BLOCKSIZEZ);
  298. /* TAG(nslicesz - 1, y, x, iter) remains ... */
  299. for (z = 0; z < nslicesz - 1; z++)
  300. {
  301. starpu_tag_remove(TAG(z, y, x, iter));
  302. }
  303. if (iter > 0)
  304. {
  305. starpu_tag_remove(TAG(nslicesz - 1, y, x, iter-1));
  306. }
  307. if (iter != niter - 1) {
  308. submit_new_iter(x, y, iter+1);
  309. }
  310. }
  311. static void submit_new_iter(unsigned x, unsigned y, unsigned iter)
  312. {
  313. unsigned z;
  314. for (z = 0; z < nslicesz; z++)
  315. {
  316. struct starpu_task *task;
  317. task = construct_task(x, y, z, iter);
  318. if (z != 0) {
  319. starpu_tag_declare_deps(TAG(z, y, x, iter), 1, TAG(z-1, y, x, iter));
  320. }
  321. if (z == nslicesz - 1) {
  322. struct cb2_s *cb2 = malloc(sizeof(struct cb2_s));
  323. cb2->blockx = x;
  324. cb2->blocky = y;
  325. cb2->iter = iter;
  326. task->callback_func = callback_func_2;
  327. task->callback_arg = cb2;
  328. }
  329. starpu_task_submit(task);
  330. }
  331. }
  332. static void launch_codelets(void)
  333. {
  334. #ifdef STARPU_USE_FXT
  335. _starpu_fxt_register_thread(0);
  336. #endif
  337. /* partition the work into slices */
  338. unsigned taskx, tasky;
  339. srand(time(NULL));
  340. /* should we use a single performance model for all archs and use an
  341. * acceleration factor ? */
  342. if (use_common_model) {
  343. cl.model = &STARPU_GEMM(model_common);
  344. }
  345. else {
  346. cl.model = &STARPU_GEMM(model);
  347. }
  348. for (taskx = 0; taskx < nslicesx; taskx++)
  349. {
  350. for (tasky = 0; tasky < nslicesy; tasky++)
  351. {
  352. submit_new_iter(taskx, tasky, 0);
  353. }
  354. }
  355. }
  356. int main(__attribute__ ((unused)) int argc,
  357. __attribute__ ((unused)) char **argv)
  358. {
  359. parse_args(argc, argv);
  360. /* start the runtime */
  361. starpu_init(NULL);
  362. starpu_helper_cublas_init();
  363. #ifdef STARPU_USE_GORDON
  364. load_elf_sgemm();
  365. #endif
  366. init_problem_data();
  367. gettimeofday(&start, NULL);
  368. launch_codelets();
  369. starpu_task_wait_for_all();
  370. gettimeofday(&end, NULL);
  371. double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  372. display_stats(timing);
  373. cleanup_problem();
  374. starpu_helper_cublas_shutdown();
  375. starpu_shutdown();
  376. return 0;
  377. }