cholesky_tile_tag.c 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2009-2014 Université de Bordeaux
  4. * Copyright (C) 2010, 2011, 2012, 2013 Centre National de la Recherche Scientifique
  5. *
  6. * StarPU is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU Lesser General Public License as published by
  8. * the Free Software Foundation; either version 2.1 of the License, or (at
  9. * your option) any later version.
  10. *
  11. * StarPU is distributed in the hope that it will be useful, but
  12. * WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  14. *
  15. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  16. */
  17. #include "cholesky.h"
  18. #if defined(STARPU_USE_CUDA) && defined(STARPU_HAVE_MAGMA)
  19. #include "magma.h"
  20. #endif
  21. /* A [ y ] [ x ] */
  22. float *A[NMAXBLOCKS][NMAXBLOCKS];
  23. starpu_data_handle_t A_state[NMAXBLOCKS][NMAXBLOCKS];
  24. /*
  25. * Some useful functions
  26. */
  27. static struct starpu_task *create_task(starpu_tag_t id)
  28. {
  29. struct starpu_task *task = starpu_task_create();
  30. task->cl_arg = NULL;
  31. task->use_tag = 1;
  32. task->tag_id = id;
  33. return task;
  34. }
  35. /*
  36. * Create the codelets
  37. */
  38. static struct starpu_task * create_task_11(unsigned k, unsigned nblocks)
  39. {
  40. /* FPRINTF(stdout, "task 11 k = %d TAG = %llx\n", k, (TAG11(k))); */
  41. struct starpu_task *task = create_task(TAG11(k));
  42. task->cl = &cl11;
  43. /* which sub-data is manipulated ? */
  44. task->handles[0] = A_state[k][k];
  45. /* this is an important task */
  46. task->priority = STARPU_MAX_PRIO;
  47. /* enforce dependencies ... */
  48. if (k > 0)
  49. {
  50. starpu_tag_declare_deps(TAG11(k), 1, TAG22(k-1, k, k));
  51. }
  52. int n = starpu_matrix_get_nx(task->handles[0]);
  53. task->flops = FLOPS_SPOTRF(n);
  54. return task;
  55. }
  56. static int create_task_21(unsigned k, unsigned j)
  57. {
  58. int ret;
  59. struct starpu_task *task = create_task(TAG21(k, j));
  60. task->cl = &cl21;
  61. /* which sub-data is manipulated ? */
  62. task->handles[0] = A_state[k][k];
  63. task->handles[1] = A_state[j][k];
  64. if (j == k+1)
  65. {
  66. task->priority = STARPU_MAX_PRIO;
  67. }
  68. /* enforce dependencies ... */
  69. if (k > 0)
  70. {
  71. starpu_tag_declare_deps(TAG21(k, j), 2, TAG11(k), TAG22(k-1, k, j));
  72. }
  73. else
  74. {
  75. starpu_tag_declare_deps(TAG21(k, j), 1, TAG11(k));
  76. }
  77. int n = starpu_matrix_get_nx(task->handles[0]);
  78. task->flops = FLOPS_STRSM(n, n);
  79. ret = starpu_task_submit(task);
  80. if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
  81. return ret;
  82. }
  83. static int create_task_22(unsigned k, unsigned i, unsigned j)
  84. {
  85. int ret;
  86. /* FPRINTF(stdout, "task 22 k,i,j = %d,%d,%d TAG = %llx\n", k,i,j, TAG22(k,i,j)); */
  87. struct starpu_task *task = create_task(TAG22(k, i, j));
  88. task->cl = &cl22;
  89. /* which sub-data is manipulated ? */
  90. task->handles[0] = A_state[i][k];
  91. task->handles[1] = A_state[j][k];
  92. task->handles[2] = A_state[j][i];
  93. if ( (i == k + 1) && (j == k +1) )
  94. {
  95. task->priority = STARPU_MAX_PRIO;
  96. }
  97. /* enforce dependencies ... */
  98. if (k > 0)
  99. {
  100. starpu_tag_declare_deps(TAG22(k, i, j), 3, TAG22(k-1, i, j), TAG21(k, i), TAG21(k, j));
  101. }
  102. else
  103. {
  104. starpu_tag_declare_deps(TAG22(k, i, j), 2, TAG21(k, i), TAG21(k, j));
  105. }
  106. int n = starpu_matrix_get_nx(task->handles[0]);
  107. task->flops = FLOPS_SGEMM(n, n, n);
  108. ret = starpu_task_submit(task);
  109. if (ret != -ENODEV) STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
  110. return ret;
  111. }
  112. /*
  113. * code to bootstrap the factorization
  114. * and construct the DAG
  115. */
  116. static int cholesky_no_stride(void)
  117. {
  118. int ret;
  119. double start;
  120. double end;
  121. struct starpu_task *entry_task = NULL;
  122. /* create all the DAG nodes */
  123. unsigned i,j,k;
  124. for (k = 0; k < nblocks; k++)
  125. {
  126. struct starpu_task *task = create_task_11(k, nblocks);
  127. /* we defer the launch of the first task */
  128. if (k == 0)
  129. {
  130. entry_task = task;
  131. }
  132. else
  133. {
  134. ret = starpu_task_submit(task);
  135. STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
  136. }
  137. for (j = k+1; j<nblocks; j++)
  138. {
  139. ret = create_task_21(k, j);
  140. if (ret == -ENODEV) return 77;
  141. for (i = k+1; i<nblocks; i++)
  142. {
  143. if (i <= j)
  144. {
  145. ret = create_task_22(k, i, j);
  146. if (ret == -ENODEV) return 77;
  147. }
  148. }
  149. }
  150. }
  151. /* schedule the codelet */
  152. start = starpu_timing_now();
  153. ret = starpu_task_submit(entry_task);
  154. if (ret == -ENODEV) return 77;
  155. STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
  156. /* stall the application until the end of computations */
  157. starpu_tag_wait(TAG11(nblocks-1));
  158. end = starpu_timing_now();
  159. double timing = end - start;
  160. double flop = (1.0f*size*size*size)/3.0f;
  161. PRINTF("# size\tms\tGFlops\n");
  162. PRINTF("%u\t%.0f\t%.1f\n", size, timing/1000, (flop/timing/1000.0f));
  163. return 0;
  164. }
  165. int main(int argc, char **argv)
  166. {
  167. unsigned x, y;
  168. int ret;
  169. parse_args(argc, argv);
  170. assert(nblocks <= NMAXBLOCKS);
  171. FPRINTF(stderr, "BLOCK SIZE = %d\n", size / nblocks);
  172. #ifdef STARPU_HAVE_MAGMA
  173. magma_init();
  174. #endif
  175. ret = starpu_init(NULL);
  176. if (ret == -ENODEV)
  177. return 77;
  178. STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
  179. #ifdef STARPU_USE_CUDA
  180. initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,cuda_chol_task_11_cost);
  181. initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,cuda_chol_task_21_cost);
  182. initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,cuda_chol_task_22_cost);
  183. #else
  184. initialize_chol_model(&chol_model_11,"chol_model_11",cpu_chol_task_11_cost,NULL);
  185. initialize_chol_model(&chol_model_21,"chol_model_21",cpu_chol_task_21_cost,NULL);
  186. initialize_chol_model(&chol_model_22,"chol_model_22",cpu_chol_task_22_cost,NULL);
  187. #endif
  188. /* Disable sequential consistency */
  189. starpu_data_set_default_sequential_consistency_flag(0);
  190. starpu_cublas_init();
  191. #ifndef STARPU_SIMGRID
  192. for (y = 0; y < nblocks; y++)
  193. for (x = 0; x < nblocks; x++)
  194. {
  195. if (x <= y)
  196. {
  197. #ifdef STARPU_HAVE_POSIX_MEMALIGN
  198. posix_memalign((void **)&A[y][x], 128, BLOCKSIZE*BLOCKSIZE*sizeof(float));
  199. #else
  200. A[y][x] = malloc(BLOCKSIZE*BLOCKSIZE*sizeof(float));
  201. #endif
  202. assert(A[y][x]);
  203. }
  204. }
  205. /* create a simple definite positive symetric matrix example
  206. *
  207. * Hilbert matrix : h(i,j) = 1/(i+j+1) ( + n In to make is stable )
  208. * */
  209. for (y = 0; y < nblocks; y++)
  210. for (x = 0; x < nblocks; x++)
  211. if (x <= y)
  212. {
  213. unsigned i, j;
  214. for (i = 0; i < BLOCKSIZE; i++)
  215. for (j = 0; j < BLOCKSIZE; j++)
  216. {
  217. A[y][x][i*BLOCKSIZE + j] =
  218. (float)(1.0f/((float) (1.0+(x*BLOCKSIZE+i)+(y*BLOCKSIZE+j))));
  219. /* make it a little more numerically stable ... ;) */
  220. if ((x == y) && (i == j))
  221. A[y][x][i*BLOCKSIZE + j] += (float)(2*size);
  222. }
  223. }
  224. #endif
  225. for (y = 0; y < nblocks; y++)
  226. for (x = 0; x < nblocks; x++)
  227. {
  228. if (x <= y)
  229. {
  230. starpu_matrix_data_register(&A_state[y][x], STARPU_MAIN_RAM, (uintptr_t)A[y][x],
  231. BLOCKSIZE, BLOCKSIZE, BLOCKSIZE, sizeof(float));
  232. }
  233. }
  234. ret = cholesky_no_stride();
  235. for (y = 0; y < nblocks; y++)
  236. for (x = 0; x < nblocks; x++)
  237. {
  238. if (x <= y)
  239. {
  240. starpu_data_unregister(A_state[y][x]);
  241. free(A[y][x]);
  242. }
  243. }
  244. starpu_cublas_shutdown();
  245. starpu_shutdown();
  246. return ret;
  247. }