dw_spmv.c 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350
  1. /*
  2. * StarPU
  3. * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU Lesser General Public License as published by
  7. * the Free Software Foundation; either version 2.1 of the License, or (at
  8. * your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful, but
  11. * WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. *
  14. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  15. */
  16. /*
  17. * Conjugate gradients for Sparse matrices
  18. */
  19. #include "dw_spmv.h"
  20. struct timeval start;
  21. struct timeval end;
  22. unsigned nblocks = 1;
  23. unsigned remainingtasks = -1;
  24. /* First a Matrix-Vector product (SpMV) */
  25. unsigned blocks = 512;
  26. unsigned grids = 8;
  27. #ifdef USE_CUDA
  28. /* CUDA spmv codelet */
  29. static struct starpu_cuda_module_s cuda_module;
  30. static struct starpu_cuda_function_s cuda_function;
  31. static starpu_cuda_codelet_t cuda_spmv;
  32. void initialize_cuda(void)
  33. {
  34. char module_path[1024];
  35. sprintf(module_path,
  36. "%s/examples/cuda/spmv_cuda.cubin", STARPUDIR);
  37. char *function_symbol = "spmv_kernel_3";
  38. starpu_init_cuda_module(&cuda_module, module_path);
  39. starpu_init_cuda_function(&cuda_function, &cuda_module, function_symbol);
  40. cuda_spmv.func = &cuda_function;
  41. cuda_spmv.stack = NULL;
  42. cuda_spmv.stack_size = 0;
  43. cuda_spmv.gridx = grids;
  44. cuda_spmv.gridy = 1;
  45. cuda_spmv.blockx = blocks;
  46. cuda_spmv.blocky = 1;
  47. cuda_spmv.shmemsize = 60;
  48. }
  49. #endif // USE_CUDA
  50. sem_t sem;
  51. uint32_t size = 4194304;
  52. starpu_data_handle sparse_matrix;
  53. starpu_data_handle vector_in, vector_out;
  54. float *sparse_matrix_nzval;
  55. uint32_t *sparse_matrix_colind;
  56. uint32_t *sparse_matrix_rowptr;
  57. float *vector_in_ptr;
  58. float *vector_out_ptr;
  59. unsigned usecpu = 0;
  60. void parse_args(int argc, char **argv)
  61. {
  62. int i;
  63. for (i = 1; i < argc; i++) {
  64. if (strcmp(argv[i], "-size") == 0) {
  65. char *argptr;
  66. size = strtol(argv[++i], &argptr, 10);
  67. }
  68. if (strcmp(argv[i], "-block") == 0) {
  69. char *argptr;
  70. blocks = strtol(argv[++i], &argptr, 10);
  71. }
  72. if (strcmp(argv[i], "-grid") == 0) {
  73. char *argptr;
  74. grids = strtol(argv[++i], &argptr, 10);
  75. }
  76. if (strcmp(argv[i], "-nblocks") == 0) {
  77. char *argptr;
  78. nblocks = strtol(argv[++i], &argptr, 10);
  79. }
  80. if (strcmp(argv[i], "-cpu") == 0) {
  81. usecpu = 1;
  82. }
  83. }
  84. }
  85. void core_spmv(starpu_data_interface_t *descr, __attribute__((unused)) void *arg)
  86. {
  87. float *nzval = (float *)descr[0].csr.nzval;
  88. uint32_t *colind = descr[0].csr.colind;
  89. uint32_t *rowptr = descr[0].csr.rowptr;
  90. float *vecin = (float *)descr[1].vector.ptr;
  91. float *vecout = (float *)descr[2].vector.ptr;
  92. uint32_t firstelem = descr[0].csr.firstentry;
  93. uint32_t nnz;
  94. uint32_t nrow;
  95. nnz = descr[0].csr.nnz;
  96. nrow = descr[0].csr.nrow;
  97. //STARPU_ASSERT(nrow == descr[1].vector.nx);
  98. STARPU_ASSERT(nrow == descr[2].vector.nx);
  99. unsigned row;
  100. for (row = 0; row < nrow; row++)
  101. {
  102. float tmp = 0.0f;
  103. unsigned index;
  104. unsigned firstindex = rowptr[row] - firstelem;
  105. unsigned lastindex = rowptr[row+1] - firstelem;
  106. for (index = firstindex; index < lastindex; index++)
  107. {
  108. unsigned col;
  109. col = colind[index];
  110. tmp += nzval[index]*vecin[col];
  111. }
  112. vecout[row] = tmp;
  113. }
  114. }
  115. void create_data(void)
  116. {
  117. /* we need a sparse symetric (definite positive ?) matrix and a "dense" vector */
  118. /* example of 3-band matrix */
  119. float *nzval;
  120. uint32_t nnz;
  121. uint32_t *colind;
  122. uint32_t *rowptr;
  123. nnz = 3*size-2;
  124. nzval = malloc(nnz*sizeof(float));
  125. colind = malloc(nnz*sizeof(uint32_t));
  126. rowptr = malloc((size+1)*sizeof(uint32_t));
  127. assert(nzval);
  128. assert(colind);
  129. assert(rowptr);
  130. /* fill the matrix */
  131. unsigned row;
  132. unsigned pos = 0;
  133. for (row = 0; row < size; row++)
  134. {
  135. rowptr[row] = pos;
  136. if (row > 0) {
  137. nzval[pos] = 1.0f;
  138. colind[pos] = row-1;
  139. pos++;
  140. }
  141. nzval[pos] = 5.0f;
  142. colind[pos] = row;
  143. pos++;
  144. if (row < size - 1) {
  145. nzval[pos] = 1.0f;
  146. colind[pos] = row+1;
  147. pos++;
  148. }
  149. }
  150. STARPU_ASSERT(pos == nnz);
  151. rowptr[size] = nnz;
  152. starpu_monitor_csr_data(&sparse_matrix, 0, nnz, size, (uintptr_t)nzval, colind, rowptr, 0, sizeof(float));
  153. sparse_matrix_nzval = nzval;
  154. sparse_matrix_colind = colind;
  155. sparse_matrix_rowptr = rowptr;
  156. /* initiate the 2 vectors */
  157. float *invec, *outvec;
  158. invec = malloc(size*sizeof(float));
  159. assert(invec);
  160. outvec = malloc(size*sizeof(float));
  161. assert(outvec);
  162. /* fill those */
  163. unsigned ind;
  164. for (ind = 0; ind < size; ind++)
  165. {
  166. invec[ind] = 2.0f;
  167. outvec[ind] = 0.0f;
  168. }
  169. starpu_monitor_vector_data(&vector_in, 0, (uintptr_t)invec, size, sizeof(float));
  170. starpu_monitor_vector_data(&vector_out, 0, (uintptr_t)outvec, size, sizeof(float));
  171. vector_in_ptr = invec;
  172. vector_out_ptr = outvec;
  173. }
  174. void init_problem_callback(void *arg)
  175. {
  176. unsigned *remaining = arg;
  177. unsigned val = STARPU_ATOMIC_ADD(remaining, -1);
  178. printf("callback %d remaining \n", val);
  179. if ( val == 0 )
  180. {
  181. printf("DONE ...\n");
  182. gettimeofday(&end, NULL);
  183. starpu_unpartition_data(sparse_matrix, 0);
  184. starpu_unpartition_data(vector_out, 0);
  185. sem_post(&sem);
  186. }
  187. }
  188. void call_spmv_codelet_filters(void)
  189. {
  190. remainingtasks = nblocks;
  191. starpu_codelet *cl = malloc(sizeof(starpu_codelet));
  192. /* partition the data along a block distribution */
  193. starpu_filter csr_f, vector_f;
  194. csr_f.filter_func = starpu_vertical_block_filter_func_csr;
  195. csr_f.filter_arg = nblocks;
  196. vector_f.filter_func = starpu_block_filter_func_vector;
  197. vector_f.filter_arg = nblocks;
  198. starpu_partition_data(sparse_matrix, &csr_f);
  199. starpu_partition_data(vector_out, &vector_f);
  200. cl->where = CORE|CUDA;
  201. cl->core_func = core_spmv;
  202. #ifdef USE_CUDA
  203. cl->cuda_func = &cuda_spmv;
  204. #endif
  205. cl->nbuffers = 3;
  206. gettimeofday(&start, NULL);
  207. unsigned part;
  208. for (part = 0; part < nblocks; part++)
  209. {
  210. struct starpu_task *task = starpu_task_create();
  211. task->callback_func = init_problem_callback;
  212. task->callback_arg = &remainingtasks;
  213. task->cl = cl;
  214. task->cl_arg = NULL;
  215. task->buffers[0].handle = get_sub_data(sparse_matrix, 1, part);
  216. task->buffers[0].mode = STARPU_R;
  217. task->buffers[1].handle = vector_in;
  218. task->buffers[1].mode = STARPU_R;
  219. task->buffers[2].handle = get_sub_data(vector_out, 1, part);
  220. task->buffers[2].mode = STARPU_W;
  221. starpu_submit_task(task);
  222. }
  223. }
  224. void init_problem(void)
  225. {
  226. /* create the sparse input matrix */
  227. create_data();
  228. /* create a new codelet that will perform a SpMV on it */
  229. call_spmv_codelet_filters();
  230. }
  231. void print_results(void)
  232. {
  233. unsigned row;
  234. for (row = 0; row < STARPU_MIN(size, 16); row++)
  235. {
  236. printf("%2.2f\t%2.2f\n", vector_in_ptr[row], vector_out_ptr[row]);
  237. }
  238. }
  239. int main(__attribute__ ((unused)) int argc,
  240. __attribute__ ((unused)) char **argv)
  241. {
  242. parse_args(argc, argv);
  243. /* start the runtime */
  244. starpu_init(NULL);
  245. sem_init(&sem, 0, 0U);
  246. #ifdef USE_CUDA
  247. initialize_cuda();
  248. #endif
  249. init_problem();
  250. sem_wait(&sem);
  251. sem_destroy(&sem);
  252. print_results();
  253. double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  254. fprintf(stderr, "Computation took (in ms)\n");
  255. printf("%2.2f\n", timing/1000);
  256. return 0;
  257. }