dot_product.c 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2010-2011 Université de Bordeaux 1
  4. *
  5. * StarPU is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU Lesser General Public License as published by
  7. * the Free Software Foundation; either version 2.1 of the License, or (at
  8. * your option) any later version.
  9. *
  10. * StarPU is distributed in the hope that it will be useful, but
  11. * WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. *
  14. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  15. */
  16. #include <starpu.h>
  17. #include <assert.h>
  18. #ifdef STARPU_USE_CUDA
  19. #include <cuda.h>
  20. #include <cublas.h>
  21. #include <starpu_cuda.h>
  22. #endif
  23. #define FPRINTF(ofile, fmt, args ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ##args); }} while(0)
  24. static float *x;
  25. static float *y;
  26. static starpu_data_handle_t *x_handles;
  27. static starpu_data_handle_t *y_handles;
  28. static unsigned nblocks = 4096;
  29. static unsigned entries_per_block = 1024;
  30. #define DOT_TYPE double
  31. static DOT_TYPE dot = 0.0f;
  32. static starpu_data_handle_t dot_handle;
  33. static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
  34. {
  35. const struct cudaDeviceProp *props;
  36. if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
  37. return 1;
  38. #ifdef STARPU_USE_CUDA
  39. /* Cuda device */
  40. props = starpu_cuda_get_device_properties(workerid);
  41. if (props->major >= 2 || props->minor >= 3)
  42. /* At least compute capability 1.3, supports doubles */
  43. return 1;
  44. #endif
  45. /* Old card, does not support doubles */
  46. return 0;
  47. }
  48. /*
  49. * Codelet to create a neutral element
  50. */
  51. void init_cpu_func(void *descr[], void *cl_arg)
  52. {
  53. DOT_TYPE *dot = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
  54. *dot = 0.0f;
  55. }
  56. #ifdef STARPU_USE_CUDA
  57. void init_cuda_func(void *descr[], void *cl_arg)
  58. {
  59. DOT_TYPE *dot = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
  60. cudaMemset(dot, 0, sizeof(DOT_TYPE));
  61. cudaThreadSynchronize();
  62. }
  63. #endif
  64. static struct starpu_codelet init_codelet =
  65. {
  66. .where = STARPU_CPU|STARPU_CUDA,
  67. .can_execute = can_execute,
  68. .cpu_funcs = {init_cpu_func, NULL},
  69. #ifdef STARPU_USE_CUDA
  70. .cuda_funcs = {init_cuda_func, NULL},
  71. #endif
  72. .nbuffers = 1
  73. };
  74. /*
  75. * Codelet to perform the reduction of two elements
  76. */
  77. void redux_cpu_func(void *descr[], void *cl_arg)
  78. {
  79. DOT_TYPE *dota = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
  80. DOT_TYPE *dotb = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]);
  81. *dota = *dota + *dotb;
  82. }
  83. #ifdef STARPU_USE_CUDA
  84. extern void redux_cuda_func(void *descr[], void *_args);
  85. #endif
  86. static struct starpu_codelet redux_codelet =
  87. {
  88. .where = STARPU_CPU|STARPU_CUDA,
  89. .can_execute = can_execute,
  90. .cpu_funcs = {redux_cpu_func, NULL},
  91. #ifdef STARPU_USE_CUDA
  92. .cuda_funcs = {redux_cuda_func, NULL},
  93. #endif
  94. .nbuffers = 2
  95. };
  96. /*
  97. * Dot product codelet
  98. */
  99. void dot_cpu_func(void *descr[], void *cl_arg)
  100. {
  101. float *local_x = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  102. float *local_y = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  103. DOT_TYPE *dot = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[2]);
  104. unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  105. DOT_TYPE local_dot = 0.0;
  106. unsigned i;
  107. for (i = 0; i < n; i++)
  108. {
  109. local_dot += (DOT_TYPE)local_x[i]*(DOT_TYPE)local_y[i];
  110. }
  111. *dot = *dot + local_dot;
  112. }
  113. #ifdef STARPU_USE_CUDA
  114. void dot_cuda_func(void *descr[], void *cl_arg)
  115. {
  116. DOT_TYPE current_dot;
  117. DOT_TYPE local_dot;
  118. float *local_x = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  119. float *local_y = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  120. DOT_TYPE *dot = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[2]);
  121. unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  122. cudaMemcpy(&current_dot, dot, sizeof(DOT_TYPE), cudaMemcpyDeviceToHost);
  123. cudaThreadSynchronize();
  124. local_dot = (DOT_TYPE)cublasSdot(n, local_x, 1, local_y, 1);
  125. /* FPRINTF(stderr, "current_dot %f local dot %f -> %f\n", current_dot, local_dot, current_dot + local_dot); */
  126. current_dot += local_dot;
  127. cudaThreadSynchronize();
  128. cudaMemcpy(dot, &current_dot, sizeof(DOT_TYPE), cudaMemcpyHostToDevice);
  129. cudaThreadSynchronize();
  130. }
  131. #endif
  132. static struct starpu_codelet dot_codelet =
  133. {
  134. .where = STARPU_CPU|STARPU_CUDA,
  135. .can_execute = can_execute,
  136. .cpu_funcs = {dot_cpu_func, NULL},
  137. #ifdef STARPU_USE_CUDA
  138. .cuda_funcs = {dot_cuda_func, NULL},
  139. #endif
  140. .nbuffers = 3,
  141. .modes = {STARPU_R, STARPU_R, STARPU_REDUX}
  142. };
  143. /*
  144. * Tasks initialization
  145. */
  146. int main(int argc, char **argv)
  147. {
  148. int ret;
  149. ret = starpu_init(NULL);
  150. STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
  151. starpu_helper_cublas_init();
  152. unsigned long nelems = nblocks*entries_per_block;
  153. size_t size = nelems*sizeof(float);
  154. x = (float *) malloc(size);
  155. y = (float *) malloc(size);
  156. x_handles = (starpu_data_handle_t *) calloc(nblocks, sizeof(starpu_data_handle_t));
  157. y_handles = (starpu_data_handle_t *) calloc(nblocks, sizeof(starpu_data_handle_t));
  158. assert(x && y);
  159. starpu_srand48(0);
  160. DOT_TYPE reference_dot = 0.0;
  161. unsigned long i;
  162. for (i = 0; i < nelems; i++)
  163. {
  164. x[i] = (float)starpu_drand48();
  165. y[i] = (float)starpu_drand48();
  166. reference_dot += (DOT_TYPE)x[i]*(DOT_TYPE)y[i];
  167. }
  168. unsigned block;
  169. for (block = 0; block < nblocks; block++)
  170. {
  171. starpu_vector_data_register(&x_handles[block], 0,
  172. (uintptr_t)&x[entries_per_block*block], entries_per_block, sizeof(float));
  173. starpu_vector_data_register(&y_handles[block], 0,
  174. (uintptr_t)&y[entries_per_block*block], entries_per_block, sizeof(float));
  175. }
  176. starpu_variable_data_register(&dot_handle, 0, (uintptr_t)&dot, sizeof(DOT_TYPE));
  177. /*
  178. * Compute dot product with StarPU
  179. */
  180. starpu_data_set_reduction_methods(dot_handle, &redux_codelet, &init_codelet);
  181. for (block = 0; block < nblocks; block++)
  182. {
  183. struct starpu_task *task = starpu_task_create();
  184. task->cl = &dot_codelet;
  185. task->destroy = 1;
  186. task->handles[0] = x_handles[block];
  187. task->handles[1] = y_handles[block];
  188. task->handles[2] = dot_handle;
  189. int ret = starpu_task_submit(task);
  190. if (ret == -ENODEV) goto enodev;
  191. STARPU_ASSERT(!ret);
  192. }
  193. for (block = 0; block < nblocks; block++)
  194. {
  195. starpu_data_unregister(x_handles[block]);
  196. starpu_data_unregister(y_handles[block]);
  197. }
  198. starpu_data_unregister(dot_handle);
  199. FPRINTF(stderr, "Reference : %e vs. %e (Delta %e)\n", reference_dot, dot, reference_dot - dot);
  200. starpu_helper_cublas_shutdown();
  201. starpu_shutdown();
  202. free(x);
  203. free(y);
  204. free(x_handles);
  205. free(y_handles);
  206. return 0;
  207. enodev:
  208. fprintf(stderr, "WARNING: No one can execute this task\n");
  209. /* yes, we do not perform the computation but we did detect that no one
  210. * could perform the kernel, so this is not an error from StarPU */
  211. return 77;
  212. }