dot_product.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2010-2015 Université de Bordeaux
  4. * Copyright (C) 2012 INRIA
  5. *
  6. * StarPU is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU Lesser General Public License as published by
  8. * the Free Software Foundation; either version 2.1 of the License, or (at
  9. * your option) any later version.
  10. *
  11. * StarPU is distributed in the hope that it will be useful, but
  12. * WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  14. *
  15. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  16. */
  17. #include <starpu.h>
  18. #include <assert.h>
  19. #include <math.h>
  20. #include <reductions/dot_product.h>
  21. #ifdef STARPU_USE_CUDA
  22. #include <cuda.h>
  23. #include <cublas.h>
  24. #endif
  25. #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
  26. static float *_x;
  27. static float *_y;
  28. static starpu_data_handle_t *_x_handles;
  29. static starpu_data_handle_t *_y_handles;
  30. #ifdef STARPU_USE_OPENCL
  31. static struct starpu_opencl_program _opencl_program;
  32. #endif
  33. #ifdef STARPU_QUICK_CHECK
  34. static unsigned _nblocks = 512;
  35. #else
  36. static unsigned _nblocks = 4096;
  37. #endif
  38. static unsigned _entries_per_block = 1024;
  39. static DOT_TYPE _dot = 0.0f;
  40. static starpu_data_handle_t _dot_handle;
  41. static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
  42. {
  43. enum starpu_worker_archtype type = starpu_worker_get_type(workerid);
  44. if (type == STARPU_CPU_WORKER || type == STARPU_OPENCL_WORKER)
  45. return 1;
  46. #ifdef STARPU_USE_CUDA
  47. #ifdef STARPU_SIMGRID
  48. /* We don't know, let's assume it can */
  49. return 1;
  50. #else
  51. /* Cuda device */
  52. const struct cudaDeviceProp *props;
  53. props = starpu_cuda_get_device_properties(workerid);
  54. if (props->major >= 2 || props->minor >= 3)
  55. /* At least compute capability 1.3, supports doubles */
  56. return 1;
  57. #endif
  58. #endif
  59. /* Old card, does not support doubles */
  60. return 0;
  61. }
  62. /*
  63. * Codelet to create a neutral element
  64. */
  65. void init_cpu_func(void *descr[], void *cl_arg)
  66. {
  67. DOT_TYPE *dot = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
  68. *dot = 0.0f;
  69. }
  70. #ifdef STARPU_USE_CUDA
  71. void init_cuda_func(void *descr[], void *cl_arg)
  72. {
  73. DOT_TYPE *dot = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
  74. cudaMemsetAsync(dot, 0, sizeof(DOT_TYPE), starpu_cuda_get_local_stream());
  75. }
  76. #endif
  77. #ifdef STARPU_USE_OPENCL
  78. void init_opencl_func(void *buffers[], void *args)
  79. {
  80. cl_int err;
  81. cl_command_queue queue;
  82. cl_mem dot = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[0]);
  83. starpu_opencl_get_current_queue(&queue);
  84. DOT_TYPE zero = (DOT_TYPE) 0.0;
  85. err = clEnqueueWriteBuffer(queue,
  86. dot,
  87. CL_TRUE,
  88. 0,
  89. sizeof(DOT_TYPE),
  90. &zero,
  91. 0,
  92. NULL,
  93. NULL);
  94. if (err != CL_SUCCESS)
  95. STARPU_OPENCL_REPORT_ERROR(err);
  96. }
  97. #endif
  98. static struct starpu_codelet init_codelet =
  99. {
  100. .can_execute = can_execute,
  101. .cpu_funcs = {init_cpu_func},
  102. .cpu_funcs_name = {"init_cpu_func"},
  103. #ifdef STARPU_USE_CUDA
  104. .cuda_funcs = {init_cuda_func},
  105. .cuda_flags = {STARPU_CUDA_ASYNC},
  106. #endif
  107. #ifdef STARPU_USE_OPENCL
  108. .opencl_funcs = {init_opencl_func},
  109. #endif
  110. .modes = {STARPU_W},
  111. .nbuffers = 1,
  112. .name = "init",
  113. };
  114. /*
  115. * Codelet to perform the reduction of two elements
  116. */
  117. void redux_cpu_func(void *descr[], void *cl_arg)
  118. {
  119. DOT_TYPE *dota = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
  120. DOT_TYPE *dotb = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]);
  121. *dota = *dota + *dotb;
  122. }
  123. #ifdef STARPU_USE_CUDA
  124. extern void redux_cuda_func(void *descr[], void *_args);
  125. #endif
  126. #ifdef STARPU_USE_OPENCL
  127. void redux_opencl_func(void *buffers[], void *args)
  128. {
  129. int id, devid;
  130. cl_int err;
  131. cl_kernel kernel;
  132. cl_command_queue queue;
  133. cl_event event;
  134. cl_mem dota = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[0]);
  135. cl_mem dotb = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[1]);
  136. id = starpu_worker_get_id();
  137. devid = starpu_worker_get_devid(id);
  138. err = starpu_opencl_load_kernel(&kernel, &queue, &_opencl_program, "_redux_opencl", devid);
  139. if (err != CL_SUCCESS)
  140. STARPU_OPENCL_REPORT_ERROR(err);
  141. err = clSetKernelArg(kernel, 0, sizeof(dota), &dota);
  142. err|= clSetKernelArg(kernel, 1, sizeof(dotb), &dotb);
  143. if (err != CL_SUCCESS)
  144. STARPU_OPENCL_REPORT_ERROR(err);
  145. {
  146. size_t global=1;
  147. size_t local;
  148. size_t s;
  149. cl_device_id device;
  150. starpu_opencl_get_device(devid, &device);
  151. err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
  152. if (err != CL_SUCCESS)
  153. STARPU_OPENCL_REPORT_ERROR(err);
  154. if (local > global)
  155. local=global;
  156. err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
  157. if (err != CL_SUCCESS)
  158. STARPU_OPENCL_REPORT_ERROR(err);
  159. }
  160. starpu_opencl_release_kernel(kernel);
  161. }
  162. #endif
  163. static struct starpu_codelet redux_codelet =
  164. {
  165. .can_execute = can_execute,
  166. .cpu_funcs = {redux_cpu_func},
  167. .cpu_funcs_name = {"redux_cpu_func"},
  168. #ifdef STARPU_USE_CUDA
  169. .cuda_funcs = {redux_cuda_func},
  170. .cuda_flags = {STARPU_CUDA_ASYNC},
  171. #endif
  172. #ifdef STARPU_USE_OPENCL
  173. .opencl_funcs = {redux_opencl_func},
  174. .opencl_flags = {STARPU_OPENCL_ASYNC},
  175. #endif
  176. .modes = {STARPU_RW, STARPU_R},
  177. .nbuffers = 2,
  178. .name = "redux"
  179. };
  180. /*
  181. * Dot product codelet
  182. */
  183. void dot_cpu_func(void *descr[], void *cl_arg)
  184. {
  185. float *local_x = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  186. float *local_y = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  187. DOT_TYPE *dot = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[2]);
  188. unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  189. DOT_TYPE local_dot = 0.0;
  190. unsigned i;
  191. for (i = 0; i < n; i++)
  192. {
  193. local_dot += (DOT_TYPE)local_x[i]*(DOT_TYPE)local_y[i];
  194. }
  195. *dot = *dot + local_dot;
  196. }
  197. #ifdef STARPU_USE_CUDA
  198. void dot_cuda_func(void *descr[], void *cl_arg)
  199. {
  200. DOT_TYPE current_dot;
  201. DOT_TYPE local_dot;
  202. float *local_x = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
  203. float *local_y = (float *)STARPU_VECTOR_GET_PTR(descr[1]);
  204. DOT_TYPE *dot = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[2]);
  205. unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
  206. cudaMemcpyAsync(&current_dot, dot, sizeof(DOT_TYPE), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
  207. local_dot = (DOT_TYPE)cublasSdot(n, local_x, 1, local_y, 1);
  208. /* FPRINTF(stderr, "current_dot %f local dot %f -> %f\n", current_dot, local_dot, current_dot + local_dot); */
  209. cudaStreamSynchronize(starpu_cuda_get_local_stream());
  210. current_dot += local_dot;
  211. cudaMemcpyAsync(dot, &current_dot, sizeof(DOT_TYPE), cudaMemcpyHostToDevice, starpu_cuda_get_local_stream());
  212. }
  213. #endif
  214. #ifdef STARPU_USE_OPENCL
  215. void dot_opencl_func(void *buffers[], void *args)
  216. {
  217. int id, devid;
  218. cl_int err;
  219. cl_kernel kernel;
  220. cl_command_queue queue;
  221. cl_event event;
  222. cl_mem x = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
  223. cl_mem y = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[1]);
  224. cl_mem dot = (cl_mem) STARPU_VARIABLE_GET_PTR(buffers[2]);
  225. unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
  226. id = starpu_worker_get_id();
  227. devid = starpu_worker_get_devid(id);
  228. err = starpu_opencl_load_kernel(&kernel, &queue, &_opencl_program, "_dot_opencl", devid);
  229. if (err != CL_SUCCESS)
  230. STARPU_OPENCL_REPORT_ERROR(err);
  231. err = clSetKernelArg(kernel, 0, sizeof(x), &x);
  232. err|= clSetKernelArg(kernel, 1, sizeof(y), &y);
  233. err|= clSetKernelArg(kernel, 2, sizeof(dot), &dot);
  234. err|= clSetKernelArg(kernel, 3, sizeof(n), &n);
  235. if (err != CL_SUCCESS)
  236. STARPU_OPENCL_REPORT_ERROR(err);
  237. {
  238. size_t global=1;
  239. size_t local;
  240. size_t s;
  241. cl_device_id device;
  242. starpu_opencl_get_device(devid, &device);
  243. err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
  244. if (err != CL_SUCCESS)
  245. STARPU_OPENCL_REPORT_ERROR(err);
  246. if (local > global)
  247. local=global;
  248. err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
  249. if (err != CL_SUCCESS)
  250. STARPU_OPENCL_REPORT_ERROR(err);
  251. }
  252. starpu_opencl_release_kernel(kernel);
  253. }
  254. #endif
  255. static struct starpu_codelet dot_codelet =
  256. {
  257. .can_execute = can_execute,
  258. .cpu_funcs = {dot_cpu_func},
  259. .cpu_funcs_name = {"dot_cpu_func"},
  260. #ifdef STARPU_USE_CUDA
  261. .cuda_funcs = {dot_cuda_func},
  262. .cuda_flags = {STARPU_CUDA_ASYNC},
  263. #endif
  264. #ifdef STARPU_USE_OPENCL
  265. .opencl_funcs = {dot_opencl_func},
  266. .opencl_flags = {STARPU_OPENCL_ASYNC},
  267. #endif
  268. .nbuffers = 3,
  269. .modes = {STARPU_R, STARPU_R, STARPU_REDUX},
  270. .name = "dot"
  271. };
  272. /*
  273. * Tasks initialization
  274. */
  275. int main(int argc, char **argv)
  276. {
  277. int ret;
  278. /* Not supported yet */
  279. if (starpu_get_env_number_default("STARPU_GLOBAL_ARBITER", 0) > 0)
  280. return 77;
  281. ret = starpu_init(NULL);
  282. if (ret == -ENODEV)
  283. return 77;
  284. STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
  285. #ifdef STARPU_USE_OPENCL
  286. ret = starpu_opencl_load_opencl_from_file("examples/reductions/dot_product_opencl_kernels.cl",
  287. &_opencl_program, NULL);
  288. STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file");
  289. #endif
  290. starpu_cublas_init();
  291. unsigned long nelems = _nblocks*_entries_per_block;
  292. size_t size = nelems*sizeof(float);
  293. _x = (float *) malloc(size);
  294. _y = (float *) malloc(size);
  295. _x_handles = (starpu_data_handle_t *) calloc(_nblocks, sizeof(starpu_data_handle_t));
  296. _y_handles = (starpu_data_handle_t *) calloc(_nblocks, sizeof(starpu_data_handle_t));
  297. assert(_x && _y);
  298. starpu_srand48(0);
  299. DOT_TYPE reference_dot = 0.0;
  300. unsigned long i;
  301. for (i = 0; i < nelems; i++)
  302. {
  303. _x[i] = (float)starpu_drand48();
  304. _y[i] = (float)starpu_drand48();
  305. reference_dot += (DOT_TYPE)_x[i]*(DOT_TYPE)_y[i];
  306. }
  307. unsigned block;
  308. for (block = 0; block < _nblocks; block++)
  309. {
  310. starpu_vector_data_register(&_x_handles[block], STARPU_MAIN_RAM,
  311. (uintptr_t)&_x[_entries_per_block*block], _entries_per_block, sizeof(float));
  312. starpu_vector_data_register(&_y_handles[block], STARPU_MAIN_RAM,
  313. (uintptr_t)&_y[_entries_per_block*block], _entries_per_block, sizeof(float));
  314. }
  315. starpu_variable_data_register(&_dot_handle, STARPU_MAIN_RAM, (uintptr_t)&_dot, sizeof(DOT_TYPE));
  316. /*
  317. * Compute dot product with StarPU
  318. */
  319. starpu_data_set_reduction_methods(_dot_handle, &redux_codelet, &init_codelet);
  320. for (block = 0; block < _nblocks; block++)
  321. {
  322. struct starpu_task *task = starpu_task_create();
  323. task->cl = &dot_codelet;
  324. task->destroy = 1;
  325. task->handles[0] = _x_handles[block];
  326. task->handles[1] = _y_handles[block];
  327. task->handles[2] = _dot_handle;
  328. ret = starpu_task_submit(task);
  329. if (ret == -ENODEV) goto enodev;
  330. STARPU_ASSERT(!ret);
  331. }
  332. for (block = 0; block < _nblocks; block++)
  333. {
  334. starpu_data_unregister(_x_handles[block]);
  335. starpu_data_unregister(_y_handles[block]);
  336. }
  337. starpu_data_unregister(_dot_handle);
  338. FPRINTF(stderr, "Reference : %e vs. %e (Delta %e)\n", reference_dot, _dot, reference_dot - _dot);
  339. starpu_cublas_shutdown();
  340. #ifdef STARPU_USE_OPENCL
  341. ret = starpu_opencl_unload_opencl(&_opencl_program);
  342. STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl");
  343. #endif
  344. starpu_shutdown();
  345. free(_x);
  346. free(_y);
  347. free(_x_handles);
  348. free(_y_handles);
  349. if (fabs(reference_dot - _dot) < reference_dot * 1e-6)
  350. return EXIT_SUCCESS;
  351. else
  352. return EXIT_FAILURE;
  353. enodev:
  354. fprintf(stderr, "WARNING: No one can execute this task\n");
  355. /* yes, we do not perform the computation but we did detect that no one
  356. * could perform the kernel, so this is not an error from StarPU */
  357. return 77;
  358. }