cl_enqueuendrangekernel.c 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2011-2012 Inria
  4. * Copyright (C) 2012,2014,2016-2017 CNRS
  5. * Copyright (C) 2010-2011,2013,2016-2018 Université de Bordeaux
  6. *
  7. * StarPU is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU Lesser General Public License as published by
  9. * the Free Software Foundation; either version 2.1 of the License, or (at
  10. * your option) any later version.
  11. *
  12. * StarPU is distributed in the hope that it will be useful, but
  13. * WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  15. *
  16. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  17. */
  18. #include "socl.h"
  19. #include "event.h"
  20. void soclEnqueueNDRangeKernel_task(void *descr[], void *args)
  21. {
  22. command_ndrange_kernel cmd = (command_ndrange_kernel)args;
  23. cl_command_queue cq;
  24. int wid;
  25. cl_int err;
  26. cl_event ev = command_event_get(cmd);
  27. ev->prof_start = _socl_nanotime();
  28. gc_entity_release(ev);
  29. wid = starpu_worker_get_id_check();
  30. starpu_opencl_get_queue(wid, &cq);
  31. DEBUG_MSG("[worker %d] [kernel %d] Executing kernel...\n", wid, cmd->kernel->id);
  32. int range = starpu_worker_get_range();
  33. /* Set arguments */
  34. {
  35. unsigned int i;
  36. int buf = 0;
  37. for (i=0; i<cmd->num_args; i++)
  38. {
  39. switch (cmd->arg_types[i])
  40. {
  41. case Null:
  42. err = clSetKernelArg(cmd->kernel->cl_kernels[range], i, cmd->arg_sizes[i], NULL);
  43. break;
  44. case Buffer:
  45. {
  46. cl_mem mem;
  47. mem = (cl_mem)STARPU_VARIABLE_GET_PTR(descr[buf]);
  48. err = clSetKernelArg(cmd->kernel->cl_kernels[range], i, cmd->arg_sizes[i], &mem);
  49. buf++;
  50. }
  51. break;
  52. case Immediate:
  53. err = clSetKernelArg(cmd->kernel->cl_kernels[range], i, cmd->arg_sizes[i], cmd->args[i]);
  54. break;
  55. }
  56. if (err != CL_SUCCESS)
  57. {
  58. DEBUG_CL("clSetKernelArg", err);
  59. DEBUG_ERROR("Aborting\n");
  60. }
  61. }
  62. }
  63. /* Calling Kernel */
  64. cl_event event;
  65. err = clEnqueueNDRangeKernel(cq, cmd->kernel->cl_kernels[range], cmd->work_dim, cmd->global_work_offset, cmd->global_work_size, cmd->local_work_size, 0, NULL, &event);
  66. if (err != CL_SUCCESS)
  67. {
  68. ERROR_MSG("Worker[%d] Unable to Enqueue kernel (error %d)\n", wid, err);
  69. DEBUG_CL("clEnqueueNDRangeKernel", err);
  70. DEBUG_MSG("Workdim %u, global_work_offset %p, global_work_size %p, local_work_size %p\n",
  71. cmd->work_dim, cmd->global_work_offset, cmd->global_work_size, cmd->local_work_size);
  72. DEBUG_MSG("Global work size: %ld %ld %ld\n", (long)cmd->global_work_size[0],
  73. (long)(cmd->work_dim > 1 ? cmd->global_work_size[1] : 1), (long)(cmd->work_dim > 2 ? cmd->global_work_size[2] : 1));
  74. if (cmd->local_work_size != NULL)
  75. DEBUG_MSG("Local work size: %ld %ld %ld\n", (long)cmd->local_work_size[0],
  76. (long)(cmd->work_dim > 1 ? cmd->local_work_size[1] : 1), (long)(cmd->work_dim > 2 ? cmd->local_work_size[2] : 1));
  77. }
  78. else
  79. {
  80. /* Waiting for kernel to terminate */
  81. clWaitForEvents(1, &event);
  82. clReleaseEvent(event);
  83. }
  84. }
  85. /**
  86. * Real kernel enqueuing command
  87. */
  88. cl_int command_ndrange_kernel_submit(command_ndrange_kernel cmd)
  89. {
  90. starpu_task task = task_create();
  91. task->cl = &cmd->codelet;
  92. task->cl->model = cmd->kernel->perfmodel;
  93. task->cl_arg = cmd;
  94. task->cl_arg_size = sizeof(cmd);
  95. /* Execute the task on a specific worker? */
  96. if (cmd->_command.event->cq->device != NULL)
  97. {
  98. task->execute_on_a_specific_worker = 1;
  99. task->workerid = cmd->_command.event->cq->device->worker_id;
  100. }
  101. struct starpu_codelet * codelet = task->cl;
  102. /* We need to detect which parameters are OpenCL's memory objects and
  103. * we retrieve their corresponding StarPU buffers */
  104. cmd->num_buffers = 0;
  105. cmd->buffers = malloc(sizeof(cl_mem) * cmd->num_args);
  106. unsigned int i;
  107. for (i=0; i<cmd->num_args; i++)
  108. {
  109. if (cmd->arg_types[i] == Buffer)
  110. {
  111. cl_mem buf = *(cl_mem*)cmd->args[i];
  112. gc_entity_store(&cmd->buffers[cmd->num_buffers], buf);
  113. task->handles[cmd->num_buffers] = buf->handle;
  114. /* Determine best StarPU buffer access mode */
  115. int mode;
  116. if (buf->mode == CL_MEM_READ_ONLY)
  117. mode = STARPU_R;
  118. else if (buf->mode == CL_MEM_WRITE_ONLY)
  119. {
  120. mode = STARPU_W;
  121. buf->scratch = 0;
  122. }
  123. else if (buf->scratch)
  124. { //RW but never accessed in RW or W mode
  125. mode = STARPU_W;
  126. buf->scratch = 0;
  127. }
  128. else
  129. {
  130. mode = STARPU_RW;
  131. buf->scratch = 0;
  132. }
  133. codelet->modes[cmd->num_buffers] = mode;
  134. cmd->num_buffers += 1;
  135. }
  136. }
  137. codelet->nbuffers = cmd->num_buffers;
  138. task_submit(task, cmd);
  139. return CL_SUCCESS;
  140. }
  141. CL_API_SUFFIX__VERSION_1_1
  142. CL_API_ENTRY cl_int CL_API_CALL
  143. soclEnqueueNDRangeKernel(cl_command_queue cq,
  144. cl_kernel kernel,
  145. cl_uint work_dim,
  146. const size_t * global_work_offset,
  147. const size_t * global_work_size,
  148. const size_t * local_work_size,
  149. cl_uint num_events,
  150. const cl_event * events,
  151. cl_event * event)
  152. {
  153. if (kernel->split_func != NULL && !STARPU_PTHREAD_MUTEX_TRYLOCK(&kernel->split_lock))
  154. {
  155. cl_event beforeEvent, afterEvent, totalEvent;
  156. totalEvent = event_create();
  157. gc_entity_store(&totalEvent->cq, cq);
  158. command_marker cmd = command_marker_create();
  159. beforeEvent = command_event_get(cmd);
  160. command_queue_enqueue(cq, cmd, num_events, events);
  161. cl_uint iter = 1;
  162. cl_uint split_min = CL_UINT_MAX;
  163. cl_uint split_min_iter = 1;
  164. while (iter < kernel->split_space && kernel->split_perfs[iter] != 0)
  165. {
  166. if (kernel->split_perfs[iter] < split_min)
  167. {
  168. split_min = kernel->split_perfs[iter];
  169. split_min_iter = iter;
  170. }
  171. iter++;
  172. }
  173. if (iter == kernel->split_space)
  174. {
  175. iter = split_min_iter;
  176. }
  177. cl_int ret = kernel->split_func(cq, iter, kernel->split_data, beforeEvent, &afterEvent);
  178. if (ret == CL_SUCCESS)
  179. {
  180. //FIXME: blocking call
  181. soclWaitForEvents(1, &afterEvent);
  182. /* Store perf */
  183. cl_ulong start,end;
  184. soclGetEventProfilingInfo(beforeEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &start, NULL);
  185. soclGetEventProfilingInfo(afterEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
  186. soclReleaseEvent(afterEvent);
  187. kernel->split_perfs[iter] = end-start;
  188. STARPU_PTHREAD_MUTEX_UNLOCK(&kernel->split_lock);
  189. event_complete(totalEvent);
  190. totalEvent->prof_start = start;
  191. totalEvent->prof_submit = start;
  192. totalEvent->prof_queued = start;
  193. totalEvent->prof_end = end;
  194. RETURN_EVENT(totalEvent,event);
  195. }
  196. else
  197. {
  198. STARPU_PTHREAD_MUTEX_UNLOCK(&kernel->split_lock);
  199. soclReleaseEvent(totalEvent);
  200. }
  201. return ret;
  202. }
  203. else
  204. {
  205. command_ndrange_kernel cmd = command_ndrange_kernel_create(kernel, work_dim,
  206. global_work_offset, global_work_size, local_work_size);
  207. cl_event ev = command_event_get(cmd);
  208. command_queue_enqueue(cq, cmd, num_events, events);
  209. RETURN_EVENT(ev, event);
  210. }
  211. return CL_SUCCESS;
  212. }