driver_opencl.c 34 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2010-2016 Université de Bordeaux
  4. * Copyright (C) 2010 Mehdi Juhoor <mjuhoor@gmail.com>
  5. * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016 CNRS
  6. * Copyright (C) 2011 Télécom-SudParis
  7. *
  8. * StarPU is free software; you can redistribute it and/or modify
  9. * it under the terms of the GNU Lesser General Public License as published by
  10. * the Free Software Foundation; either version 2.1 of the License, or (at
  11. * your option) any later version.
  12. *
  13. * StarPU is distributed in the hope that it will be useful, but
  14. * WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  16. *
  17. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  18. */
  19. #include <math.h>
  20. #include <starpu.h>
  21. #include <starpu_profiling.h>
  22. #include <common/config.h>
  23. #include <common/utils.h>
  24. #include <core/debug.h>
  25. #include <starpu_opencl.h>
  26. #include <drivers/driver_common/driver_common.h>
  27. #include "driver_opencl.h"
  28. #include "driver_opencl_utils.h"
  29. #include <common/utils.h>
  30. #include <datawizard/memory_manager.h>
  31. #include <datawizard/memory_nodes.h>
  32. #include <datawizard/malloc.h>
  33. #ifdef STARPU_SIMGRID
  34. #include <core/simgrid.h>
  35. #endif
  36. static int nb_devices = -1;
  37. static int init_done = 0;
  38. static starpu_pthread_mutex_t big_lock = STARPU_PTHREAD_MUTEX_INITIALIZER;
  39. static size_t global_mem[STARPU_MAXOPENCLDEVS];
  40. #ifdef STARPU_USE_OPENCL
  41. static cl_context contexts[STARPU_MAXOPENCLDEVS];
  42. static cl_device_id devices[STARPU_MAXOPENCLDEVS];
  43. static cl_command_queue queues[STARPU_MAXOPENCLDEVS];
  44. static cl_command_queue in_transfer_queues[STARPU_MAXOPENCLDEVS];
  45. static cl_command_queue out_transfer_queues[STARPU_MAXOPENCLDEVS];
  46. static cl_command_queue peer_transfer_queues[STARPU_MAXOPENCLDEVS];
  47. #ifndef STARPU_SIMGRID
  48. static cl_command_queue alloc_queues[STARPU_MAXOPENCLDEVS];
  49. static cl_event task_events[STARPU_MAXOPENCLDEVS][STARPU_MAX_PIPELINE];
  50. #endif /* !STARPU_SIMGRID */
  51. #endif
  52. #ifdef STARPU_SIMGRID
  53. static unsigned task_finished[STARPU_MAXOPENCLDEVS][STARPU_MAX_PIPELINE];
  54. static starpu_pthread_mutex_t task_mutex[STARPU_MAXOPENCLDEVS][STARPU_MAX_PIPELINE];
  55. static starpu_pthread_cond_t task_cond[STARPU_MAXOPENCLDEVS][STARPU_MAX_PIPELINE];
  56. #endif /* STARPU_SIMGRID */
  57. void
  58. _starpu_opencl_discover_devices(struct _starpu_machine_config *config)
  59. {
  60. /* Discover the number of OpenCL devices. Fill the result in CONFIG. */
  61. /* As OpenCL must have been initialized before calling this function,
  62. * `nb_device' is ensured to be correctly set. */
  63. STARPU_ASSERT(init_done == 1);
  64. config->topology.nhwopenclgpus = nb_devices;
  65. }
  66. static void _starpu_opencl_limit_gpu_mem_if_needed(unsigned devid)
  67. {
  68. starpu_ssize_t limit;
  69. size_t STARPU_ATTRIBUTE_UNUSED totalGlobalMem = 0;
  70. size_t STARPU_ATTRIBUTE_UNUSED to_waste = 0;
  71. char name[30];
  72. #ifdef STARPU_SIMGRID
  73. totalGlobalMem = _starpu_simgrid_get_memsize("OpenCL", devid);
  74. #elif defined(STARPU_USE_OPENCL)
  75. /* Request the size of the current device's memory */
  76. cl_int err;
  77. cl_ulong size;
  78. err = clGetDeviceInfo(devices[devid], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size), &size, NULL);
  79. if (STARPU_UNLIKELY(err != CL_SUCCESS))
  80. STARPU_OPENCL_REPORT_ERROR(err);
  81. totalGlobalMem = size;
  82. #endif
  83. limit = starpu_get_env_number("STARPU_LIMIT_OPENCL_MEM");
  84. if (limit == -1)
  85. {
  86. sprintf(name, "STARPU_LIMIT_OPENCL_%u_MEM", devid);
  87. limit = starpu_get_env_number(name);
  88. }
  89. #if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
  90. if (limit == -1)
  91. {
  92. /* Use 90% of the available memory by default. */
  93. limit = totalGlobalMem / (1024*1024) * 0.9;
  94. }
  95. #endif
  96. global_mem[devid] = limit * 1024*1024;
  97. #ifdef STARPU_USE_OPENCL
  98. /* How much memory to waste ? */
  99. to_waste = totalGlobalMem - global_mem[devid];
  100. #endif
  101. _STARPU_DEBUG("OpenCL device %d: Wasting %ld MB / Limit %ld MB / Total %ld MB / Remains %ld MB\n",
  102. devid, (long)to_waste/(1024*1024), (long) limit, (long)totalGlobalMem/(1024*1024),
  103. (long)(totalGlobalMem - to_waste)/(1024*1024));
  104. }
  105. #ifdef STARPU_USE_OPENCL
  106. void starpu_opencl_get_context(int devid, cl_context *context)
  107. {
  108. *context = contexts[devid];
  109. }
  110. void starpu_opencl_get_device(int devid, cl_device_id *device)
  111. {
  112. *device = devices[devid];
  113. }
  114. void starpu_opencl_get_queue(int devid, cl_command_queue *queue)
  115. {
  116. *queue = queues[devid];
  117. }
  118. void starpu_opencl_get_current_queue(cl_command_queue *queue)
  119. {
  120. struct _starpu_worker *worker = _starpu_get_local_worker_key();
  121. STARPU_ASSERT(queue);
  122. *queue = queues[worker->devid];
  123. }
  124. void starpu_opencl_get_current_context(cl_context *context)
  125. {
  126. struct _starpu_worker *worker = _starpu_get_local_worker_key();
  127. STARPU_ASSERT(context);
  128. *context = contexts[worker->devid];
  129. }
  130. #endif /* STARPU_USE_OPENCL */
  131. int _starpu_opencl_init_context(int devid)
  132. {
  133. #ifdef STARPU_SIMGRID
  134. int j;
  135. for (j = 0; j < STARPU_MAX_PIPELINE; j++)
  136. {
  137. task_finished[devid][j] = 0;
  138. STARPU_PTHREAD_MUTEX_INIT(&task_mutex[devid][j], NULL);
  139. STARPU_PTHREAD_COND_INIT(&task_cond[devid][j], NULL);
  140. }
  141. #else /* !STARPU_SIMGRID */
  142. cl_int err;
  143. cl_uint uint;
  144. STARPU_PTHREAD_MUTEX_LOCK(&big_lock);
  145. _STARPU_DEBUG("Initialising context for dev %d\n", devid);
  146. // Create a compute context
  147. err = 0;
  148. contexts[devid] = clCreateContext(NULL, 1, &devices[devid], NULL, NULL, &err);
  149. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  150. err = clGetDeviceInfo(devices[devid], CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(uint), &uint, NULL);
  151. if (STARPU_UNLIKELY(err != CL_SUCCESS))
  152. STARPU_OPENCL_REPORT_ERROR(err);
  153. starpu_malloc_set_align(uint/8);
  154. // Create execution queue for the given device
  155. queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], 0, &err);
  156. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  157. // Create transfer queue for the given device
  158. cl_command_queue_properties props;
  159. err = clGetDeviceInfo(devices[devid], CL_DEVICE_QUEUE_PROPERTIES, sizeof(props), &props, NULL);
  160. if (STARPU_UNLIKELY(err != CL_SUCCESS))
  161. STARPU_OPENCL_REPORT_ERROR(err);
  162. props &= ~CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
  163. in_transfer_queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], props, &err);
  164. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  165. out_transfer_queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], props, &err);
  166. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  167. peer_transfer_queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], props, &err);
  168. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  169. alloc_queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], 0, &err);
  170. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  171. STARPU_PTHREAD_MUTEX_UNLOCK(&big_lock);
  172. #endif /* !STARPU_SIMGRID */
  173. return 0;
  174. }
  175. int _starpu_opencl_deinit_context(int devid)
  176. {
  177. #ifdef STARPU_SIMGRID
  178. int j;
  179. for (j = 0; j < STARPU_MAX_PIPELINE; j++)
  180. {
  181. task_finished[devid][j] = 0;
  182. STARPU_PTHREAD_MUTEX_DESTROY(&task_mutex[devid][j]);
  183. STARPU_PTHREAD_COND_DESTROY(&task_cond[devid][j]);
  184. }
  185. #else /* !STARPU_SIMGRID */
  186. cl_int err;
  187. STARPU_PTHREAD_MUTEX_LOCK(&big_lock);
  188. _STARPU_DEBUG("De-initialising context for dev %d\n", devid);
  189. err = clFinish(queues[devid]);
  190. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  191. err = clReleaseCommandQueue(queues[devid]);
  192. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  193. err = clFinish(in_transfer_queues[devid]);
  194. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  195. err = clReleaseCommandQueue(in_transfer_queues[devid]);
  196. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  197. err = clFinish(out_transfer_queues[devid]);
  198. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  199. err = clReleaseCommandQueue(out_transfer_queues[devid]);
  200. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  201. err = clFinish(peer_transfer_queues[devid]);
  202. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  203. err = clReleaseCommandQueue(peer_transfer_queues[devid]);
  204. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  205. err = clFinish(alloc_queues[devid]);
  206. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  207. err = clReleaseCommandQueue(alloc_queues[devid]);
  208. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  209. err = clReleaseContext(contexts[devid]);
  210. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  211. contexts[devid] = NULL;
  212. STARPU_PTHREAD_MUTEX_UNLOCK(&big_lock);
  213. #endif
  214. return 0;
  215. }
  216. #ifdef STARPU_USE_OPENCL
  217. cl_int starpu_opencl_allocate_memory(int devid STARPU_ATTRIBUTE_UNUSED, cl_mem *mem STARPU_ATTRIBUTE_UNUSED, size_t size STARPU_ATTRIBUTE_UNUSED, cl_mem_flags flags STARPU_ATTRIBUTE_UNUSED)
  218. {
  219. #ifdef STARPU_SIMGRID
  220. STARPU_ABORT();
  221. #else
  222. cl_int err;
  223. cl_mem memory;
  224. memory = clCreateBuffer(contexts[devid], flags, size, NULL, &err);
  225. if (err == CL_OUT_OF_HOST_MEMORY) return err;
  226. if (err == CL_MEM_OBJECT_ALLOCATION_FAILURE) return err;
  227. if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
  228. /*
  229. * OpenCL uses lazy memory allocation: we will only know if the
  230. * allocation failed when trying to copy data onto the device. But we
  231. * want to know this __now__, so we just perform a dummy copy.
  232. */
  233. char dummy = 0;
  234. cl_event ev;
  235. err = clEnqueueWriteBuffer(alloc_queues[devid], memory, CL_TRUE,
  236. 0, sizeof(dummy), &dummy,
  237. 0, NULL, &ev);
  238. if (err == CL_MEM_OBJECT_ALLOCATION_FAILURE)
  239. return err;
  240. if (err == CL_OUT_OF_RESOURCES)
  241. return err;
  242. if (err != CL_SUCCESS)
  243. STARPU_OPENCL_REPORT_ERROR(err);
  244. clWaitForEvents(1, &ev);
  245. clReleaseEvent(ev);
  246. *mem = memory;
  247. return CL_SUCCESS;
  248. #endif
  249. }
  250. cl_int starpu_opencl_copy_ram_to_opencl(void *ptr, unsigned src_node STARPU_ATTRIBUTE_UNUSED, cl_mem buffer, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size, size_t offset, cl_event *event, int *ret)
  251. {
  252. cl_int err;
  253. struct _starpu_worker *worker = _starpu_get_local_worker_key();
  254. if (event)
  255. _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
  256. cl_event ev;
  257. err = clEnqueueWriteBuffer(in_transfer_queues[worker->devid], buffer, CL_FALSE, offset, size, ptr, 0, NULL, &ev);
  258. if (event)
  259. _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
  260. if (STARPU_LIKELY(err == CL_SUCCESS))
  261. {
  262. if (event == NULL)
  263. {
  264. /* We want a synchronous copy, let's synchronise the queue */
  265. err = clWaitForEvents(1, &ev);
  266. if (STARPU_UNLIKELY(err))
  267. STARPU_OPENCL_REPORT_ERROR(err);
  268. err = clReleaseEvent(ev);
  269. if (STARPU_UNLIKELY(err))
  270. STARPU_OPENCL_REPORT_ERROR(err);
  271. }
  272. else
  273. {
  274. *event = ev;
  275. }
  276. if (ret)
  277. {
  278. *ret = (event == NULL) ? 0 : -EAGAIN;
  279. }
  280. }
  281. return err;
  282. }
  283. cl_int starpu_opencl_copy_opencl_to_ram(cl_mem buffer, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *ptr, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size, size_t offset, cl_event *event, int *ret)
  284. {
  285. cl_int err;
  286. struct _starpu_worker *worker = _starpu_get_local_worker_key();
  287. if (event)
  288. _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
  289. cl_event ev;
  290. err = clEnqueueReadBuffer(out_transfer_queues[worker->devid], buffer, CL_FALSE, offset, size, ptr, 0, NULL, &ev);
  291. if (event)
  292. _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
  293. if (STARPU_LIKELY(err == CL_SUCCESS))
  294. {
  295. if (event == NULL)
  296. {
  297. /* We want a synchronous copy, let's synchronise the queue */
  298. err = clWaitForEvents(1, &ev);
  299. if (STARPU_UNLIKELY(err))
  300. STARPU_OPENCL_REPORT_ERROR(err);
  301. err = clReleaseEvent(ev);
  302. if (STARPU_UNLIKELY(err))
  303. STARPU_OPENCL_REPORT_ERROR(err);
  304. }
  305. else
  306. {
  307. *event = ev;
  308. }
  309. if (ret)
  310. {
  311. *ret = (event == NULL) ? 0 : -EAGAIN;
  312. }
  313. }
  314. return err;
  315. }
  316. cl_int starpu_opencl_copy_opencl_to_opencl(cl_mem src, unsigned src_node STARPU_ATTRIBUTE_UNUSED, size_t src_offset, cl_mem dst, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t dst_offset, size_t size, cl_event *event, int *ret)
  317. {
  318. cl_int err;
  319. struct _starpu_worker *worker = _starpu_get_local_worker_key();
  320. if (event)
  321. _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
  322. cl_event ev;
  323. err = clEnqueueCopyBuffer(peer_transfer_queues[worker->devid], src, dst, src_offset, dst_offset, size, 0, NULL, &ev);
  324. if (event)
  325. _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
  326. if (STARPU_LIKELY(err == CL_SUCCESS))
  327. {
  328. if (event == NULL)
  329. {
  330. /* We want a synchronous copy, let's synchronise the queue */
  331. err = clWaitForEvents(1, &ev);
  332. if (STARPU_UNLIKELY(err))
  333. STARPU_OPENCL_REPORT_ERROR(err);
  334. err = clReleaseEvent(ev);
  335. if (STARPU_UNLIKELY(err))
  336. STARPU_OPENCL_REPORT_ERROR(err);
  337. }
  338. else
  339. {
  340. *event = ev;
  341. }
  342. if (ret)
  343. {
  344. *ret = (event == NULL) ? 0 : -EAGAIN;
  345. }
  346. }
  347. return err;
  348. }
  349. cl_int starpu_opencl_copy_async_sync(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, cl_event *event)
  350. {
  351. enum starpu_node_kind src_kind = starpu_node_get_kind(src_node);
  352. enum starpu_node_kind dst_kind = starpu_node_get_kind(dst_node);
  353. cl_int err;
  354. int ret;
  355. switch (_STARPU_MEMORY_NODE_TUPLE(src_kind,dst_kind))
  356. {
  357. case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_CPU_RAM):
  358. err = starpu_opencl_copy_opencl_to_ram(
  359. (cl_mem) src, src_node,
  360. (void*) (dst + dst_offset), dst_node,
  361. size, src_offset, event, &ret);
  362. if (STARPU_UNLIKELY(err))
  363. STARPU_OPENCL_REPORT_ERROR(err);
  364. return ret;
  365. case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_OPENCL_RAM):
  366. err = starpu_opencl_copy_ram_to_opencl(
  367. (void*) (src + src_offset), src_node,
  368. (cl_mem) dst, dst_node,
  369. size, dst_offset, event, &ret);
  370. if (STARPU_UNLIKELY(err))
  371. STARPU_OPENCL_REPORT_ERROR(err);
  372. return ret;
  373. case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_OPENCL_RAM):
  374. err = starpu_opencl_copy_opencl_to_opencl(
  375. (cl_mem) src, src_node, src_offset,
  376. (cl_mem) dst, dst_node, dst_offset,
  377. size, event, &ret);
  378. if (STARPU_UNLIKELY(err))
  379. STARPU_OPENCL_REPORT_ERROR(err);
  380. return ret;
  381. default:
  382. STARPU_ABORT();
  383. break;
  384. }
  385. }
  386. #if 0
  387. cl_int _starpu_opencl_copy_rect_opencl_to_ram(cl_mem buffer, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *ptr, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, const size_t buffer_origin[3], const size_t host_origin[3],
  388. const size_t region[3], size_t buffer_row_pitch, size_t buffer_slice_pitch,
  389. size_t host_row_pitch, size_t host_slice_pitch, cl_event *event)
  390. {
  391. cl_int err;
  392. struct _starpu_worker *worker = _starpu_get_local_worker_key();
  393. cl_bool blocking;
  394. blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
  395. if (event)
  396. _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
  397. err = clEnqueueReadBufferRect(out_transfer_queues[worker->devid], buffer, blocking, buffer_origin, host_origin, region, buffer_row_pitch,
  398. buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, 0, NULL, event);
  399. if (event)
  400. _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
  401. if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
  402. return CL_SUCCESS;
  403. }
  404. cl_int _starpu_opencl_copy_rect_ram_to_opencl(void *ptr, unsigned src_node STARPU_ATTRIBUTE_UNUSED, cl_mem buffer, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, const size_t buffer_origin[3], const size_t host_origin[3],
  405. const size_t region[3], size_t buffer_row_pitch, size_t buffer_slice_pitch,
  406. size_t host_row_pitch, size_t host_slice_pitch, cl_event *event)
  407. {
  408. cl_int err;
  409. struct _starpu_worker *worker = _starpu_get_local_worker_key();
  410. cl_bool blocking;
  411. blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
  412. if (event)
  413. _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
  414. err = clEnqueueWriteBufferRect(in_transfer_queues[worker->devid], buffer, blocking, buffer_origin, host_origin, region, buffer_row_pitch,
  415. buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, 0, NULL, event);
  416. if (event)
  417. _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
  418. if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
  419. return CL_SUCCESS;
  420. }
  421. #endif
  422. #endif /* STARPU_USE_OPENCL */
  423. static size_t _starpu_opencl_get_global_mem_size(int devid)
  424. {
  425. return global_mem[devid];
  426. }
  427. void _starpu_opencl_init(void)
  428. {
  429. STARPU_PTHREAD_MUTEX_LOCK(&big_lock);
  430. if (!init_done)
  431. {
  432. #ifdef STARPU_SIMGRID
  433. nb_devices = _starpu_simgrid_get_nbhosts("OpenCL");
  434. #else /* STARPU_USE_OPENCL */
  435. cl_platform_id platform_id[_STARPU_OPENCL_PLATFORM_MAX];
  436. cl_uint nb_platforms;
  437. cl_int err;
  438. int i;
  439. cl_device_type device_type = CL_DEVICE_TYPE_GPU|CL_DEVICE_TYPE_ACCELERATOR;
  440. _STARPU_DEBUG("Initialising OpenCL\n");
  441. // Get Platforms
  442. if (starpu_get_env_number("STARPU_OPENCL_ON_CPUS") > 0)
  443. device_type |= CL_DEVICE_TYPE_CPU;
  444. if (starpu_get_env_number("STARPU_OPENCL_ONLY_ON_CPUS") > 0)
  445. device_type = CL_DEVICE_TYPE_CPU;
  446. err = clGetPlatformIDs(_STARPU_OPENCL_PLATFORM_MAX, platform_id, &nb_platforms);
  447. if (STARPU_UNLIKELY(err != CL_SUCCESS)) nb_platforms=0;
  448. _STARPU_DEBUG("Platforms detected: %u\n", nb_platforms);
  449. _STARPU_DEBUG("CPU device type: %s\n", device_type&CL_DEVICE_TYPE_CPU?"requested":"not requested");
  450. _STARPU_DEBUG("GPU device type: %s\n", device_type&CL_DEVICE_TYPE_GPU?"requested":"not requested");
  451. _STARPU_DEBUG("Accelerator device type: %s\n", device_type&CL_DEVICE_TYPE_ACCELERATOR?"requested":"not requested");
  452. // Get devices
  453. nb_devices = 0;
  454. {
  455. unsigned j;
  456. for (j=0; j<nb_platforms; j++)
  457. {
  458. cl_uint num;
  459. int platform_valid = 1;
  460. char name[1024], vendor[1024];
  461. err = clGetPlatformInfo(platform_id[j], CL_PLATFORM_NAME, 1024, name, NULL);
  462. if (err != CL_SUCCESS)
  463. {
  464. STARPU_OPENCL_REPORT_ERROR_WITH_MSG("clGetPlatformInfo NAME", err);
  465. platform_valid = 0;
  466. }
  467. else
  468. {
  469. err = clGetPlatformInfo(platform_id[j], CL_PLATFORM_VENDOR, 1024, vendor, NULL);
  470. if (STARPU_UNLIKELY(err != CL_SUCCESS))
  471. {
  472. STARPU_OPENCL_REPORT_ERROR_WITH_MSG("clGetPlatformInfo VENDOR", err);
  473. platform_valid = 0;
  474. }
  475. }
  476. if(strcmp(name, "SOCL Platform") == 0)
  477. {
  478. platform_valid = 0;
  479. _STARPU_DEBUG("Skipping SOCL Platform\n");
  480. }
  481. #ifdef STARPU_VERBOSE
  482. if (platform_valid)
  483. _STARPU_DEBUG("Platform: %s - %s\n", name, vendor);
  484. else
  485. _STARPU_DEBUG("Platform invalid\n");
  486. #endif
  487. if (platform_valid && nb_devices <= STARPU_MAXOPENCLDEVS)
  488. {
  489. err = clGetDeviceIDs(platform_id[j], device_type, STARPU_MAXOPENCLDEVS-nb_devices, STARPU_MAXOPENCLDEVS == nb_devices ? NULL : &devices[nb_devices], &num);
  490. if (err == CL_DEVICE_NOT_FOUND)
  491. {
  492. const cl_device_type all_device_types = CL_DEVICE_TYPE_CPU|CL_DEVICE_TYPE_GPU|CL_DEVICE_TYPE_ACCELERATOR;
  493. if (device_type != all_device_types)
  494. {
  495. _STARPU_DEBUG(" No devices of the requested type(s) subset detected on this platform\n");
  496. }
  497. else
  498. {
  499. _STARPU_DEBUG(" No devices detected on this platform\n");
  500. }
  501. }
  502. else
  503. {
  504. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  505. _STARPU_DEBUG(" %u devices detected\n", num);
  506. nb_devices += num;
  507. }
  508. }
  509. }
  510. }
  511. // Get location of OpenCl kernel source files
  512. _starpu_opencl_program_dir = starpu_getenv("STARPU_OPENCL_PROGRAM_DIR");
  513. if (nb_devices > STARPU_MAXOPENCLDEVS)
  514. {
  515. _STARPU_DISP("# Warning: %u OpenCL devices available. Only %d enabled. Use configure option --enable-maxopencldev=xxx to update the maximum value of supported OpenCL devices?\n", nb_devices, STARPU_MAXOPENCLDEVS);
  516. nb_devices = STARPU_MAXOPENCLDEVS;
  517. }
  518. // initialise internal structures
  519. for(i=0 ; i<nb_devices ; i++)
  520. {
  521. contexts[i] = NULL;
  522. queues[i] = NULL;
  523. in_transfer_queues[i] = NULL;
  524. out_transfer_queues[i] = NULL;
  525. peer_transfer_queues[i] = NULL;
  526. alloc_queues[i] = NULL;
  527. }
  528. #endif /* STARPU_USE_OPENCL */
  529. init_done=1;
  530. }
  531. STARPU_PTHREAD_MUTEX_UNLOCK(&big_lock);
  532. }
  533. #ifndef STARPU_SIMGRID
  534. static unsigned _starpu_opencl_get_device_name(int dev, char *name, int lname);
  535. #endif
  536. static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker *worker, unsigned char pipeline_idx);
  537. static void _starpu_opencl_stop_job(struct _starpu_job *j, struct _starpu_worker *worker);
  538. static void _starpu_opencl_execute_job(struct starpu_task *task, struct _starpu_worker *worker);
  539. int _starpu_opencl_driver_init(struct _starpu_worker *worker)
  540. {
  541. int devid = worker->devid;
  542. _starpu_driver_start(worker, _STARPU_FUT_OPENCL_KEY, 0);
  543. _starpu_opencl_init_context(devid);
  544. /* one more time to avoid hacks from third party lib :) */
  545. _starpu_bind_thread_on_cpu(worker->config, worker->bindid, worker->workerid);
  546. _starpu_opencl_limit_gpu_mem_if_needed(devid);
  547. _starpu_memory_manager_set_global_memory_size(worker->memory_node, _starpu_opencl_get_global_mem_size(devid));
  548. float size = (float) global_mem[devid] / (1<<30);
  549. #ifdef STARPU_SIMGRID
  550. const char *devname = "Simgrid";
  551. #else
  552. /* get the device's name */
  553. char devname[128];
  554. _starpu_opencl_get_device_name(devid, devname, 128);
  555. #endif
  556. snprintf(worker->name, sizeof(worker->name), "OpenCL %d (%s %.1f GiB)", devid, devname, size);
  557. snprintf(worker->short_name, sizeof(worker->short_name), "OpenCL %d", devid);
  558. starpu_pthread_setname(worker->short_name);
  559. worker->pipeline_length = starpu_get_env_number_default("STARPU_OPENCL_PIPELINE", 2);
  560. if (worker->pipeline_length > STARPU_MAX_PIPELINE)
  561. {
  562. _STARPU_DISP("Warning: STARPU_OPENCL_PIPELINE is %u, but STARPU_MAX_PIPELINE is only %u", worker->pipeline_length, STARPU_MAX_PIPELINE);
  563. worker->pipeline_length = STARPU_MAX_PIPELINE;
  564. }
  565. #if !defined(STARPU_SIMGRID) && !defined(STARPU_NON_BLOCKING_DRIVERS)
  566. if (worker->pipeline_length >= 1)
  567. {
  568. /* We need non-blocking drivers, to poll for OPENCL task
  569. * termination */
  570. _STARPU_DISP("Warning: reducing STARPU_OPENCL_PIPELINE to 0 because blocking drivers are enabled (and simgrid is not enabled)\n");
  571. worker->pipeline_length = 0;
  572. }
  573. #endif
  574. _STARPU_DEBUG("OpenCL (%s) dev id %d thread is ready to run on CPU %d !\n", devname, devid, worker->bindid);
  575. _STARPU_TRACE_WORKER_INIT_END(worker->workerid);
  576. /* tell the main thread that this one is ready */
  577. STARPU_PTHREAD_MUTEX_LOCK(&worker->mutex);
  578. worker->status = STATUS_UNKNOWN;
  579. worker->worker_is_initialized = 1;
  580. STARPU_PTHREAD_COND_SIGNAL(&worker->ready_cond);
  581. STARPU_PTHREAD_MUTEX_UNLOCK(&worker->mutex);
  582. return 0;
  583. }
  584. int _starpu_opencl_driver_run_once(struct _starpu_worker *worker)
  585. {
  586. int workerid = worker->workerid;
  587. unsigned memnode = worker->memory_node;
  588. struct _starpu_job *j;
  589. struct starpu_task *task;
  590. int res;
  591. int idle;
  592. #ifdef STARPU_SIMGRID
  593. starpu_pthread_wait_reset(&worker->wait);
  594. #endif
  595. /* First poll for completed jobs */
  596. idle = 0;
  597. if (worker->ntasks)
  598. {
  599. #ifndef STARPU_SIMGRID
  600. size_t size;
  601. int err;
  602. #endif
  603. /* On-going asynchronous task, check for its termination first */
  604. task = worker->current_tasks[worker->first_task];
  605. #ifdef STARPU_SIMGRID
  606. if (task_finished[worker->devid][worker->first_task])
  607. #else /* !STARPU_SIMGRID */
  608. cl_int status;
  609. err = clGetEventInfo(task_events[worker->devid][worker->first_task], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &status, &size);
  610. STARPU_ASSERT(size == sizeof(cl_int));
  611. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  612. if (status != CL_COMPLETE)
  613. #endif /* !STARPU_SIMGRID */
  614. {
  615. _STARPU_TRACE_START_EXECUTING();
  616. }
  617. else
  618. {
  619. #ifndef STARPU_SIMGRID
  620. err = clReleaseEvent(task_events[worker->devid][worker->first_task]);
  621. if (STARPU_UNLIKELY(err)) STARPU_OPENCL_REPORT_ERROR(err);
  622. task_events[worker->devid][worker->first_task] = 0;
  623. #endif
  624. /* Asynchronous task completed! */
  625. _starpu_opencl_stop_job(_starpu_get_job_associated_to_task(task), worker);
  626. /* See next task if any */
  627. if (worker->ntasks)
  628. {
  629. task = worker->current_tasks[worker->first_task];
  630. j = _starpu_get_job_associated_to_task(task);
  631. if (task->cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC)
  632. {
  633. /* An asynchronous task, it was already queued,
  634. * it's now running, record its start time. */
  635. _starpu_driver_start_job(worker, j, &worker->perf_arch, &j->cl_start, 0, starpu_profiling_status_get());
  636. }
  637. else
  638. {
  639. /* A synchronous task, we have finished flushing the pipeline, we can now at last execute it. */
  640. _STARPU_TRACE_END_PROGRESS(memnode);
  641. _STARPU_TRACE_EVENT("sync_task");
  642. _starpu_opencl_execute_job(task, worker);
  643. _STARPU_TRACE_EVENT("end_sync_task");
  644. _STARPU_TRACE_START_PROGRESS(memnode);
  645. worker->pipeline_stuck = 0;
  646. }
  647. }
  648. _STARPU_TRACE_END_EXECUTING();
  649. }
  650. }
  651. if (!worker->pipeline_length || worker->ntasks < worker->pipeline_length)
  652. idle++;
  653. #if defined(STARPU_NON_BLOCKING_DRIVERS) && !defined(STARPU_SIMGRID)
  654. if (!idle)
  655. {
  656. /* Not ready yet, no better thing to do than waiting */
  657. __starpu_datawizard_progress(memnode, 1, 0);
  658. __starpu_datawizard_progress(STARPU_MAIN_RAM, 1, 0);
  659. return 0;
  660. }
  661. #endif
  662. res = !idle;
  663. res |= __starpu_datawizard_progress(memnode, 1, 1);
  664. res |= __starpu_datawizard_progress(STARPU_MAIN_RAM, 1, 1);
  665. task = _starpu_get_worker_task(worker, workerid, memnode);
  666. #ifdef STARPU_SIMGRID
  667. if (!res && !task)
  668. starpu_pthread_wait_wait(&worker->wait);
  669. #endif
  670. if (task == NULL)
  671. return 0;
  672. j = _starpu_get_job_associated_to_task(task);
  673. /* can OpenCL do that task ? */
  674. if (!_STARPU_OPENCL_MAY_PERFORM(j))
  675. {
  676. /* this is not a OpenCL task */
  677. _starpu_push_task_to_workers(task);
  678. return 0;
  679. }
  680. worker->current_tasks[(worker->first_task + worker->ntasks)%STARPU_MAX_PIPELINE] = task;
  681. worker->ntasks++;
  682. if (worker->ntasks > 1 && !(task->cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC))
  683. {
  684. /* We have to execute a non-asynchronous task but we
  685. * still have tasks in the pipeline... Record it to
  686. * prevent more tasks from coming, and do it later */
  687. worker->pipeline_stuck = 1;
  688. return 0;
  689. }
  690. _STARPU_TRACE_END_PROGRESS(memnode);
  691. _starpu_opencl_execute_job(task, worker);
  692. _STARPU_TRACE_START_PROGRESS(memnode);
  693. return 0;
  694. }
  695. int _starpu_opencl_driver_deinit(struct _starpu_worker *worker)
  696. {
  697. _STARPU_TRACE_WORKER_DEINIT_START;
  698. unsigned memnode = worker->memory_node;
  699. _starpu_handle_all_pending_node_data_requests(memnode);
  700. /* In case there remains some memory that was automatically
  701. * allocated by StarPU, we release it now. Note that data
  702. * coherency is not maintained anymore at that point ! */
  703. _starpu_free_all_automatically_allocated_buffers(memnode);
  704. _starpu_malloc_shutdown(memnode);
  705. unsigned devid = worker->devid;
  706. _starpu_opencl_deinit_context(devid);
  707. worker->worker_is_initialized = 0;
  708. _STARPU_TRACE_WORKER_DEINIT_END(_STARPU_FUT_OPENCL_KEY);
  709. return 0;
  710. }
  711. void *_starpu_opencl_worker(void *_arg)
  712. {
  713. struct _starpu_worker* worker = _arg;
  714. _starpu_opencl_driver_init(worker);
  715. _STARPU_TRACE_START_PROGRESS(memnode);
  716. while (_starpu_machine_is_running())
  717. {
  718. _starpu_may_pause();
  719. _starpu_opencl_driver_run_once(worker);
  720. }
  721. _starpu_opencl_driver_deinit(worker);
  722. _STARPU_TRACE_END_PROGRESS(memnode);
  723. return NULL;
  724. }
  725. #ifdef STARPU_USE_OPENCL
  726. #ifndef STARPU_SIMGRID
  727. static unsigned _starpu_opencl_get_device_name(int dev, char *name, int lname)
  728. {
  729. int err;
  730. if (!init_done)
  731. {
  732. _starpu_opencl_init();
  733. }
  734. // Get device name
  735. err = clGetDeviceInfo(devices[dev], CL_DEVICE_NAME, lname, name, NULL);
  736. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  737. _STARPU_DEBUG("Device %d : [%s]\n", dev, name);
  738. return EXIT_SUCCESS;
  739. }
  740. #endif
  741. #endif
  742. unsigned _starpu_opencl_get_device_count(void)
  743. {
  744. if (!init_done)
  745. {
  746. _starpu_opencl_init();
  747. }
  748. return nb_devices;
  749. }
  750. #ifdef STARPU_USE_OPENCL
  751. cl_device_type _starpu_opencl_get_device_type(int devid)
  752. {
  753. int err;
  754. cl_device_type type;
  755. if (!init_done)
  756. _starpu_opencl_init();
  757. err = clGetDeviceInfo(devices[devid], CL_DEVICE_TYPE, sizeof(cl_device_type), &type, NULL);
  758. if (STARPU_UNLIKELY(err != CL_SUCCESS))
  759. STARPU_OPENCL_REPORT_ERROR(err);
  760. return type;
  761. }
  762. #endif /* STARPU_USE_OPENCL */
  763. static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker *worker, unsigned char pipeline_idx STARPU_ATTRIBUTE_UNUSED)
  764. {
  765. int ret;
  766. STARPU_ASSERT(j);
  767. struct starpu_task *task = j->task;
  768. int profiling = starpu_profiling_status_get();
  769. STARPU_ASSERT(task);
  770. struct starpu_codelet *cl = task->cl;
  771. STARPU_ASSERT(cl);
  772. _starpu_set_current_task(j->task);
  773. ret = _starpu_fetch_task_input(j);
  774. if (ret != 0)
  775. {
  776. /* there was not enough memory, so the input of
  777. * the codelet cannot be fetched ... put the
  778. * codelet back, and try it later */
  779. return -EAGAIN;
  780. }
  781. if (worker->ntasks == 1)
  782. {
  783. /* We are alone in the pipeline, the kernel will start now, record it */
  784. _starpu_driver_start_job(worker, j, &worker->perf_arch, &j->cl_start, 0, profiling);
  785. }
  786. starpu_opencl_func_t func = _starpu_task_get_opencl_nth_implementation(cl, j->nimpl);
  787. STARPU_ASSERT_MSG(func, "when STARPU_OPENCL is defined in 'where', opencl_func or opencl_funcs has to be defined");
  788. if (_starpu_get_disable_kernels() <= 0)
  789. {
  790. _STARPU_TRACE_START_EXECUTING();
  791. #ifdef STARPU_SIMGRID
  792. double length = NAN;
  793. int async = task->cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC;
  794. int simulate = 1;
  795. if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE & !async)
  796. {
  797. /* Actually execute function */
  798. simulate = 0;
  799. func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
  800. #ifdef STARPU_OPENCL_SIMULATOR
  801. #ifndef CL_PROFILING_CLOCK_CYCLE_COUNT
  802. #ifdef CL_PROFILING_COMMAND_SHAVE_CYCLE_COUNT
  803. #define CL_PROFILING_CLOCK_CYCLE_COUNT CL_PROFILING_COMMAND_SHAVE_CYCLE_COUNT
  804. #else
  805. #error The OpenCL simulator must provide CL_PROFILING_CLOCK_CYCLE_COUNT
  806. #endif
  807. #endif
  808. struct starpu_profiling_task_info *profiling_info = task->profiling_info;
  809. STARPU_ASSERT_MSG(profiling_info->used_cycles, "Application kernel must call starpu_opencl_collect_stats to collect simulated time");
  810. #ifdef HAVE_MSG_HOST_GET_SPEED
  811. length = ((double) profiling_info->used_cycles)/MSG_host_get_speed(MSG_host_self());
  812. #else
  813. length = ((double) profiling_info->used_cycles)/MSG_get_host_speed(MSG_host_self());
  814. #endif
  815. /* And give the simulated time to simgrid */
  816. simulate = 1;
  817. #endif
  818. }
  819. if (simulate)
  820. _starpu_simgrid_submit_job(worker->workerid, j, &worker->perf_arch, length,
  821. async ? &task_finished[worker->devid][pipeline_idx] : NULL,
  822. async ? &task_mutex[worker->devid][pipeline_idx] : NULL,
  823. async ? &task_cond[worker->devid][pipeline_idx] : NULL);
  824. #else
  825. func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
  826. #endif
  827. _STARPU_TRACE_END_EXECUTING();
  828. }
  829. return 0;
  830. }
  831. static void _starpu_opencl_stop_job(struct _starpu_job *j, struct _starpu_worker *worker)
  832. {
  833. struct timespec codelet_end;
  834. int profiling = starpu_profiling_status_get();
  835. _starpu_set_current_task(NULL);
  836. if (worker->pipeline_length)
  837. worker->current_tasks[worker->first_task] = NULL;
  838. else
  839. worker->current_task = NULL;
  840. worker->first_task = (worker->first_task + 1) % STARPU_MAX_PIPELINE;
  841. worker->ntasks--;
  842. _starpu_driver_end_job(worker, j, &worker->perf_arch, &codelet_end, 0, profiling);
  843. struct _starpu_sched_ctx *sched_ctx = _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(worker, j);
  844. STARPU_ASSERT_MSG(sched_ctx != NULL, "there should be a worker %d in the ctx of this job \n", worker->workerid);
  845. if(!sched_ctx->sched_policy)
  846. _starpu_driver_update_job_feedback(j, worker, &sched_ctx->perf_arch, &j->cl_start, &codelet_end, profiling);
  847. else
  848. _starpu_driver_update_job_feedback(j, worker, &worker->perf_arch, &j->cl_start, &codelet_end, profiling);
  849. _starpu_push_task_output(j);
  850. _starpu_handle_job_termination(j);
  851. }
  852. static void _starpu_opencl_execute_job(struct starpu_task *task, struct _starpu_worker *worker)
  853. {
  854. int res;
  855. struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
  856. unsigned char pipeline_idx = (worker->first_task + worker->ntasks - 1)%STARPU_MAX_PIPELINE;
  857. res = _starpu_opencl_start_job(j, worker, pipeline_idx);
  858. if (res)
  859. {
  860. switch (res)
  861. {
  862. case -EAGAIN:
  863. _STARPU_DISP("ouch, OpenCL could not actually run task %p, putting it back...\n", task);
  864. _starpu_push_task_to_workers(task);
  865. STARPU_ABORT();
  866. default:
  867. STARPU_ABORT();
  868. }
  869. }
  870. if (task->cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC)
  871. {
  872. /* Record event to synchronize with task termination later */
  873. #ifndef STARPU_SIMGRID
  874. cl_command_queue queue;
  875. starpu_opencl_get_queue(worker->devid, &queue);
  876. #endif
  877. if (worker->pipeline_length == 0)
  878. {
  879. #ifdef STARPU_SIMGRID
  880. _starpu_simgrid_wait_tasks(worker->workerid);
  881. #else
  882. starpu_opencl_get_queue(worker->devid, &queue);
  883. clFinish(queue);
  884. #endif
  885. _starpu_opencl_stop_job(j, worker);
  886. }
  887. else
  888. {
  889. #ifndef STARPU_SIMGRID
  890. int err;
  891. /* the function clEnqueueMarker is deprecated from
  892. * OpenCL version 1.2. We would like to use the new
  893. * function clEnqueueMarkerWithWaitList. We could do
  894. * it by checking its availability through our own
  895. * configure macro HAVE_CLENQUEUEMARKERWITHWAITLIST
  896. * and the OpenCL macro CL_VERSION_1_2. However these
  897. * 2 macros detect the function availability in the
  898. * ICD and not in the device implementation.
  899. */
  900. err = clEnqueueMarker(queue, &task_events[worker->devid][pipeline_idx]);
  901. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  902. #endif
  903. _STARPU_TRACE_START_EXECUTING();
  904. }
  905. }
  906. else
  907. /* Synchronous execution */
  908. {
  909. _starpu_opencl_stop_job(j, worker);
  910. }
  911. }
  912. #ifdef STARPU_USE_OPENCL
  913. int _starpu_run_opencl(struct _starpu_worker *workerarg)
  914. {
  915. _STARPU_DEBUG("Running OpenCL %u from the application\n", workerarg->devid);
  916. workerarg->set = NULL;
  917. workerarg->worker_is_initialized = 0;
  918. /* Let's go ! */
  919. _starpu_opencl_worker(workerarg);
  920. return 0;
  921. }
  922. #endif /* STARPU_USE_OPENCL */