driver_opencl.c 35 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2010-2017 Université de Bordeaux
  4. * Copyright (C) 2010 Mehdi Juhoor <mjuhoor@gmail.com>
  5. * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 CNRS
  6. * Copyright (C) 2011 Télécom-SudParis
  7. *
  8. * StarPU is free software; you can redistribute it and/or modify
  9. * it under the terms of the GNU Lesser General Public License as published by
  10. * the Free Software Foundation; either version 2.1 of the License, or (at
  11. * your option) any later version.
  12. *
  13. * StarPU is distributed in the hope that it will be useful, but
  14. * WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  16. *
  17. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  18. */
  19. #include <math.h>
  20. #include <starpu.h>
  21. #include <starpu_profiling.h>
  22. #include <common/config.h>
  23. #include <common/utils.h>
  24. #include <core/debug.h>
  25. #include <starpu_opencl.h>
  26. #include <drivers/driver_common/driver_common.h>
  27. #include "driver_opencl.h"
  28. #include "driver_opencl_utils.h"
  29. #include <common/utils.h>
  30. #include <datawizard/memory_manager.h>
  31. #include <datawizard/memory_nodes.h>
  32. #include <datawizard/malloc.h>
  33. #include <core/task.h>
  34. #ifdef STARPU_SIMGRID
  35. #include <core/simgrid.h>
  36. #endif
  37. static int nb_devices = -1;
  38. static int init_done = 0;
  39. static starpu_pthread_mutex_t big_lock = STARPU_PTHREAD_MUTEX_INITIALIZER;
  40. static size_t global_mem[STARPU_MAXOPENCLDEVS];
  41. #ifdef STARPU_USE_OPENCL
  42. static cl_context contexts[STARPU_MAXOPENCLDEVS];
  43. static cl_device_id devices[STARPU_MAXOPENCLDEVS];
  44. static cl_command_queue queues[STARPU_MAXOPENCLDEVS];
  45. static cl_command_queue in_transfer_queues[STARPU_MAXOPENCLDEVS];
  46. static cl_command_queue out_transfer_queues[STARPU_MAXOPENCLDEVS];
  47. static cl_command_queue peer_transfer_queues[STARPU_MAXOPENCLDEVS];
  48. #ifndef STARPU_SIMGRID
  49. static cl_command_queue alloc_queues[STARPU_MAXOPENCLDEVS];
  50. static cl_event task_events[STARPU_MAXOPENCLDEVS][STARPU_MAX_PIPELINE];
  51. #endif /* !STARPU_SIMGRID */
  52. #endif
  53. #ifdef STARPU_SIMGRID
  54. static unsigned task_finished[STARPU_MAXOPENCLDEVS][STARPU_MAX_PIPELINE];
  55. #endif /* STARPU_SIMGRID */
  56. void
  57. _starpu_opencl_discover_devices(struct _starpu_machine_config *config)
  58. {
  59. /* Discover the number of OpenCL devices. Fill the result in CONFIG. */
  60. /* As OpenCL must have been initialized before calling this function,
  61. * `nb_device' is ensured to be correctly set. */
  62. STARPU_ASSERT(init_done == 1);
  63. config->topology.nhwopenclgpus = nb_devices;
  64. }
  65. static void _starpu_opencl_limit_gpu_mem_if_needed(unsigned devid)
  66. {
  67. starpu_ssize_t limit;
  68. size_t STARPU_ATTRIBUTE_UNUSED totalGlobalMem = 0;
  69. size_t STARPU_ATTRIBUTE_UNUSED to_waste = 0;
  70. #ifdef STARPU_SIMGRID
  71. totalGlobalMem = _starpu_simgrid_get_memsize("OpenCL", devid);
  72. #elif defined(STARPU_USE_OPENCL)
  73. /* Request the size of the current device's memory */
  74. cl_int err;
  75. cl_ulong size;
  76. err = clGetDeviceInfo(devices[devid], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size), &size, NULL);
  77. if (STARPU_UNLIKELY(err != CL_SUCCESS))
  78. STARPU_OPENCL_REPORT_ERROR(err);
  79. totalGlobalMem = size;
  80. #endif
  81. limit = starpu_get_env_number("STARPU_LIMIT_OPENCL_MEM");
  82. if (limit == -1)
  83. {
  84. char name[30];
  85. sprintf(name, "STARPU_LIMIT_OPENCL_%u_MEM", devid);
  86. limit = starpu_get_env_number(name);
  87. }
  88. #if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
  89. if (limit == -1)
  90. {
  91. /* Use 90% of the available memory by default. */
  92. limit = totalGlobalMem / (1024*1024) * 0.9;
  93. }
  94. #endif
  95. global_mem[devid] = limit * 1024*1024;
  96. #ifdef STARPU_USE_OPENCL
  97. /* How much memory to waste ? */
  98. to_waste = totalGlobalMem - global_mem[devid];
  99. #endif
  100. _STARPU_DEBUG("OpenCL device %d: Wasting %ld MB / Limit %ld MB / Total %ld MB / Remains %ld MB\n",
  101. devid, (long)to_waste/(1024*1024), (long) limit, (long)totalGlobalMem/(1024*1024),
  102. (long)(totalGlobalMem - to_waste)/(1024*1024));
  103. }
  104. #ifdef STARPU_USE_OPENCL
  105. void starpu_opencl_get_context(int devid, cl_context *context)
  106. {
  107. *context = contexts[devid];
  108. }
  109. void starpu_opencl_get_device(int devid, cl_device_id *device)
  110. {
  111. *device = devices[devid];
  112. }
  113. void starpu_opencl_get_queue(int devid, cl_command_queue *queue)
  114. {
  115. *queue = queues[devid];
  116. }
  117. void starpu_opencl_get_current_queue(cl_command_queue *queue)
  118. {
  119. struct _starpu_worker *worker = _starpu_get_local_worker_key();
  120. STARPU_ASSERT(queue);
  121. *queue = queues[worker->devid];
  122. }
  123. void starpu_opencl_get_current_context(cl_context *context)
  124. {
  125. struct _starpu_worker *worker = _starpu_get_local_worker_key();
  126. STARPU_ASSERT(context);
  127. *context = contexts[worker->devid];
  128. }
  129. #endif /* STARPU_USE_OPENCL */
  130. int _starpu_opencl_init_context(int devid)
  131. {
  132. #ifdef STARPU_SIMGRID
  133. int j;
  134. for (j = 0; j < STARPU_MAX_PIPELINE; j++)
  135. task_finished[devid][j] = 0;
  136. #else /* !STARPU_SIMGRID */
  137. cl_int err;
  138. cl_uint uint;
  139. STARPU_PTHREAD_MUTEX_LOCK(&big_lock);
  140. _STARPU_DEBUG("Initialising context for dev %d\n", devid);
  141. // Create a compute context
  142. err = 0;
  143. contexts[devid] = clCreateContext(NULL, 1, &devices[devid], NULL, NULL, &err);
  144. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  145. err = clGetDeviceInfo(devices[devid], CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(uint), &uint, NULL);
  146. if (STARPU_UNLIKELY(err != CL_SUCCESS))
  147. STARPU_OPENCL_REPORT_ERROR(err);
  148. starpu_malloc_set_align(uint/8);
  149. // Create execution queue for the given device
  150. queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], 0, &err);
  151. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  152. // Create transfer queue for the given device
  153. cl_command_queue_properties props;
  154. err = clGetDeviceInfo(devices[devid], CL_DEVICE_QUEUE_PROPERTIES, sizeof(props), &props, NULL);
  155. if (STARPU_UNLIKELY(err != CL_SUCCESS))
  156. STARPU_OPENCL_REPORT_ERROR(err);
  157. props &= ~CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
  158. in_transfer_queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], props, &err);
  159. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  160. out_transfer_queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], props, &err);
  161. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  162. peer_transfer_queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], props, &err);
  163. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  164. alloc_queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], 0, &err);
  165. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  166. STARPU_PTHREAD_MUTEX_UNLOCK(&big_lock);
  167. #endif /* !STARPU_SIMGRID */
  168. return 0;
  169. }
  170. int _starpu_opencl_deinit_context(int devid)
  171. {
  172. #ifdef STARPU_SIMGRID
  173. int j;
  174. for (j = 0; j < STARPU_MAX_PIPELINE; j++)
  175. task_finished[devid][j] = 0;
  176. #else /* !STARPU_SIMGRID */
  177. cl_int err;
  178. STARPU_PTHREAD_MUTEX_LOCK(&big_lock);
  179. _STARPU_DEBUG("De-initialising context for dev %d\n", devid);
  180. err = clFinish(queues[devid]);
  181. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  182. err = clReleaseCommandQueue(queues[devid]);
  183. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  184. err = clFinish(in_transfer_queues[devid]);
  185. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  186. err = clReleaseCommandQueue(in_transfer_queues[devid]);
  187. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  188. err = clFinish(out_transfer_queues[devid]);
  189. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  190. err = clReleaseCommandQueue(out_transfer_queues[devid]);
  191. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  192. err = clFinish(peer_transfer_queues[devid]);
  193. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  194. err = clReleaseCommandQueue(peer_transfer_queues[devid]);
  195. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  196. err = clFinish(alloc_queues[devid]);
  197. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  198. err = clReleaseCommandQueue(alloc_queues[devid]);
  199. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  200. err = clReleaseContext(contexts[devid]);
  201. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  202. contexts[devid] = NULL;
  203. STARPU_PTHREAD_MUTEX_UNLOCK(&big_lock);
  204. #endif
  205. return 0;
  206. }
  207. #ifdef STARPU_USE_OPENCL
  208. cl_int starpu_opencl_allocate_memory(int devid STARPU_ATTRIBUTE_UNUSED, cl_mem *mem STARPU_ATTRIBUTE_UNUSED, size_t size STARPU_ATTRIBUTE_UNUSED, cl_mem_flags flags STARPU_ATTRIBUTE_UNUSED)
  209. {
  210. #ifdef STARPU_SIMGRID
  211. STARPU_ABORT();
  212. #else
  213. cl_int err;
  214. cl_mem memory;
  215. memory = clCreateBuffer(contexts[devid], flags, size, NULL, &err);
  216. if (err == CL_OUT_OF_HOST_MEMORY) return err;
  217. if (err == CL_MEM_OBJECT_ALLOCATION_FAILURE) return err;
  218. if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
  219. /*
  220. * OpenCL uses lazy memory allocation: we will only know if the
  221. * allocation failed when trying to copy data onto the device. But we
  222. * want to know this __now__, so we just perform a dummy copy.
  223. */
  224. char dummy = 0;
  225. cl_event ev;
  226. err = clEnqueueWriteBuffer(alloc_queues[devid], memory, CL_TRUE,
  227. 0, sizeof(dummy), &dummy,
  228. 0, NULL, &ev);
  229. if (err == CL_MEM_OBJECT_ALLOCATION_FAILURE)
  230. return err;
  231. if (err == CL_OUT_OF_RESOURCES)
  232. return err;
  233. if (err != CL_SUCCESS)
  234. STARPU_OPENCL_REPORT_ERROR(err);
  235. clWaitForEvents(1, &ev);
  236. clReleaseEvent(ev);
  237. *mem = memory;
  238. return CL_SUCCESS;
  239. #endif
  240. }
  241. cl_int starpu_opencl_copy_ram_to_opencl(void *ptr, unsigned src_node STARPU_ATTRIBUTE_UNUSED, cl_mem buffer, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size, size_t offset, cl_event *event, int *ret)
  242. {
  243. cl_int err;
  244. struct _starpu_worker *worker = _starpu_get_local_worker_key();
  245. if (event)
  246. _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
  247. cl_event ev;
  248. err = clEnqueueWriteBuffer(in_transfer_queues[worker->devid], buffer, CL_FALSE, offset, size, ptr, 0, NULL, &ev);
  249. if (event)
  250. _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
  251. if (STARPU_LIKELY(err == CL_SUCCESS))
  252. {
  253. if (event == NULL)
  254. {
  255. /* We want a synchronous copy, let's synchronise the queue */
  256. err = clWaitForEvents(1, &ev);
  257. if (STARPU_UNLIKELY(err))
  258. STARPU_OPENCL_REPORT_ERROR(err);
  259. err = clReleaseEvent(ev);
  260. if (STARPU_UNLIKELY(err))
  261. STARPU_OPENCL_REPORT_ERROR(err);
  262. }
  263. else
  264. {
  265. *event = ev;
  266. }
  267. if (ret)
  268. {
  269. *ret = (event == NULL) ? 0 : -EAGAIN;
  270. }
  271. }
  272. return err;
  273. }
  274. cl_int starpu_opencl_copy_opencl_to_ram(cl_mem buffer, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *ptr, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size, size_t offset, cl_event *event, int *ret)
  275. {
  276. cl_int err;
  277. struct _starpu_worker *worker = _starpu_get_local_worker_key();
  278. if (event)
  279. _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
  280. cl_event ev;
  281. err = clEnqueueReadBuffer(out_transfer_queues[worker->devid], buffer, CL_FALSE, offset, size, ptr, 0, NULL, &ev);
  282. if (event)
  283. _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
  284. if (STARPU_LIKELY(err == CL_SUCCESS))
  285. {
  286. if (event == NULL)
  287. {
  288. /* We want a synchronous copy, let's synchronise the queue */
  289. err = clWaitForEvents(1, &ev);
  290. if (STARPU_UNLIKELY(err))
  291. STARPU_OPENCL_REPORT_ERROR(err);
  292. err = clReleaseEvent(ev);
  293. if (STARPU_UNLIKELY(err))
  294. STARPU_OPENCL_REPORT_ERROR(err);
  295. }
  296. else
  297. {
  298. *event = ev;
  299. }
  300. if (ret)
  301. {
  302. *ret = (event == NULL) ? 0 : -EAGAIN;
  303. }
  304. }
  305. return err;
  306. }
  307. cl_int starpu_opencl_copy_opencl_to_opencl(cl_mem src, unsigned src_node STARPU_ATTRIBUTE_UNUSED, size_t src_offset, cl_mem dst, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t dst_offset, size_t size, cl_event *event, int *ret)
  308. {
  309. cl_int err;
  310. struct _starpu_worker *worker = _starpu_get_local_worker_key();
  311. if (event)
  312. _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
  313. cl_event ev;
  314. err = clEnqueueCopyBuffer(peer_transfer_queues[worker->devid], src, dst, src_offset, dst_offset, size, 0, NULL, &ev);
  315. if (event)
  316. _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
  317. if (STARPU_LIKELY(err == CL_SUCCESS))
  318. {
  319. if (event == NULL)
  320. {
  321. /* We want a synchronous copy, let's synchronise the queue */
  322. err = clWaitForEvents(1, &ev);
  323. if (STARPU_UNLIKELY(err))
  324. STARPU_OPENCL_REPORT_ERROR(err);
  325. err = clReleaseEvent(ev);
  326. if (STARPU_UNLIKELY(err))
  327. STARPU_OPENCL_REPORT_ERROR(err);
  328. }
  329. else
  330. {
  331. *event = ev;
  332. }
  333. if (ret)
  334. {
  335. *ret = (event == NULL) ? 0 : -EAGAIN;
  336. }
  337. }
  338. return err;
  339. }
  340. cl_int starpu_opencl_copy_async_sync(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, cl_event *event)
  341. {
  342. enum starpu_node_kind src_kind = starpu_node_get_kind(src_node);
  343. enum starpu_node_kind dst_kind = starpu_node_get_kind(dst_node);
  344. cl_int err;
  345. int ret;
  346. switch (_STARPU_MEMORY_NODE_TUPLE(src_kind,dst_kind))
  347. {
  348. case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_CPU_RAM):
  349. err = starpu_opencl_copy_opencl_to_ram(
  350. (cl_mem) src, src_node,
  351. (void*) (dst + dst_offset), dst_node,
  352. size, src_offset, event, &ret);
  353. if (STARPU_UNLIKELY(err))
  354. STARPU_OPENCL_REPORT_ERROR(err);
  355. return ret;
  356. case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_OPENCL_RAM):
  357. err = starpu_opencl_copy_ram_to_opencl(
  358. (void*) (src + src_offset), src_node,
  359. (cl_mem) dst, dst_node,
  360. size, dst_offset, event, &ret);
  361. if (STARPU_UNLIKELY(err))
  362. STARPU_OPENCL_REPORT_ERROR(err);
  363. return ret;
  364. case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_OPENCL_RAM):
  365. err = starpu_opencl_copy_opencl_to_opencl(
  366. (cl_mem) src, src_node, src_offset,
  367. (cl_mem) dst, dst_node, dst_offset,
  368. size, event, &ret);
  369. if (STARPU_UNLIKELY(err))
  370. STARPU_OPENCL_REPORT_ERROR(err);
  371. return ret;
  372. default:
  373. STARPU_ABORT();
  374. break;
  375. }
  376. }
  377. #if 0
  378. cl_int _starpu_opencl_copy_rect_opencl_to_ram(cl_mem buffer, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *ptr, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, const size_t buffer_origin[3], const size_t host_origin[3],
  379. const size_t region[3], size_t buffer_row_pitch, size_t buffer_slice_pitch,
  380. size_t host_row_pitch, size_t host_slice_pitch, cl_event *event)
  381. {
  382. cl_int err;
  383. struct _starpu_worker *worker = _starpu_get_local_worker_key();
  384. cl_bool blocking;
  385. blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
  386. if (event)
  387. _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
  388. err = clEnqueueReadBufferRect(out_transfer_queues[worker->devid], buffer, blocking, buffer_origin, host_origin, region, buffer_row_pitch,
  389. buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, 0, NULL, event);
  390. if (event)
  391. _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
  392. if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
  393. return CL_SUCCESS;
  394. }
  395. cl_int _starpu_opencl_copy_rect_ram_to_opencl(void *ptr, unsigned src_node STARPU_ATTRIBUTE_UNUSED, cl_mem buffer, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, const size_t buffer_origin[3], const size_t host_origin[3],
  396. const size_t region[3], size_t buffer_row_pitch, size_t buffer_slice_pitch,
  397. size_t host_row_pitch, size_t host_slice_pitch, cl_event *event)
  398. {
  399. cl_int err;
  400. struct _starpu_worker *worker = _starpu_get_local_worker_key();
  401. cl_bool blocking;
  402. blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
  403. if (event)
  404. _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
  405. err = clEnqueueWriteBufferRect(in_transfer_queues[worker->devid], buffer, blocking, buffer_origin, host_origin, region, buffer_row_pitch,
  406. buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, 0, NULL, event);
  407. if (event)
  408. _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
  409. if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
  410. return CL_SUCCESS;
  411. }
  412. #endif
  413. #endif /* STARPU_USE_OPENCL */
  414. static size_t _starpu_opencl_get_global_mem_size(int devid)
  415. {
  416. return global_mem[devid];
  417. }
  418. void _starpu_opencl_init(void)
  419. {
  420. STARPU_PTHREAD_MUTEX_LOCK(&big_lock);
  421. if (!init_done)
  422. {
  423. #ifdef STARPU_SIMGRID
  424. nb_devices = _starpu_simgrid_get_nbhosts("OpenCL");
  425. #else /* STARPU_USE_OPENCL */
  426. cl_platform_id platform_id[_STARPU_OPENCL_PLATFORM_MAX];
  427. cl_uint nb_platforms;
  428. cl_int err;
  429. int i;
  430. cl_device_type device_type = CL_DEVICE_TYPE_GPU|CL_DEVICE_TYPE_ACCELERATOR;
  431. _STARPU_DEBUG("Initialising OpenCL\n");
  432. // Get Platforms
  433. if (starpu_get_env_number("STARPU_OPENCL_ON_CPUS") > 0)
  434. device_type |= CL_DEVICE_TYPE_CPU;
  435. if (starpu_get_env_number("STARPU_OPENCL_ONLY_ON_CPUS") > 0)
  436. device_type = CL_DEVICE_TYPE_CPU;
  437. err = clGetPlatformIDs(_STARPU_OPENCL_PLATFORM_MAX, platform_id, &nb_platforms);
  438. if (STARPU_UNLIKELY(err != CL_SUCCESS)) nb_platforms=0;
  439. _STARPU_DEBUG("Platforms detected: %u\n", nb_platforms);
  440. _STARPU_DEBUG("CPU device type: %s\n", (device_type&CL_DEVICE_TYPE_CPU)?"requested":"not requested");
  441. _STARPU_DEBUG("GPU device type: %s\n", (device_type&CL_DEVICE_TYPE_GPU)?"requested":"not requested");
  442. _STARPU_DEBUG("Accelerator device type: %s\n", (device_type&CL_DEVICE_TYPE_ACCELERATOR)?"requested":"not requested");
  443. // Get devices
  444. nb_devices = 0;
  445. {
  446. unsigned j;
  447. for (j=0; j<nb_platforms; j++)
  448. {
  449. cl_uint num;
  450. int platform_valid = 1;
  451. char name[1024], vendor[1024];
  452. err = clGetPlatformInfo(platform_id[j], CL_PLATFORM_NAME, 1024, name, NULL);
  453. if (err != CL_SUCCESS)
  454. {
  455. STARPU_OPENCL_REPORT_ERROR_WITH_MSG("clGetPlatformInfo NAME", err);
  456. platform_valid = 0;
  457. }
  458. else
  459. {
  460. err = clGetPlatformInfo(platform_id[j], CL_PLATFORM_VENDOR, 1024, vendor, NULL);
  461. if (STARPU_UNLIKELY(err != CL_SUCCESS))
  462. {
  463. STARPU_OPENCL_REPORT_ERROR_WITH_MSG("clGetPlatformInfo VENDOR", err);
  464. platform_valid = 0;
  465. }
  466. }
  467. if(strcmp(name, "SOCL Platform") == 0)
  468. {
  469. platform_valid = 0;
  470. _STARPU_DEBUG("Skipping SOCL Platform\n");
  471. }
  472. #ifdef STARPU_VERBOSE
  473. if (platform_valid)
  474. _STARPU_DEBUG("Platform: %s - %s\n", name, vendor);
  475. else
  476. _STARPU_DEBUG("Platform invalid: %s - %s\n", name, vendor);
  477. #endif
  478. if (platform_valid && nb_devices <= STARPU_MAXOPENCLDEVS)
  479. {
  480. err = clGetDeviceIDs(platform_id[j], device_type, STARPU_MAXOPENCLDEVS-nb_devices, STARPU_MAXOPENCLDEVS == nb_devices ? NULL : &devices[nb_devices], &num);
  481. if (err == CL_DEVICE_NOT_FOUND)
  482. {
  483. const cl_device_type all_device_types = CL_DEVICE_TYPE_CPU|CL_DEVICE_TYPE_GPU|CL_DEVICE_TYPE_ACCELERATOR;
  484. if (device_type != all_device_types)
  485. {
  486. _STARPU_DEBUG(" No devices of the requested type(s) subset detected on this platform\n");
  487. }
  488. else
  489. {
  490. _STARPU_DEBUG(" No devices detected on this platform\n");
  491. }
  492. }
  493. else
  494. {
  495. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  496. _STARPU_DEBUG(" %u devices detected\n", num);
  497. nb_devices += num;
  498. }
  499. }
  500. }
  501. }
  502. // Get location of OpenCl kernel source files
  503. _starpu_opencl_program_dir = starpu_getenv("STARPU_OPENCL_PROGRAM_DIR");
  504. if (nb_devices > STARPU_MAXOPENCLDEVS)
  505. {
  506. _STARPU_DISP("# Warning: %u OpenCL devices available. Only %d enabled. Use configure option --enable-maxopencldev=xxx to update the maximum value of supported OpenCL devices?\n", nb_devices, STARPU_MAXOPENCLDEVS);
  507. nb_devices = STARPU_MAXOPENCLDEVS;
  508. }
  509. // initialise internal structures
  510. for(i=0 ; i<nb_devices ; i++)
  511. {
  512. contexts[i] = NULL;
  513. queues[i] = NULL;
  514. in_transfer_queues[i] = NULL;
  515. out_transfer_queues[i] = NULL;
  516. peer_transfer_queues[i] = NULL;
  517. alloc_queues[i] = NULL;
  518. }
  519. #endif /* STARPU_USE_OPENCL */
  520. init_done=1;
  521. }
  522. STARPU_PTHREAD_MUTEX_UNLOCK(&big_lock);
  523. }
  524. #ifndef STARPU_SIMGRID
  525. static unsigned _starpu_opencl_get_device_name(int dev, char *name, int lname);
  526. #endif
  527. static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker *worker, unsigned char pipeline_idx);
  528. static void _starpu_opencl_stop_job(struct _starpu_job *j, struct _starpu_worker *worker);
  529. static void _starpu_opencl_execute_job(struct starpu_task *task, struct _starpu_worker *worker);
  530. int _starpu_opencl_driver_init(struct _starpu_worker *worker)
  531. {
  532. int devid = worker->devid;
  533. _starpu_driver_start(worker, _STARPU_FUT_OPENCL_KEY, 0);
  534. _starpu_opencl_init_context(devid);
  535. /* one more time to avoid hacks from third party lib :) */
  536. _starpu_bind_thread_on_cpu(worker->config, worker->bindid, worker->workerid);
  537. _starpu_opencl_limit_gpu_mem_if_needed(devid);
  538. _starpu_memory_manager_set_global_memory_size(worker->memory_node, _starpu_opencl_get_global_mem_size(devid));
  539. float size = (float) global_mem[devid] / (1<<30);
  540. #ifdef STARPU_SIMGRID
  541. const char *devname = "Simgrid";
  542. #else
  543. /* get the device's name */
  544. char devname[128];
  545. _starpu_opencl_get_device_name(devid, devname, 128);
  546. #endif
  547. snprintf(worker->name, sizeof(worker->name), "OpenCL %d (%s %.1f GiB)", devid, devname, size);
  548. snprintf(worker->short_name, sizeof(worker->short_name), "OpenCL %d", devid);
  549. starpu_pthread_setname(worker->short_name);
  550. worker->pipeline_length = starpu_get_env_number_default("STARPU_OPENCL_PIPELINE", 2);
  551. if (worker->pipeline_length > STARPU_MAX_PIPELINE)
  552. {
  553. _STARPU_DISP("Warning: STARPU_OPENCL_PIPELINE is %u, but STARPU_MAX_PIPELINE is only %u", worker->pipeline_length, STARPU_MAX_PIPELINE);
  554. worker->pipeline_length = STARPU_MAX_PIPELINE;
  555. }
  556. #if !defined(STARPU_SIMGRID) && !defined(STARPU_NON_BLOCKING_DRIVERS)
  557. if (worker->pipeline_length >= 1)
  558. {
  559. /* We need non-blocking drivers, to poll for OPENCL task
  560. * termination */
  561. _STARPU_DISP("Warning: reducing STARPU_OPENCL_PIPELINE to 0 because blocking drivers are enabled (and simgrid is not enabled)\n");
  562. worker->pipeline_length = 0;
  563. }
  564. #endif
  565. _STARPU_DEBUG("OpenCL (%s) dev id %d thread is ready to run on CPU %d !\n", devname, devid, worker->bindid);
  566. _STARPU_TRACE_WORKER_INIT_END(worker->workerid);
  567. /* tell the main thread that this one is ready */
  568. STARPU_PTHREAD_MUTEX_LOCK(&worker->mutex);
  569. worker->status = STATUS_UNKNOWN;
  570. worker->worker_is_initialized = 1;
  571. STARPU_PTHREAD_COND_SIGNAL(&worker->ready_cond);
  572. STARPU_PTHREAD_MUTEX_UNLOCK(&worker->mutex);
  573. return 0;
  574. }
  575. int _starpu_opencl_driver_run_once(struct _starpu_worker *worker)
  576. {
  577. int workerid = worker->workerid;
  578. unsigned memnode = worker->memory_node;
  579. struct _starpu_job *j;
  580. struct starpu_task *task;
  581. int res;
  582. int idle_tasks, idle_transfers;
  583. #ifdef STARPU_SIMGRID
  584. starpu_pthread_wait_reset(&worker->wait);
  585. #endif
  586. idle_tasks = 0;
  587. idle_transfers = 0;
  588. /* First test for transfers pending for next task */
  589. task = worker->task_transferring;
  590. if (!task)
  591. idle_transfers++;
  592. if (task && worker->nb_buffers_transferred == worker->nb_buffers_totransfer)
  593. {
  594. _STARPU_TRACE_END_PROGRESS(memnode);
  595. j = _starpu_get_job_associated_to_task(task);
  596. _starpu_fetch_task_input_tail(task, j, worker);
  597. /* Reset it */
  598. worker->task_transferring = NULL;
  599. if (worker->ntasks > 1 && !(task->cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC))
  600. {
  601. /* We have to execute a non-asynchronous task but we
  602. * still have tasks in the pipeline... Record it to
  603. * prevent more tasks from coming, and do it later */
  604. worker->pipeline_stuck = 1;
  605. return 0;
  606. }
  607. _starpu_opencl_execute_job(task, worker);
  608. _STARPU_TRACE_START_PROGRESS(memnode);
  609. }
  610. /* Then poll for completed jobs */
  611. if (worker->ntasks && worker->current_tasks[worker->first_task] != worker->task_transferring)
  612. {
  613. #ifndef STARPU_SIMGRID
  614. size_t size;
  615. int err;
  616. #endif
  617. /* On-going asynchronous task, check for its termination first */
  618. task = worker->current_tasks[worker->first_task];
  619. #ifdef STARPU_SIMGRID
  620. if (!task_finished[worker->devid][worker->first_task])
  621. #else /* !STARPU_SIMGRID */
  622. cl_int status;
  623. err = clGetEventInfo(task_events[worker->devid][worker->first_task], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &status, &size);
  624. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  625. STARPU_ASSERT(size == sizeof(cl_int));
  626. if (status != CL_COMPLETE)
  627. #endif /* !STARPU_SIMGRID */
  628. {
  629. }
  630. else
  631. {
  632. _STARPU_TRACE_END_PROGRESS(memnode);
  633. #ifndef STARPU_SIMGRID
  634. err = clReleaseEvent(task_events[worker->devid][worker->first_task]);
  635. if (STARPU_UNLIKELY(err)) STARPU_OPENCL_REPORT_ERROR(err);
  636. task_events[worker->devid][worker->first_task] = 0;
  637. #endif
  638. /* Asynchronous task completed! */
  639. _starpu_opencl_stop_job(_starpu_get_job_associated_to_task(task), worker);
  640. /* See next task if any */
  641. if (worker->ntasks && worker->current_tasks[worker->first_task] != worker->task_transferring)
  642. {
  643. task = worker->current_tasks[worker->first_task];
  644. j = _starpu_get_job_associated_to_task(task);
  645. if (task->cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC)
  646. {
  647. /* An asynchronous task, it was already queued,
  648. * it's now running, record its start time. */
  649. _starpu_driver_start_job(worker, j, &worker->perf_arch, &j->cl_start, 0, starpu_profiling_status_get());
  650. }
  651. else
  652. {
  653. /* A synchronous task, we have finished flushing the pipeline, we can now at last execute it. */
  654. _STARPU_TRACE_EVENT("sync_task");
  655. _starpu_opencl_execute_job(task, worker);
  656. _STARPU_TRACE_EVENT("end_sync_task");
  657. worker->pipeline_stuck = 0;
  658. }
  659. }
  660. _STARPU_TRACE_START_PROGRESS(memnode);
  661. }
  662. }
  663. if (!worker->pipeline_length || worker->ntasks < worker->pipeline_length)
  664. idle_tasks++;
  665. #if defined(STARPU_NON_BLOCKING_DRIVERS) && !defined(STARPU_SIMGRID)
  666. if (!idle_tasks)
  667. {
  668. /* No task ready yet, no better thing to do than waiting */
  669. __starpu_datawizard_progress(1, !idle_transfers);
  670. return 0;
  671. }
  672. #endif
  673. res = !idle_tasks || !idle_transfers;
  674. res |= __starpu_datawizard_progress(1, 1);
  675. task = _starpu_get_worker_task(worker, workerid, memnode);
  676. #ifdef STARPU_SIMGRID
  677. if (!res && !task)
  678. starpu_pthread_wait_wait(&worker->wait);
  679. #endif
  680. if (task == NULL)
  681. return 0;
  682. j = _starpu_get_job_associated_to_task(task);
  683. /* can OpenCL do that task ? */
  684. if (!_STARPU_OPENCL_MAY_PERFORM(j))
  685. {
  686. /* this is not a OpenCL task */
  687. _starpu_push_task_to_workers(task);
  688. return 0;
  689. }
  690. _STARPU_TRACE_END_PROGRESS(memnode);
  691. worker->current_tasks[(worker->first_task + worker->ntasks)%STARPU_MAX_PIPELINE] = task;
  692. worker->ntasks++;
  693. /* Fetch data asynchronously */
  694. res = _starpu_fetch_task_input(task, j, 1);
  695. STARPU_ASSERT(res == 0);
  696. _STARPU_TRACE_START_PROGRESS(memnode);
  697. return 0;
  698. }
  699. int _starpu_opencl_driver_deinit(struct _starpu_worker *worker)
  700. {
  701. _STARPU_TRACE_WORKER_DEINIT_START;
  702. unsigned memnode = worker->memory_node;
  703. _starpu_handle_all_pending_node_data_requests(memnode);
  704. /* In case there remains some memory that was automatically
  705. * allocated by StarPU, we release it now. Note that data
  706. * coherency is not maintained anymore at that point ! */
  707. _starpu_free_all_automatically_allocated_buffers(memnode);
  708. _starpu_malloc_shutdown(memnode);
  709. unsigned devid = worker->devid;
  710. _starpu_opencl_deinit_context(devid);
  711. worker->worker_is_initialized = 0;
  712. _STARPU_TRACE_WORKER_DEINIT_END(_STARPU_FUT_OPENCL_KEY);
  713. return 0;
  714. }
  715. void *_starpu_opencl_worker(void *_arg)
  716. {
  717. struct _starpu_worker* worker = _arg;
  718. _starpu_opencl_driver_init(worker);
  719. _STARPU_TRACE_START_PROGRESS(worker->memory_node);
  720. while (_starpu_machine_is_running())
  721. {
  722. _starpu_may_pause();
  723. _starpu_opencl_driver_run_once(worker);
  724. }
  725. _starpu_opencl_driver_deinit(worker);
  726. _STARPU_TRACE_END_PROGRESS(worker->memory_node);
  727. return NULL;
  728. }
  729. #ifdef STARPU_USE_OPENCL
  730. #ifndef STARPU_SIMGRID
  731. static unsigned _starpu_opencl_get_device_name(int dev, char *name, int lname)
  732. {
  733. int err;
  734. if (!init_done)
  735. {
  736. _starpu_opencl_init();
  737. }
  738. // Get device name
  739. err = clGetDeviceInfo(devices[dev], CL_DEVICE_NAME, lname, name, NULL);
  740. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  741. _STARPU_DEBUG("Device %d : [%s]\n", dev, name);
  742. return EXIT_SUCCESS;
  743. }
  744. #endif
  745. #endif
  746. unsigned _starpu_opencl_get_device_count(void)
  747. {
  748. if (!init_done)
  749. {
  750. _starpu_opencl_init();
  751. }
  752. return nb_devices;
  753. }
  754. #ifdef STARPU_USE_OPENCL
  755. cl_device_type _starpu_opencl_get_device_type(int devid)
  756. {
  757. int err;
  758. cl_device_type type;
  759. if (!init_done)
  760. _starpu_opencl_init();
  761. err = clGetDeviceInfo(devices[devid], CL_DEVICE_TYPE, sizeof(cl_device_type), &type, NULL);
  762. if (STARPU_UNLIKELY(err != CL_SUCCESS))
  763. STARPU_OPENCL_REPORT_ERROR(err);
  764. return type;
  765. }
  766. #endif /* STARPU_USE_OPENCL */
  767. static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker *worker, unsigned char pipeline_idx STARPU_ATTRIBUTE_UNUSED)
  768. {
  769. STARPU_ASSERT(j);
  770. struct starpu_task *task = j->task;
  771. int profiling = starpu_profiling_status_get();
  772. STARPU_ASSERT(task);
  773. struct starpu_codelet *cl = task->cl;
  774. STARPU_ASSERT(cl);
  775. _starpu_set_current_task(task);
  776. if (worker->ntasks == 1)
  777. {
  778. /* We are alone in the pipeline, the kernel will start now, record it */
  779. _starpu_driver_start_job(worker, j, &worker->perf_arch, &j->cl_start, 0, profiling);
  780. }
  781. starpu_opencl_func_t func = _starpu_task_get_opencl_nth_implementation(cl, j->nimpl);
  782. STARPU_ASSERT_MSG(func, "when STARPU_OPENCL is defined in 'where', opencl_func or opencl_funcs has to be defined");
  783. if (_starpu_get_disable_kernels() <= 0)
  784. {
  785. _STARPU_TRACE_START_EXECUTING();
  786. #ifdef STARPU_SIMGRID
  787. double length = NAN;
  788. int async = task->cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC;
  789. int simulate = 1;
  790. if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE && !async)
  791. {
  792. /* Actually execute function */
  793. simulate = 0;
  794. func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
  795. #ifdef STARPU_OPENCL_SIMULATOR
  796. #ifndef CL_PROFILING_CLOCK_CYCLE_COUNT
  797. #ifdef CL_PROFILING_COMMAND_SHAVE_CYCLE_COUNT
  798. #define CL_PROFILING_CLOCK_CYCLE_COUNT CL_PROFILING_COMMAND_SHAVE_CYCLE_COUNT
  799. #else
  800. #error The OpenCL simulator must provide CL_PROFILING_CLOCK_CYCLE_COUNT
  801. #endif
  802. #endif
  803. struct starpu_profiling_task_info *profiling_info = task->profiling_info;
  804. STARPU_ASSERT_MSG(profiling_info->used_cycles, "Application kernel must call starpu_opencl_collect_stats to collect simulated time");
  805. #ifdef HAVE_MSG_HOST_GET_SPEED
  806. length = ((double) profiling_info->used_cycles)/MSG_host_get_speed(MSG_host_self());
  807. #else
  808. length = ((double) profiling_info->used_cycles)/MSG_get_host_speed(MSG_host_self());
  809. #endif
  810. /* And give the simulated time to simgrid */
  811. simulate = 1;
  812. #endif
  813. }
  814. else if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT && !async)
  815. {
  816. _SIMGRID_TIMER_BEGIN(1);
  817. func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
  818. _SIMGRID_TIMER_END;
  819. simulate=0;
  820. }
  821. if (simulate)
  822. _starpu_simgrid_submit_job(worker->workerid, j, &worker->perf_arch, length,
  823. async ? &task_finished[worker->devid][pipeline_idx] : NULL);
  824. #else
  825. func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
  826. #endif
  827. _STARPU_TRACE_END_EXECUTING();
  828. }
  829. return 0;
  830. }
  831. static void _starpu_opencl_stop_job(struct _starpu_job *j, struct _starpu_worker *worker)
  832. {
  833. struct timespec codelet_end;
  834. int profiling = starpu_profiling_status_get();
  835. _starpu_set_current_task(NULL);
  836. if (worker->pipeline_length)
  837. worker->current_tasks[worker->first_task] = NULL;
  838. else
  839. worker->current_task = NULL;
  840. worker->first_task = (worker->first_task + 1) % STARPU_MAX_PIPELINE;
  841. worker->ntasks--;
  842. _starpu_driver_end_job(worker, j, &worker->perf_arch, &codelet_end, 0, profiling);
  843. struct _starpu_sched_ctx *sched_ctx = _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(worker, j);
  844. STARPU_ASSERT_MSG(sched_ctx != NULL, "there should be a worker %d in the ctx of this job \n", worker->workerid);
  845. if(!sched_ctx->sched_policy)
  846. _starpu_driver_update_job_feedback(j, worker, &sched_ctx->perf_arch, &j->cl_start, &codelet_end, profiling);
  847. else
  848. _starpu_driver_update_job_feedback(j, worker, &worker->perf_arch, &j->cl_start, &codelet_end, profiling);
  849. _starpu_push_task_output(j);
  850. _starpu_handle_job_termination(j);
  851. }
  852. static void _starpu_opencl_execute_job(struct starpu_task *task, struct _starpu_worker *worker)
  853. {
  854. int res;
  855. struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
  856. unsigned char pipeline_idx = (worker->first_task + worker->ntasks - 1)%STARPU_MAX_PIPELINE;
  857. res = _starpu_opencl_start_job(j, worker, pipeline_idx);
  858. if (res)
  859. {
  860. switch (res)
  861. {
  862. case -EAGAIN:
  863. _STARPU_DISP("ouch, OpenCL could not actually run task %p, putting it back...\n", task);
  864. _starpu_push_task_to_workers(task);
  865. STARPU_ABORT();
  866. default:
  867. STARPU_ABORT();
  868. }
  869. }
  870. if (task->cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC)
  871. {
  872. /* Record event to synchronize with task termination later */
  873. #ifndef STARPU_SIMGRID
  874. cl_command_queue queue;
  875. starpu_opencl_get_queue(worker->devid, &queue);
  876. #endif
  877. if (worker->pipeline_length == 0)
  878. {
  879. #ifdef STARPU_SIMGRID
  880. _starpu_simgrid_wait_tasks(worker->workerid);
  881. #else
  882. starpu_opencl_get_queue(worker->devid, &queue);
  883. clFinish(queue);
  884. #endif
  885. _starpu_opencl_stop_job(j, worker);
  886. }
  887. else
  888. {
  889. #ifndef STARPU_SIMGRID
  890. int err;
  891. /* the function clEnqueueMarker is deprecated from
  892. * OpenCL version 1.2. We would like to use the new
  893. * function clEnqueueMarkerWithWaitList. We could do
  894. * it by checking its availability through our own
  895. * configure macro HAVE_CLENQUEUEMARKERWITHWAITLIST
  896. * and the OpenCL macro CL_VERSION_1_2. However these
  897. * 2 macros detect the function availability in the
  898. * ICD and not in the device implementation.
  899. */
  900. err = clEnqueueMarker(queue, &task_events[worker->devid][pipeline_idx]);
  901. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  902. #endif
  903. _STARPU_TRACE_START_EXECUTING();
  904. }
  905. }
  906. else
  907. /* Synchronous execution */
  908. {
  909. _starpu_opencl_stop_job(j, worker);
  910. }
  911. }
  912. #ifdef STARPU_USE_OPENCL
  913. int _starpu_run_opencl(struct _starpu_worker *workerarg)
  914. {
  915. _STARPU_DEBUG("Running OpenCL %u from the application\n", workerarg->devid);
  916. workerarg->set = NULL;
  917. workerarg->worker_is_initialized = 0;
  918. /* Let's go ! */
  919. _starpu_opencl_worker(workerarg);
  920. return 0;
  921. }
  922. struct _starpu_driver_ops _starpu_driver_opencl_ops =
  923. {
  924. .init = _starpu_opencl_driver_init,
  925. .run = _starpu_run_opencl,
  926. .run_once = _starpu_opencl_driver_run_once,
  927. .deinit = _starpu_opencl_driver_deinit
  928. };
  929. #endif /* STARPU_USE_OPENCL */