driver_opencl.c 45 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2010-2021 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  4. * Copyright (C) 2010 Mehdi Juhoor
  5. * Copyright (C) 2011 Télécom-SudParis
  6. * Copyright (C) 2013 Thibaut Lambert
  7. *
  8. * StarPU is free software; you can redistribute it and/or modify
  9. * it under the terms of the GNU Lesser General Public License as published by
  10. * the Free Software Foundation; either version 2.1 of the License, or (at
  11. * your option) any later version.
  12. *
  13. * StarPU is distributed in the hope that it will be useful, but
  14. * WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  16. *
  17. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  18. */
  19. #include <math.h>
  20. #include <starpu.h>
  21. #include <starpu_profiling.h>
  22. #include <common/config.h>
  23. #include <common/utils.h>
  24. #include <core/debug.h>
  25. #include <starpu_opencl.h>
  26. #include <drivers/driver_common/driver_common.h>
  27. #include <drivers/opencl/driver_opencl.h>
  28. #include <drivers/opencl/driver_opencl_utils.h>
  29. #include <common/utils.h>
  30. #include <datawizard/memory_manager.h>
  31. #include <datawizard/memory_nodes.h>
  32. #include <datawizard/malloc.h>
  33. #include <datawizard/datawizard.h>
  34. #include <core/task.h>
  35. #include <common/knobs.h>
  36. #ifdef STARPU_SIMGRID
  37. #include <core/simgrid.h>
  38. #endif
  39. static int nb_devices = -1;
  40. static int init_done = 0;
  41. static starpu_pthread_mutex_t big_lock = STARPU_PTHREAD_MUTEX_INITIALIZER;
  42. static size_t global_mem[STARPU_MAXOPENCLDEVS];
  43. #ifdef STARPU_USE_OPENCL
  44. static cl_context contexts[STARPU_MAXOPENCLDEVS];
  45. static cl_device_id devices[STARPU_MAXOPENCLDEVS];
  46. static cl_command_queue queues[STARPU_MAXOPENCLDEVS];
  47. static cl_command_queue in_transfer_queues[STARPU_MAXOPENCLDEVS];
  48. static cl_command_queue out_transfer_queues[STARPU_MAXOPENCLDEVS];
  49. static cl_command_queue peer_transfer_queues[STARPU_MAXOPENCLDEVS];
  50. #ifndef STARPU_SIMGRID
  51. static cl_command_queue alloc_queues[STARPU_MAXOPENCLDEVS];
  52. static cl_event task_events[STARPU_MAXOPENCLDEVS][STARPU_MAX_PIPELINE];
  53. #endif /* !STARPU_SIMGRID */
  54. #endif
  55. #ifdef STARPU_SIMGRID
  56. static unsigned task_finished[STARPU_MAXOPENCLDEVS][STARPU_MAX_PIPELINE];
  57. static starpu_pthread_mutex_t opencl_alloc_mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
  58. #endif /* STARPU_SIMGRID */
  59. #define _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err) do { if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err); } while(0)
  60. void
  61. _starpu_opencl_discover_devices(struct _starpu_machine_config *config)
  62. {
  63. /* Discover the number of OpenCL devices. Fill the result in CONFIG. */
  64. /* As OpenCL must have been initialized before calling this function,
  65. * `nb_device' is ensured to be correctly set. */
  66. STARPU_ASSERT(init_done == 1);
  67. config->topology.nhwdevices[STARPU_OPENCL_WORKER] = nb_devices;
  68. }
  69. static void _starpu_opencl_limit_gpu_mem_if_needed(unsigned devid)
  70. {
  71. starpu_ssize_t limit;
  72. size_t STARPU_ATTRIBUTE_UNUSED totalGlobalMem = 0;
  73. size_t STARPU_ATTRIBUTE_UNUSED to_waste = 0;
  74. #ifdef STARPU_SIMGRID
  75. totalGlobalMem = _starpu_simgrid_get_memsize("OpenCL", devid);
  76. #elif defined(STARPU_USE_OPENCL)
  77. /* Request the size of the current device's memory */
  78. cl_int err;
  79. cl_ulong size;
  80. err = clGetDeviceInfo(devices[devid], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size), &size, NULL);
  81. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  82. totalGlobalMem = size;
  83. #endif
  84. limit = starpu_get_env_number("STARPU_LIMIT_OPENCL_MEM");
  85. if (limit == -1)
  86. {
  87. char name[30];
  88. snprintf(name, sizeof(name), "STARPU_LIMIT_OPENCL_%u_MEM", devid);
  89. limit = starpu_get_env_number(name);
  90. }
  91. #if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
  92. if (limit == -1)
  93. {
  94. /* Use 90% of the available memory by default. */
  95. limit = totalGlobalMem / (1024*1024) * 0.9;
  96. }
  97. #endif
  98. global_mem[devid] = limit * 1024*1024;
  99. #ifdef STARPU_USE_OPENCL
  100. /* How much memory to waste ? */
  101. to_waste = totalGlobalMem - global_mem[devid];
  102. #endif
  103. _STARPU_DEBUG("OpenCL device %u: Wasting %ld MB / Limit %ld MB / Total %ld MB / Remains %ld MB\n",
  104. devid, (long)to_waste/(1024*1024), (long) limit, (long)totalGlobalMem/(1024*1024),
  105. (long)(totalGlobalMem - to_waste)/(1024*1024));
  106. }
  107. #ifdef STARPU_USE_OPENCL
  108. void starpu_opencl_get_context(int devid, cl_context *context)
  109. {
  110. *context = contexts[devid];
  111. }
  112. void starpu_opencl_get_device(int devid, cl_device_id *device)
  113. {
  114. *device = devices[devid];
  115. }
  116. void starpu_opencl_get_queue(int devid, cl_command_queue *queue)
  117. {
  118. *queue = queues[devid];
  119. }
  120. void starpu_opencl_get_current_queue(cl_command_queue *queue)
  121. {
  122. struct _starpu_worker *worker = _starpu_get_local_worker_key();
  123. STARPU_ASSERT(queue);
  124. *queue = queues[worker->devid];
  125. }
  126. void starpu_opencl_get_current_context(cl_context *context)
  127. {
  128. struct _starpu_worker *worker = _starpu_get_local_worker_key();
  129. STARPU_ASSERT(context);
  130. *context = contexts[worker->devid];
  131. }
  132. #endif /* STARPU_USE_OPENCL */
  133. int _starpu_opencl_init_context(int devid)
  134. {
  135. #ifdef STARPU_SIMGRID
  136. int j;
  137. for (j = 0; j < STARPU_MAX_PIPELINE; j++)
  138. task_finished[devid][j] = 0;
  139. #else /* !STARPU_SIMGRID */
  140. cl_int err;
  141. cl_uint uint;
  142. STARPU_PTHREAD_MUTEX_LOCK(&big_lock);
  143. _STARPU_DEBUG("Initialising context for dev %d\n", devid);
  144. // Create a compute context
  145. err = 0;
  146. contexts[devid] = clCreateContext(NULL, 1, &devices[devid], NULL, NULL, &err);
  147. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  148. err = clGetDeviceInfo(devices[devid], CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(uint), &uint, NULL);
  149. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  150. starpu_malloc_set_align(uint/8);
  151. // Create execution queue for the given device
  152. queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], 0, &err);
  153. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  154. // Create transfer queue for the given device
  155. cl_command_queue_properties props;
  156. err = clGetDeviceInfo(devices[devid], CL_DEVICE_QUEUE_PROPERTIES, sizeof(props), &props, NULL);
  157. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  158. props &= ~CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
  159. in_transfer_queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], props, &err);
  160. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  161. out_transfer_queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], props, &err);
  162. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  163. peer_transfer_queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], props, &err);
  164. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  165. alloc_queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], 0, &err);
  166. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  167. STARPU_PTHREAD_MUTEX_UNLOCK(&big_lock);
  168. #endif /* !STARPU_SIMGRID */
  169. return 0;
  170. }
  171. int _starpu_opencl_deinit_context(int devid)
  172. {
  173. #ifdef STARPU_SIMGRID
  174. int j;
  175. for (j = 0; j < STARPU_MAX_PIPELINE; j++)
  176. task_finished[devid][j] = 0;
  177. #else /* !STARPU_SIMGRID */
  178. cl_int err;
  179. STARPU_PTHREAD_MUTEX_LOCK(&big_lock);
  180. _STARPU_DEBUG("De-initialising context for dev %d\n", devid);
  181. err = clFinish(queues[devid]);
  182. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  183. err = clReleaseCommandQueue(queues[devid]);
  184. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  185. err = clFinish(in_transfer_queues[devid]);
  186. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  187. err = clReleaseCommandQueue(in_transfer_queues[devid]);
  188. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  189. err = clFinish(out_transfer_queues[devid]);
  190. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  191. err = clReleaseCommandQueue(out_transfer_queues[devid]);
  192. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  193. err = clFinish(peer_transfer_queues[devid]);
  194. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  195. err = clReleaseCommandQueue(peer_transfer_queues[devid]);
  196. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  197. err = clFinish(alloc_queues[devid]);
  198. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  199. err = clReleaseCommandQueue(alloc_queues[devid]);
  200. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  201. err = clReleaseContext(contexts[devid]);
  202. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  203. contexts[devid] = NULL;
  204. STARPU_PTHREAD_MUTEX_UNLOCK(&big_lock);
  205. #endif
  206. return 0;
  207. }
  208. #ifdef STARPU_USE_OPENCL
  209. cl_int starpu_opencl_allocate_memory(int devid STARPU_ATTRIBUTE_UNUSED, cl_mem *mem STARPU_ATTRIBUTE_UNUSED, size_t size STARPU_ATTRIBUTE_UNUSED, cl_mem_flags flags STARPU_ATTRIBUTE_UNUSED)
  210. {
  211. #ifdef STARPU_SIMGRID
  212. STARPU_ABORT();
  213. #else
  214. cl_int err;
  215. cl_mem memory;
  216. memory = clCreateBuffer(contexts[devid], flags, size, NULL, &err);
  217. if (err == CL_OUT_OF_HOST_MEMORY)
  218. return err;
  219. if (err == CL_MEM_OBJECT_ALLOCATION_FAILURE)
  220. return err;
  221. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  222. /*
  223. * OpenCL uses lazy memory allocation: we will only know if the
  224. * allocation failed when trying to copy data onto the device. But we
  225. * want to know this __now__, so we just perform a dummy copy.
  226. */
  227. char dummy = 0;
  228. cl_event ev;
  229. err = clEnqueueWriteBuffer(alloc_queues[devid], memory, CL_TRUE,
  230. 0, sizeof(dummy), &dummy,
  231. 0, NULL, &ev);
  232. if (err == CL_MEM_OBJECT_ALLOCATION_FAILURE)
  233. return err;
  234. if (err == CL_OUT_OF_RESOURCES)
  235. return err;
  236. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  237. clWaitForEvents(1, &ev);
  238. clReleaseEvent(ev);
  239. *mem = memory;
  240. return CL_SUCCESS;
  241. #endif
  242. }
  243. cl_int starpu_opencl_copy_ram_to_opencl(void *ptr, unsigned src_node STARPU_ATTRIBUTE_UNUSED, cl_mem buffer, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size, size_t offset, cl_event *event, int *ret)
  244. {
  245. cl_int err;
  246. struct _starpu_worker *worker = _starpu_get_local_worker_key();
  247. double start = 0.;
  248. if (event)
  249. starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
  250. cl_event ev;
  251. err = clEnqueueWriteBuffer(in_transfer_queues[worker->devid], buffer, CL_FALSE, offset, size, ptr, 0, NULL, &ev);
  252. if (event)
  253. starpu_interface_end_driver_copy_async(src_node, dst_node, start);
  254. if (STARPU_LIKELY(err == CL_SUCCESS))
  255. {
  256. if (event == NULL)
  257. {
  258. /* We want a synchronous copy, let's synchronise the queue */
  259. err = clWaitForEvents(1, &ev);
  260. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  261. err = clReleaseEvent(ev);
  262. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  263. }
  264. else
  265. {
  266. clFlush(in_transfer_queues[worker->devid]);
  267. *event = ev;
  268. }
  269. if (ret)
  270. {
  271. *ret = (event == NULL) ? 0 : -EAGAIN;
  272. }
  273. }
  274. return err;
  275. }
  276. cl_int starpu_opencl_copy_opencl_to_ram(cl_mem buffer, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *ptr, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size, size_t offset, cl_event *event, int *ret)
  277. {
  278. cl_int err;
  279. struct _starpu_worker *worker = _starpu_get_local_worker_key();
  280. double start = 0.;
  281. if (event)
  282. starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
  283. cl_event ev;
  284. err = clEnqueueReadBuffer(out_transfer_queues[worker->devid], buffer, CL_FALSE, offset, size, ptr, 0, NULL, &ev);
  285. if (event)
  286. starpu_interface_end_driver_copy_async(src_node, dst_node, start);
  287. if (STARPU_LIKELY(err == CL_SUCCESS))
  288. {
  289. if (event == NULL)
  290. {
  291. /* We want a synchronous copy, let's synchronise the queue */
  292. err = clWaitForEvents(1, &ev);
  293. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  294. err = clReleaseEvent(ev);
  295. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  296. }
  297. else
  298. {
  299. clFlush(out_transfer_queues[worker->devid]);
  300. *event = ev;
  301. }
  302. if (ret)
  303. {
  304. *ret = (event == NULL) ? 0 : -EAGAIN;
  305. }
  306. }
  307. return err;
  308. }
  309. cl_int starpu_opencl_copy_opencl_to_opencl(cl_mem src, unsigned src_node STARPU_ATTRIBUTE_UNUSED, size_t src_offset, cl_mem dst, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t dst_offset, size_t size, cl_event *event, int *ret)
  310. {
  311. cl_int err;
  312. struct _starpu_worker *worker = _starpu_get_local_worker_key();
  313. double start = 0.;
  314. if (event)
  315. starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
  316. cl_event ev;
  317. err = clEnqueueCopyBuffer(peer_transfer_queues[worker->devid], src, dst, src_offset, dst_offset, size, 0, NULL, &ev);
  318. if (event)
  319. starpu_interface_end_driver_copy_async(src_node, dst_node, start);
  320. if (STARPU_LIKELY(err == CL_SUCCESS))
  321. {
  322. if (event == NULL)
  323. {
  324. /* We want a synchronous copy, let's synchronise the queue */
  325. err = clWaitForEvents(1, &ev);
  326. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  327. err = clReleaseEvent(ev);
  328. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  329. }
  330. else
  331. {
  332. clFlush(peer_transfer_queues[worker->devid]);
  333. *event = ev;
  334. }
  335. if (ret)
  336. {
  337. *ret = (event == NULL) ? 0 : -EAGAIN;
  338. }
  339. }
  340. return err;
  341. }
  342. cl_int starpu_opencl_copy_async_sync(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, cl_event *event)
  343. {
  344. enum starpu_node_kind src_kind = starpu_node_get_kind(src_node);
  345. enum starpu_node_kind dst_kind = starpu_node_get_kind(dst_node);
  346. cl_int err;
  347. int ret;
  348. if (src_kind == STARPU_OPENCL_RAM && dst_kind == STARPU_CPU_RAM)
  349. {
  350. err = starpu_opencl_copy_opencl_to_ram((cl_mem) src, src_node,
  351. (void*) (dst + dst_offset), dst_node,
  352. size, src_offset, event, &ret);
  353. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  354. return ret;
  355. }
  356. if (src_kind == STARPU_CPU_RAM && dst_kind == STARPU_OPENCL_RAM)
  357. {
  358. err = starpu_opencl_copy_ram_to_opencl((void*) (src + src_offset), src_node,
  359. (cl_mem) dst, dst_node,
  360. size, dst_offset, event, &ret);
  361. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  362. return ret;
  363. }
  364. if (src_kind == STARPU_OPENCL_RAM && (dst_kind == STARPU_CPU_RAM || dst_kind == STARPU_OPENCL_RAM))
  365. {
  366. err = starpu_opencl_copy_opencl_to_opencl((cl_mem) src, src_node, src_offset,
  367. (cl_mem) dst, dst_node, dst_offset,
  368. size, event, &ret);
  369. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  370. return ret;
  371. }
  372. STARPU_ABORT();
  373. }
  374. #if 0
  375. cl_int _starpu_opencl_copy_rect_opencl_to_ram(cl_mem buffer, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *ptr, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, const size_t buffer_origin[3], const size_t host_origin[3],
  376. const size_t region[3], size_t buffer_row_pitch, size_t buffer_slice_pitch,
  377. size_t host_row_pitch, size_t host_slice_pitch, cl_event *event)
  378. {
  379. cl_int err;
  380. struct _starpu_worker *worker = _starpu_get_local_worker_key();
  381. cl_bool blocking;
  382. double start = 0.;
  383. blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
  384. if (event)
  385. starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
  386. err = clEnqueueReadBufferRect(out_transfer_queues[worker->devid], buffer, blocking, buffer_origin, host_origin, region, buffer_row_pitch,
  387. buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, 0, NULL, event);
  388. clFlush(out_transfer_queues[worker->devid]);
  389. if (event)
  390. starpu_interface_end_driver_copy_async(src_node, dst_node, start);
  391. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  392. return CL_SUCCESS;
  393. }
  394. cl_int _starpu_opencl_copy_rect_ram_to_opencl(void *ptr, unsigned src_node STARPU_ATTRIBUTE_UNUSED, cl_mem buffer, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, const size_t buffer_origin[3], const size_t host_origin[3],
  395. const size_t region[3], size_t buffer_row_pitch, size_t buffer_slice_pitch,
  396. size_t host_row_pitch, size_t host_slice_pitch, cl_event *event)
  397. {
  398. cl_int err;
  399. struct _starpu_worker *worker = _starpu_get_local_worker_key();
  400. cl_bool blocking;
  401. double start = 0.;
  402. blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
  403. if (event)
  404. starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
  405. err = clEnqueueWriteBufferRect(in_transfer_queues[worker->devid], buffer, blocking, buffer_origin, host_origin, region, buffer_row_pitch,
  406. buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, 0, NULL, event);
  407. clFlush(in_transfer_queues[worker->devid]);
  408. if (event)
  409. starpu_interface_end_driver_copy_async(src_node, dst_node, start);
  410. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  411. return CL_SUCCESS;
  412. }
  413. #endif
  414. #endif /* STARPU_USE_OPENCL */
  415. static size_t _starpu_opencl_get_global_mem_size(int devid)
  416. {
  417. return global_mem[devid];
  418. }
  419. void _starpu_opencl_init(void)
  420. {
  421. STARPU_PTHREAD_MUTEX_LOCK(&big_lock);
  422. if (!init_done)
  423. {
  424. #ifdef STARPU_SIMGRID
  425. nb_devices = _starpu_simgrid_get_nbhosts("OpenCL");
  426. #else /* STARPU_USE_OPENCL */
  427. cl_platform_id platform_id[_STARPU_OPENCL_PLATFORM_MAX];
  428. cl_uint nb_platforms;
  429. cl_int err;
  430. int i;
  431. cl_device_type device_type = CL_DEVICE_TYPE_GPU|CL_DEVICE_TYPE_ACCELERATOR;
  432. _STARPU_DEBUG("Initialising OpenCL\n");
  433. // Get Platforms
  434. if (starpu_get_env_number("STARPU_OPENCL_ON_CPUS") > 0)
  435. device_type |= CL_DEVICE_TYPE_CPU;
  436. if (starpu_get_env_number("STARPU_OPENCL_ONLY_ON_CPUS") > 0)
  437. device_type = CL_DEVICE_TYPE_CPU;
  438. err = clGetPlatformIDs(_STARPU_OPENCL_PLATFORM_MAX, platform_id, &nb_platforms);
  439. if (STARPU_UNLIKELY(err != CL_SUCCESS)) nb_platforms=0;
  440. _STARPU_DEBUG("Platforms detected: %u\n", nb_platforms);
  441. _STARPU_DEBUG("CPU device type: %s\n", (device_type&CL_DEVICE_TYPE_CPU)?"requested":"not requested");
  442. _STARPU_DEBUG("GPU device type: %s\n", (device_type&CL_DEVICE_TYPE_GPU)?"requested":"not requested");
  443. _STARPU_DEBUG("Accelerator device type: %s\n", (device_type&CL_DEVICE_TYPE_ACCELERATOR)?"requested":"not requested");
  444. // Get devices
  445. nb_devices = 0;
  446. {
  447. unsigned j;
  448. for (j=0; j<nb_platforms; j++)
  449. {
  450. cl_uint num;
  451. int platform_valid = 1;
  452. char name[1024], vendor[1024];
  453. err = clGetPlatformInfo(platform_id[j], CL_PLATFORM_NAME, 1024, name, NULL);
  454. if (err != CL_SUCCESS)
  455. {
  456. STARPU_OPENCL_REPORT_ERROR_WITH_MSG("clGetPlatformInfo NAME", err);
  457. platform_valid = 0;
  458. }
  459. else
  460. {
  461. err = clGetPlatformInfo(platform_id[j], CL_PLATFORM_VENDOR, 1024, vendor, NULL);
  462. if (STARPU_UNLIKELY(err != CL_SUCCESS))
  463. {
  464. STARPU_OPENCL_REPORT_ERROR_WITH_MSG("clGetPlatformInfo VENDOR", err);
  465. platform_valid = 0;
  466. }
  467. }
  468. if(strcmp(name, "SOCL Platform") == 0)
  469. {
  470. platform_valid = 0;
  471. _STARPU_DEBUG("Skipping SOCL Platform\n");
  472. }
  473. #ifdef STARPU_VERBOSE
  474. if (platform_valid)
  475. _STARPU_DEBUG("Platform: %s - %s\n", name, vendor);
  476. else
  477. _STARPU_DEBUG("Platform invalid: %s - %s\n", name, vendor);
  478. #endif
  479. if (platform_valid && nb_devices <= STARPU_MAXOPENCLDEVS)
  480. {
  481. err = clGetDeviceIDs(platform_id[j], device_type, STARPU_MAXOPENCLDEVS-nb_devices, STARPU_MAXOPENCLDEVS == nb_devices ? NULL : &devices[nb_devices], &num);
  482. if (err == CL_DEVICE_NOT_FOUND)
  483. {
  484. const cl_device_type all_device_types = CL_DEVICE_TYPE_CPU|CL_DEVICE_TYPE_GPU|CL_DEVICE_TYPE_ACCELERATOR;
  485. if (device_type != all_device_types)
  486. {
  487. _STARPU_DEBUG(" No devices of the requested type(s) subset detected on this platform\n");
  488. }
  489. else
  490. {
  491. _STARPU_DEBUG(" No devices detected on this platform\n");
  492. }
  493. }
  494. else
  495. {
  496. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  497. _STARPU_DEBUG(" %u devices detected\n", num);
  498. nb_devices += num;
  499. }
  500. }
  501. }
  502. }
  503. // Get location of OpenCl kernel source files
  504. _starpu_opencl_program_dir = starpu_getenv("STARPU_OPENCL_PROGRAM_DIR");
  505. if (nb_devices > STARPU_MAXOPENCLDEVS)
  506. {
  507. _STARPU_DISP("# Warning: %u OpenCL devices available. Only %d enabled. Use configure option --enable-maxopencldev=xxx to update the maximum value of supported OpenCL devices?\n", nb_devices, STARPU_MAXOPENCLDEVS);
  508. nb_devices = STARPU_MAXOPENCLDEVS;
  509. }
  510. // initialise internal structures
  511. for(i=0 ; i<nb_devices ; i++)
  512. {
  513. contexts[i] = NULL;
  514. queues[i] = NULL;
  515. in_transfer_queues[i] = NULL;
  516. out_transfer_queues[i] = NULL;
  517. peer_transfer_queues[i] = NULL;
  518. alloc_queues[i] = NULL;
  519. }
  520. #endif /* STARPU_USE_OPENCL */
  521. init_done=1;
  522. }
  523. STARPU_PTHREAD_MUTEX_UNLOCK(&big_lock);
  524. }
  525. #ifndef STARPU_SIMGRID
  526. static unsigned _starpu_opencl_get_device_name(int dev, char *name, int lname);
  527. #endif
  528. static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker *worker, unsigned char pipeline_idx);
  529. static void _starpu_opencl_stop_job(struct _starpu_job *j, struct _starpu_worker *worker);
  530. static void _starpu_opencl_execute_job(struct starpu_task *task, struct _starpu_worker *worker);
  531. int _starpu_opencl_driver_init(struct _starpu_worker *worker)
  532. {
  533. int devid = worker->devid;
  534. _starpu_driver_start(worker, STARPU_OPENCL_WORKER, 0);
  535. _starpu_opencl_init_context(devid);
  536. /* one more time to avoid hacks from third party lib :) */
  537. _starpu_bind_thread_on_cpu(worker->bindid, worker->workerid, NULL);
  538. _starpu_opencl_limit_gpu_mem_if_needed(devid);
  539. _starpu_memory_manager_set_global_memory_size(worker->memory_node, _starpu_opencl_get_global_mem_size(devid));
  540. float size = (float) global_mem[devid] / (1<<30);
  541. #ifdef STARPU_SIMGRID
  542. const char *devname = "Simgrid";
  543. #else
  544. /* get the device's name */
  545. char devname[64];
  546. _starpu_opencl_get_device_name(devid, devname, 64);
  547. #endif
  548. snprintf(worker->name, sizeof(worker->name), "OpenCL %d (%s %.1f GiB)", devid, devname, size);
  549. snprintf(worker->short_name, sizeof(worker->short_name), "OpenCL %d", devid);
  550. starpu_pthread_setname(worker->short_name);
  551. worker->pipeline_length = starpu_get_env_number_default("STARPU_OPENCL_PIPELINE", 2);
  552. if (worker->pipeline_length > STARPU_MAX_PIPELINE)
  553. {
  554. _STARPU_DISP("Warning: STARPU_OPENCL_PIPELINE is %u, but STARPU_MAX_PIPELINE is only %u", worker->pipeline_length, STARPU_MAX_PIPELINE);
  555. worker->pipeline_length = STARPU_MAX_PIPELINE;
  556. }
  557. #if !defined(STARPU_SIMGRID) && !defined(STARPU_NON_BLOCKING_DRIVERS)
  558. if (worker->pipeline_length >= 1)
  559. {
  560. /* We need non-blocking drivers, to poll for OPENCL task
  561. * termination */
  562. _STARPU_DISP("Warning: reducing STARPU_OPENCL_PIPELINE to 0 because blocking drivers are enabled (and simgrid is not enabled)\n");
  563. worker->pipeline_length = 0;
  564. }
  565. #endif
  566. _STARPU_DEBUG("OpenCL (%s) dev id %d thread is ready to run on CPU %d !\n", devname, devid, worker->bindid);
  567. _STARPU_TRACE_WORKER_INIT_END(worker->workerid);
  568. /* tell the main thread that this one is ready */
  569. STARPU_PTHREAD_MUTEX_LOCK(&worker->mutex);
  570. worker->status = STATUS_UNKNOWN;
  571. worker->worker_is_initialized = 1;
  572. STARPU_PTHREAD_COND_SIGNAL(&worker->ready_cond);
  573. STARPU_PTHREAD_MUTEX_UNLOCK(&worker->mutex);
  574. return 0;
  575. }
  576. int _starpu_opencl_driver_run_once(struct _starpu_worker *worker)
  577. {
  578. int workerid = worker->workerid;
  579. unsigned memnode = worker->memory_node;
  580. struct _starpu_job *j;
  581. struct starpu_task *task;
  582. int res;
  583. int idle_tasks, idle_transfers;
  584. #ifdef STARPU_SIMGRID
  585. starpu_pthread_wait_reset(&worker->wait);
  586. #endif
  587. idle_tasks = 0;
  588. idle_transfers = 0;
  589. /* First test for transfers pending for next task */
  590. task = worker->task_transferring;
  591. if (!task)
  592. idle_transfers++;
  593. if (task && worker->nb_buffers_transferred == worker->nb_buffers_totransfer)
  594. {
  595. STARPU_RMB();
  596. _STARPU_TRACE_END_PROGRESS(memnode);
  597. j = _starpu_get_job_associated_to_task(task);
  598. _starpu_fetch_task_input_tail(task, j, worker);
  599. _starpu_set_worker_status(worker, STATUS_UNKNOWN);
  600. /* Reset it */
  601. worker->task_transferring = NULL;
  602. if (worker->ntasks > 1 && !(task->cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC))
  603. {
  604. /* We have to execute a non-asynchronous task but we
  605. * still have tasks in the pipeline... Record it to
  606. * prevent more tasks from coming, and do it later */
  607. worker->pipeline_stuck = 1;
  608. return 0;
  609. }
  610. _starpu_opencl_execute_job(task, worker);
  611. _STARPU_TRACE_START_PROGRESS(memnode);
  612. }
  613. /* Then poll for completed jobs */
  614. if (worker->ntasks && worker->current_tasks[worker->first_task] != worker->task_transferring)
  615. {
  616. #ifndef STARPU_SIMGRID
  617. size_t size;
  618. int err;
  619. #endif
  620. /* On-going asynchronous task, check for its termination first */
  621. task = worker->current_tasks[worker->first_task];
  622. #ifdef STARPU_SIMGRID
  623. if (!task_finished[worker->devid][worker->first_task])
  624. #else /* !STARPU_SIMGRID */
  625. cl_int status;
  626. err = clGetEventInfo(task_events[worker->devid][worker->first_task], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &status, &size);
  627. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  628. STARPU_ASSERT(size == sizeof(cl_int));
  629. if (status != CL_COMPLETE)
  630. #endif /* !STARPU_SIMGRID */
  631. {
  632. }
  633. else
  634. {
  635. _STARPU_TRACE_END_PROGRESS(memnode);
  636. #ifndef STARPU_SIMGRID
  637. err = clReleaseEvent(task_events[worker->devid][worker->first_task]);
  638. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  639. task_events[worker->devid][worker->first_task] = 0;
  640. #endif
  641. /* Asynchronous task completed! */
  642. _starpu_opencl_stop_job(_starpu_get_job_associated_to_task(task), worker);
  643. /* See next task if any */
  644. if (worker->ntasks && worker->current_tasks[worker->first_task] != worker->task_transferring)
  645. {
  646. task = worker->current_tasks[worker->first_task];
  647. j = _starpu_get_job_associated_to_task(task);
  648. if (task->cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC)
  649. {
  650. /* An asynchronous task, it was already queued,
  651. * it's now running, record its start time. */
  652. _starpu_driver_start_job(worker, j, &worker->perf_arch, 0, starpu_profiling_status_get());
  653. }
  654. else
  655. {
  656. /* A synchronous task, we have finished flushing the pipeline, we can now at last execute it. */
  657. _STARPU_TRACE_EVENT("sync_task");
  658. _starpu_opencl_execute_job(task, worker);
  659. _STARPU_TRACE_EVENT("end_sync_task");
  660. worker->pipeline_stuck = 0;
  661. }
  662. }
  663. _STARPU_TRACE_START_PROGRESS(memnode);
  664. }
  665. }
  666. if (!worker->pipeline_length || worker->ntasks < worker->pipeline_length)
  667. idle_tasks++;
  668. #if defined(STARPU_NON_BLOCKING_DRIVERS) && !defined(STARPU_SIMGRID)
  669. if (!idle_tasks)
  670. {
  671. /* No task ready yet, no better thing to do than waiting */
  672. __starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, !idle_transfers);
  673. return 0;
  674. }
  675. #endif
  676. res = __starpu_datawizard_progress(STARPU_DATAWIZARD_DO_ALLOC, 1);
  677. task = _starpu_get_worker_task(worker, workerid, memnode);
  678. #ifdef STARPU_SIMGRID
  679. if (!res && !task)
  680. starpu_pthread_wait_wait(&worker->wait);
  681. #endif
  682. if (task == NULL)
  683. return 0;
  684. j = _starpu_get_job_associated_to_task(task);
  685. worker->current_tasks[(worker->first_task + worker->ntasks)%STARPU_MAX_PIPELINE] = task;
  686. worker->ntasks++;
  687. if (worker->pipeline_length == 0)
  688. /* _starpu_get_worker_task checks .current_task field if pipeline_length == 0
  689. *
  690. * TODO: update driver to not use current_tasks[] when pipeline_length == 0,
  691. * as for cuda driver */
  692. worker->current_task = task;
  693. /* can OpenCL do that task ? */
  694. if (!_STARPU_MAY_PERFORM(j, OPENCL))
  695. {
  696. /* this is not a OpenCL task */
  697. _starpu_worker_refuse_task(worker, task);
  698. return 0;
  699. }
  700. _STARPU_TRACE_END_PROGRESS(memnode);
  701. /* Fetch data asynchronously */
  702. res = _starpu_fetch_task_input(task, j, 1);
  703. STARPU_ASSERT(res == 0);
  704. _STARPU_TRACE_START_PROGRESS(memnode);
  705. return 0;
  706. }
  707. int _starpu_opencl_driver_deinit(struct _starpu_worker *worker)
  708. {
  709. _STARPU_TRACE_WORKER_DEINIT_START;
  710. unsigned memnode = worker->memory_node;
  711. _starpu_datawizard_handle_all_pending_node_data_requests(memnode);
  712. /* In case there remains some memory that was automatically
  713. * allocated by StarPU, we release it now. Note that data
  714. * coherency is not maintained anymore at that point ! */
  715. _starpu_free_all_automatically_allocated_buffers(memnode);
  716. _starpu_malloc_shutdown(memnode);
  717. unsigned devid = worker->devid;
  718. _starpu_opencl_deinit_context(devid);
  719. worker->worker_is_initialized = 0;
  720. _STARPU_TRACE_WORKER_DEINIT_END(STARPU_OPENCL_WORKER);
  721. return 0;
  722. }
  723. void *_starpu_opencl_worker(void *_arg)
  724. {
  725. struct _starpu_worker* worker = _arg;
  726. _starpu_opencl_driver_init(worker);
  727. _STARPU_TRACE_START_PROGRESS(worker->memory_node);
  728. while (_starpu_machine_is_running())
  729. {
  730. _starpu_may_pause();
  731. _starpu_opencl_driver_run_once(worker);
  732. }
  733. _starpu_opencl_driver_deinit(worker);
  734. _STARPU_TRACE_END_PROGRESS(worker->memory_node);
  735. return NULL;
  736. }
  737. #ifdef STARPU_USE_OPENCL
  738. #ifndef STARPU_SIMGRID
  739. static unsigned _starpu_opencl_get_device_name(int dev, char *name, int lname)
  740. {
  741. int err;
  742. if (!init_done)
  743. {
  744. _starpu_opencl_init();
  745. }
  746. // Get device name
  747. err = clGetDeviceInfo(devices[dev], CL_DEVICE_NAME, lname, name, NULL);
  748. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  749. _STARPU_DEBUG("Device %d : [%s]\n", dev, name);
  750. return EXIT_SUCCESS;
  751. }
  752. #endif
  753. #endif
  754. unsigned _starpu_opencl_get_device_count(void)
  755. {
  756. if (!init_done)
  757. {
  758. _starpu_opencl_init();
  759. }
  760. return nb_devices;
  761. }
  762. #ifdef STARPU_USE_OPENCL
  763. cl_device_type _starpu_opencl_get_device_type(int devid)
  764. {
  765. int err;
  766. cl_device_type type;
  767. if (!init_done)
  768. _starpu_opencl_init();
  769. err = clGetDeviceInfo(devices[devid], CL_DEVICE_TYPE, sizeof(cl_device_type), &type, NULL);
  770. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  771. return type;
  772. }
  773. #endif /* STARPU_USE_OPENCL */
  774. static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker *worker, unsigned char pipeline_idx STARPU_ATTRIBUTE_UNUSED)
  775. {
  776. STARPU_ASSERT(j);
  777. struct starpu_task *task = j->task;
  778. int profiling = starpu_profiling_status_get();
  779. STARPU_ASSERT(task);
  780. struct starpu_codelet *cl = task->cl;
  781. STARPU_ASSERT(cl);
  782. _starpu_set_current_task(task);
  783. if (worker->ntasks == 1)
  784. {
  785. /* We are alone in the pipeline, the kernel will start now, record it */
  786. _starpu_driver_start_job(worker, j, &worker->perf_arch, 0, profiling);
  787. }
  788. starpu_opencl_func_t func = _starpu_task_get_opencl_nth_implementation(cl, j->nimpl);
  789. STARPU_ASSERT_MSG(func, "when STARPU_OPENCL is defined in 'where', opencl_func or opencl_funcs has to be defined");
  790. if (_starpu_get_disable_kernels() <= 0)
  791. {
  792. _STARPU_TRACE_START_EXECUTING();
  793. #ifdef STARPU_SIMGRID
  794. double length = NAN;
  795. double energy = NAN;
  796. int async = task->cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC;
  797. int simulate = 1;
  798. if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE && !async)
  799. {
  800. /* Actually execute function */
  801. simulate = 0;
  802. func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
  803. #ifdef STARPU_OPENCL_SIMULATOR
  804. #ifndef CL_PROFILING_CLOCK_CYCLE_COUNT
  805. #ifdef CL_PROFILING_COMMAND_SHAVE_CYCLE_COUNT
  806. #define CL_PROFILING_CLOCK_CYCLE_COUNT CL_PROFILING_COMMAND_SHAVE_CYCLE_COUNT
  807. #else
  808. #error The OpenCL simulator must provide CL_PROFILING_CLOCK_CYCLE_COUNT
  809. #endif
  810. #endif
  811. struct starpu_profiling_task_info *profiling_info = task->profiling_info;
  812. STARPU_ASSERT_MSG(profiling_info->used_cycles, "Application kernel must call starpu_opencl_collect_stats to collect simulated time");
  813. #if defined(HAVE_SG_HOST_SPEED) || defined(sg_host_speed)
  814. # if defined(HAVE_SG_HOST_SELF) || defined(sg_host_self)
  815. length = ((double) profiling_info->used_cycles)/sg_host_speed(sg_host_self());
  816. # else
  817. length = ((double) profiling_info->used_cycles)/sg_host_speed(MSG_host_self());
  818. # endif
  819. #elif defined HAVE_MSG_HOST_GET_SPEED || defined(MSG_host_get_speed)
  820. length = ((double) profiling_info->used_cycles)/MSG_host_get_speed(MSG_host_self());
  821. #else
  822. length = ((double) profiling_info->used_cycles)/MSG_get_host_speed(MSG_host_self());
  823. #endif
  824. energy = info->energy_consumed;
  825. /* And give the simulated time to simgrid */
  826. simulate = 1;
  827. #endif
  828. }
  829. else if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT && !async)
  830. {
  831. _SIMGRID_TIMER_BEGIN(1);
  832. func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
  833. _SIMGRID_TIMER_END;
  834. simulate=0;
  835. }
  836. if (simulate)
  837. {
  838. struct _starpu_sched_ctx *sched_ctx = _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(worker, j);
  839. _starpu_simgrid_submit_job(sched_ctx->id, worker->workerid, j, &worker->perf_arch, length, energy,
  840. async ? &task_finished[worker->devid][pipeline_idx] : NULL);
  841. }
  842. #else
  843. func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
  844. cl_command_queue queue;
  845. starpu_opencl_get_queue(worker->devid, &queue);
  846. #endif
  847. _STARPU_TRACE_END_EXECUTING();
  848. }
  849. return 0;
  850. }
  851. static void _starpu_opencl_stop_job(struct _starpu_job *j, struct _starpu_worker *worker)
  852. {
  853. int profiling = starpu_profiling_status_get();
  854. _starpu_set_current_task(NULL);
  855. if (worker->pipeline_length)
  856. worker->current_tasks[worker->first_task] = NULL;
  857. else
  858. worker->current_task = NULL;
  859. worker->first_task = (worker->first_task + 1) % STARPU_MAX_PIPELINE;
  860. worker->ntasks--;
  861. _starpu_driver_end_job(worker, j, &worker->perf_arch, 0, profiling);
  862. struct _starpu_sched_ctx *sched_ctx = _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(worker, j);
  863. STARPU_ASSERT_MSG(sched_ctx != NULL, "there should be a worker %d in the ctx of this job \n", worker->workerid);
  864. if(!sched_ctx->sched_policy)
  865. _starpu_driver_update_job_feedback(j, worker, &sched_ctx->perf_arch, profiling);
  866. else
  867. _starpu_driver_update_job_feedback(j, worker, &worker->perf_arch, profiling);
  868. _starpu_push_task_output(j);
  869. _starpu_handle_job_termination(j);
  870. }
  871. static void _starpu_opencl_execute_job(struct starpu_task *task, struct _starpu_worker *worker)
  872. {
  873. int res;
  874. struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
  875. unsigned char pipeline_idx = (worker->first_task + worker->ntasks - 1)%STARPU_MAX_PIPELINE;
  876. res = _starpu_opencl_start_job(j, worker, pipeline_idx);
  877. if (res)
  878. {
  879. switch (res)
  880. {
  881. case -EAGAIN:
  882. _STARPU_DISP("ouch, OpenCL could not actually run task %p, putting it back...\n", task);
  883. _starpu_push_task_to_workers(task);
  884. STARPU_ABORT();
  885. default:
  886. STARPU_ABORT();
  887. }
  888. }
  889. if (task->cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC)
  890. {
  891. /* Record event to synchronize with task termination later */
  892. #ifndef STARPU_SIMGRID
  893. cl_command_queue queue;
  894. starpu_opencl_get_queue(worker->devid, &queue);
  895. #endif
  896. if (worker->pipeline_length == 0)
  897. {
  898. #ifdef STARPU_SIMGRID
  899. _starpu_simgrid_wait_tasks(worker->workerid);
  900. #else
  901. starpu_opencl_get_queue(worker->devid, &queue);
  902. clFinish(queue);
  903. #endif
  904. _starpu_opencl_stop_job(j, worker);
  905. }
  906. else
  907. {
  908. #ifndef STARPU_SIMGRID
  909. int err;
  910. /* the function clEnqueueMarker is deprecated from
  911. * OpenCL version 1.2. We would like to use the new
  912. * function clEnqueueMarkerWithWaitList. We could do
  913. * it by checking its availability through our own
  914. * configure macro HAVE_CLENQUEUEMARKERWITHWAITLIST
  915. * and the OpenCL macro CL_VERSION_1_2. However these
  916. * 2 macros detect the function availability in the
  917. * ICD and not in the device implementation.
  918. */
  919. err = clEnqueueMarker(queue, &task_events[worker->devid][pipeline_idx]);
  920. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  921. clFlush(queue);
  922. #endif
  923. _STARPU_TRACE_START_EXECUTING();
  924. }
  925. }
  926. else
  927. /* Synchronous execution */
  928. {
  929. _starpu_opencl_stop_job(j, worker);
  930. }
  931. }
  932. #ifdef STARPU_USE_OPENCL
  933. int _starpu_run_opencl(struct _starpu_worker *workerarg)
  934. {
  935. _STARPU_DEBUG("Running OpenCL %u from the application\n", workerarg->devid);
  936. workerarg->set = NULL;
  937. workerarg->worker_is_initialized = 0;
  938. /* Let's go ! */
  939. _starpu_opencl_worker(workerarg);
  940. return 0;
  941. }
  942. struct _starpu_driver_ops _starpu_driver_opencl_ops =
  943. {
  944. .init = _starpu_opencl_driver_init,
  945. .run = _starpu_run_opencl,
  946. .run_once = _starpu_opencl_driver_run_once,
  947. .deinit = _starpu_opencl_driver_deinit
  948. };
  949. #endif
  950. #if defined(STARPU_USE_OPENCL)
  951. unsigned _starpu_opencl_test_request_completion(struct _starpu_async_channel *async_channel)
  952. {
  953. cl_int event_status;
  954. cl_event opencl_event = (*async_channel).event.opencl_event;
  955. if (opencl_event == NULL) STARPU_ABORT();
  956. cl_int err = clGetEventInfo(opencl_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL);
  957. if (STARPU_UNLIKELY(err != CL_SUCCESS))
  958. STARPU_OPENCL_REPORT_ERROR(err);
  959. if (event_status < 0)
  960. STARPU_OPENCL_REPORT_ERROR(event_status);
  961. if (event_status == CL_COMPLETE)
  962. {
  963. err = clReleaseEvent(opencl_event);
  964. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  965. }
  966. return (event_status == CL_COMPLETE);
  967. }
  968. void _starpu_opencl_wait_request_completion(struct _starpu_async_channel *async_channel)
  969. {
  970. cl_int err;
  971. if ((*async_channel).event.opencl_event == NULL)
  972. STARPU_ABORT();
  973. err = clWaitForEvents(1, &((*async_channel).event.opencl_event));
  974. if (STARPU_UNLIKELY(err != CL_SUCCESS))
  975. STARPU_OPENCL_REPORT_ERROR(err);
  976. err = clReleaseEvent((*async_channel).event.opencl_event);
  977. if (STARPU_UNLIKELY(err != CL_SUCCESS))
  978. STARPU_OPENCL_REPORT_ERROR(err);
  979. }
  980. int _starpu_opencl_copy_interface_from_opencl_to_opencl(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
  981. {
  982. int src_kind = starpu_node_get_kind(src_node);
  983. int dst_kind = starpu_node_get_kind(dst_node);
  984. STARPU_ASSERT(src_kind == STARPU_OPENCL_RAM && dst_kind == STARPU_OPENCL_RAM);
  985. int ret = 1;
  986. const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
  987. /* STARPU_OPENCL_RAM -> STARPU_OPENCL_RAM */
  988. STARPU_ASSERT(starpu_worker_get_local_memory_node() == dst_node || starpu_worker_get_local_memory_node() == src_node);
  989. if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() || !(copy_methods->opencl_to_opencl_async || copy_methods->any_to_any))
  990. {
  991. STARPU_ASSERT(copy_methods->opencl_to_opencl || copy_methods->any_to_any);
  992. /* this is not associated to a request so it's synchronous */
  993. if (copy_methods->opencl_to_opencl)
  994. copy_methods->opencl_to_opencl(src_interface, src_node, dst_interface, dst_node);
  995. else
  996. copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
  997. }
  998. else
  999. {
  1000. req->async_channel.node_ops = &_starpu_driver_opencl_node_ops;
  1001. if (copy_methods->opencl_to_opencl_async)
  1002. ret = copy_methods->opencl_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
  1003. else
  1004. {
  1005. STARPU_ASSERT(copy_methods->any_to_any);
  1006. ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
  1007. }
  1008. }
  1009. return ret;
  1010. }
  1011. int _starpu_opencl_copy_interface_from_opencl_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
  1012. {
  1013. int src_kind = starpu_node_get_kind(src_node);
  1014. int dst_kind = starpu_node_get_kind(dst_node);
  1015. STARPU_ASSERT(src_kind == STARPU_OPENCL_RAM && dst_kind == STARPU_CPU_RAM);
  1016. int ret = 1;
  1017. const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
  1018. /* OpenCL -> RAM */
  1019. STARPU_ASSERT(starpu_worker_get_local_memory_node() == src_node);
  1020. if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() || !(copy_methods->opencl_to_ram_async || copy_methods->any_to_any))
  1021. {
  1022. STARPU_ASSERT(copy_methods->opencl_to_ram || copy_methods->any_to_any);
  1023. /* this is not associated to a request so it's synchronous */
  1024. if (copy_methods->opencl_to_ram)
  1025. copy_methods->opencl_to_ram(src_interface, src_node, dst_interface, dst_node);
  1026. else
  1027. copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
  1028. }
  1029. else
  1030. {
  1031. req->async_channel.node_ops = &_starpu_driver_opencl_node_ops;
  1032. if (copy_methods->opencl_to_ram_async)
  1033. ret = copy_methods->opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
  1034. else
  1035. {
  1036. STARPU_ASSERT(copy_methods->any_to_any);
  1037. ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
  1038. }
  1039. }
  1040. return ret;
  1041. }
  1042. int _starpu_opencl_copy_interface_from_cpu_to_opencl(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
  1043. {
  1044. int src_kind = starpu_node_get_kind(src_node);
  1045. int dst_kind = starpu_node_get_kind(dst_node);
  1046. STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_OPENCL_RAM);
  1047. int ret = 0;
  1048. const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
  1049. /* STARPU_CPU_RAM -> STARPU_OPENCL_RAM */
  1050. STARPU_ASSERT(starpu_worker_get_local_memory_node() == dst_node);
  1051. if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() || !(copy_methods->ram_to_opencl_async || copy_methods->any_to_any))
  1052. {
  1053. STARPU_ASSERT(copy_methods->ram_to_opencl || copy_methods->any_to_any);
  1054. /* this is not associated to a request so it's synchronous */
  1055. if (copy_methods->ram_to_opencl)
  1056. copy_methods->ram_to_opencl(src_interface, src_node, dst_interface, dst_node);
  1057. else
  1058. copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
  1059. }
  1060. else
  1061. {
  1062. req->async_channel.node_ops = &_starpu_driver_opencl_node_ops;
  1063. if (copy_methods->ram_to_opencl_async)
  1064. ret = copy_methods->ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
  1065. else
  1066. {
  1067. STARPU_ASSERT(copy_methods->any_to_any);
  1068. ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
  1069. }
  1070. }
  1071. return ret;
  1072. }
  1073. int _starpu_opencl_copy_data_from_opencl_to_opencl(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
  1074. {
  1075. int src_kind = starpu_node_get_kind(src_node);
  1076. int dst_kind = starpu_node_get_kind(dst_node);
  1077. STARPU_ASSERT(src_kind == STARPU_OPENCL_RAM && dst_kind == STARPU_OPENCL_RAM);
  1078. return starpu_opencl_copy_async_sync(src, src_offset, src_node,
  1079. dst, dst_offset, dst_node,
  1080. size,
  1081. &async_channel->event.opencl_event);
  1082. }
  1083. int _starpu_opencl_copy_data_from_opencl_to_cpu(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
  1084. {
  1085. int src_kind = starpu_node_get_kind(src_node);
  1086. int dst_kind = starpu_node_get_kind(dst_node);
  1087. STARPU_ASSERT(src_kind == STARPU_OPENCL_RAM && dst_kind == STARPU_CPU_RAM);
  1088. return starpu_opencl_copy_async_sync(src, src_offset, src_node,
  1089. dst, dst_offset, dst_node,
  1090. size,
  1091. &async_channel->event.opencl_event);
  1092. }
  1093. int _starpu_opencl_copy_data_from_cpu_to_opencl(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
  1094. {
  1095. int src_kind = starpu_node_get_kind(src_node);
  1096. int dst_kind = starpu_node_get_kind(dst_node);
  1097. STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_OPENCL_RAM);
  1098. return starpu_opencl_copy_async_sync(src, src_offset, src_node,
  1099. dst, dst_offset, dst_node,
  1100. size,
  1101. &async_channel->event.opencl_event);
  1102. }
  1103. #endif
  1104. uintptr_t _starpu_opencl_malloc_on_node(unsigned dst_node, size_t size, int flags)
  1105. {
  1106. (void)flags;
  1107. uintptr_t addr = 0;
  1108. #ifdef STARPU_SIMGRID
  1109. static uintptr_t last[STARPU_MAXNODES];
  1110. /* Sleep for the allocation */
  1111. STARPU_PTHREAD_MUTEX_LOCK(&opencl_alloc_mutex);
  1112. if (_starpu_simgrid_cuda_malloc_cost())
  1113. starpu_sleep(0.000175);
  1114. if (!last[dst_node])
  1115. last[dst_node] = 1<<10;
  1116. addr = last[dst_node];
  1117. last[dst_node]+=size;
  1118. STARPU_ASSERT(last[dst_node] >= addr);
  1119. STARPU_PTHREAD_MUTEX_UNLOCK(&opencl_alloc_mutex);
  1120. #else
  1121. int ret;
  1122. cl_mem ptr;
  1123. ret = starpu_opencl_allocate_memory(starpu_memory_node_get_devid(dst_node), &ptr, size, CL_MEM_READ_WRITE);
  1124. if (ret)
  1125. {
  1126. addr = 0;
  1127. }
  1128. else
  1129. {
  1130. addr = (uintptr_t)ptr;
  1131. }
  1132. #endif
  1133. return addr;
  1134. }
  1135. void _starpu_opencl_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags)
  1136. {
  1137. (void)dst_node;
  1138. (void)addr;
  1139. (void)size;
  1140. (void)flags;
  1141. #ifdef STARPU_SIMGRID
  1142. STARPU_PTHREAD_MUTEX_LOCK(&opencl_alloc_mutex);
  1143. /* Sleep for the free */
  1144. if (_starpu_simgrid_cuda_malloc_cost())
  1145. starpu_sleep(0.000750);
  1146. STARPU_PTHREAD_MUTEX_UNLOCK(&opencl_alloc_mutex);
  1147. #else
  1148. cl_int err;
  1149. err = clReleaseMemObject((void*)addr);
  1150. if (STARPU_UNLIKELY(err != CL_SUCCESS))
  1151. STARPU_OPENCL_REPORT_ERROR(err);
  1152. #endif
  1153. }
  1154. int _starpu_opencl_is_direct_access_supported(unsigned node, unsigned handling_node)
  1155. {
  1156. (void)node;
  1157. (void)handling_node;
  1158. return 0;
  1159. }
  1160. #ifdef STARPU_SIMGRID
  1161. struct _starpu_node_ops _starpu_driver_opencl_node_ops =
  1162. {
  1163. .copy_interface_to[STARPU_CPU_RAM] = NULL,
  1164. .copy_interface_to[STARPU_OPENCL_RAM] = NULL,
  1165. .copy_data_to[STARPU_CPU_RAM] = NULL,
  1166. .copy_data_to[STARPU_OPENCL_RAM] = NULL,
  1167. .wait_request_completion = NULL,
  1168. .test_request_completion = NULL,
  1169. .is_direct_access_supported = _starpu_opencl_is_direct_access_supported,
  1170. .malloc_on_node = _starpu_opencl_malloc_on_node,
  1171. .free_on_node = _starpu_opencl_free_on_node,
  1172. .name = "opencl driver"
  1173. };
  1174. #else
  1175. struct _starpu_node_ops _starpu_driver_opencl_node_ops =
  1176. {
  1177. .copy_interface_to[STARPU_CPU_RAM] = _starpu_opencl_copy_interface_from_opencl_to_cpu,
  1178. .copy_interface_to[STARPU_OPENCL_RAM] = _starpu_opencl_copy_interface_from_opencl_to_opencl,
  1179. .copy_data_to[STARPU_CPU_RAM] = _starpu_opencl_copy_data_from_opencl_to_cpu,
  1180. .copy_data_to[STARPU_OPENCL_RAM] = _starpu_opencl_copy_data_from_opencl_to_opencl,
  1181. /* TODO: copy2D/3D? */
  1182. .wait_request_completion = _starpu_opencl_wait_request_completion,
  1183. .test_request_completion = _starpu_opencl_test_request_completion,
  1184. .is_direct_access_supported = _starpu_opencl_is_direct_access_supported,
  1185. .malloc_on_node = _starpu_opencl_malloc_on_node,
  1186. .free_on_node = _starpu_opencl_free_on_node,
  1187. .name = "opencl driver"
  1188. };
  1189. #endif