driver_opencl.c 34 KB


  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2010-2017 Université de Bordeaux
  4. * Copyright (C) 2010 Mehdi Juhoor
  5. * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 CNRS
  6. * Copyright (C) 2011 Télécom-SudParis
  7. * Copyright (C) 2017 Inria
  8. *
  9. * StarPU is free software; you can redistribute it and/or modify
  10. * it under the terms of the GNU Lesser General Public License as published by
  11. * the Free Software Foundation; either version 2.1 of the License, or (at
  12. * your option) any later version.
  13. *
  14. * StarPU is distributed in the hope that it will be useful, but
  15. * WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  17. *
  18. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  19. */
  20. #include <math.h>
  21. #include <starpu.h>
  22. #include <starpu_profiling.h>
  23. #include <common/config.h>
  24. #include <common/utils.h>
  25. #include <core/debug.h>
  26. #include <starpu_opencl.h>
  27. #include <drivers/driver_common/driver_common.h>
  28. #include "driver_opencl.h"
  29. #include "driver_opencl_utils.h"
  30. #include <common/utils.h>
  31. #include <datawizard/memory_manager.h>
  32. #include <datawizard/memory_nodes.h>
  33. #include <datawizard/malloc.h>
  34. #include <core/task.h>
  35. #ifdef STARPU_SIMGRID
  36. #include <core/simgrid.h>
  37. #endif
  38. static int nb_devices = -1;
  39. static int init_done = 0;
  40. static starpu_pthread_mutex_t big_lock = STARPU_PTHREAD_MUTEX_INITIALIZER;
  41. static size_t global_mem[STARPU_MAXOPENCLDEVS];
  42. #ifdef STARPU_USE_OPENCL
  43. static cl_context contexts[STARPU_MAXOPENCLDEVS];
  44. static cl_device_id devices[STARPU_MAXOPENCLDEVS];
  45. static cl_command_queue queues[STARPU_MAXOPENCLDEVS];
  46. static cl_command_queue in_transfer_queues[STARPU_MAXOPENCLDEVS];
  47. static cl_command_queue out_transfer_queues[STARPU_MAXOPENCLDEVS];
  48. static cl_command_queue peer_transfer_queues[STARPU_MAXOPENCLDEVS];
  49. #ifndef STARPU_SIMGRID
  50. static cl_command_queue alloc_queues[STARPU_MAXOPENCLDEVS];
  51. static cl_event task_events[STARPU_MAXOPENCLDEVS][STARPU_MAX_PIPELINE];
  52. #endif /* !STARPU_SIMGRID */
  53. #endif
  54. #ifdef STARPU_SIMGRID
  55. static unsigned task_finished[STARPU_MAXOPENCLDEVS][STARPU_MAX_PIPELINE];
  56. #endif /* STARPU_SIMGRID */
  57. #define _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err) do { if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err); } while(0)
  58. void
  59. _starpu_opencl_discover_devices(struct _starpu_machine_config *config)
  60. {
  61. /* Discover the number of OpenCL devices. Fill the result in CONFIG. */
  62. /* As OpenCL must have been initialized before calling this function,
  63. * `nb_device' is ensured to be correctly set. */
  64. STARPU_ASSERT(init_done == 1);
  65. config->topology.nhwopenclgpus = nb_devices;
  66. }
  67. static void _starpu_opencl_limit_gpu_mem_if_needed(unsigned devid)
  68. {
  69. starpu_ssize_t limit;
  70. size_t STARPU_ATTRIBUTE_UNUSED totalGlobalMem = 0;
  71. size_t STARPU_ATTRIBUTE_UNUSED to_waste = 0;
  72. #ifdef STARPU_SIMGRID
  73. totalGlobalMem = _starpu_simgrid_get_memsize("OpenCL", devid);
  74. #elif defined(STARPU_USE_OPENCL)
  75. /* Request the size of the current device's memory */
  76. cl_int err;
  77. cl_ulong size;
  78. err = clGetDeviceInfo(devices[devid], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size), &size, NULL);
  79. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  80. totalGlobalMem = size;
  81. #endif
  82. limit = starpu_get_env_number("STARPU_LIMIT_OPENCL_MEM");
  83. if (limit == -1)
  84. {
  85. char name[30];
  86. snprintf(name, sizeof(name), "STARPU_LIMIT_OPENCL_%u_MEM", devid);
  87. limit = starpu_get_env_number(name);
  88. }
  89. #if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
  90. if (limit == -1)
  91. {
  92. /* Use 90% of the available memory by default. */
  93. limit = totalGlobalMem / (1024*1024) * 0.9;
  94. }
  95. #endif
  96. global_mem[devid] = limit * 1024*1024;
  97. #ifdef STARPU_USE_OPENCL
  98. /* How much memory to waste ? */
  99. to_waste = totalGlobalMem - global_mem[devid];
  100. #endif
  101. _STARPU_DEBUG("OpenCL device %u: Wasting %ld MB / Limit %ld MB / Total %ld MB / Remains %ld MB\n",
  102. devid, (long)to_waste/(1024*1024), (long) limit, (long)totalGlobalMem/(1024*1024),
  103. (long)(totalGlobalMem - to_waste)/(1024*1024));
  104. }
  105. #ifdef STARPU_USE_OPENCL
  106. void starpu_opencl_get_context(int devid, cl_context *context)
  107. {
  108. *context = contexts[devid];
  109. }
  110. void starpu_opencl_get_device(int devid, cl_device_id *device)
  111. {
  112. *device = devices[devid];
  113. }
  114. void starpu_opencl_get_queue(int devid, cl_command_queue *queue)
  115. {
  116. *queue = queues[devid];
  117. }
  118. void starpu_opencl_get_current_queue(cl_command_queue *queue)
  119. {
  120. struct _starpu_worker *worker = _starpu_get_local_worker_key();
  121. STARPU_ASSERT(queue);
  122. *queue = queues[worker->devid];
  123. }
  124. void starpu_opencl_get_current_context(cl_context *context)
  125. {
  126. struct _starpu_worker *worker = _starpu_get_local_worker_key();
  127. STARPU_ASSERT(context);
  128. *context = contexts[worker->devid];
  129. }
  130. #endif /* STARPU_USE_OPENCL */
  131. int _starpu_opencl_init_context(int devid)
  132. {
  133. #ifdef STARPU_SIMGRID
  134. int j;
  135. for (j = 0; j < STARPU_MAX_PIPELINE; j++)
  136. task_finished[devid][j] = 0;
  137. #else /* !STARPU_SIMGRID */
  138. cl_int err;
  139. cl_uint uint;
  140. STARPU_PTHREAD_MUTEX_LOCK(&big_lock);
  141. _STARPU_DEBUG("Initialising context for dev %d\n", devid);
  142. // Create a compute context
  143. err = 0;
  144. contexts[devid] = clCreateContext(NULL, 1, &devices[devid], NULL, NULL, &err);
  145. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  146. err = clGetDeviceInfo(devices[devid], CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(uint), &uint, NULL);
  147. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  148. starpu_malloc_set_align(uint/8);
  149. // Create execution queue for the given device
  150. queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], 0, &err);
  151. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  152. // Create transfer queue for the given device
  153. cl_command_queue_properties props;
  154. err = clGetDeviceInfo(devices[devid], CL_DEVICE_QUEUE_PROPERTIES, sizeof(props), &props, NULL);
  155. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  156. props &= ~CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
  157. in_transfer_queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], props, &err);
  158. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  159. out_transfer_queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], props, &err);
  160. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  161. peer_transfer_queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], props, &err);
  162. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  163. alloc_queues[devid] = clCreateCommandQueue(contexts[devid], devices[devid], 0, &err);
  164. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  165. STARPU_PTHREAD_MUTEX_UNLOCK(&big_lock);
  166. #endif /* !STARPU_SIMGRID */
  167. return 0;
  168. }
  169. int _starpu_opencl_deinit_context(int devid)
  170. {
  171. #ifdef STARPU_SIMGRID
  172. int j;
  173. for (j = 0; j < STARPU_MAX_PIPELINE; j++)
  174. task_finished[devid][j] = 0;
  175. #else /* !STARPU_SIMGRID */
  176. cl_int err;
  177. STARPU_PTHREAD_MUTEX_LOCK(&big_lock);
  178. _STARPU_DEBUG("De-initialising context for dev %d\n", devid);
  179. err = clFinish(queues[devid]);
  180. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  181. err = clReleaseCommandQueue(queues[devid]);
  182. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  183. err = clFinish(in_transfer_queues[devid]);
  184. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  185. err = clReleaseCommandQueue(in_transfer_queues[devid]);
  186. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  187. err = clFinish(out_transfer_queues[devid]);
  188. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  189. err = clReleaseCommandQueue(out_transfer_queues[devid]);
  190. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  191. err = clFinish(peer_transfer_queues[devid]);
  192. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  193. err = clReleaseCommandQueue(peer_transfer_queues[devid]);
  194. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  195. err = clFinish(alloc_queues[devid]);
  196. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  197. err = clReleaseCommandQueue(alloc_queues[devid]);
  198. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  199. err = clReleaseContext(contexts[devid]);
  200. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  201. contexts[devid] = NULL;
  202. STARPU_PTHREAD_MUTEX_UNLOCK(&big_lock);
  203. #endif
  204. return 0;
  205. }
  206. #ifdef STARPU_USE_OPENCL
  207. cl_int starpu_opencl_allocate_memory(int devid STARPU_ATTRIBUTE_UNUSED, cl_mem *mem STARPU_ATTRIBUTE_UNUSED, size_t size STARPU_ATTRIBUTE_UNUSED, cl_mem_flags flags STARPU_ATTRIBUTE_UNUSED)
  208. {
  209. #ifdef STARPU_SIMGRID
  210. STARPU_ABORT();
  211. #else
  212. cl_int err;
  213. cl_mem memory;
  214. memory = clCreateBuffer(contexts[devid], flags, size, NULL, &err);
  215. if (err == CL_OUT_OF_HOST_MEMORY)
  216. return err;
  217. if (err == CL_MEM_OBJECT_ALLOCATION_FAILURE)
  218. return err;
  219. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  220. /*
  221. * OpenCL uses lazy memory allocation: we will only know if the
  222. * allocation failed when trying to copy data onto the device. But we
  223. * want to know this __now__, so we just perform a dummy copy.
  224. */
  225. char dummy = 0;
  226. cl_event ev;
  227. err = clEnqueueWriteBuffer(alloc_queues[devid], memory, CL_TRUE,
  228. 0, sizeof(dummy), &dummy,
  229. 0, NULL, &ev);
  230. if (err == CL_MEM_OBJECT_ALLOCATION_FAILURE)
  231. return err;
  232. if (err == CL_OUT_OF_RESOURCES)
  233. return err;
  234. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  235. clWaitForEvents(1, &ev);
  236. clReleaseEvent(ev);
  237. *mem = memory;
  238. return CL_SUCCESS;
  239. #endif
  240. }
  241. cl_int starpu_opencl_copy_ram_to_opencl(void *ptr, unsigned src_node STARPU_ATTRIBUTE_UNUSED, cl_mem buffer, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size, size_t offset, cl_event *event, int *ret)
  242. {
  243. cl_int err;
  244. struct _starpu_worker *worker = _starpu_get_local_worker_key();
  245. if (event)
  246. _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
  247. cl_event ev;
  248. err = clEnqueueWriteBuffer(in_transfer_queues[worker->devid], buffer, CL_FALSE, offset, size, ptr, 0, NULL, &ev);
  249. if (event)
  250. _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
  251. if (STARPU_LIKELY(err == CL_SUCCESS))
  252. {
  253. if (event == NULL)
  254. {
  255. /* We want a synchronous copy, let's synchronise the queue */
  256. err = clWaitForEvents(1, &ev);
  257. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  258. err = clReleaseEvent(ev);
  259. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  260. }
  261. else
  262. {
  263. *event = ev;
  264. }
  265. if (ret)
  266. {
  267. *ret = (event == NULL) ? 0 : -EAGAIN;
  268. }
  269. }
  270. return err;
  271. }
  272. cl_int starpu_opencl_copy_opencl_to_ram(cl_mem buffer, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *ptr, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size, size_t offset, cl_event *event, int *ret)
  273. {
  274. cl_int err;
  275. struct _starpu_worker *worker = _starpu_get_local_worker_key();
  276. if (event)
  277. _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
  278. cl_event ev;
  279. err = clEnqueueReadBuffer(out_transfer_queues[worker->devid], buffer, CL_FALSE, offset, size, ptr, 0, NULL, &ev);
  280. if (event)
  281. _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
  282. if (STARPU_LIKELY(err == CL_SUCCESS))
  283. {
  284. if (event == NULL)
  285. {
  286. /* We want a synchronous copy, let's synchronise the queue */
  287. err = clWaitForEvents(1, &ev);
  288. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  289. err = clReleaseEvent(ev);
  290. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  291. }
  292. else
  293. {
  294. *event = ev;
  295. }
  296. if (ret)
  297. {
  298. *ret = (event == NULL) ? 0 : -EAGAIN;
  299. }
  300. }
  301. return err;
  302. }
  303. cl_int starpu_opencl_copy_opencl_to_opencl(cl_mem src, unsigned src_node STARPU_ATTRIBUTE_UNUSED, size_t src_offset, cl_mem dst, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t dst_offset, size_t size, cl_event *event, int *ret)
  304. {
  305. cl_int err;
  306. struct _starpu_worker *worker = _starpu_get_local_worker_key();
  307. if (event)
  308. _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
  309. cl_event ev;
  310. err = clEnqueueCopyBuffer(peer_transfer_queues[worker->devid], src, dst, src_offset, dst_offset, size, 0, NULL, &ev);
  311. if (event)
  312. _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
  313. if (STARPU_LIKELY(err == CL_SUCCESS))
  314. {
  315. if (event == NULL)
  316. {
  317. /* We want a synchronous copy, let's synchronise the queue */
  318. err = clWaitForEvents(1, &ev);
  319. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  320. err = clReleaseEvent(ev);
  321. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  322. }
  323. else
  324. {
  325. *event = ev;
  326. }
  327. if (ret)
  328. {
  329. *ret = (event == NULL) ? 0 : -EAGAIN;
  330. }
  331. }
  332. return err;
  333. }
  334. cl_int starpu_opencl_copy_async_sync(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, cl_event *event)
  335. {
  336. enum starpu_node_kind src_kind = starpu_node_get_kind(src_node);
  337. enum starpu_node_kind dst_kind = starpu_node_get_kind(dst_node);
  338. cl_int err;
  339. int ret;
  340. switch (_STARPU_MEMORY_NODE_TUPLE(src_kind,dst_kind))
  341. {
  342. case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_CPU_RAM):
  343. err = starpu_opencl_copy_opencl_to_ram(
  344. (cl_mem) src, src_node,
  345. (void*) (dst + dst_offset), dst_node,
  346. size, src_offset, event, &ret);
  347. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  348. return ret;
  349. case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_OPENCL_RAM):
  350. err = starpu_opencl_copy_ram_to_opencl(
  351. (void*) (src + src_offset), src_node,
  352. (cl_mem) dst, dst_node,
  353. size, dst_offset, event, &ret);
  354. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  355. return ret;
  356. case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_OPENCL_RAM):
  357. err = starpu_opencl_copy_opencl_to_opencl(
  358. (cl_mem) src, src_node, src_offset,
  359. (cl_mem) dst, dst_node, dst_offset,
  360. size, event, &ret);
  361. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  362. return ret;
  363. default:
  364. STARPU_ABORT();
  365. break;
  366. }
  367. }
  368. #if 0
  369. cl_int _starpu_opencl_copy_rect_opencl_to_ram(cl_mem buffer, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *ptr, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, const size_t buffer_origin[3], const size_t host_origin[3],
  370. const size_t region[3], size_t buffer_row_pitch, size_t buffer_slice_pitch,
  371. size_t host_row_pitch, size_t host_slice_pitch, cl_event *event)
  372. {
  373. cl_int err;
  374. struct _starpu_worker *worker = _starpu_get_local_worker_key();
  375. cl_bool blocking;
  376. blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
  377. if (event)
  378. _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
  379. err = clEnqueueReadBufferRect(out_transfer_queues[worker->devid], buffer, blocking, buffer_origin, host_origin, region, buffer_row_pitch,
  380. buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, 0, NULL, event);
  381. if (event)
  382. _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
  383. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  384. return CL_SUCCESS;
  385. }
  386. cl_int _starpu_opencl_copy_rect_ram_to_opencl(void *ptr, unsigned src_node STARPU_ATTRIBUTE_UNUSED, cl_mem buffer, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, const size_t buffer_origin[3], const size_t host_origin[3],
  387. const size_t region[3], size_t buffer_row_pitch, size_t buffer_slice_pitch,
  388. size_t host_row_pitch, size_t host_slice_pitch, cl_event *event)
  389. {
  390. cl_int err;
  391. struct _starpu_worker *worker = _starpu_get_local_worker_key();
  392. cl_bool blocking;
  393. blocking = (event == NULL) ? CL_TRUE : CL_FALSE;
  394. if (event)
  395. _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
  396. err = clEnqueueWriteBufferRect(in_transfer_queues[worker->devid], buffer, blocking, buffer_origin, host_origin, region, buffer_row_pitch,
  397. buffer_slice_pitch, host_row_pitch, host_slice_pitch, ptr, 0, NULL, event);
  398. if (event)
  399. _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
  400. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  401. return CL_SUCCESS;
  402. }
  403. #endif
  404. #endif /* STARPU_USE_OPENCL */
  405. static size_t _starpu_opencl_get_global_mem_size(int devid)
  406. {
  407. return global_mem[devid];
  408. }
  409. void _starpu_opencl_init(void)
  410. {
  411. STARPU_PTHREAD_MUTEX_LOCK(&big_lock);
  412. if (!init_done)
  413. {
  414. #ifdef STARPU_SIMGRID
  415. nb_devices = _starpu_simgrid_get_nbhosts("OpenCL");
  416. #else /* STARPU_USE_OPENCL */
  417. cl_platform_id platform_id[_STARPU_OPENCL_PLATFORM_MAX];
  418. cl_uint nb_platforms;
  419. cl_int err;
  420. int i;
  421. cl_device_type device_type = CL_DEVICE_TYPE_GPU|CL_DEVICE_TYPE_ACCELERATOR;
  422. _STARPU_DEBUG("Initialising OpenCL\n");
  423. // Get Platforms
  424. if (starpu_get_env_number("STARPU_OPENCL_ON_CPUS") > 0)
  425. device_type |= CL_DEVICE_TYPE_CPU;
  426. if (starpu_get_env_number("STARPU_OPENCL_ONLY_ON_CPUS") > 0)
  427. device_type = CL_DEVICE_TYPE_CPU;
  428. err = clGetPlatformIDs(_STARPU_OPENCL_PLATFORM_MAX, platform_id, &nb_platforms);
  429. if (STARPU_UNLIKELY(err != CL_SUCCESS)) nb_platforms=0;
  430. _STARPU_DEBUG("Platforms detected: %u\n", nb_platforms);
  431. _STARPU_DEBUG("CPU device type: %s\n", (device_type&CL_DEVICE_TYPE_CPU)?"requested":"not requested");
  432. _STARPU_DEBUG("GPU device type: %s\n", (device_type&CL_DEVICE_TYPE_GPU)?"requested":"not requested");
  433. _STARPU_DEBUG("Accelerator device type: %s\n", (device_type&CL_DEVICE_TYPE_ACCELERATOR)?"requested":"not requested");
  434. // Get devices
  435. nb_devices = 0;
  436. {
  437. unsigned j;
  438. for (j=0; j<nb_platforms; j++)
  439. {
  440. cl_uint num;
  441. int platform_valid = 1;
  442. char name[1024], vendor[1024];
  443. err = clGetPlatformInfo(platform_id[j], CL_PLATFORM_NAME, 1024, name, NULL);
  444. if (err != CL_SUCCESS)
  445. {
  446. STARPU_OPENCL_REPORT_ERROR_WITH_MSG("clGetPlatformInfo NAME", err);
  447. platform_valid = 0;
  448. }
  449. else
  450. {
  451. err = clGetPlatformInfo(platform_id[j], CL_PLATFORM_VENDOR, 1024, vendor, NULL);
  452. if (STARPU_UNLIKELY(err != CL_SUCCESS))
  453. {
  454. STARPU_OPENCL_REPORT_ERROR_WITH_MSG("clGetPlatformInfo VENDOR", err);
  455. platform_valid = 0;
  456. }
  457. }
  458. if(strcmp(name, "SOCL Platform") == 0)
  459. {
  460. platform_valid = 0;
  461. _STARPU_DEBUG("Skipping SOCL Platform\n");
  462. }
  463. #ifdef STARPU_VERBOSE
  464. if (platform_valid)
  465. _STARPU_DEBUG("Platform: %s - %s\n", name, vendor);
  466. else
  467. _STARPU_DEBUG("Platform invalid: %s - %s\n", name, vendor);
  468. #endif
  469. if (platform_valid && nb_devices <= STARPU_MAXOPENCLDEVS)
  470. {
  471. err = clGetDeviceIDs(platform_id[j], device_type, STARPU_MAXOPENCLDEVS-nb_devices, STARPU_MAXOPENCLDEVS == nb_devices ? NULL : &devices[nb_devices], &num);
  472. if (err == CL_DEVICE_NOT_FOUND)
  473. {
  474. const cl_device_type all_device_types = CL_DEVICE_TYPE_CPU|CL_DEVICE_TYPE_GPU|CL_DEVICE_TYPE_ACCELERATOR;
  475. if (device_type != all_device_types)
  476. {
  477. _STARPU_DEBUG(" No devices of the requested type(s) subset detected on this platform\n");
  478. }
  479. else
  480. {
  481. _STARPU_DEBUG(" No devices detected on this platform\n");
  482. }
  483. }
  484. else
  485. {
  486. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  487. _STARPU_DEBUG(" %u devices detected\n", num);
  488. nb_devices += num;
  489. }
  490. }
  491. }
  492. }
  493. // Get location of OpenCl kernel source files
  494. _starpu_opencl_program_dir = starpu_getenv("STARPU_OPENCL_PROGRAM_DIR");
  495. if (nb_devices > STARPU_MAXOPENCLDEVS)
  496. {
  497. _STARPU_DISP("# Warning: %u OpenCL devices available. Only %d enabled. Use configure option --enable-maxopencldev=xxx to update the maximum value of supported OpenCL devices?\n", nb_devices, STARPU_MAXOPENCLDEVS);
  498. nb_devices = STARPU_MAXOPENCLDEVS;
  499. }
  500. // initialise internal structures
  501. for(i=0 ; i<nb_devices ; i++)
  502. {
  503. contexts[i] = NULL;
  504. queues[i] = NULL;
  505. in_transfer_queues[i] = NULL;
  506. out_transfer_queues[i] = NULL;
  507. peer_transfer_queues[i] = NULL;
  508. alloc_queues[i] = NULL;
  509. }
  510. #endif /* STARPU_USE_OPENCL */
  511. init_done=1;
  512. }
  513. STARPU_PTHREAD_MUTEX_UNLOCK(&big_lock);
  514. }
  515. #ifndef STARPU_SIMGRID
  516. static unsigned _starpu_opencl_get_device_name(int dev, char *name, int lname);
  517. #endif
  518. static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker *worker, unsigned char pipeline_idx);
  519. static void _starpu_opencl_stop_job(struct _starpu_job *j, struct _starpu_worker *worker);
  520. static void _starpu_opencl_execute_job(struct starpu_task *task, struct _starpu_worker *worker);
  521. int _starpu_opencl_driver_init(struct _starpu_worker *worker)
  522. {
  523. int devid = worker->devid;
  524. _starpu_driver_start(worker, _STARPU_FUT_OPENCL_KEY, 0);
  525. _starpu_opencl_init_context(devid);
  526. /* one more time to avoid hacks from third party lib :) */
  527. _starpu_bind_thread_on_cpu(worker->bindid, worker->workerid);
  528. _starpu_opencl_limit_gpu_mem_if_needed(devid);
  529. _starpu_memory_manager_set_global_memory_size(worker->memory_node, _starpu_opencl_get_global_mem_size(devid));
  530. float size = (float) global_mem[devid] / (1<<30);
  531. #ifdef STARPU_SIMGRID
  532. const char *devname = "Simgrid";
  533. #else
  534. /* get the device's name */
  535. char devname[128];
  536. _starpu_opencl_get_device_name(devid, devname, 128);
  537. #endif
  538. snprintf(worker->name, sizeof(worker->name), "OpenCL %d (%s %.1f GiB)", devid, devname, size);
  539. snprintf(worker->short_name, sizeof(worker->short_name), "OpenCL %d", devid);
  540. starpu_pthread_setname(worker->short_name);
  541. worker->pipeline_length = starpu_get_env_number_default("STARPU_OPENCL_PIPELINE", 2);
  542. if (worker->pipeline_length > STARPU_MAX_PIPELINE)
  543. {
  544. _STARPU_DISP("Warning: STARPU_OPENCL_PIPELINE is %u, but STARPU_MAX_PIPELINE is only %u", worker->pipeline_length, STARPU_MAX_PIPELINE);
  545. worker->pipeline_length = STARPU_MAX_PIPELINE;
  546. }
  547. #if !defined(STARPU_SIMGRID) && !defined(STARPU_NON_BLOCKING_DRIVERS)
  548. if (worker->pipeline_length >= 1)
  549. {
  550. /* We need non-blocking drivers, to poll for OPENCL task
  551. * termination */
  552. _STARPU_DISP("Warning: reducing STARPU_OPENCL_PIPELINE to 0 because blocking drivers are enabled (and simgrid is not enabled)\n");
  553. worker->pipeline_length = 0;
  554. }
  555. #endif
  556. _STARPU_DEBUG("OpenCL (%s) dev id %d thread is ready to run on CPU %d !\n", devname, devid, worker->bindid);
  557. _STARPU_TRACE_WORKER_INIT_END(worker->workerid);
  558. /* tell the main thread that this one is ready */
  559. STARPU_PTHREAD_MUTEX_LOCK(&worker->mutex);
  560. worker->status = STATUS_UNKNOWN;
  561. worker->worker_is_initialized = 1;
  562. STARPU_PTHREAD_COND_SIGNAL(&worker->ready_cond);
  563. STARPU_PTHREAD_MUTEX_UNLOCK(&worker->mutex);
  564. return 0;
  565. }
  566. int _starpu_opencl_driver_run_once(struct _starpu_worker *worker)
  567. {
  568. int workerid = worker->workerid;
  569. unsigned memnode = worker->memory_node;
  570. struct _starpu_job *j;
  571. struct starpu_task *task;
  572. int res;
  573. int idle_tasks, idle_transfers;
  574. #ifdef STARPU_SIMGRID
  575. starpu_pthread_wait_reset(&worker->wait);
  576. #endif
  577. idle_tasks = 0;
  578. idle_transfers = 0;
  579. /* First test for transfers pending for next task */
  580. task = worker->task_transferring;
  581. if (!task)
  582. idle_transfers++;
  583. if (task && worker->nb_buffers_transferred == worker->nb_buffers_totransfer)
  584. {
  585. _STARPU_TRACE_END_PROGRESS(memnode);
  586. j = _starpu_get_job_associated_to_task(task);
  587. _starpu_fetch_task_input_tail(task, j, worker);
  588. _starpu_set_worker_status(worker, STATUS_UNKNOWN);
  589. /* Reset it */
  590. worker->task_transferring = NULL;
  591. if (worker->ntasks > 1 && !(task->cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC))
  592. {
  593. /* We have to execute a non-asynchronous task but we
  594. * still have tasks in the pipeline... Record it to
  595. * prevent more tasks from coming, and do it later */
  596. worker->pipeline_stuck = 1;
  597. return 0;
  598. }
  599. _starpu_opencl_execute_job(task, worker);
  600. _STARPU_TRACE_START_PROGRESS(memnode);
  601. }
  602. /* Then poll for completed jobs */
  603. if (worker->ntasks && worker->current_tasks[worker->first_task] != worker->task_transferring)
  604. {
  605. #ifndef STARPU_SIMGRID
  606. size_t size;
  607. int err;
  608. #endif
  609. /* On-going asynchronous task, check for its termination first */
  610. task = worker->current_tasks[worker->first_task];
  611. #ifdef STARPU_SIMGRID
  612. if (!task_finished[worker->devid][worker->first_task])
  613. #else /* !STARPU_SIMGRID */
  614. cl_int status;
  615. err = clGetEventInfo(task_events[worker->devid][worker->first_task], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &status, &size);
  616. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  617. STARPU_ASSERT(size == sizeof(cl_int));
  618. if (status != CL_COMPLETE)
  619. #endif /* !STARPU_SIMGRID */
  620. {
  621. }
  622. else
  623. {
  624. _STARPU_TRACE_END_PROGRESS(memnode);
  625. #ifndef STARPU_SIMGRID
  626. err = clReleaseEvent(task_events[worker->devid][worker->first_task]);
  627. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  628. task_events[worker->devid][worker->first_task] = 0;
  629. #endif
  630. /* Asynchronous task completed! */
  631. _starpu_opencl_stop_job(_starpu_get_job_associated_to_task(task), worker);
  632. /* See next task if any */
  633. if (worker->ntasks && worker->current_tasks[worker->first_task] != worker->task_transferring)
  634. {
  635. task = worker->current_tasks[worker->first_task];
  636. j = _starpu_get_job_associated_to_task(task);
  637. if (task->cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC)
  638. {
  639. /* An asynchronous task, it was already queued,
  640. * it's now running, record its start time. */
  641. _starpu_driver_start_job(worker, j, &worker->perf_arch, 0, starpu_profiling_status_get());
  642. }
  643. else
  644. {
  645. /* A synchronous task, we have finished flushing the pipeline, we can now at last execute it. */
  646. _STARPU_TRACE_EVENT("sync_task");
  647. _starpu_opencl_execute_job(task, worker);
  648. _STARPU_TRACE_EVENT("end_sync_task");
  649. worker->pipeline_stuck = 0;
  650. }
  651. }
  652. _STARPU_TRACE_START_PROGRESS(memnode);
  653. }
  654. }
  655. if (!worker->pipeline_length || worker->ntasks < worker->pipeline_length)
  656. idle_tasks++;
  657. #if defined(STARPU_NON_BLOCKING_DRIVERS) && !defined(STARPU_SIMGRID)
  658. if (!idle_tasks)
  659. {
  660. /* No task ready yet, no better thing to do than waiting */
  661. __starpu_datawizard_progress(1, !idle_transfers);
  662. return 0;
  663. }
  664. #endif
  665. res = !idle_tasks || !idle_transfers;
  666. res |= __starpu_datawizard_progress(1, 1);
  667. task = _starpu_get_worker_task(worker, workerid, memnode);
  668. #ifdef STARPU_SIMGRID
  669. if (!res && !task)
  670. starpu_pthread_wait_wait(&worker->wait);
  671. #endif
  672. if (task == NULL)
  673. return 0;
  674. j = _starpu_get_job_associated_to_task(task);
  675. worker->current_tasks[(worker->first_task + worker->ntasks)%STARPU_MAX_PIPELINE] = task;
  676. worker->ntasks++;
  677. if (worker->pipeline_length == 0)
  678. /* _starpu_get_worker_task checks .current_task field if pipeline_length == 0
  679. *
  680. * TODO: update driver to not use current_tasks[] when pipeline_length == 0,
  681. * as for cuda driver */
  682. worker->current_task = task;
  683. /* can OpenCL do that task ? */
  684. if (!_STARPU_OPENCL_MAY_PERFORM(j))
  685. {
  686. /* this is not a OpenCL task */
  687. _starpu_worker_refuse_task(worker, task);
  688. return 0;
  689. }
  690. _STARPU_TRACE_END_PROGRESS(memnode);
  691. /* Fetch data asynchronously */
  692. res = _starpu_fetch_task_input(task, j, 1);
  693. STARPU_ASSERT(res == 0);
  694. _STARPU_TRACE_START_PROGRESS(memnode);
  695. return 0;
  696. }
  697. int _starpu_opencl_driver_deinit(struct _starpu_worker *worker)
  698. {
  699. _STARPU_TRACE_WORKER_DEINIT_START;
  700. unsigned memnode = worker->memory_node;
  701. _starpu_handle_all_pending_node_data_requests(memnode);
  702. /* In case there remains some memory that was automatically
  703. * allocated by StarPU, we release it now. Note that data
  704. * coherency is not maintained anymore at that point ! */
  705. _starpu_free_all_automatically_allocated_buffers(memnode);
  706. _starpu_malloc_shutdown(memnode);
  707. unsigned devid = worker->devid;
  708. _starpu_opencl_deinit_context(devid);
  709. worker->worker_is_initialized = 0;
  710. _STARPU_TRACE_WORKER_DEINIT_END(_STARPU_FUT_OPENCL_KEY);
  711. return 0;
  712. }
  713. void *_starpu_opencl_worker(void *_arg)
  714. {
  715. struct _starpu_worker* worker = _arg;
  716. _starpu_opencl_driver_init(worker);
  717. _STARPU_TRACE_START_PROGRESS(worker->memory_node);
  718. while (_starpu_machine_is_running())
  719. {
  720. _starpu_may_pause();
  721. _starpu_opencl_driver_run_once(worker);
  722. }
  723. _starpu_opencl_driver_deinit(worker);
  724. _STARPU_TRACE_END_PROGRESS(worker->memory_node);
  725. return NULL;
  726. }
  727. #ifdef STARPU_USE_OPENCL
  728. #ifndef STARPU_SIMGRID
  729. static unsigned _starpu_opencl_get_device_name(int dev, char *name, int lname)
  730. {
  731. int err;
  732. if (!init_done)
  733. {
  734. _starpu_opencl_init();
  735. }
  736. // Get device name
  737. err = clGetDeviceInfo(devices[dev], CL_DEVICE_NAME, lname, name, NULL);
  738. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  739. _STARPU_DEBUG("Device %d : [%s]\n", dev, name);
  740. return EXIT_SUCCESS;
  741. }
  742. #endif
  743. #endif
  744. unsigned _starpu_opencl_get_device_count(void)
  745. {
  746. if (!init_done)
  747. {
  748. _starpu_opencl_init();
  749. }
  750. return nb_devices;
  751. }
  752. #ifdef STARPU_USE_OPENCL
  753. cl_device_type _starpu_opencl_get_device_type(int devid)
  754. {
  755. int err;
  756. cl_device_type type;
  757. if (!init_done)
  758. _starpu_opencl_init();
  759. err = clGetDeviceInfo(devices[devid], CL_DEVICE_TYPE, sizeof(cl_device_type), &type, NULL);
  760. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  761. return type;
  762. }
  763. #endif /* STARPU_USE_OPENCL */
  764. static int _starpu_opencl_start_job(struct _starpu_job *j, struct _starpu_worker *worker, unsigned char pipeline_idx STARPU_ATTRIBUTE_UNUSED)
  765. {
  766. STARPU_ASSERT(j);
  767. struct starpu_task *task = j->task;
  768. int profiling = starpu_profiling_status_get();
  769. STARPU_ASSERT(task);
  770. struct starpu_codelet *cl = task->cl;
  771. STARPU_ASSERT(cl);
  772. _starpu_set_current_task(task);
  773. if (worker->ntasks == 1)
  774. {
  775. /* We are alone in the pipeline, the kernel will start now, record it */
  776. _starpu_driver_start_job(worker, j, &worker->perf_arch, 0, profiling);
  777. }
  778. starpu_opencl_func_t func = _starpu_task_get_opencl_nth_implementation(cl, j->nimpl);
  779. STARPU_ASSERT_MSG(func, "when STARPU_OPENCL is defined in 'where', opencl_func or opencl_funcs has to be defined");
  780. if (_starpu_get_disable_kernels() <= 0)
  781. {
  782. _STARPU_TRACE_START_EXECUTING();
  783. #ifdef STARPU_SIMGRID
  784. double length = NAN;
  785. int async = task->cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC;
  786. int simulate = 1;
  787. if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE && !async)
  788. {
  789. /* Actually execute function */
  790. simulate = 0;
  791. func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
  792. #ifdef STARPU_OPENCL_SIMULATOR
  793. #ifndef CL_PROFILING_CLOCK_CYCLE_COUNT
  794. #ifdef CL_PROFILING_COMMAND_SHAVE_CYCLE_COUNT
  795. #define CL_PROFILING_CLOCK_CYCLE_COUNT CL_PROFILING_COMMAND_SHAVE_CYCLE_COUNT
  796. #else
  797. #error The OpenCL simulator must provide CL_PROFILING_CLOCK_CYCLE_COUNT
  798. #endif
  799. #endif
  800. struct starpu_profiling_task_info *profiling_info = task->profiling_info;
  801. STARPU_ASSERT_MSG(profiling_info->used_cycles, "Application kernel must call starpu_opencl_collect_stats to collect simulated time");
  802. #ifdef HAVE_MSG_HOST_GET_SPEED
  803. length = ((double) profiling_info->used_cycles)/MSG_host_get_speed(MSG_host_self());
  804. #else
  805. length = ((double) profiling_info->used_cycles)/MSG_get_host_speed(MSG_host_self());
  806. #endif
  807. /* And give the simulated time to simgrid */
  808. simulate = 1;
  809. #endif
  810. }
  811. else if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT && !async)
  812. {
  813. _SIMGRID_TIMER_BEGIN(1);
  814. func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
  815. _SIMGRID_TIMER_END;
  816. simulate=0;
  817. }
  818. if (simulate)
  819. _starpu_simgrid_submit_job(worker->workerid, j, &worker->perf_arch, length,
  820. async ? &task_finished[worker->devid][pipeline_idx] : NULL);
  821. #else
  822. func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
  823. #endif
  824. _STARPU_TRACE_END_EXECUTING();
  825. }
  826. return 0;
  827. }
  828. static void _starpu_opencl_stop_job(struct _starpu_job *j, struct _starpu_worker *worker)
  829. {
  830. int profiling = starpu_profiling_status_get();
  831. _starpu_set_current_task(NULL);
  832. if (worker->pipeline_length)
  833. worker->current_tasks[worker->first_task] = NULL;
  834. else
  835. worker->current_task = NULL;
  836. worker->first_task = (worker->first_task + 1) % STARPU_MAX_PIPELINE;
  837. worker->ntasks--;
  838. _starpu_driver_end_job(worker, j, &worker->perf_arch, 0, profiling);
  839. struct _starpu_sched_ctx *sched_ctx = _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(worker, j);
  840. STARPU_ASSERT_MSG(sched_ctx != NULL, "there should be a worker %d in the ctx of this job \n", worker->workerid);
  841. if(!sched_ctx->sched_policy)
  842. _starpu_driver_update_job_feedback(j, worker, &sched_ctx->perf_arch, profiling);
  843. else
  844. _starpu_driver_update_job_feedback(j, worker, &worker->perf_arch, profiling);
  845. _starpu_push_task_output(j);
  846. _starpu_handle_job_termination(j);
  847. }
  848. static void _starpu_opencl_execute_job(struct starpu_task *task, struct _starpu_worker *worker)
  849. {
  850. int res;
  851. struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
  852. unsigned char pipeline_idx = (worker->first_task + worker->ntasks - 1)%STARPU_MAX_PIPELINE;
  853. res = _starpu_opencl_start_job(j, worker, pipeline_idx);
  854. if (res)
  855. {
  856. switch (res)
  857. {
  858. case -EAGAIN:
  859. _STARPU_DISP("ouch, OpenCL could not actually run task %p, putting it back...\n", task);
  860. _starpu_push_task_to_workers(task);
  861. STARPU_ABORT();
  862. default:
  863. STARPU_ABORT();
  864. }
  865. }
  866. if (task->cl->opencl_flags[j->nimpl] & STARPU_OPENCL_ASYNC)
  867. {
  868. /* Record event to synchronize with task termination later */
  869. #ifndef STARPU_SIMGRID
  870. cl_command_queue queue;
  871. starpu_opencl_get_queue(worker->devid, &queue);
  872. #endif
  873. if (worker->pipeline_length == 0)
  874. {
  875. #ifdef STARPU_SIMGRID
  876. _starpu_simgrid_wait_tasks(worker->workerid);
  877. #else
  878. starpu_opencl_get_queue(worker->devid, &queue);
  879. clFinish(queue);
  880. #endif
  881. _starpu_opencl_stop_job(j, worker);
  882. }
  883. else
  884. {
  885. #ifndef STARPU_SIMGRID
  886. int err;
  887. /* the function clEnqueueMarker is deprecated from
  888. * OpenCL version 1.2. We would like to use the new
  889. * function clEnqueueMarkerWithWaitList. We could do
  890. * it by checking its availability through our own
  891. * configure macro HAVE_CLENQUEUEMARKERWITHWAITLIST
  892. * and the OpenCL macro CL_VERSION_1_2. However these
  893. * 2 macros detect the function availability in the
  894. * ICD and not in the device implementation.
  895. */
  896. err = clEnqueueMarker(queue, &task_events[worker->devid][pipeline_idx]);
  897. _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
  898. #endif
  899. _STARPU_TRACE_START_EXECUTING();
  900. }
  901. }
  902. else
  903. /* Synchronous execution */
  904. {
  905. _starpu_opencl_stop_job(j, worker);
  906. }
  907. }
  908. #ifdef STARPU_USE_OPENCL
  909. int _starpu_run_opencl(struct _starpu_worker *workerarg)
  910. {
  911. _STARPU_DEBUG("Running OpenCL %u from the application\n", workerarg->devid);
  912. workerarg->set = NULL;
  913. workerarg->worker_is_initialized = 0;
  914. /* Let's go ! */
  915. _starpu_opencl_worker(workerarg);
  916. return 0;
  917. }
  918. struct _starpu_driver_ops _starpu_driver_opencl_ops =
  919. {
  920. .init = _starpu_opencl_driver_init,
  921. .run = _starpu_run_opencl,
  922. .run_once = _starpu_opencl_driver_run_once,
  923. .deinit = _starpu_opencl_driver_deinit
  924. };
  925. #endif /* STARPU_USE_OPENCL */