driver_cuda.c 33 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2009-2017 Université de Bordeaux
  4. * Copyright (C) 2010 Mehdi Juhoor <mjuhoor@gmail.com>
  5. * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017 CNRS
  6. * Copyright (C) 2011 Télécom-SudParis
  7. * Copyright (C) 2016 Uppsala University
  8. *
  9. * StarPU is free software; you can redistribute it and/or modify
  10. * it under the terms of the GNU Lesser General Public License as published by
  11. * the Free Software Foundation; either version 2.1 of the License, or (at
  12. * your option) any later version.
  13. *
  14. * StarPU is distributed in the hope that it will be useful, but
  15. * WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  17. *
  18. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  19. */
  20. #include <starpu.h>
  21. #include <starpu_cuda.h>
  22. #include <starpu_profiling.h>
  23. #include <common/utils.h>
  24. #include <common/config.h>
  25. #include <core/debug.h>
  26. #include <drivers/driver_common/driver_common.h>
  27. #include "driver_cuda.h"
  28. #include <core/sched_policy.h>
  29. #ifdef HAVE_CUDA_GL_INTEROP_H
  30. #include <cuda_gl_interop.h>
  31. #endif
  32. #include <datawizard/memory_manager.h>
  33. #include <datawizard/memory_nodes.h>
  34. #include <datawizard/malloc.h>
  35. #include <core/task.h>
  36. #ifdef STARPU_SIMGRID
  37. #include <core/simgrid.h>
  38. #endif
  39. #ifdef STARPU_USE_CUDA
  40. #if CUDART_VERSION >= 5000
  41. /* Avoid letting our streams spuriously synchonize with the NULL stream */
  42. #define starpu_cudaStreamCreate(stream) cudaStreamCreateWithFlags(stream, cudaStreamNonBlocking)
  43. #else
  44. #define starpu_cudaStreamCreate(stream) cudaStreamCreate(stream)
  45. #endif
  46. #endif
  47. /* the number of CUDA devices */
  48. static int ncudagpus = -1;
  49. static size_t global_mem[STARPU_MAXCUDADEVS];
  50. int _starpu_cuda_bus_ids[STARPU_MAXCUDADEVS+1][STARPU_MAXCUDADEVS+1];
  51. #ifdef STARPU_USE_CUDA
  52. static cudaStream_t streams[STARPU_NMAXWORKERS];
  53. static cudaStream_t out_transfer_streams[STARPU_MAXCUDADEVS];
  54. static cudaStream_t in_transfer_streams[STARPU_MAXCUDADEVS];
  55. /* Note: streams are not thread-safe, so we define them for each CUDA worker
  56. * emitting a GPU-GPU transfer */
  57. static cudaStream_t in_peer_transfer_streams[STARPU_MAXCUDADEVS][STARPU_MAXCUDADEVS];
  58. static cudaStream_t out_peer_transfer_streams[STARPU_MAXCUDADEVS][STARPU_MAXCUDADEVS];
  59. static struct cudaDeviceProp props[STARPU_MAXCUDADEVS];
  60. #ifndef STARPU_SIMGRID
  61. static cudaEvent_t task_events[STARPU_NMAXWORKERS][STARPU_MAX_PIPELINE];
  62. #endif
  63. #endif /* STARPU_USE_CUDA */
  64. #ifdef STARPU_SIMGRID
  65. static unsigned task_finished[STARPU_NMAXWORKERS][STARPU_MAX_PIPELINE];
  66. #endif /* STARPU_SIMGRID */
  67. static enum initialization cuda_device_init[STARPU_MAXCUDADEVS];
  68. static int cuda_device_users[STARPU_MAXCUDADEVS];
  69. static starpu_pthread_mutex_t cuda_device_init_mutex[STARPU_MAXCUDADEVS];
  70. static starpu_pthread_cond_t cuda_device_init_cond[STARPU_MAXCUDADEVS];
  71. void _starpu_cuda_init(void)
  72. {
  73. unsigned i;
  74. for (i = 0; i < STARPU_MAXCUDADEVS; i++)
  75. {
  76. STARPU_PTHREAD_MUTEX_INIT(&cuda_device_init_mutex[i], NULL);
  77. STARPU_PTHREAD_COND_INIT(&cuda_device_init_cond[i], NULL);
  78. }
  79. }
  80. static size_t _starpu_cuda_get_global_mem_size(unsigned devid)
  81. {
  82. return global_mem[devid];
  83. }
  84. void
  85. _starpu_cuda_discover_devices (struct _starpu_machine_config *config)
  86. {
  87. /* Discover the number of CUDA devices. Fill the result in CONFIG. */
  88. #ifdef STARPU_SIMGRID
  89. config->topology.nhwcudagpus = _starpu_simgrid_get_nbhosts("CUDA");
  90. #else
  91. int cnt;
  92. cudaError_t cures;
  93. cures = cudaGetDeviceCount (&cnt);
  94. if (STARPU_UNLIKELY(cures != cudaSuccess))
  95. cnt = 0;
  96. config->topology.nhwcudagpus = cnt;
  97. #endif
  98. }
  99. /* In case we want to cap the amount of memory available on the GPUs by the
  100. * mean of the STARPU_LIMIT_CUDA_MEM, we decrease the value of
  101. * global_mem[devid] which is the value returned by
  102. * _starpu_cuda_get_global_mem_size() to indicate how much memory can
  103. * be allocated on the device
  104. */
  105. static void _starpu_cuda_limit_gpu_mem_if_needed(unsigned devid)
  106. {
  107. starpu_ssize_t limit;
  108. size_t STARPU_ATTRIBUTE_UNUSED totalGlobalMem = 0;
  109. size_t STARPU_ATTRIBUTE_UNUSED to_waste = 0;
  110. #ifdef STARPU_SIMGRID
  111. totalGlobalMem = _starpu_simgrid_get_memsize("CUDA", devid);
  112. #elif defined(STARPU_USE_CUDA)
  113. /* Find the size of the memory on the device */
  114. totalGlobalMem = props[devid].totalGlobalMem;
  115. #endif
  116. limit = starpu_get_env_number("STARPU_LIMIT_CUDA_MEM");
  117. if (limit == -1)
  118. {
  119. char name[30];
  120. sprintf(name, "STARPU_LIMIT_CUDA_%u_MEM", devid);
  121. limit = starpu_get_env_number(name);
  122. }
  123. #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
  124. if (limit == -1)
  125. {
  126. /* Use 90% of the available memory by default. */
  127. limit = totalGlobalMem / (1024*1024) * 0.9;
  128. }
  129. #endif
  130. global_mem[devid] = limit * 1024*1024;
  131. #ifdef STARPU_USE_CUDA
  132. /* How much memory to waste ? */
  133. to_waste = totalGlobalMem - global_mem[devid];
  134. props[devid].totalGlobalMem -= to_waste;
  135. #endif /* STARPU_USE_CUDA */
  136. _STARPU_DEBUG("CUDA device %u: Wasting %ld MB / Limit %ld MB / Total %ld MB / Remains %ld MB\n",
  137. devid, (long) to_waste/(1024*1024), (long) limit, (long) totalGlobalMem/(1024*1024),
  138. (long) (totalGlobalMem - to_waste)/(1024*1024));
  139. }
  140. #ifdef STARPU_USE_CUDA
  141. cudaStream_t starpu_cuda_get_local_in_transfer_stream()
  142. {
  143. int worker = starpu_worker_get_id_check();
  144. int devid = starpu_worker_get_devid(worker);
  145. cudaStream_t stream;
  146. stream = in_transfer_streams[devid];
  147. STARPU_ASSERT(stream);
  148. return stream;
  149. }
  150. cudaStream_t starpu_cuda_get_in_transfer_stream(unsigned dst_node)
  151. {
  152. int dst_devid = _starpu_memory_node_get_devid(dst_node);
  153. cudaStream_t stream;
  154. stream = in_transfer_streams[dst_devid];
  155. STARPU_ASSERT(stream);
  156. return stream;
  157. }
  158. cudaStream_t starpu_cuda_get_local_out_transfer_stream()
  159. {
  160. int worker = starpu_worker_get_id_check();
  161. int devid = starpu_worker_get_devid(worker);
  162. cudaStream_t stream;
  163. stream = out_transfer_streams[devid];
  164. STARPU_ASSERT(stream);
  165. return stream;
  166. }
  167. cudaStream_t starpu_cuda_get_out_transfer_stream(unsigned src_node)
  168. {
  169. int src_devid = _starpu_memory_node_get_devid(src_node);
  170. cudaStream_t stream;
  171. stream = out_transfer_streams[src_devid];
  172. STARPU_ASSERT(stream);
  173. return stream;
  174. }
  175. cudaStream_t starpu_cuda_get_peer_transfer_stream(unsigned src_node, unsigned dst_node)
  176. {
  177. int src_devid = _starpu_memory_node_get_devid(src_node);
  178. int dst_devid = _starpu_memory_node_get_devid(dst_node);
  179. cudaStream_t stream;
  180. stream = in_peer_transfer_streams[src_devid][dst_devid];
  181. STARPU_ASSERT(stream);
  182. return stream;
  183. }
  184. cudaStream_t starpu_cuda_get_local_stream(void)
  185. {
  186. int worker = starpu_worker_get_id_check();
  187. return streams[worker];
  188. }
  189. const struct cudaDeviceProp *starpu_cuda_get_device_properties(unsigned workerid)
  190. {
  191. struct _starpu_machine_config *config = _starpu_get_machine_config();
  192. unsigned devid = config->workers[workerid].devid;
  193. return &props[devid];
  194. }
  195. #endif /* STARPU_USE_CUDA */
  196. void starpu_cuda_set_device(unsigned devid STARPU_ATTRIBUTE_UNUSED)
  197. {
  198. #ifdef STARPU_SIMGRID
  199. STARPU_ABORT();
  200. #else
  201. cudaError_t cures;
  202. struct starpu_conf *conf = &_starpu_get_machine_config()->conf;
  203. #if !defined(HAVE_CUDA_MEMCPY_PEER) && defined(HAVE_CUDA_GL_INTEROP_H)
  204. unsigned i;
  205. #endif
  206. #ifdef HAVE_CUDA_MEMCPY_PEER
  207. if (conf->n_cuda_opengl_interoperability)
  208. {
  209. _STARPU_MSG("OpenGL interoperability was requested, but StarPU was built with multithread GPU control support, please reconfigure with --disable-cuda-memcpy-peer but that will disable the memcpy-peer optimizations\n");
  210. STARPU_ABORT();
  211. }
  212. #elif !defined(HAVE_CUDA_GL_INTEROP_H)
  213. if (conf->n_cuda_opengl_interoperability)
  214. {
  215. _STARPU_MSG("OpenGL interoperability was requested, but cuda_gl_interop.h could not be compiled, please make sure that OpenGL headers were available before ./configure run.");
  216. STARPU_ABORT();
  217. }
  218. #else
  219. for (i = 0; i < conf->n_cuda_opengl_interoperability; i++)
  220. if (conf->cuda_opengl_interoperability[i] == devid)
  221. {
  222. cures = cudaGLSetGLDevice(devid);
  223. goto done;
  224. }
  225. #endif
  226. cures = cudaSetDevice(devid);
  227. #if !defined(HAVE_CUDA_MEMCPY_PEER) && defined(HAVE_CUDA_GL_INTEROP_H)
  228. done:
  229. #endif
  230. if (STARPU_UNLIKELY(cures
  231. #ifdef STARPU_OPENMP
  232. /* When StarPU is used as Open Runtime support,
  233. * starpu_omp_shutdown() will usually be called from a
  234. * destructor, in which case cudaThreadExit() reports a
  235. * cudaErrorCudartUnloading here. There should not
  236. * be any remaining tasks running at this point so
  237. * we can probably ignore it without much consequences. */
  238. && cures != cudaErrorCudartUnloading
  239. #endif /* STARPU_OPENMP */
  240. ))
  241. STARPU_CUDA_REPORT_ERROR(cures);
  242. #endif
  243. }
  244. static void init_device_context(unsigned devid, unsigned memnode)
  245. {
  246. #ifndef STARPU_SIMGRID
  247. cudaError_t cures;
  248. /* TODO: cudaSetDeviceFlag(cudaDeviceMapHost) */
  249. starpu_cuda_set_device(devid);
  250. #endif /* !STARPU_SIMGRID */
  251. STARPU_PTHREAD_MUTEX_LOCK(&cuda_device_init_mutex[devid]);
  252. cuda_device_users[devid]++;
  253. if (cuda_device_init[devid] == UNINITIALIZED)
  254. /* Nobody started initialization yet, do it */
  255. cuda_device_init[devid] = CHANGING;
  256. else
  257. {
  258. /* Somebody else is doing initialization, wait for it */
  259. while (cuda_device_init[devid] != INITIALIZED)
  260. STARPU_PTHREAD_COND_WAIT(&cuda_device_init_cond[devid], &cuda_device_init_mutex[devid]);
  261. STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_device_init_mutex[devid]);
  262. return;
  263. }
  264. STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_device_init_mutex[devid]);
  265. #ifndef STARPU_SIMGRID
  266. #ifdef HAVE_CUDA_MEMCPY_PEER
  267. if (starpu_get_env_number("STARPU_ENABLE_CUDA_GPU_GPU_DIRECT") != 0)
  268. {
  269. int nworkers = starpu_worker_get_count();
  270. int workerid;
  271. for (workerid = 0; workerid < nworkers; workerid++)
  272. {
  273. struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
  274. if (worker->arch == STARPU_CUDA_WORKER && worker->devid != devid)
  275. {
  276. int can;
  277. cures = cudaDeviceCanAccessPeer(&can, devid, worker->devid);
  278. if (!cures && can)
  279. {
  280. cures = cudaDeviceEnablePeerAccess(worker->devid, 0);
  281. if (!cures)
  282. {
  283. _STARPU_DEBUG("Enabled GPU-Direct %d -> %d\n", worker->devid, devid);
  284. /* direct copies are made from the destination, see link_supports_direct_transfers */
  285. starpu_bus_set_direct(_starpu_cuda_bus_ids[worker->devid][devid], 1);
  286. }
  287. }
  288. }
  289. }
  290. }
  291. #endif
  292. /* force CUDA to initialize the context for real */
  293. cures = cudaFree(0);
  294. if (STARPU_UNLIKELY(cures))
  295. {
  296. if (cures == cudaErrorDevicesUnavailable)
  297. {
  298. _STARPU_MSG("All CUDA-capable devices are busy or unavailable\n");
  299. exit(77);
  300. }
  301. STARPU_CUDA_REPORT_ERROR(cures);
  302. }
  303. cures = cudaGetDeviceProperties(&props[devid], devid);
  304. if (STARPU_UNLIKELY(cures))
  305. STARPU_CUDA_REPORT_ERROR(cures);
  306. #ifdef HAVE_CUDA_MEMCPY_PEER
  307. if (props[devid].computeMode == cudaComputeModeExclusive)
  308. {
  309. _STARPU_MSG("CUDA is in EXCLUSIVE-THREAD mode, but StarPU was built with multithread GPU control support, please either ask your administrator to use EXCLUSIVE-PROCESS mode (which should really be fine), or reconfigure with --disable-cuda-memcpy-peer but that will disable the memcpy-peer optimizations\n");
  310. STARPU_ABORT();
  311. }
  312. #endif
  313. cures = starpu_cudaStreamCreate(&in_transfer_streams[devid]);
  314. if (STARPU_UNLIKELY(cures))
  315. STARPU_CUDA_REPORT_ERROR(cures);
  316. cures = starpu_cudaStreamCreate(&out_transfer_streams[devid]);
  317. if (STARPU_UNLIKELY(cures))
  318. STARPU_CUDA_REPORT_ERROR(cures);
  319. int i;
  320. for (i = 0; i < ncudagpus; i++)
  321. {
  322. cures = starpu_cudaStreamCreate(&in_peer_transfer_streams[i][devid]);
  323. if (STARPU_UNLIKELY(cures))
  324. STARPU_CUDA_REPORT_ERROR(cures);
  325. cures = starpu_cudaStreamCreate(&out_peer_transfer_streams[devid][i]);
  326. if (STARPU_UNLIKELY(cures))
  327. STARPU_CUDA_REPORT_ERROR(cures);
  328. }
  329. #endif /* !STARPU_SIMGRID */
  330. STARPU_PTHREAD_MUTEX_LOCK(&cuda_device_init_mutex[devid]);
  331. cuda_device_init[devid] = INITIALIZED;
  332. STARPU_PTHREAD_COND_BROADCAST(&cuda_device_init_cond[devid]);
  333. STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_device_init_mutex[devid]);
  334. _starpu_cuda_limit_gpu_mem_if_needed(devid);
  335. _starpu_memory_manager_set_global_memory_size(memnode, _starpu_cuda_get_global_mem_size(devid));
  336. }
  337. static void init_worker_context(unsigned workerid, unsigned devid STARPU_ATTRIBUTE_UNUSED)
  338. {
  339. int j;
  340. #ifdef STARPU_SIMGRID
  341. for (j = 0; j < STARPU_MAX_PIPELINE; j++)
  342. task_finished[workerid][j] = 0;
  343. #else /* !STARPU_SIMGRID */
  344. cudaError_t cures;
  345. starpu_cuda_set_device(devid);
  346. for (j = 0; j < STARPU_MAX_PIPELINE; j++)
  347. {
  348. cures = cudaEventCreateWithFlags(&task_events[workerid][j], cudaEventDisableTiming);
  349. if (STARPU_UNLIKELY(cures))
  350. STARPU_CUDA_REPORT_ERROR(cures);
  351. }
  352. cures = starpu_cudaStreamCreate(&streams[workerid]);
  353. if (STARPU_UNLIKELY(cures))
  354. STARPU_CUDA_REPORT_ERROR(cures);
  355. #endif /* !STARPU_SIMGRID */
  356. }
  357. #ifndef STARPU_SIMGRID
  358. static void deinit_device_context(unsigned devid)
  359. {
  360. int i;
  361. starpu_cuda_set_device(devid);
  362. cudaStreamDestroy(in_transfer_streams[devid]);
  363. cudaStreamDestroy(out_transfer_streams[devid]);
  364. for (i = 0; i < ncudagpus; i++)
  365. {
  366. cudaStreamDestroy(in_peer_transfer_streams[i][devid]);
  367. cudaStreamDestroy(out_peer_transfer_streams[devid][i]);
  368. }
  369. }
  370. #endif /* !STARPU_SIMGRID */
  371. static void deinit_worker_context(unsigned workerid, unsigned devid STARPU_ATTRIBUTE_UNUSED)
  372. {
  373. unsigned j;
  374. #ifdef STARPU_SIMGRID
  375. for (j = 0; j < STARPU_MAX_PIPELINE; j++)
  376. task_finished[workerid][j] = 0;
  377. #else /* STARPU_SIMGRID */
  378. starpu_cuda_set_device(devid);
  379. for (j = 0; j < STARPU_MAX_PIPELINE; j++)
  380. cudaEventDestroy(task_events[workerid][j]);
  381. cudaStreamDestroy(streams[workerid]);
  382. #endif /* STARPU_SIMGRID */
  383. }
  384. /* Return the number of devices usable in the system.
  385. * The value returned cannot be greater than MAXCUDADEVS */
  386. unsigned _starpu_get_cuda_device_count(void)
  387. {
  388. int cnt;
  389. #ifdef STARPU_SIMGRID
  390. cnt = _starpu_simgrid_get_nbhosts("CUDA");
  391. #else
  392. cudaError_t cures;
  393. cures = cudaGetDeviceCount(&cnt);
  394. if (STARPU_UNLIKELY(cures))
  395. return 0;
  396. #endif
  397. if (cnt > STARPU_MAXCUDADEVS)
  398. {
  399. _STARPU_MSG("# Warning: %d CUDA devices available. Only %d enabled. Use configure option --enable-maxcudadev=xxx to update the maximum value of supported CUDA devices.\n", cnt, STARPU_MAXCUDADEVS);
  400. cnt = STARPU_MAXCUDADEVS;
  401. }
  402. return (unsigned)cnt;
  403. }
  404. /* This is run from initialize to determine the number of CUDA devices */
  405. void _starpu_init_cuda(void)
  406. {
  407. if (ncudagpus < 0)
  408. {
  409. ncudagpus = _starpu_get_cuda_device_count();
  410. STARPU_ASSERT(ncudagpus <= STARPU_MAXCUDADEVS);
  411. }
  412. }
  413. static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *worker, unsigned char pipeline_idx STARPU_ATTRIBUTE_UNUSED)
  414. {
  415. STARPU_ASSERT(j);
  416. struct starpu_task *task = j->task;
  417. int profiling = starpu_profiling_status_get();
  418. STARPU_ASSERT(task);
  419. struct starpu_codelet *cl = task->cl;
  420. STARPU_ASSERT(cl);
  421. _starpu_set_local_worker_key(worker);
  422. _starpu_set_current_task(task);
  423. if (worker->ntasks == 1)
  424. {
  425. /* We are alone in the pipeline, the kernel will start now, record it */
  426. _starpu_driver_start_job(worker, j, &worker->perf_arch, &j->cl_start, 0, profiling);
  427. }
  428. #if defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
  429. /* We make sure we do manipulate the proper device */
  430. starpu_cuda_set_device(worker->devid);
  431. #endif
  432. starpu_cuda_func_t func = _starpu_task_get_cuda_nth_implementation(cl, j->nimpl);
  433. STARPU_ASSERT_MSG(func, "when STARPU_CUDA is defined in 'where', cuda_func or cuda_funcs has to be defined");
  434. if (_starpu_get_disable_kernels() <= 0)
  435. {
  436. _STARPU_TRACE_START_EXECUTING();
  437. #ifdef STARPU_SIMGRID
  438. int async = task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC;
  439. unsigned workerid = worker->workerid;
  440. if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE && !async)
  441. func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
  442. else if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT && !async)
  443. {
  444. _SIMGRID_TIMER_BEGIN(1);
  445. func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
  446. _SIMGRID_TIMER_END;
  447. }
  448. else
  449. _starpu_simgrid_submit_job(workerid, j, &worker->perf_arch, NAN,
  450. async ? &task_finished[workerid][pipeline_idx] : NULL);
  451. #else
  452. func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
  453. #endif
  454. _STARPU_TRACE_END_EXECUTING();
  455. }
  456. return 0;
  457. }
  458. static void finish_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *worker)
  459. {
  460. struct timespec codelet_end;
  461. int profiling = starpu_profiling_status_get();
  462. _starpu_set_current_task(NULL);
  463. if (worker->pipeline_length)
  464. worker->current_tasks[worker->first_task] = NULL;
  465. else
  466. worker->current_task = NULL;
  467. worker->first_task = (worker->first_task + 1) % STARPU_MAX_PIPELINE;
  468. worker->ntasks--;
  469. _starpu_driver_end_job(worker, j, &worker->perf_arch, &codelet_end, 0, profiling);
  470. struct _starpu_sched_ctx *sched_ctx = _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(worker, j);
  471. if(!sched_ctx)
  472. sched_ctx = _starpu_get_sched_ctx_struct(j->task->sched_ctx);
  473. if(!sched_ctx->sched_policy)
  474. _starpu_driver_update_job_feedback(j, worker, &sched_ctx->perf_arch, &j->cl_start, &codelet_end, profiling);
  475. else
  476. _starpu_driver_update_job_feedback(j, worker, &worker->perf_arch, &j->cl_start, &codelet_end, profiling);
  477. _starpu_push_task_output(j);
  478. _starpu_handle_job_termination(j);
  479. }
  480. /* Execute a job, up to completion for synchronous jobs */
  481. static void execute_job_on_cuda(struct starpu_task *task, struct _starpu_worker *worker)
  482. {
  483. int workerid = worker->workerid;
  484. int res;
  485. struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
  486. unsigned char pipeline_idx = (worker->first_task + worker->ntasks - 1)%STARPU_MAX_PIPELINE;
  487. res = start_job_on_cuda(j, worker, pipeline_idx);
  488. if (res)
  489. {
  490. switch (res)
  491. {
  492. case -EAGAIN:
  493. _STARPU_DISP("ouch, CUDA could not actually run task %p, putting it back...\n", task);
  494. _starpu_push_task_to_workers(task);
  495. STARPU_ABORT();
  496. default:
  497. STARPU_ABORT();
  498. }
  499. }
  500. if (task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
  501. {
  502. if (worker->pipeline_length == 0)
  503. {
  504. #ifdef STARPU_SIMGRID
  505. _starpu_simgrid_wait_tasks(workerid);
  506. #else
  507. /* Forced synchronous execution */
  508. cudaStreamSynchronize(starpu_cuda_get_local_stream());
  509. #endif
  510. finish_job_on_cuda(j, worker);
  511. }
  512. else
  513. {
  514. #ifndef STARPU_SIMGRID
  515. /* Record event to synchronize with task termination later */
  516. cudaEventRecord(task_events[workerid][pipeline_idx], starpu_cuda_get_local_stream());
  517. #endif
  518. #ifdef STARPU_USE_FXT
  519. int k;
  520. for (k = 0; k < (int) worker->set->nworkers; k++)
  521. if (worker->set->workers[k].ntasks == worker->set->workers[k].pipeline_length)
  522. break;
  523. if (k == (int) worker->set->nworkers)
  524. /* Everybody busy */
  525. _STARPU_TRACE_START_EXECUTING();
  526. #endif
  527. }
  528. }
  529. else
  530. /* Synchronous execution */
  531. {
  532. #if !defined(STARPU_SIMGRID)
  533. STARPU_ASSERT_MSG(cudaStreamQuery(starpu_cuda_get_local_stream()) == cudaSuccess, "Unless when using the STARPU_CUDA_ASYNC flag, CUDA codelets have to wait for termination of their kernels on the starpu_cuda_get_local_stream() stream");
  534. #endif
  535. finish_job_on_cuda(j, worker);
  536. }
  537. }
  538. /* This is run from the driver to initialize the driver CUDA context */
  539. int _starpu_cuda_driver_init(struct _starpu_worker_set *worker_set)
  540. {
  541. struct _starpu_worker *worker0 = &worker_set->workers[0];
  542. int lastdevid = -1;
  543. unsigned i;
  544. _starpu_driver_start(worker0, _STARPU_FUT_CUDA_KEY, 0);
  545. _starpu_set_local_worker_set_key(worker_set);
  546. #ifdef STARPU_USE_FXT
  547. for (i = 1; i < worker_set->nworkers; i++)
  548. _starpu_worker_start(&worker_set->workers[i], _STARPU_FUT_CUDA_KEY, 0);
  549. #endif
  550. for (i = 0; i < worker_set->nworkers; i++)
  551. {
  552. struct _starpu_worker *worker = &worker_set->workers[i];
  553. unsigned devid = worker->devid;
  554. unsigned memnode = worker->memory_node;
  555. if ((int) devid == lastdevid)
  556. {
  557. #ifdef STARPU_SIMGRID
  558. STARPU_ASSERT_MSG(0, "Simgrid mode does not support concurrent kernel execution yet\n");
  559. #endif /* !STARPU_SIMGRID */
  560. /* Already initialized */
  561. continue;
  562. }
  563. #ifndef STARPU_SIMGRID
  564. if (worker->config->topology.nworkerpercuda > 1 && props[devid].concurrentKernels == 0)
  565. _STARPU_DISP("Warning: STARPU_NWORKER_PER_CUDA is %u, but CUDA device %u does not support concurrent kernel execution!\n", worker_set->nworkers, devid);
  566. #endif /* !STARPU_SIMGRID */
  567. lastdevid = devid;
  568. init_device_context(devid, memnode);
  569. }
  570. /* one more time to avoid hacks from third party lib :) */
  571. _starpu_bind_thread_on_cpu(worker0->config, worker0->bindid, worker0->workerid);
  572. for (i = 0; i < worker_set->nworkers; i++)
  573. {
  574. struct _starpu_worker *worker = &worker_set->workers[i];
  575. unsigned devid = worker->devid;
  576. unsigned workerid = worker->workerid;
  577. unsigned subdev = i % _starpu_get_machine_config()->topology.nworkerpercuda;
  578. float size = (float) global_mem[devid] / (1<<30);
  579. #ifdef STARPU_SIMGRID
  580. const char *devname = "Simgrid";
  581. #else
  582. /* get the device's name */
  583. char devname[128];
  584. strncpy(devname, props[devid].name, 127);
  585. devname[127] = 0;
  586. #endif
  587. #if defined(STARPU_HAVE_BUSID) && !defined(STARPU_SIMGRID)
  588. #if defined(STARPU_HAVE_DOMAINID) && !defined(STARPU_SIMGRID)
  589. if (props[devid].pciDomainID)
  590. snprintf(worker->name, sizeof(worker->name), "CUDA %u.%u (%s %.1f GiB %04x:%02x:%02x.0)", devid, subdev, devname, size, props[devid].pciDomainID, props[devid].pciBusID, props[devid].pciDeviceID);
  591. else
  592. #endif
  593. snprintf(worker->name, sizeof(worker->name), "CUDA %u.%u (%s %.1f GiB %02x:%02x.0)", devid, subdev, devname, size, props[devid].pciBusID, props[devid].pciDeviceID);
  594. #else
  595. snprintf(worker->name, sizeof(worker->name), "CUDA %u.%u (%s %.1f GiB)", devid, subdev, devname, size);
  596. #endif
  597. snprintf(worker->short_name, sizeof(worker->short_name), "CUDA %u.%u", devid, subdev);
  598. _STARPU_DEBUG("cuda (%s) dev id %u worker %u thread is ready to run on CPU %d !\n", devname, devid, subdev, worker->bindid);
  599. worker->pipeline_length = starpu_get_env_number_default("STARPU_CUDA_PIPELINE", 2);
  600. if (worker->pipeline_length > STARPU_MAX_PIPELINE)
  601. {
  602. _STARPU_DISP("Warning: STARPU_CUDA_PIPELINE is %u, but STARPU_MAX_PIPELINE is only %u", worker->pipeline_length, STARPU_MAX_PIPELINE);
  603. worker->pipeline_length = STARPU_MAX_PIPELINE;
  604. }
  605. #if !defined(STARPU_SIMGRID) && !defined(STARPU_NON_BLOCKING_DRIVERS)
  606. if (worker->pipeline_length >= 1)
  607. {
  608. /* We need non-blocking drivers, to poll for CUDA task
  609. * termination */
  610. _STARPU_DISP("Warning: reducing STARPU_CUDA_PIPELINE to 0 because blocking drivers are enabled (and simgrid is not enabled)\n");
  611. worker->pipeline_length = 0;
  612. }
  613. #endif
  614. init_worker_context(workerid, worker->devid);
  615. _STARPU_TRACE_WORKER_INIT_END(workerid);
  616. }
  617. {
  618. char thread_name[16];
  619. snprintf(thread_name, sizeof(thread_name), "CUDA %u", worker0->devid);
  620. starpu_pthread_setname(thread_name);
  621. }
  622. /* tell the main thread that this one is ready */
  623. STARPU_PTHREAD_MUTEX_LOCK(&worker0->mutex);
  624. worker0->status = STATUS_UNKNOWN;
  625. worker0->worker_is_initialized = 1;
  626. STARPU_PTHREAD_COND_SIGNAL(&worker0->ready_cond);
  627. STARPU_PTHREAD_MUTEX_UNLOCK(&worker0->mutex);
  628. /* tell the main thread that this one is ready */
  629. STARPU_PTHREAD_MUTEX_LOCK(&worker_set->mutex);
  630. worker_set->set_is_initialized = 1;
  631. STARPU_PTHREAD_COND_SIGNAL(&worker_set->ready_cond);
  632. STARPU_PTHREAD_MUTEX_UNLOCK(&worker_set->mutex);
  633. return 0;
  634. }
  635. int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
  636. {
  637. struct _starpu_worker *worker0 = &worker_set->workers[0];
  638. struct starpu_task *tasks[worker_set->nworkers], *task;
  639. struct _starpu_job *j;
  640. int i, res;
  641. int idle_tasks, idle_transfers;
  642. #ifdef STARPU_SIMGRID
  643. starpu_pthread_wait_reset(&worker0->wait);
  644. #endif
  645. _starpu_set_local_worker_key(worker0);
  646. /* First poll for completed jobs */
  647. idle_tasks = 0;
  648. idle_transfers = 0;
  649. for (i = 0; i < (int) worker_set->nworkers; i++)
  650. {
  651. struct _starpu_worker *worker = &worker_set->workers[i];
  652. int workerid = worker->workerid;
  653. unsigned memnode = worker->memory_node;
  654. if (!worker->ntasks)
  655. idle_tasks++;
  656. if (!worker->task_transferring)
  657. idle_transfers++;
  658. if (!worker->ntasks && !worker->task_transferring)
  659. {
  660. /* Even nothing to test */
  661. continue;
  662. }
  663. /* First test for transfers pending for next task */
  664. task = worker->task_transferring;
  665. if (task && worker->nb_buffers_transferred == worker->nb_buffers_totransfer)
  666. {
  667. _STARPU_TRACE_END_PROGRESS(memnode);
  668. j = _starpu_get_job_associated_to_task(task);
  669. _starpu_set_local_worker_key(worker);
  670. _starpu_fetch_task_input_tail(task, j, worker);
  671. /* Reset it */
  672. worker->task_transferring = NULL;
  673. if (worker->ntasks > 1 && !(task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC))
  674. {
  675. /* We have to execute a non-asynchronous task but we
  676. * still have tasks in the pipeline... Record it to
  677. * prevent more tasks from coming, and do it later */
  678. worker->pipeline_stuck = 1;
  679. }
  680. else
  681. {
  682. execute_job_on_cuda(task, worker);
  683. }
  684. _STARPU_TRACE_START_PROGRESS(memnode);
  685. }
  686. /* Then test for termination of queued tasks */
  687. if (!worker->ntasks)
  688. /* No queued task */
  689. continue;
  690. if (worker->pipeline_length)
  691. task = worker->current_tasks[worker->first_task];
  692. else
  693. task = worker->current_task;
  694. if (task == worker->task_transferring)
  695. /* Next task is still pending transfer */
  696. continue;
  697. /* On-going asynchronous task, check for its termination first */
  698. #ifdef STARPU_SIMGRID
  699. if (task_finished[workerid][worker->first_task])
  700. #else /* !STARPU_SIMGRID */
  701. cudaError_t cures = cudaEventQuery(task_events[workerid][worker->first_task]);
  702. if (cures != cudaSuccess)
  703. {
  704. STARPU_ASSERT_MSG(cures == cudaErrorNotReady, "CUDA error on task %p, codelet %p (%s): %s (%d)", task, task->cl, _starpu_codelet_get_model_name(task->cl), cudaGetErrorString(cures), cures);
  705. }
  706. else
  707. #endif /* !STARPU_SIMGRID */
  708. {
  709. _STARPU_TRACE_END_PROGRESS(memnode);
  710. /* Asynchronous task completed! */
  711. _starpu_set_local_worker_key(worker);
  712. finish_job_on_cuda(_starpu_get_job_associated_to_task(task), worker);
  713. /* See next task if any */
  714. if (worker->ntasks && worker->current_tasks[worker->first_task] != worker->task_transferring)
  715. {
  716. task = worker->current_tasks[worker->first_task];
  717. j = _starpu_get_job_associated_to_task(task);
  718. if (task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
  719. {
  720. /* An asynchronous task, it was already
  721. * queued, it's now running, record its start time. */
  722. _starpu_driver_start_job(worker, j, &worker->perf_arch, &j->cl_start, 0, starpu_profiling_status_get());
  723. }
  724. else
  725. {
  726. /* A synchronous task, we have finished
  727. * flushing the pipeline, we can now at
  728. * last execute it. */
  729. _STARPU_TRACE_EVENT("sync_task");
  730. execute_job_on_cuda(task, worker);
  731. _STARPU_TRACE_EVENT("end_sync_task");
  732. worker->pipeline_stuck = 0;
  733. }
  734. }
  735. #ifdef STARPU_USE_FXT
  736. int k;
  737. for (k = 0; k < (int) worker_set->nworkers; k++)
  738. if (worker_set->workers[k].ntasks)
  739. break;
  740. if (k == (int) worker_set->nworkers)
  741. /* Everybody busy */
  742. _STARPU_TRACE_END_EXECUTING()
  743. #endif
  744. _STARPU_TRACE_START_PROGRESS(memnode);
  745. }
  746. if (!worker->pipeline_length || worker->ntasks < worker->pipeline_length)
  747. idle_tasks++;
  748. }
  749. #if defined(STARPU_NON_BLOCKING_DRIVERS) && !defined(STARPU_SIMGRID)
  750. if (!idle_tasks)
  751. {
  752. /* No task ready yet, no better thing to do than waiting */
  753. __starpu_datawizard_progress(1, !idle_transfers);
  754. return 0;
  755. }
  756. #endif
  757. /* Something done, make some progress */
  758. res = !idle_tasks || !idle_transfers;
  759. res |= __starpu_datawizard_progress(1, 1);
  760. /* And pull tasks */
  761. res |= _starpu_get_multi_worker_task(worker_set->workers, tasks, worker_set->nworkers, worker0->memory_node);
  762. #ifdef STARPU_SIMGRID
  763. if (!res)
  764. starpu_pthread_wait_wait(&worker0->wait);
  765. #else
  766. if (!res)
  767. return 0;
  768. #endif
  769. for (i = 0; i < (int) worker_set->nworkers; i++)
  770. {
  771. struct _starpu_worker *worker = &worker_set->workers[i];
  772. unsigned memnode STARPU_ATTRIBUTE_UNUSED = worker->memory_node;
  773. task = tasks[i];
  774. if (!task)
  775. continue;
  776. j = _starpu_get_job_associated_to_task(task);
  777. /* can CUDA do that task ? */
  778. if (!_STARPU_CUDA_MAY_PERFORM(j))
  779. {
  780. /* this is neither a cuda or a cublas task */
  781. worker->ntasks--;
  782. _starpu_push_task_to_workers(task);
  783. continue;
  784. }
  785. /* Fetch data asynchronously */
  786. _STARPU_TRACE_END_PROGRESS(memnode);
  787. _starpu_set_local_worker_key(worker);
  788. res = _starpu_fetch_task_input(task, j, 1);
  789. STARPU_ASSERT(res == 0);
  790. _STARPU_TRACE_START_PROGRESS(memnode);
  791. }
  792. return 0;
  793. }
  794. int _starpu_cuda_driver_deinit(struct _starpu_worker_set *worker_set)
  795. {
  796. int lastdevid = -1;
  797. unsigned i;
  798. _STARPU_TRACE_WORKER_DEINIT_START;
  799. for (i = 0; i < worker_set->nworkers; i++)
  800. {
  801. struct _starpu_worker *worker = &worker_set->workers[i];
  802. unsigned devid = worker->devid;
  803. unsigned memnode = worker->memory_node;
  804. unsigned usersleft;
  805. if ((int) devid == lastdevid)
  806. /* Already initialized */
  807. continue;
  808. lastdevid = devid;
  809. STARPU_PTHREAD_MUTEX_LOCK(&cuda_device_init_mutex[devid]);
  810. usersleft = --cuda_device_users[devid];
  811. STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_device_init_mutex[devid]);
  812. if (!usersleft)
  813. {
  814. /* I'm last, deinitialize device */
  815. _starpu_handle_all_pending_node_data_requests(memnode);
  816. /* In case there remains some memory that was automatically
  817. * allocated by StarPU, we release it now. Note that data
  818. * coherency is not maintained anymore at that point ! */
  819. _starpu_free_all_automatically_allocated_buffers(memnode);
  820. _starpu_malloc_shutdown(memnode);
  821. #ifndef STARPU_SIMGRID
  822. deinit_device_context(devid);
  823. #endif /* !STARPU_SIMGRID */
  824. }
  825. STARPU_PTHREAD_MUTEX_LOCK(&cuda_device_init_mutex[devid]);
  826. cuda_device_init[devid] = UNINITIALIZED;
  827. STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_device_init_mutex[devid]);
  828. }
  829. for (i = 0; i < worker_set->nworkers; i++)
  830. {
  831. struct _starpu_worker *worker = &worker_set->workers[i];
  832. unsigned workerid = worker->workerid;
  833. deinit_worker_context(workerid, worker->devid);
  834. }
  835. worker_set->workers[0].worker_is_initialized = 0;
  836. _STARPU_TRACE_WORKER_DEINIT_END(_STARPU_FUT_CUDA_KEY);
  837. return 0;
  838. }
  839. void *_starpu_cuda_worker(void *_arg)
  840. {
  841. struct _starpu_worker_set* worker_set = _arg;
  842. unsigned i;
  843. _starpu_cuda_driver_init(worker_set);
  844. for (i = 0; i < worker_set->nworkers; i++)
  845. _STARPU_TRACE_START_PROGRESS(worker_set->workers[i].memory_node);
  846. while (_starpu_machine_is_running())
  847. {
  848. _starpu_may_pause();
  849. _starpu_cuda_driver_run_once(worker_set);
  850. }
  851. for (i = 0; i < worker_set->nworkers; i++)
  852. _STARPU_TRACE_END_PROGRESS(worker_set->workers[i].memory_node);
  853. _starpu_cuda_driver_deinit(worker_set);
  854. return NULL;
  855. }
  856. #ifdef STARPU_USE_CUDA
  857. void starpu_cublas_report_error(const char *func, const char *file, int line, int status)
  858. {
  859. char *errormsg;
  860. switch (status)
  861. {
  862. case CUBLAS_STATUS_SUCCESS:
  863. errormsg = "success";
  864. break;
  865. case CUBLAS_STATUS_NOT_INITIALIZED:
  866. errormsg = "not initialized";
  867. break;
  868. case CUBLAS_STATUS_ALLOC_FAILED:
  869. errormsg = "alloc failed";
  870. break;
  871. case CUBLAS_STATUS_INVALID_VALUE:
  872. errormsg = "invalid value";
  873. break;
  874. case CUBLAS_STATUS_ARCH_MISMATCH:
  875. errormsg = "arch mismatch";
  876. break;
  877. case CUBLAS_STATUS_EXECUTION_FAILED:
  878. errormsg = "execution failed";
  879. break;
  880. case CUBLAS_STATUS_INTERNAL_ERROR:
  881. errormsg = "internal error";
  882. break;
  883. default:
  884. errormsg = "unknown error";
  885. break;
  886. }
  887. _STARPU_MSG("oops in %s (%s:%d)... %d: %s \n", func, file, line, status, errormsg);
  888. STARPU_ABORT();
  889. }
  890. void starpu_cuda_report_error(const char *func, const char *file, int line, cudaError_t status)
  891. {
  892. const char *errormsg = cudaGetErrorString(status);
  893. printf("oops in %s (%s:%d)... %d: %s \n", func, file, line, status, errormsg);
  894. STARPU_ABORT();
  895. }
  896. #endif /* STARPU_USE_CUDA */
  897. #ifdef STARPU_USE_CUDA
  898. int
  899. starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node,
  900. void *dst_ptr, unsigned dst_node,
  901. size_t ssize, cudaStream_t stream,
  902. enum cudaMemcpyKind kind)
  903. {
  904. #ifdef HAVE_CUDA_MEMCPY_PEER
  905. int peer_copy = 0;
  906. int src_dev = -1, dst_dev = -1;
  907. #endif
  908. cudaError_t cures = 0;
  909. if (kind == cudaMemcpyDeviceToDevice && src_node != dst_node)
  910. {
  911. #ifdef HAVE_CUDA_MEMCPY_PEER
  912. peer_copy = 1;
  913. src_dev = _starpu_memory_node_get_devid(src_node);
  914. dst_dev = _starpu_memory_node_get_devid(dst_node);
  915. #else
  916. STARPU_ABORT();
  917. #endif
  918. }
  919. if (stream)
  920. {
  921. _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
  922. #ifdef HAVE_CUDA_MEMCPY_PEER
  923. if (peer_copy)
  924. {
  925. cures = cudaMemcpyPeerAsync((char *) dst_ptr, dst_dev,
  926. (char *) src_ptr, src_dev,
  927. ssize, stream);
  928. }
  929. else
  930. #endif
  931. {
  932. cures = cudaMemcpyAsync((char *)dst_ptr, (char *)src_ptr, ssize, kind, stream);
  933. }
  934. _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
  935. }
  936. /* Test if the asynchronous copy has failed or if the caller only asked for a synchronous copy */
  937. if (stream == NULL || cures)
  938. {
  939. /* do it in a synchronous fashion */
  940. #ifdef HAVE_CUDA_MEMCPY_PEER
  941. if (peer_copy)
  942. {
  943. cures = cudaMemcpyPeer((char *) dst_ptr, dst_dev,
  944. (char *) src_ptr, src_dev,
  945. ssize);
  946. }
  947. else
  948. #endif
  949. {
  950. cures = cudaMemcpy((char *)dst_ptr, (char *)src_ptr, ssize, kind);
  951. }
  952. if (STARPU_UNLIKELY(cures))
  953. STARPU_CUDA_REPORT_ERROR(cures);
  954. return 0;
  955. }
  956. return -EAGAIN;
  957. }
  958. #endif /* STARPU_USE_CUDA */
  959. int _starpu_run_cuda(struct _starpu_worker_set *workerarg)
  960. {
  961. /* Let's go ! */
  962. _starpu_cuda_worker(workerarg);
  963. return 0;
  964. }
  965. int _starpu_cuda_driver_init_from_worker(struct _starpu_worker *worker)
  966. {
  967. return _starpu_cuda_driver_init(worker->set);
  968. }
  969. int _starpu_cuda_run_from_worker(struct _starpu_worker *worker)
  970. {
  971. return _starpu_run_cuda(worker->set);
  972. }
  973. int _starpu_cuda_driver_run_once_from_worker(struct _starpu_worker *worker)
  974. {
  975. return _starpu_cuda_driver_run_once(worker->set);
  976. }
  977. int _starpu_cuda_driver_deinit_from_worker(struct _starpu_worker *worker)
  978. {
  979. return _starpu_cuda_driver_deinit(worker->set);
  980. }
  981. struct _starpu_driver_ops _starpu_driver_cuda_ops =
  982. {
  983. .init = _starpu_cuda_driver_init_from_worker,
  984. .run = _starpu_cuda_run_from_worker,
  985. .run_once = _starpu_cuda_driver_run_once_from_worker,
  986. .deinit = _starpu_cuda_driver_deinit_from_worker
  987. };