driver_cuda.c 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2009-2017 Université de Bordeaux
  4. * Copyright (C) 2010 Mehdi Juhoor <mjuhoor@gmail.com>
  5. * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017 CNRS
  6. * Copyright (C) 2011 Télécom-SudParis
  7. * Copyright (C) 2016 Uppsala University
  8. * Copyright (C) 2017 Inria
  9. *
  10. * StarPU is free software; you can redistribute it and/or modify
  11. * it under the terms of the GNU Lesser General Public License as published by
  12. * the Free Software Foundation; either version 2.1 of the License, or (at
  13. * your option) any later version.
  14. *
  15. * StarPU is distributed in the hope that it will be useful, but
  16. * WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  18. *
  19. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  20. */
  21. #include <starpu.h>
  22. #include <starpu_cuda.h>
  23. #include <starpu_profiling.h>
  24. #include <common/utils.h>
  25. #include <common/config.h>
  26. #include <core/debug.h>
  27. #include <drivers/driver_common/driver_common.h>
  28. #include "driver_cuda.h"
  29. #include <core/sched_policy.h>
  30. #ifdef HAVE_CUDA_GL_INTEROP_H
  31. #include <cuda_gl_interop.h>
  32. #endif
  33. #include <datawizard/memory_manager.h>
  34. #include <datawizard/memory_nodes.h>
  35. #include <datawizard/malloc.h>
  36. #include <core/task.h>
  37. #ifdef STARPU_SIMGRID
  38. #include <core/simgrid.h>
  39. #endif
  40. #ifdef STARPU_USE_CUDA
  41. #if CUDART_VERSION >= 5000
  42. /* Avoid letting our streams spuriously synchonize with the NULL stream */
  43. #define starpu_cudaStreamCreate(stream) cudaStreamCreateWithFlags(stream, cudaStreamNonBlocking)
  44. #else
  45. #define starpu_cudaStreamCreate(stream) cudaStreamCreate(stream)
  46. #endif
  47. #endif
  48. /* the number of CUDA devices */
  49. static int ncudagpus = -1;
  50. static size_t global_mem[STARPU_MAXCUDADEVS];
  51. int _starpu_cuda_bus_ids[STARPU_MAXCUDADEVS+1][STARPU_MAXCUDADEVS+1];
  52. #ifdef STARPU_USE_CUDA
  53. static cudaStream_t streams[STARPU_NMAXWORKERS];
  54. static cudaStream_t out_transfer_streams[STARPU_MAXCUDADEVS];
  55. static cudaStream_t in_transfer_streams[STARPU_MAXCUDADEVS];
  56. /* Note: streams are not thread-safe, so we define them for each CUDA worker
  57. * emitting a GPU-GPU transfer */
  58. static cudaStream_t in_peer_transfer_streams[STARPU_MAXCUDADEVS][STARPU_MAXCUDADEVS];
  59. static cudaStream_t out_peer_transfer_streams[STARPU_MAXCUDADEVS][STARPU_MAXCUDADEVS];
  60. static struct cudaDeviceProp props[STARPU_MAXCUDADEVS];
  61. #ifndef STARPU_SIMGRID
  62. static cudaEvent_t task_events[STARPU_NMAXWORKERS][STARPU_MAX_PIPELINE];
  63. #endif
  64. #endif /* STARPU_USE_CUDA */
  65. #ifdef STARPU_SIMGRID
  66. static unsigned task_finished[STARPU_NMAXWORKERS][STARPU_MAX_PIPELINE];
  67. #endif /* STARPU_SIMGRID */
  68. static enum initialization cuda_device_init[STARPU_MAXCUDADEVS];
  69. static int cuda_device_users[STARPU_MAXCUDADEVS];
  70. static starpu_pthread_mutex_t cuda_device_init_mutex[STARPU_MAXCUDADEVS];
  71. static starpu_pthread_cond_t cuda_device_init_cond[STARPU_MAXCUDADEVS];
  72. void _starpu_cuda_init(void)
  73. {
  74. unsigned i;
  75. for (i = 0; i < STARPU_MAXCUDADEVS; i++)
  76. {
  77. STARPU_PTHREAD_MUTEX_INIT(&cuda_device_init_mutex[i], NULL);
  78. STARPU_PTHREAD_COND_INIT(&cuda_device_init_cond[i], NULL);
  79. }
  80. }
  81. static size_t _starpu_cuda_get_global_mem_size(unsigned devid)
  82. {
  83. return global_mem[devid];
  84. }
  85. void
  86. _starpu_cuda_discover_devices (struct _starpu_machine_config *config)
  87. {
  88. /* Discover the number of CUDA devices. Fill the result in CONFIG. */
  89. #ifdef STARPU_SIMGRID
  90. config->topology.nhwcudagpus = _starpu_simgrid_get_nbhosts("CUDA");
  91. #else
  92. int cnt;
  93. cudaError_t cures;
  94. cures = cudaGetDeviceCount (&cnt);
  95. if (STARPU_UNLIKELY(cures != cudaSuccess))
  96. cnt = 0;
  97. config->topology.nhwcudagpus = cnt;
  98. #endif
  99. }
  100. /* In case we want to cap the amount of memory available on the GPUs by the
  101. * mean of the STARPU_LIMIT_CUDA_MEM, we decrease the value of
  102. * global_mem[devid] which is the value returned by
  103. * _starpu_cuda_get_global_mem_size() to indicate how much memory can
  104. * be allocated on the device
  105. */
  106. static void _starpu_cuda_limit_gpu_mem_if_needed(unsigned devid)
  107. {
  108. starpu_ssize_t limit;
  109. size_t STARPU_ATTRIBUTE_UNUSED totalGlobalMem = 0;
  110. size_t STARPU_ATTRIBUTE_UNUSED to_waste = 0;
  111. #ifdef STARPU_SIMGRID
  112. totalGlobalMem = _starpu_simgrid_get_memsize("CUDA", devid);
  113. #elif defined(STARPU_USE_CUDA)
  114. /* Find the size of the memory on the device */
  115. totalGlobalMem = props[devid].totalGlobalMem;
  116. #endif
  117. limit = starpu_get_env_number("STARPU_LIMIT_CUDA_MEM");
  118. if (limit == -1)
  119. {
  120. char name[30];
  121. sprintf(name, "STARPU_LIMIT_CUDA_%u_MEM", devid);
  122. limit = starpu_get_env_number(name);
  123. }
  124. #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
  125. if (limit == -1)
  126. {
  127. /* Use 90% of the available memory by default. */
  128. limit = totalGlobalMem / (1024*1024) * 0.9;
  129. }
  130. #endif
  131. global_mem[devid] = limit * 1024*1024;
  132. #ifdef STARPU_USE_CUDA
  133. /* How much memory to waste ? */
  134. to_waste = totalGlobalMem - global_mem[devid];
  135. props[devid].totalGlobalMem -= to_waste;
  136. #endif /* STARPU_USE_CUDA */
  137. _STARPU_DEBUG("CUDA device %u: Wasting %ld MB / Limit %ld MB / Total %ld MB / Remains %ld MB\n",
  138. devid, (long) to_waste/(1024*1024), (long) limit, (long) totalGlobalMem/(1024*1024),
  139. (long) (totalGlobalMem - to_waste)/(1024*1024));
  140. }
  141. #ifdef STARPU_USE_CUDA
  142. cudaStream_t starpu_cuda_get_local_in_transfer_stream()
  143. {
  144. int worker = starpu_worker_get_id_check();
  145. int devid = starpu_worker_get_devid(worker);
  146. cudaStream_t stream;
  147. stream = in_transfer_streams[devid];
  148. STARPU_ASSERT(stream);
  149. return stream;
  150. }
  151. cudaStream_t starpu_cuda_get_in_transfer_stream(unsigned dst_node)
  152. {
  153. int dst_devid = _starpu_memory_node_get_devid(dst_node);
  154. cudaStream_t stream;
  155. stream = in_transfer_streams[dst_devid];
  156. STARPU_ASSERT(stream);
  157. return stream;
  158. }
  159. cudaStream_t starpu_cuda_get_local_out_transfer_stream()
  160. {
  161. int worker = starpu_worker_get_id_check();
  162. int devid = starpu_worker_get_devid(worker);
  163. cudaStream_t stream;
  164. stream = out_transfer_streams[devid];
  165. STARPU_ASSERT(stream);
  166. return stream;
  167. }
  168. cudaStream_t starpu_cuda_get_out_transfer_stream(unsigned src_node)
  169. {
  170. int src_devid = _starpu_memory_node_get_devid(src_node);
  171. cudaStream_t stream;
  172. stream = out_transfer_streams[src_devid];
  173. STARPU_ASSERT(stream);
  174. return stream;
  175. }
  176. cudaStream_t starpu_cuda_get_peer_transfer_stream(unsigned src_node, unsigned dst_node)
  177. {
  178. int src_devid = _starpu_memory_node_get_devid(src_node);
  179. int dst_devid = _starpu_memory_node_get_devid(dst_node);
  180. cudaStream_t stream;
  181. stream = in_peer_transfer_streams[src_devid][dst_devid];
  182. STARPU_ASSERT(stream);
  183. return stream;
  184. }
  185. cudaStream_t starpu_cuda_get_local_stream(void)
  186. {
  187. int worker = starpu_worker_get_id_check();
  188. return streams[worker];
  189. }
  190. const struct cudaDeviceProp *starpu_cuda_get_device_properties(unsigned workerid)
  191. {
  192. struct _starpu_machine_config *config = _starpu_get_machine_config();
  193. unsigned devid = config->workers[workerid].devid;
  194. return &props[devid];
  195. }
  196. #endif /* STARPU_USE_CUDA */
  197. void starpu_cuda_set_device(unsigned devid STARPU_ATTRIBUTE_UNUSED)
  198. {
  199. #ifdef STARPU_SIMGRID
  200. STARPU_ABORT();
  201. #else
  202. cudaError_t cures;
  203. struct starpu_conf *conf = &_starpu_get_machine_config()->conf;
  204. #if !defined(HAVE_CUDA_MEMCPY_PEER) && defined(HAVE_CUDA_GL_INTEROP_H)
  205. unsigned i;
  206. #endif
  207. #ifdef HAVE_CUDA_MEMCPY_PEER
  208. if (conf->n_cuda_opengl_interoperability)
  209. {
  210. _STARPU_MSG("OpenGL interoperability was requested, but StarPU was built with multithread GPU control support, please reconfigure with --disable-cuda-memcpy-peer but that will disable the memcpy-peer optimizations\n");
  211. STARPU_ABORT();
  212. }
  213. #elif !defined(HAVE_CUDA_GL_INTEROP_H)
  214. if (conf->n_cuda_opengl_interoperability)
  215. {
  216. _STARPU_MSG("OpenGL interoperability was requested, but cuda_gl_interop.h could not be compiled, please make sure that OpenGL headers were available before ./configure run.");
  217. STARPU_ABORT();
  218. }
  219. #else
  220. for (i = 0; i < conf->n_cuda_opengl_interoperability; i++)
  221. if (conf->cuda_opengl_interoperability[i] == devid)
  222. {
  223. cures = cudaGLSetGLDevice(devid);
  224. goto done;
  225. }
  226. #endif
  227. cures = cudaSetDevice(devid);
  228. #if !defined(HAVE_CUDA_MEMCPY_PEER) && defined(HAVE_CUDA_GL_INTEROP_H)
  229. done:
  230. #endif
  231. if (STARPU_UNLIKELY(cures
  232. #ifdef STARPU_OPENMP
  233. /* When StarPU is used as Open Runtime support,
  234. * starpu_omp_shutdown() will usually be called from a
  235. * destructor, in which case cudaThreadExit() reports a
  236. * cudaErrorCudartUnloading here. There should not
  237. * be any remaining tasks running at this point so
  238. * we can probably ignore it without much consequences. */
  239. && cures != cudaErrorCudartUnloading
  240. #endif /* STARPU_OPENMP */
  241. ))
  242. STARPU_CUDA_REPORT_ERROR(cures);
  243. #endif
  244. }
  245. static void init_device_context(unsigned devid, unsigned memnode)
  246. {
  247. #ifndef STARPU_SIMGRID
  248. cudaError_t cures;
  249. /* TODO: cudaSetDeviceFlag(cudaDeviceMapHost) */
  250. starpu_cuda_set_device(devid);
  251. #endif /* !STARPU_SIMGRID */
  252. STARPU_PTHREAD_MUTEX_LOCK(&cuda_device_init_mutex[devid]);
  253. cuda_device_users[devid]++;
  254. if (cuda_device_init[devid] == UNINITIALIZED)
  255. /* Nobody started initialization yet, do it */
  256. cuda_device_init[devid] = CHANGING;
  257. else
  258. {
  259. /* Somebody else is doing initialization, wait for it */
  260. while (cuda_device_init[devid] != INITIALIZED)
  261. STARPU_PTHREAD_COND_WAIT(&cuda_device_init_cond[devid], &cuda_device_init_mutex[devid]);
  262. STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_device_init_mutex[devid]);
  263. return;
  264. }
  265. STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_device_init_mutex[devid]);
  266. #ifndef STARPU_SIMGRID
  267. #ifdef HAVE_CUDA_MEMCPY_PEER
  268. if (starpu_get_env_number("STARPU_ENABLE_CUDA_GPU_GPU_DIRECT") != 0)
  269. {
  270. int nworkers = starpu_worker_get_count();
  271. int workerid;
  272. for (workerid = 0; workerid < nworkers; workerid++)
  273. {
  274. struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
  275. if (worker->arch == STARPU_CUDA_WORKER && worker->devid != devid)
  276. {
  277. int can;
  278. cures = cudaDeviceCanAccessPeer(&can, devid, worker->devid);
  279. if (!cures && can)
  280. {
  281. cures = cudaDeviceEnablePeerAccess(worker->devid, 0);
  282. if (!cures)
  283. {
  284. _STARPU_DEBUG("Enabled GPU-Direct %d -> %d\n", worker->devid, devid);
  285. /* direct copies are made from the destination, see link_supports_direct_transfers */
  286. starpu_bus_set_direct(_starpu_cuda_bus_ids[worker->devid][devid], 1);
  287. }
  288. }
  289. }
  290. }
  291. }
  292. #endif
  293. /* force CUDA to initialize the context for real */
  294. cures = cudaFree(0);
  295. if (STARPU_UNLIKELY(cures))
  296. {
  297. if (cures == cudaErrorDevicesUnavailable)
  298. {
  299. _STARPU_MSG("All CUDA-capable devices are busy or unavailable\n");
  300. exit(77);
  301. }
  302. STARPU_CUDA_REPORT_ERROR(cures);
  303. }
  304. cures = cudaGetDeviceProperties(&props[devid], devid);
  305. if (STARPU_UNLIKELY(cures))
  306. STARPU_CUDA_REPORT_ERROR(cures);
  307. #ifdef HAVE_CUDA_MEMCPY_PEER
  308. if (props[devid].computeMode == cudaComputeModeExclusive)
  309. {
  310. _STARPU_MSG("CUDA is in EXCLUSIVE-THREAD mode, but StarPU was built with multithread GPU control support, please either ask your administrator to use EXCLUSIVE-PROCESS mode (which should really be fine), or reconfigure with --disable-cuda-memcpy-peer but that will disable the memcpy-peer optimizations\n");
  311. STARPU_ABORT();
  312. }
  313. #endif
  314. cures = starpu_cudaStreamCreate(&in_transfer_streams[devid]);
  315. if (STARPU_UNLIKELY(cures))
  316. STARPU_CUDA_REPORT_ERROR(cures);
  317. cures = starpu_cudaStreamCreate(&out_transfer_streams[devid]);
  318. if (STARPU_UNLIKELY(cures))
  319. STARPU_CUDA_REPORT_ERROR(cures);
  320. int i;
  321. for (i = 0; i < ncudagpus; i++)
  322. {
  323. cures = starpu_cudaStreamCreate(&in_peer_transfer_streams[i][devid]);
  324. if (STARPU_UNLIKELY(cures))
  325. STARPU_CUDA_REPORT_ERROR(cures);
  326. cures = starpu_cudaStreamCreate(&out_peer_transfer_streams[devid][i]);
  327. if (STARPU_UNLIKELY(cures))
  328. STARPU_CUDA_REPORT_ERROR(cures);
  329. }
  330. #endif /* !STARPU_SIMGRID */
  331. STARPU_PTHREAD_MUTEX_LOCK(&cuda_device_init_mutex[devid]);
  332. cuda_device_init[devid] = INITIALIZED;
  333. STARPU_PTHREAD_COND_BROADCAST(&cuda_device_init_cond[devid]);
  334. STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_device_init_mutex[devid]);
  335. _starpu_cuda_limit_gpu_mem_if_needed(devid);
  336. _starpu_memory_manager_set_global_memory_size(memnode, _starpu_cuda_get_global_mem_size(devid));
  337. }
  338. static void init_worker_context(unsigned workerid, unsigned devid STARPU_ATTRIBUTE_UNUSED)
  339. {
  340. int j;
  341. #ifdef STARPU_SIMGRID
  342. for (j = 0; j < STARPU_MAX_PIPELINE; j++)
  343. task_finished[workerid][j] = 0;
  344. #else /* !STARPU_SIMGRID */
  345. cudaError_t cures;
  346. starpu_cuda_set_device(devid);
  347. for (j = 0; j < STARPU_MAX_PIPELINE; j++)
  348. {
  349. cures = cudaEventCreateWithFlags(&task_events[workerid][j], cudaEventDisableTiming);
  350. if (STARPU_UNLIKELY(cures))
  351. STARPU_CUDA_REPORT_ERROR(cures);
  352. }
  353. cures = starpu_cudaStreamCreate(&streams[workerid]);
  354. if (STARPU_UNLIKELY(cures))
  355. STARPU_CUDA_REPORT_ERROR(cures);
  356. #endif /* !STARPU_SIMGRID */
  357. }
  358. #ifndef STARPU_SIMGRID
  359. static void deinit_device_context(unsigned devid)
  360. {
  361. int i;
  362. starpu_cuda_set_device(devid);
  363. cudaStreamDestroy(in_transfer_streams[devid]);
  364. cudaStreamDestroy(out_transfer_streams[devid]);
  365. for (i = 0; i < ncudagpus; i++)
  366. {
  367. cudaStreamDestroy(in_peer_transfer_streams[i][devid]);
  368. cudaStreamDestroy(out_peer_transfer_streams[devid][i]);
  369. }
  370. }
  371. #endif /* !STARPU_SIMGRID */
  372. static void deinit_worker_context(unsigned workerid, unsigned devid STARPU_ATTRIBUTE_UNUSED)
  373. {
  374. unsigned j;
  375. #ifdef STARPU_SIMGRID
  376. for (j = 0; j < STARPU_MAX_PIPELINE; j++)
  377. task_finished[workerid][j] = 0;
  378. #else /* STARPU_SIMGRID */
  379. starpu_cuda_set_device(devid);
  380. for (j = 0; j < STARPU_MAX_PIPELINE; j++)
  381. cudaEventDestroy(task_events[workerid][j]);
  382. cudaStreamDestroy(streams[workerid]);
  383. #endif /* STARPU_SIMGRID */
  384. }
  385. /* Return the number of devices usable in the system.
  386. * The value returned cannot be greater than MAXCUDADEVS */
  387. unsigned _starpu_get_cuda_device_count(void)
  388. {
  389. int cnt;
  390. #ifdef STARPU_SIMGRID
  391. cnt = _starpu_simgrid_get_nbhosts("CUDA");
  392. #else
  393. cudaError_t cures;
  394. cures = cudaGetDeviceCount(&cnt);
  395. if (STARPU_UNLIKELY(cures))
  396. return 0;
  397. #endif
  398. if (cnt > STARPU_MAXCUDADEVS)
  399. {
  400. _STARPU_MSG("# Warning: %d CUDA devices available. Only %d enabled. Use configure option --enable-maxcudadev=xxx to update the maximum value of supported CUDA devices.\n", cnt, STARPU_MAXCUDADEVS);
  401. cnt = STARPU_MAXCUDADEVS;
  402. }
  403. return (unsigned)cnt;
  404. }
  405. /* This is run from initialize to determine the number of CUDA devices */
  406. void _starpu_init_cuda(void)
  407. {
  408. if (ncudagpus < 0)
  409. {
  410. ncudagpus = _starpu_get_cuda_device_count();
  411. STARPU_ASSERT(ncudagpus <= STARPU_MAXCUDADEVS);
  412. }
  413. }
  414. static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *worker, unsigned char pipeline_idx STARPU_ATTRIBUTE_UNUSED)
  415. {
  416. STARPU_ASSERT(j);
  417. struct starpu_task *task = j->task;
  418. int profiling = starpu_profiling_status_get();
  419. STARPU_ASSERT(task);
  420. struct starpu_codelet *cl = task->cl;
  421. STARPU_ASSERT(cl);
  422. _starpu_set_local_worker_key(worker);
  423. _starpu_set_current_task(task);
  424. if (worker->ntasks == 1)
  425. {
  426. /* We are alone in the pipeline, the kernel will start now, record it */
  427. _starpu_driver_start_job(worker, j, &worker->perf_arch, 0, profiling);
  428. }
  429. #if defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
  430. /* We make sure we do manipulate the proper device */
  431. starpu_cuda_set_device(worker->devid);
  432. #endif
  433. starpu_cuda_func_t func = _starpu_task_get_cuda_nth_implementation(cl, j->nimpl);
  434. STARPU_ASSERT_MSG(func, "when STARPU_CUDA is defined in 'where', cuda_func or cuda_funcs has to be defined");
  435. if (_starpu_get_disable_kernels() <= 0)
  436. {
  437. _STARPU_TRACE_START_EXECUTING();
  438. #ifdef STARPU_SIMGRID
  439. int async = task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC;
  440. unsigned workerid = worker->workerid;
  441. if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE && !async)
  442. func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
  443. else if (cl->flags & STARPU_CODELET_SIMGRID_EXECUTE_AND_INJECT && !async)
  444. {
  445. _SIMGRID_TIMER_BEGIN(1);
  446. func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
  447. _SIMGRID_TIMER_END;
  448. }
  449. else
  450. _starpu_simgrid_submit_job(workerid, j, &worker->perf_arch, NAN,
  451. async ? &task_finished[workerid][pipeline_idx] : NULL);
  452. #else
  453. func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
  454. #endif
  455. _STARPU_TRACE_END_EXECUTING();
  456. }
  457. return 0;
  458. }
  459. static void finish_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *worker)
  460. {
  461. int profiling = starpu_profiling_status_get();
  462. _starpu_set_current_task(NULL);
  463. if (worker->pipeline_length)
  464. worker->current_tasks[worker->first_task] = NULL;
  465. else
  466. worker->current_task = NULL;
  467. worker->first_task = (worker->first_task + 1) % STARPU_MAX_PIPELINE;
  468. worker->ntasks--;
  469. _starpu_driver_end_job(worker, j, &worker->perf_arch, 0, profiling);
  470. struct _starpu_sched_ctx *sched_ctx = _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(worker, j);
  471. if(!sched_ctx)
  472. sched_ctx = _starpu_get_sched_ctx_struct(j->task->sched_ctx);
  473. if(!sched_ctx->sched_policy)
  474. _starpu_driver_update_job_feedback(j, worker, &sched_ctx->perf_arch, profiling);
  475. else
  476. _starpu_driver_update_job_feedback(j, worker, &worker->perf_arch, profiling);
  477. _starpu_push_task_output(j);
  478. _starpu_handle_job_termination(j);
  479. }
  480. /* Execute a job, up to completion for synchronous jobs */
  481. static void execute_job_on_cuda(struct starpu_task *task, struct _starpu_worker *worker)
  482. {
  483. int workerid = worker->workerid;
  484. int res;
  485. struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
  486. unsigned char pipeline_idx = (worker->first_task + worker->ntasks - 1)%STARPU_MAX_PIPELINE;
  487. res = start_job_on_cuda(j, worker, pipeline_idx);
  488. if (res)
  489. {
  490. switch (res)
  491. {
  492. case -EAGAIN:
  493. _STARPU_DISP("ouch, CUDA could not actually run task %p, putting it back...\n", task);
  494. _starpu_push_task_to_workers(task);
  495. STARPU_ABORT();
  496. default:
  497. STARPU_ABORT();
  498. }
  499. }
  500. if (task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
  501. {
  502. if (worker->pipeline_length == 0)
  503. {
  504. #ifdef STARPU_SIMGRID
  505. _starpu_simgrid_wait_tasks(workerid);
  506. #else
  507. /* Forced synchronous execution */
  508. cudaStreamSynchronize(starpu_cuda_get_local_stream());
  509. #endif
  510. finish_job_on_cuda(j, worker);
  511. }
  512. else
  513. {
  514. #ifndef STARPU_SIMGRID
  515. /* Record event to synchronize with task termination later */
  516. cudaEventRecord(task_events[workerid][pipeline_idx], starpu_cuda_get_local_stream());
  517. #endif
  518. #ifdef STARPU_USE_FXT
  519. int k;
  520. for (k = 0; k < (int) worker->set->nworkers; k++)
  521. if (worker->set->workers[k].ntasks == worker->set->workers[k].pipeline_length)
  522. break;
  523. if (k == (int) worker->set->nworkers)
  524. /* Everybody busy */
  525. _STARPU_TRACE_START_EXECUTING();
  526. #endif
  527. }
  528. }
  529. else
  530. /* Synchronous execution */
  531. {
  532. #if !defined(STARPU_SIMGRID)
  533. STARPU_ASSERT_MSG(cudaStreamQuery(starpu_cuda_get_local_stream()) == cudaSuccess, "Unless when using the STARPU_CUDA_ASYNC flag, CUDA codelets have to wait for termination of their kernels on the starpu_cuda_get_local_stream() stream");
  534. #endif
  535. finish_job_on_cuda(j, worker);
  536. }
  537. }
  538. /* This is run from the driver to initialize the driver CUDA context */
  539. int _starpu_cuda_driver_init(struct _starpu_worker_set *worker_set)
  540. {
  541. struct _starpu_worker *worker0 = &worker_set->workers[0];
  542. int lastdevid = -1;
  543. unsigned i;
  544. _starpu_driver_start(worker0, _STARPU_FUT_CUDA_KEY, 0);
  545. _starpu_set_local_worker_set_key(worker_set);
  546. #ifdef STARPU_USE_FXT
  547. for (i = 1; i < worker_set->nworkers; i++)
  548. _starpu_worker_start(&worker_set->workers[i], _STARPU_FUT_CUDA_KEY, 0);
  549. #endif
  550. for (i = 0; i < worker_set->nworkers; i++)
  551. {
  552. struct _starpu_worker *worker = &worker_set->workers[i];
  553. unsigned devid = worker->devid;
  554. unsigned memnode = worker->memory_node;
  555. if ((int) devid == lastdevid)
  556. {
  557. #ifdef STARPU_SIMGRID
  558. STARPU_ASSERT_MSG(0, "Simgrid mode does not support concurrent kernel execution yet\n");
  559. #endif /* !STARPU_SIMGRID */
  560. /* Already initialized */
  561. continue;
  562. }
  563. lastdevid = devid;
  564. init_device_context(devid, memnode);
  565. #ifndef STARPU_SIMGRID
  566. if (worker->config->topology.nworkerpercuda > 1 && props[devid].concurrentKernels == 0)
  567. _STARPU_DISP("Warning: STARPU_NWORKER_PER_CUDA is %u, but CUDA device %u does not support concurrent kernel execution!\n", worker_set->nworkers, devid);
  568. #endif /* !STARPU_SIMGRID */
  569. }
  570. /* one more time to avoid hacks from third party lib :) */
  571. _starpu_bind_thread_on_cpu(worker0->config, worker0->bindid, worker0->workerid);
  572. for (i = 0; i < worker_set->nworkers; i++)
  573. {
  574. struct _starpu_worker *worker = &worker_set->workers[i];
  575. unsigned devid = worker->devid;
  576. unsigned workerid = worker->workerid;
  577. unsigned subdev = i % _starpu_get_machine_config()->topology.nworkerpercuda;
  578. float size = (float) global_mem[devid] / (1<<30);
  579. #ifdef STARPU_SIMGRID
  580. const char *devname = "Simgrid";
  581. #else
  582. /* get the device's name */
  583. char devname[128];
  584. strncpy(devname, props[devid].name, 127);
  585. devname[127] = 0;
  586. #endif
  587. #if defined(STARPU_HAVE_BUSID) && !defined(STARPU_SIMGRID)
  588. #if defined(STARPU_HAVE_DOMAINID) && !defined(STARPU_SIMGRID)
  589. if (props[devid].pciDomainID)
  590. snprintf(worker->name, sizeof(worker->name), "CUDA %u.%u (%s %.1f GiB %04x:%02x:%02x.0)", devid, subdev, devname, size, props[devid].pciDomainID, props[devid].pciBusID, props[devid].pciDeviceID);
  591. else
  592. #endif
  593. snprintf(worker->name, sizeof(worker->name), "CUDA %u.%u (%s %.1f GiB %02x:%02x.0)", devid, subdev, devname, size, props[devid].pciBusID, props[devid].pciDeviceID);
  594. #else
  595. snprintf(worker->name, sizeof(worker->name), "CUDA %u.%u (%s %.1f GiB)", devid, subdev, devname, size);
  596. #endif
  597. snprintf(worker->short_name, sizeof(worker->short_name), "CUDA %u.%u", devid, subdev);
  598. _STARPU_DEBUG("cuda (%s) dev id %u worker %u thread is ready to run on CPU %d !\n", devname, devid, subdev, worker->bindid);
  599. worker->pipeline_length = starpu_get_env_number_default("STARPU_CUDA_PIPELINE", 2);
  600. if (worker->pipeline_length > STARPU_MAX_PIPELINE)
  601. {
  602. _STARPU_DISP("Warning: STARPU_CUDA_PIPELINE is %u, but STARPU_MAX_PIPELINE is only %u", worker->pipeline_length, STARPU_MAX_PIPELINE);
  603. worker->pipeline_length = STARPU_MAX_PIPELINE;
  604. }
  605. #if !defined(STARPU_SIMGRID) && !defined(STARPU_NON_BLOCKING_DRIVERS)
  606. if (worker->pipeline_length >= 1)
  607. {
  608. /* We need non-blocking drivers, to poll for CUDA task
  609. * termination */
  610. _STARPU_DISP("Warning: reducing STARPU_CUDA_PIPELINE to 0 because blocking drivers are enabled (and simgrid is not enabled)\n");
  611. worker->pipeline_length = 0;
  612. }
  613. #endif
  614. init_worker_context(workerid, worker->devid);
  615. _STARPU_TRACE_WORKER_INIT_END(workerid);
  616. }
  617. {
  618. char thread_name[16];
  619. snprintf(thread_name, sizeof(thread_name), "CUDA %u", worker0->devid);
  620. starpu_pthread_setname(thread_name);
  621. }
  622. /* tell the main thread that this one is ready */
  623. STARPU_PTHREAD_MUTEX_LOCK(&worker0->mutex);
  624. worker0->status = STATUS_UNKNOWN;
  625. worker0->worker_is_initialized = 1;
  626. STARPU_PTHREAD_COND_SIGNAL(&worker0->ready_cond);
  627. STARPU_PTHREAD_MUTEX_UNLOCK(&worker0->mutex);
  628. /* tell the main thread that this one is ready */
  629. STARPU_PTHREAD_MUTEX_LOCK(&worker_set->mutex);
  630. worker_set->set_is_initialized = 1;
  631. STARPU_PTHREAD_COND_SIGNAL(&worker_set->ready_cond);
  632. STARPU_PTHREAD_MUTEX_UNLOCK(&worker_set->mutex);
  633. return 0;
  634. }
  635. int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
  636. {
  637. struct _starpu_worker *worker0 = &worker_set->workers[0];
  638. struct starpu_task *tasks[worker_set->nworkers], *task;
  639. struct _starpu_job *j;
  640. int i, res;
  641. int idle_tasks, idle_transfers;
  642. #ifdef STARPU_SIMGRID
  643. starpu_pthread_wait_reset(&worker0->wait);
  644. #endif
  645. _starpu_set_local_worker_key(worker0);
  646. /* First poll for completed jobs */
  647. idle_tasks = 0;
  648. idle_transfers = 0;
  649. for (i = 0; i < (int) worker_set->nworkers; i++)
  650. {
  651. struct _starpu_worker *worker = &worker_set->workers[i];
  652. int workerid = worker->workerid;
  653. unsigned memnode = worker->memory_node;
  654. if (!worker->ntasks)
  655. idle_tasks++;
  656. if (!worker->task_transferring)
  657. idle_transfers++;
  658. if (!worker->ntasks && !worker->task_transferring)
  659. {
  660. /* Even nothing to test */
  661. continue;
  662. }
  663. /* First test for transfers pending for next task */
  664. task = worker->task_transferring;
  665. if (task && worker->nb_buffers_transferred == worker->nb_buffers_totransfer)
  666. {
  667. _STARPU_TRACE_END_PROGRESS(memnode);
  668. j = _starpu_get_job_associated_to_task(task);
  669. _starpu_set_local_worker_key(worker);
  670. _starpu_fetch_task_input_tail(task, j, worker);
  671. /* Reset it */
  672. worker->task_transferring = NULL;
  673. if (worker->ntasks > 1 && !(task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC))
  674. {
  675. /* We have to execute a non-asynchronous task but we
  676. * still have tasks in the pipeline... Record it to
  677. * prevent more tasks from coming, and do it later */
  678. worker->pipeline_stuck = 1;
  679. }
  680. else
  681. {
  682. execute_job_on_cuda(task, worker);
  683. }
  684. _STARPU_TRACE_START_PROGRESS(memnode);
  685. }
  686. /* Then test for termination of queued tasks */
  687. if (!worker->ntasks)
  688. /* No queued task */
  689. continue;
  690. if (worker->pipeline_length)
  691. task = worker->current_tasks[worker->first_task];
  692. else
  693. task = worker->current_task;
  694. if (task == worker->task_transferring)
  695. /* Next task is still pending transfer */
  696. continue;
  697. /* On-going asynchronous task, check for its termination first */
  698. #ifdef STARPU_SIMGRID
  699. if (task_finished[workerid][worker->first_task])
  700. #else /* !STARPU_SIMGRID */
  701. cudaError_t cures = cudaEventQuery(task_events[workerid][worker->first_task]);
  702. if (cures != cudaSuccess)
  703. {
  704. STARPU_ASSERT_MSG(cures == cudaErrorNotReady, "CUDA error on task %p, codelet %p (%s): %s (%d)", task, task->cl, _starpu_codelet_get_model_name(task->cl), cudaGetErrorString(cures), cures);
  705. }
  706. else
  707. #endif /* !STARPU_SIMGRID */
  708. {
  709. _STARPU_TRACE_END_PROGRESS(memnode);
  710. /* Asynchronous task completed! */
  711. _starpu_set_local_worker_key(worker);
  712. finish_job_on_cuda(_starpu_get_job_associated_to_task(task), worker);
  713. /* See next task if any */
  714. if (worker->ntasks && worker->current_tasks[worker->first_task] != worker->task_transferring)
  715. {
  716. task = worker->current_tasks[worker->first_task];
  717. j = _starpu_get_job_associated_to_task(task);
  718. if (task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
  719. {
  720. /* An asynchronous task, it was already
  721. * queued, it's now running, record its start time. */
  722. _starpu_driver_start_job(worker, j, &worker->perf_arch, 0, starpu_profiling_status_get());
  723. }
  724. else
  725. {
  726. /* A synchronous task, we have finished
  727. * flushing the pipeline, we can now at
  728. * last execute it. */
  729. _STARPU_TRACE_EVENT("sync_task");
  730. execute_job_on_cuda(task, worker);
  731. _STARPU_TRACE_EVENT("end_sync_task");
  732. worker->pipeline_stuck = 0;
  733. }
  734. }
  735. #ifdef STARPU_USE_FXT
  736. int k;
  737. for (k = 0; k < (int) worker_set->nworkers; k++)
  738. if (worker_set->workers[k].ntasks)
  739. break;
  740. if (k == (int) worker_set->nworkers)
  741. /* Everybody busy */
  742. _STARPU_TRACE_END_EXECUTING()
  743. #endif
  744. _STARPU_TRACE_START_PROGRESS(memnode);
  745. }
  746. if (!worker->pipeline_length || worker->ntasks < worker->pipeline_length)
  747. idle_tasks++;
  748. }
  749. #if defined(STARPU_NON_BLOCKING_DRIVERS) && !defined(STARPU_SIMGRID)
  750. if (!idle_tasks)
  751. {
  752. /* No task ready yet, no better thing to do than waiting */
  753. __starpu_datawizard_progress(1, !idle_transfers);
  754. return 0;
  755. }
  756. #endif
  757. /* Something done, make some progress */
  758. res = !idle_tasks || !idle_transfers;
  759. res |= __starpu_datawizard_progress(1, 1);
  760. /* And pull tasks */
  761. res |= _starpu_get_multi_worker_task(worker_set->workers, tasks, worker_set->nworkers, worker0->memory_node);
  762. #ifdef STARPU_SIMGRID
  763. if (!res)
  764. starpu_pthread_wait_wait(&worker0->wait);
  765. #else
  766. if (!res)
  767. return 0;
  768. #endif
  769. for (i = 0; i < (int) worker_set->nworkers; i++)
  770. {
  771. struct _starpu_worker *worker = &worker_set->workers[i];
  772. unsigned memnode STARPU_ATTRIBUTE_UNUSED = worker->memory_node;
  773. task = tasks[i];
  774. if (!task)
  775. continue;
  776. j = _starpu_get_job_associated_to_task(task);
  777. /* can CUDA do that task ? */
  778. if (!_STARPU_CUDA_MAY_PERFORM(j))
  779. {
  780. /* this is neither a cuda or a cublas task */
  781. _starpu_worker_refuse_task(worker, task);
  782. #if 0
  783. if (worker->pipeline_length)
  784. {
  785. int j;
  786. for (j = 0; j < worker->ntasks; j++)
  787. {
  788. const int j_mod = (j+worker->first_task)%STARPU_MAX_PIPELINE;
  789. if (task == worker->current_tasks[j_mod])
  790. {
  791. worker->current_tasks[j_mod] = NULL;
  792. if (j == 0)
  793. {
  794. worker->first_task = (worker->first_task + 1) % STARPU_MAX_PIPELINE;
  795. _starpu_set_current_task(NULL);
  796. }
  797. break;
  798. }
  799. }
  800. STARPU_ASSERT(j<worker->ntasks);
  801. }
  802. else
  803. {
  804. worker->current_task = NULL;
  805. _starpu_set_current_task(NULL);
  806. }
  807. worker->ntasks--;
  808. int res = _starpu_push_task_to_workers(task);
  809. STARPU_ASSERT_MSG(res == 0, "_starpu_push_task_to_workers() unexpectedly returned = %d\n", res);
  810. #endif
  811. continue;
  812. }
  813. /* Fetch data asynchronously */
  814. _STARPU_TRACE_END_PROGRESS(memnode);
  815. _starpu_set_local_worker_key(worker);
  816. res = _starpu_fetch_task_input(task, j, 1);
  817. STARPU_ASSERT(res == 0);
  818. _STARPU_TRACE_START_PROGRESS(memnode);
  819. }
  820. return 0;
  821. }
  822. int _starpu_cuda_driver_deinit(struct _starpu_worker_set *worker_set)
  823. {
  824. int lastdevid = -1;
  825. unsigned i;
  826. _STARPU_TRACE_WORKER_DEINIT_START;
  827. for (i = 0; i < worker_set->nworkers; i++)
  828. {
  829. struct _starpu_worker *worker = &worker_set->workers[i];
  830. unsigned devid = worker->devid;
  831. unsigned memnode = worker->memory_node;
  832. unsigned usersleft;
  833. if ((int) devid == lastdevid)
  834. /* Already initialized */
  835. continue;
  836. lastdevid = devid;
  837. STARPU_PTHREAD_MUTEX_LOCK(&cuda_device_init_mutex[devid]);
  838. usersleft = --cuda_device_users[devid];
  839. STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_device_init_mutex[devid]);
  840. if (!usersleft)
  841. {
  842. /* I'm last, deinitialize device */
  843. _starpu_handle_all_pending_node_data_requests(memnode);
  844. /* In case there remains some memory that was automatically
  845. * allocated by StarPU, we release it now. Note that data
  846. * coherency is not maintained anymore at that point ! */
  847. _starpu_free_all_automatically_allocated_buffers(memnode);
  848. _starpu_malloc_shutdown(memnode);
  849. #ifndef STARPU_SIMGRID
  850. deinit_device_context(devid);
  851. #endif /* !STARPU_SIMGRID */
  852. }
  853. STARPU_PTHREAD_MUTEX_LOCK(&cuda_device_init_mutex[devid]);
  854. cuda_device_init[devid] = UNINITIALIZED;
  855. STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_device_init_mutex[devid]);
  856. }
  857. for (i = 0; i < worker_set->nworkers; i++)
  858. {
  859. struct _starpu_worker *worker = &worker_set->workers[i];
  860. unsigned workerid = worker->workerid;
  861. deinit_worker_context(workerid, worker->devid);
  862. }
  863. worker_set->workers[0].worker_is_initialized = 0;
  864. _STARPU_TRACE_WORKER_DEINIT_END(_STARPU_FUT_CUDA_KEY);
  865. return 0;
  866. }
  867. void *_starpu_cuda_worker(void *_arg)
  868. {
  869. struct _starpu_worker_set* worker_set = _arg;
  870. unsigned i;
  871. _starpu_cuda_driver_init(worker_set);
  872. for (i = 0; i < worker_set->nworkers; i++)
  873. _STARPU_TRACE_START_PROGRESS(worker_set->workers[i].memory_node);
  874. while (_starpu_machine_is_running())
  875. {
  876. _starpu_may_pause();
  877. _starpu_cuda_driver_run_once(worker_set);
  878. }
  879. for (i = 0; i < worker_set->nworkers; i++)
  880. _STARPU_TRACE_END_PROGRESS(worker_set->workers[i].memory_node);
  881. _starpu_cuda_driver_deinit(worker_set);
  882. return NULL;
  883. }
  884. #ifdef STARPU_USE_CUDA
  885. void starpu_cublas_report_error(const char *func, const char *file, int line, int status)
  886. {
  887. char *errormsg;
  888. switch (status)
  889. {
  890. case CUBLAS_STATUS_SUCCESS:
  891. errormsg = "success";
  892. break;
  893. case CUBLAS_STATUS_NOT_INITIALIZED:
  894. errormsg = "not initialized";
  895. break;
  896. case CUBLAS_STATUS_ALLOC_FAILED:
  897. errormsg = "alloc failed";
  898. break;
  899. case CUBLAS_STATUS_INVALID_VALUE:
  900. errormsg = "invalid value";
  901. break;
  902. case CUBLAS_STATUS_ARCH_MISMATCH:
  903. errormsg = "arch mismatch";
  904. break;
  905. case CUBLAS_STATUS_EXECUTION_FAILED:
  906. errormsg = "execution failed";
  907. break;
  908. case CUBLAS_STATUS_INTERNAL_ERROR:
  909. errormsg = "internal error";
  910. break;
  911. default:
  912. errormsg = "unknown error";
  913. break;
  914. }
  915. _STARPU_MSG("oops in %s (%s:%d)... %d: %s \n", func, file, line, status, errormsg);
  916. STARPU_ABORT();
  917. }
  918. void starpu_cuda_report_error(const char *func, const char *file, int line, cudaError_t status)
  919. {
  920. const char *errormsg = cudaGetErrorString(status);
  921. _STARPU_ERROR("oops in %s (%s:%d)... %d: %s \n", func, file, line, status, errormsg);
  922. }
  923. #endif /* STARPU_USE_CUDA */
  924. #ifdef STARPU_USE_CUDA
  925. int
  926. starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node,
  927. void *dst_ptr, unsigned dst_node,
  928. size_t ssize, cudaStream_t stream,
  929. enum cudaMemcpyKind kind)
  930. {
  931. #ifdef HAVE_CUDA_MEMCPY_PEER
  932. int peer_copy = 0;
  933. int src_dev = -1, dst_dev = -1;
  934. #endif
  935. cudaError_t cures = 0;
  936. if (kind == cudaMemcpyDeviceToDevice && src_node != dst_node)
  937. {
  938. #ifdef HAVE_CUDA_MEMCPY_PEER
  939. peer_copy = 1;
  940. src_dev = _starpu_memory_node_get_devid(src_node);
  941. dst_dev = _starpu_memory_node_get_devid(dst_node);
  942. #else
  943. STARPU_ABORT();
  944. #endif
  945. }
  946. if (stream)
  947. {
  948. _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
  949. #ifdef HAVE_CUDA_MEMCPY_PEER
  950. if (peer_copy)
  951. {
  952. cures = cudaMemcpyPeerAsync((char *) dst_ptr, dst_dev,
  953. (char *) src_ptr, src_dev,
  954. ssize, stream);
  955. }
  956. else
  957. #endif
  958. {
  959. cures = cudaMemcpyAsync((char *)dst_ptr, (char *)src_ptr, ssize, kind, stream);
  960. }
  961. _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
  962. }
  963. /* Test if the asynchronous copy has failed or if the caller only asked for a synchronous copy */
  964. if (stream == NULL || cures)
  965. {
  966. /* do it in a synchronous fashion */
  967. #ifdef HAVE_CUDA_MEMCPY_PEER
  968. if (peer_copy)
  969. {
  970. cures = cudaMemcpyPeer((char *) dst_ptr, dst_dev,
  971. (char *) src_ptr, src_dev,
  972. ssize);
  973. }
  974. else
  975. #endif
  976. {
  977. cures = cudaMemcpy((char *)dst_ptr, (char *)src_ptr, ssize, kind);
  978. }
  979. if (STARPU_UNLIKELY(cures))
  980. STARPU_CUDA_REPORT_ERROR(cures);
  981. return 0;
  982. }
  983. return -EAGAIN;
  984. }
  985. #endif /* STARPU_USE_CUDA */
  986. int _starpu_run_cuda(struct _starpu_worker_set *workerarg)
  987. {
  988. /* Let's go ! */
  989. _starpu_cuda_worker(workerarg);
  990. return 0;
  991. }
  992. int _starpu_cuda_driver_init_from_worker(struct _starpu_worker *worker)
  993. {
  994. return _starpu_cuda_driver_init(worker->set);
  995. }
  996. int _starpu_cuda_run_from_worker(struct _starpu_worker *worker)
  997. {
  998. return _starpu_run_cuda(worker->set);
  999. }
  1000. int _starpu_cuda_driver_run_once_from_worker(struct _starpu_worker *worker)
  1001. {
  1002. return _starpu_cuda_driver_run_once(worker->set);
  1003. }
  1004. int _starpu_cuda_driver_deinit_from_worker(struct _starpu_worker *worker)
  1005. {
  1006. return _starpu_cuda_driver_deinit(worker->set);
  1007. }
  1008. struct _starpu_driver_ops _starpu_driver_cuda_ops =
  1009. {
  1010. .init = _starpu_cuda_driver_init_from_worker,
  1011. .run = _starpu_cuda_run_from_worker,
  1012. .run_once = _starpu_cuda_driver_run_once_from_worker,
  1013. .deinit = _starpu_cuda_driver_deinit_from_worker
  1014. };