driver_cuda.c 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2009-2015 Université de Bordeaux
  4. * Copyright (C) 2010 Mehdi Juhoor <mjuhoor@gmail.com>
  5. * Copyright (C) 2010, 2011, 2012, 2013, 2014 CNRS
  6. * Copyright (C) 2011 Télécom-SudParis
  7. *
  8. * StarPU is free software; you can redistribute it and/or modify
  9. * it under the terms of the GNU Lesser General Public License as published by
  10. * the Free Software Foundation; either version 2.1 of the License, or (at
  11. * your option) any later version.
  12. *
  13. * StarPU is distributed in the hope that it will be useful, but
  14. * WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  16. *
  17. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  18. */
  19. #include <starpu.h>
  20. #include <starpu_cuda.h>
  21. #include <starpu_profiling.h>
  22. #include <common/utils.h>
  23. #include <common/config.h>
  24. #include <core/debug.h>
  25. #include <drivers/driver_common/driver_common.h>
  26. #include "driver_cuda.h"
  27. #include <core/sched_policy.h>
  28. #ifdef HAVE_CUDA_GL_INTEROP_H
  29. #include <cuda_gl_interop.h>
  30. #endif
  31. #include <datawizard/memory_manager.h>
  32. #include <datawizard/memory_nodes.h>
  33. #include <datawizard/malloc.h>
  34. #ifdef STARPU_SIMGRID
  35. #include <core/simgrid.h>
  36. #endif
  37. #ifdef STARPU_USE_CUDA
  38. #if CUDART_VERSION >= 5000
  39. /* Avoid letting our streams spuriously synchonize with the NULL stream */
  40. #define starpu_cudaStreamCreate(stream) cudaStreamCreateWithFlags(stream, cudaStreamNonBlocking)
  41. #else
  42. #define starpu_cudaStreamCreate(stream) cudaStreamCreate(stream)
  43. #endif
  44. #endif
  45. /* the number of CUDA devices */
  46. static unsigned ncudagpus;
  47. static size_t global_mem[STARPU_MAXCUDADEVS];
  48. #ifdef STARPU_USE_CUDA
  49. static cudaStream_t streams[STARPU_NMAXWORKERS];
  50. static cudaStream_t out_transfer_streams[STARPU_MAXCUDADEVS];
  51. static cudaStream_t in_transfer_streams[STARPU_MAXCUDADEVS];
  52. /* Note: streams are not thread-safe, so we define them for each CUDA worker
  53. * emitting a GPU-GPU transfer */
  54. static cudaStream_t in_peer_transfer_streams[STARPU_MAXCUDADEVS][STARPU_MAXCUDADEVS];
  55. static cudaStream_t out_peer_transfer_streams[STARPU_MAXCUDADEVS][STARPU_MAXCUDADEVS];
  56. static struct cudaDeviceProp props[STARPU_MAXCUDADEVS];
  57. #ifndef STARPU_SIMGRID
  58. static cudaEvent_t task_events[STARPU_NMAXWORKERS][STARPU_MAX_PIPELINE];
  59. #endif
  60. #endif /* STARPU_USE_CUDA */
  61. #ifdef STARPU_SIMGRID
  62. static unsigned task_finished[STARPU_NMAXWORKERS][STARPU_MAX_PIPELINE];
  63. static starpu_pthread_mutex_t task_mutex[STARPU_NMAXWORKERS][STARPU_MAX_PIPELINE];
  64. static starpu_pthread_cond_t task_cond[STARPU_NMAXWORKERS][STARPU_MAX_PIPELINE];
  65. #endif /* STARPU_SIMGRID */
  66. void
  67. _starpu_cuda_discover_devices (struct _starpu_machine_config *config)
  68. {
  69. /* Discover the number of CUDA devices. Fill the result in CONFIG. */
  70. #ifdef STARPU_SIMGRID
  71. config->topology.nhwcudagpus = _starpu_simgrid_get_nbhosts("CUDA");
  72. #else
  73. int cnt;
  74. cudaError_t cures;
  75. cures = cudaGetDeviceCount (&cnt);
  76. if (STARPU_UNLIKELY(cures != cudaSuccess))
  77. cnt = 0;
  78. config->topology.nhwcudagpus = cnt;
  79. #endif
  80. }
  81. /* In case we want to cap the amount of memory available on the GPUs by the
  82. * mean of the STARPU_LIMIT_CUDA_MEM, we decrease the value of
  83. * global_mem[devid] which is the value returned by
  84. * _starpu_cuda_get_global_mem_size() to indicate how much memory can
  85. * be allocated on the device
  86. */
  87. static void _starpu_cuda_limit_gpu_mem_if_needed(unsigned devid)
  88. {
  89. starpu_ssize_t limit;
  90. size_t STARPU_ATTRIBUTE_UNUSED totalGlobalMem = 0;
  91. size_t STARPU_ATTRIBUTE_UNUSED to_waste = 0;
  92. char name[30];
  93. #ifdef STARPU_SIMGRID
  94. totalGlobalMem = _starpu_simgrid_get_memsize("CUDA", devid);
  95. #elif defined(STARPU_USE_CUDA)
  96. /* Find the size of the memory on the device */
  97. totalGlobalMem = props[devid].totalGlobalMem;
  98. #endif
  99. limit = starpu_get_env_number("STARPU_LIMIT_CUDA_MEM");
  100. if (limit == -1)
  101. {
  102. sprintf(name, "STARPU_LIMIT_CUDA_%u_MEM", devid);
  103. limit = starpu_get_env_number(name);
  104. }
  105. #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
  106. if (limit == -1)
  107. {
  108. /* Use 90% of the available memory by default. */
  109. limit = totalGlobalMem / (1024*1024) * 0.9;
  110. }
  111. #endif
  112. global_mem[devid] = limit * 1024*1024;
  113. #ifdef STARPU_USE_CUDA
  114. /* How much memory to waste ? */
  115. to_waste = totalGlobalMem - global_mem[devid];
  116. props[devid].totalGlobalMem -= to_waste;
  117. #endif /* STARPU_USE_CUDA */
  118. _STARPU_DEBUG("CUDA device %u: Wasting %ld MB / Limit %ld MB / Total %ld MB / Remains %ld MB\n",
  119. devid, (long) to_waste/(1024*1024), (long) limit, (long) totalGlobalMem/(1024*1024),
  120. (long) (totalGlobalMem - to_waste)/(1024*1024));
  121. }
  122. #ifdef STARPU_USE_CUDA
  123. cudaStream_t starpu_cuda_get_local_in_transfer_stream()
  124. {
  125. int worker = starpu_worker_get_id();
  126. int devid = starpu_worker_get_devid(worker);
  127. cudaStream_t stream;
  128. stream = in_transfer_streams[devid];
  129. STARPU_ASSERT(stream);
  130. return stream;
  131. }
  132. cudaStream_t starpu_cuda_get_local_out_transfer_stream()
  133. {
  134. int worker = starpu_worker_get_id();
  135. int devid = starpu_worker_get_devid(worker);
  136. cudaStream_t stream;
  137. stream = out_transfer_streams[devid];
  138. STARPU_ASSERT(stream);
  139. return stream;
  140. }
  141. cudaStream_t starpu_cuda_get_peer_transfer_stream(unsigned src_node, unsigned dst_node)
  142. {
  143. int worker = starpu_worker_get_id();
  144. int devid = starpu_worker_get_devid(worker);
  145. int src_devid = _starpu_memory_node_get_devid(src_node);
  146. int dst_devid = _starpu_memory_node_get_devid(dst_node);
  147. cudaStream_t stream;
  148. STARPU_ASSERT(devid == src_devid || devid == dst_devid);
  149. if (devid == dst_devid)
  150. stream = in_peer_transfer_streams[src_devid][dst_devid];
  151. else
  152. stream = out_peer_transfer_streams[src_devid][dst_devid];
  153. STARPU_ASSERT(stream);
  154. return stream;
  155. }
  156. cudaStream_t starpu_cuda_get_local_stream(void)
  157. {
  158. int worker = starpu_worker_get_id();
  159. return streams[worker];
  160. }
  161. const struct cudaDeviceProp *starpu_cuda_get_device_properties(unsigned workerid)
  162. {
  163. struct _starpu_machine_config *config = _starpu_get_machine_config();
  164. unsigned devid = config->workers[workerid].devid;
  165. return &props[devid];
  166. }
  167. #endif /* STARPU_USE_CUDA */
  168. void starpu_cuda_set_device(unsigned devid STARPU_ATTRIBUTE_UNUSED)
  169. {
  170. #ifdef STARPU_SIMGRID
  171. STARPU_ABORT();
  172. #else
  173. cudaError_t cures;
  174. struct starpu_conf *conf = &_starpu_get_machine_config()->conf;
  175. #if !defined(HAVE_CUDA_MEMCPY_PEER) && defined(HAVE_CUDA_GL_INTEROP_H)
  176. unsigned i;
  177. #endif
  178. #ifdef HAVE_CUDA_MEMCPY_PEER
  179. if (conf->n_cuda_opengl_interoperability)
  180. {
  181. fprintf(stderr, "OpenGL interoperability was requested, but StarPU was built with multithread GPU control support, please reconfigure with --disable-cuda-memcpy-peer but that will disable the memcpy-peer optimizations\n");
  182. STARPU_ABORT();
  183. }
  184. #elif !defined(HAVE_CUDA_GL_INTEROP_H)
  185. if (conf->n_cuda_opengl_interoperability)
  186. {
  187. fprintf(stderr,"OpenGL interoperability was requested, but cuda_gl_interop.h could not be compiled, please make sure that OpenGL headers were available before ./configure run.");
  188. STARPU_ABORT();
  189. }
  190. #else
  191. for (i = 0; i < conf->n_cuda_opengl_interoperability; i++)
  192. if (conf->cuda_opengl_interoperability[i] == devid)
  193. {
  194. cures = cudaGLSetGLDevice(devid);
  195. goto done;
  196. }
  197. #endif
  198. cures = cudaSetDevice(devid);
  199. #if !defined(HAVE_CUDA_MEMCPY_PEER) && defined(HAVE_CUDA_GL_INTEROP_H)
  200. done:
  201. #endif
  202. if (STARPU_UNLIKELY(cures
  203. #ifdef STARPU_OPENMP
  204. /* When StarPU is used as Open Runtime support,
  205. * starpu_omp_shutdown() will usually be called from a
  206. * destructor, in which case cudaThreadExit() reports a
  207. * cudaErrorCudartUnloading here. There should not
  208. * be any remaining tasks running at this point so
  209. * we can probably ignore it without much consequences. */
  210. && cures != cudaErrorCudartUnloading
  211. #endif /* STARPU_OPENMP */
  212. ))
  213. STARPU_CUDA_REPORT_ERROR(cures);
  214. #endif
  215. }
  216. #ifndef STARPU_SIMGRID
  217. static void init_device_context(unsigned devid)
  218. {
  219. int workerid;
  220. unsigned i;
  221. cudaError_t cures;
  222. /* TODO: cudaSetDeviceFlag(cudaDeviceMapHost) */
  223. starpu_cuda_set_device(devid);
  224. #ifdef HAVE_CUDA_MEMCPY_PEER
  225. if (starpu_get_env_number("STARPU_ENABLE_CUDA_GPU_GPU_DIRECT") != 0)
  226. {
  227. int nworkers = starpu_worker_get_count();
  228. for (workerid = 0; workerid < nworkers; workerid++)
  229. {
  230. struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
  231. if (worker->arch == STARPU_CUDA_WORKER && worker->devid != devid)
  232. {
  233. int can;
  234. cures = cudaDeviceCanAccessPeer(&can, devid, worker->devid);
  235. if (!cures && can)
  236. {
  237. cures = cudaDeviceEnablePeerAccess(worker->devid, 0);
  238. if (!cures)
  239. _STARPU_DEBUG("Enabled GPU-Direct %d -> %d\n", worker->devid, devid);
  240. }
  241. }
  242. }
  243. }
  244. #endif
  245. /* force CUDA to initialize the context for real */
  246. cures = cudaFree(0);
  247. if (STARPU_UNLIKELY(cures))
  248. {
  249. if (cures == cudaErrorDevicesUnavailable)
  250. {
  251. fprintf(stderr,"All CUDA-capable devices are busy or unavailable\n");
  252. exit(77);
  253. }
  254. STARPU_CUDA_REPORT_ERROR(cures);
  255. }
  256. cures = cudaGetDeviceProperties(&props[devid], devid);
  257. if (STARPU_UNLIKELY(cures))
  258. STARPU_CUDA_REPORT_ERROR(cures);
  259. #ifdef HAVE_CUDA_MEMCPY_PEER
  260. if (props[devid].computeMode == cudaComputeModeExclusive)
  261. {
  262. fprintf(stderr, "CUDA is in EXCLUSIVE-THREAD mode, but StarPU was built with multithread GPU control support, please either ask your administrator to use EXCLUSIVE-PROCESS mode (which should really be fine), or reconfigure with --disable-cuda-memcpy-peer but that will disable the memcpy-peer optimizations\n");
  263. STARPU_ABORT();
  264. }
  265. #endif
  266. cures = starpu_cudaStreamCreate(&in_transfer_streams[devid]);
  267. if (STARPU_UNLIKELY(cures))
  268. STARPU_CUDA_REPORT_ERROR(cures);
  269. cures = starpu_cudaStreamCreate(&out_transfer_streams[devid]);
  270. if (STARPU_UNLIKELY(cures))
  271. STARPU_CUDA_REPORT_ERROR(cures);
  272. for (i = 0; i < ncudagpus; i++)
  273. {
  274. cures = starpu_cudaStreamCreate(&in_peer_transfer_streams[i][devid]);
  275. if (STARPU_UNLIKELY(cures))
  276. STARPU_CUDA_REPORT_ERROR(cures);
  277. cures = starpu_cudaStreamCreate(&out_peer_transfer_streams[devid][i]);
  278. if (STARPU_UNLIKELY(cures))
  279. STARPU_CUDA_REPORT_ERROR(cures);
  280. }
  281. }
  282. #endif /* !STARPU_SIMGRID */
  283. static void init_worker_context(unsigned workerid)
  284. {
  285. int j;
  286. #ifdef STARPU_SIMGRID
  287. for (j = 0; j < STARPU_MAX_PIPELINE; j++)
  288. {
  289. task_finished[workerid][j] = 0;
  290. STARPU_PTHREAD_MUTEX_INIT(&task_mutex[workerid][j], NULL);
  291. STARPU_PTHREAD_COND_INIT(&task_cond[workerid][j], NULL);
  292. }
  293. #else /* !STARPU_SIMGRID */
  294. cudaError_t cures;
  295. for (j = 0; j < STARPU_MAX_PIPELINE; j++)
  296. {
  297. cures = cudaEventCreateWithFlags(&task_events[workerid][j], cudaEventDisableTiming);
  298. if (STARPU_UNLIKELY(cures))
  299. STARPU_CUDA_REPORT_ERROR(cures);
  300. }
  301. cures = starpu_cudaStreamCreate(&streams[workerid]);
  302. if (STARPU_UNLIKELY(cures))
  303. STARPU_CUDA_REPORT_ERROR(cures);
  304. #endif /* !STARPU_SIMGRID */
  305. }
  306. #ifndef STARPU_SIMGRID
  307. static void deinit_device_context(unsigned devid)
  308. {
  309. unsigned i;
  310. cudaStreamDestroy(in_transfer_streams[devid]);
  311. cudaStreamDestroy(out_transfer_streams[devid]);
  312. for (i = 0; i < ncudagpus; i++)
  313. {
  314. cudaStreamDestroy(in_peer_transfer_streams[i][devid]);
  315. cudaStreamDestroy(out_peer_transfer_streams[devid][i]);
  316. }
  317. }
  318. #endif /* !STARPU_SIMGRID */
  319. static void deinit_worker_context(unsigned workerid)
  320. {
  321. unsigned j;
  322. #ifdef STARPU_SIMGRID
  323. for (j = 0; j < STARPU_MAX_PIPELINE; j++)
  324. {
  325. STARPU_PTHREAD_MUTEX_DESTROY(&task_mutex[workerid][j]);
  326. STARPU_PTHREAD_COND_DESTROY(&task_cond[workerid][j]);
  327. }
  328. #else /* STARPU_SIMGRID */
  329. for (j = 0; j < STARPU_MAX_PIPELINE; j++)
  330. cudaEventDestroy(task_events[workerid][j]);
  331. cudaStreamDestroy(streams[workerid]);
  332. #endif /* STARPU_SIMGRID */
  333. }
  334. static size_t _starpu_cuda_get_global_mem_size(unsigned devid)
  335. {
  336. return global_mem[devid];
  337. }
  338. /* Return the number of devices usable in the system.
  339. * The value returned cannot be greater than MAXCUDADEVS */
  340. unsigned _starpu_get_cuda_device_count(void)
  341. {
  342. int cnt;
  343. #ifdef STARPU_SIMGRID
  344. cnt = _starpu_simgrid_get_nbhosts("CUDA");
  345. #else
  346. cudaError_t cures;
  347. cures = cudaGetDeviceCount(&cnt);
  348. if (STARPU_UNLIKELY(cures))
  349. return 0;
  350. #endif
  351. if (cnt > STARPU_MAXCUDADEVS)
  352. {
  353. fprintf(stderr, "# Warning: %d CUDA devices available. Only %d enabled. Use configure option --enable-maxcudadev=xxx to update the maximum value of supported CUDA devices.\n", cnt, STARPU_MAXCUDADEVS);
  354. cnt = STARPU_MAXCUDADEVS;
  355. }
  356. return (unsigned)cnt;
  357. }
  358. void _starpu_init_cuda(void)
  359. {
  360. ncudagpus = _starpu_get_cuda_device_count();
  361. STARPU_ASSERT(ncudagpus <= STARPU_MAXCUDADEVS);
  362. }
  363. static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *worker, unsigned char pipeline_idx STARPU_ATTRIBUTE_UNUSED)
  364. {
  365. int ret;
  366. STARPU_ASSERT(j);
  367. struct starpu_task *task = j->task;
  368. int profiling = starpu_profiling_status_get();
  369. STARPU_ASSERT(task);
  370. struct starpu_codelet *cl = task->cl;
  371. STARPU_ASSERT(cl);
  372. _starpu_set_current_task(task);
  373. ret = _starpu_fetch_task_input(j);
  374. if (ret != 0)
  375. {
  376. /* there was not enough memory, so the input of
  377. * the codelet cannot be fetched ... put the
  378. * codelet back, and try it later */
  379. return -EAGAIN;
  380. }
  381. if (worker->ntasks == 1)
  382. {
  383. /* We are alone in the pipeline, the kernel will start now, record it */
  384. _starpu_driver_start_job(worker, j, &worker->perf_arch, &j->cl_start, 0, profiling);
  385. }
  386. #if defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
  387. /* We make sure we do manipulate the proper device */
  388. starpu_cuda_set_device(worker->devid);
  389. #endif
  390. starpu_cuda_func_t func = _starpu_task_get_cuda_nth_implementation(cl, j->nimpl);
  391. STARPU_ASSERT_MSG(func, "when STARPU_CUDA is defined in 'where', cuda_func or cuda_funcs has to be defined");
  392. if (_starpu_get_disable_kernels() <= 0)
  393. {
  394. _STARPU_TRACE_START_EXECUTING();
  395. #ifdef STARPU_SIMGRID
  396. int async = task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC;
  397. unsigned workerid = worker->workerid;
  398. _starpu_simgrid_submit_job(workerid, j, &worker->perf_arch, NAN,
  399. async ? &task_finished[workerid][pipeline_idx] : NULL,
  400. async ? &task_mutex[workerid][pipeline_idx] : NULL,
  401. async ? &task_cond[workerid][pipeline_idx] : NULL);
  402. #else
  403. func(_STARPU_TASK_GET_INTERFACES(task), task->cl_arg);
  404. #endif
  405. _STARPU_TRACE_END_EXECUTING();
  406. }
  407. return 0;
  408. }
  409. static void finish_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *worker)
  410. {
  411. struct timespec codelet_end;
  412. int profiling = starpu_profiling_status_get();
  413. _starpu_set_current_task(NULL);
  414. if (worker->pipeline_length)
  415. worker->current_tasks[worker->first_task] = NULL;
  416. else
  417. worker->current_task = NULL;
  418. worker->first_task = (worker->first_task + 1) % STARPU_MAX_PIPELINE;
  419. worker->ntasks--;
  420. _starpu_driver_end_job(worker, j, &worker->perf_arch, &codelet_end, 0, profiling);
  421. struct _starpu_sched_ctx *sched_ctx = _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(worker, j);
  422. if(!sched_ctx)
  423. sched_ctx = _starpu_get_sched_ctx_struct(j->task->sched_ctx);
  424. if(!sched_ctx->sched_policy)
  425. _starpu_driver_update_job_feedback(j, worker, &sched_ctx->perf_arch, &j->cl_start, &codelet_end, profiling);
  426. else
  427. _starpu_driver_update_job_feedback(j, worker, &worker->perf_arch, &j->cl_start, &codelet_end, profiling);
  428. _starpu_push_task_output(j);
  429. _starpu_handle_job_termination(j);
  430. }
  431. /* Execute a job, up to completion for synchronous jobs */
  432. static void execute_job_on_cuda(struct starpu_task *task, struct _starpu_worker *worker)
  433. {
  434. int workerid = worker->workerid;
  435. int res;
  436. struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
  437. unsigned char pipeline_idx = (worker->first_task + worker->ntasks - 1)%STARPU_MAX_PIPELINE;
  438. res = start_job_on_cuda(j, worker, pipeline_idx);
  439. if (res)
  440. {
  441. switch (res)
  442. {
  443. case -EAGAIN:
  444. _STARPU_DISP("ouch, CUDA could not actually run task %p, putting it back...\n", task);
  445. _starpu_push_task_to_workers(task);
  446. STARPU_ABORT();
  447. default:
  448. STARPU_ABORT();
  449. }
  450. }
  451. if (task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
  452. {
  453. if (worker->pipeline_length == 0)
  454. {
  455. #ifdef STARPU_SIMGRID
  456. _starpu_simgrid_wait_tasks(workerid);
  457. #else
  458. /* Forced synchronous execution */
  459. cudaStreamSynchronize(starpu_cuda_get_local_stream());
  460. #endif
  461. finish_job_on_cuda(j, worker);
  462. }
  463. else
  464. {
  465. #ifndef STARPU_SIMGRID
  466. /* Record event to synchronize with task termination later */
  467. cudaEventRecord(task_events[workerid][pipeline_idx], starpu_cuda_get_local_stream());
  468. #endif
  469. #ifdef STARPU_USE_FXT
  470. int k;
  471. for (k = 0; k < (int) worker->set->nworkers; k++)
  472. if (worker->set->workers[k].ntasks == worker->set->workers[k].pipeline_length)
  473. break;
  474. if (k == (int) worker->set->nworkers)
  475. /* Everybody busy */
  476. _STARPU_TRACE_START_EXECUTING();
  477. #endif
  478. }
  479. }
  480. else
  481. /* Synchronous execution */
  482. {
  483. #if !defined(STARPU_SIMGRID)
  484. STARPU_ASSERT_MSG(cudaStreamQuery(starpu_cuda_get_local_stream()) == cudaSuccess, "Unless when using the STARPU_CUDA_ASYNC flag, CUDA codelets have to wait for termination of their kernels on the starpu_cuda_get_local_stream() stream");
  485. #endif
  486. finish_job_on_cuda(j, worker);
  487. }
  488. }
  489. /* XXX Should this be merged with _starpu_init_cuda ? */
  490. int _starpu_cuda_driver_init(struct _starpu_worker_set *worker_set)
  491. {
  492. struct _starpu_worker *worker0 = &worker_set->workers[0];
  493. int lastdevid = -1;
  494. unsigned i;
  495. _starpu_driver_start(worker0, _STARPU_FUT_CUDA_KEY, 0);
  496. _starpu_set_local_worker_set_key(worker_set);
  497. #ifdef STARPU_USE_FXT
  498. for (i = 1; i < worker_set->nworkers; i++)
  499. _starpu_worker_start(&worker_set->workers[i], _STARPU_FUT_CUDA_KEY, 0);
  500. #endif
  501. for (i = 0; i < worker_set->nworkers; i++)
  502. {
  503. struct _starpu_worker *worker = &worker_set->workers[i];
  504. unsigned devid = worker->devid;
  505. unsigned memnode = worker->memory_node;
  506. if ((int) devid == lastdevid)
  507. /* Already initialized */
  508. continue;
  509. lastdevid = devid;
  510. #ifndef STARPU_SIMGRID
  511. init_device_context(devid);
  512. #endif
  513. #ifdef STARPU_SIMGRID
  514. STARPU_ASSERT_MSG(worker_set->nworkers == 1, "Simgrid mode does not support concurrent kernel execution yet\n");
  515. #else /* !STARPU_SIMGRID */
  516. if (worker_set->nworkers > 1 && props[devid].concurrentKernels == 0)
  517. _STARPU_DISP("Warning: STARPU_NWORKER_PER_CUDA is %u, but the device does not support concurrent kernel execution!\n", worker_set->nworkers);
  518. #endif /* !STARPU_SIMGRID */
  519. _starpu_cuda_limit_gpu_mem_if_needed(devid);
  520. _starpu_memory_manager_set_global_memory_size(memnode, _starpu_cuda_get_global_mem_size(devid));
  521. _starpu_malloc_init(memnode);
  522. }
  523. /* one more time to avoid hacks from third party lib :) */
  524. _starpu_bind_thread_on_cpu(worker0->config, worker0->bindid);
  525. for (i = 0; i < worker_set->nworkers; i++)
  526. {
  527. struct _starpu_worker *worker = &worker_set->workers[i];
  528. unsigned devid = worker->devid;
  529. unsigned workerid = worker->workerid;
  530. float size = (float) global_mem[devid] / (1<<30);
  531. #ifdef STARPU_SIMGRID
  532. const char *devname = "Simgrid";
  533. #else
  534. /* get the device's name */
  535. char devname[128];
  536. strncpy(devname, props[devid].name, 128);
  537. #endif
  538. #if defined(STARPU_HAVE_BUSID) && !defined(STARPU_SIMGRID)
  539. #if defined(STARPU_HAVE_DOMAINID) && !defined(STARPU_SIMGRID)
  540. if (props[devid].pciDomainID)
  541. snprintf(worker->name, sizeof(worker->name), "CUDA %u.%u (%s %.1f GiB %04x:%02x:%02x.0)", devid, i, devname, size, props[devid].pciDomainID, props[devid].pciBusID, props[devid].pciDeviceID);
  542. else
  543. #endif
  544. snprintf(worker->name, sizeof(worker->name), "CUDA %u.%u (%s %.1f GiB %02x:%02x.0)", devid, i, devname, size, props[devid].pciBusID, props[devid].pciDeviceID);
  545. #else
  546. snprintf(worker->name, sizeof(worker->name), "CUDA %u.%u (%s %.1f GiB)", devid, i, devname, size);
  547. #endif
  548. snprintf(worker->short_name, sizeof(worker->short_name), "CUDA %u.%u", devid, i);
  549. _STARPU_DEBUG("cuda (%s) dev id %u worker %u thread is ready to run on CPU %d !\n", devname, devid, i, worker->bindid);
  550. worker->pipeline_length = starpu_get_env_number_default("STARPU_CUDA_PIPELINE", 2);
  551. if (worker->pipeline_length > STARPU_MAX_PIPELINE)
  552. {
  553. _STARPU_DISP("Warning: STARPU_CUDA_PIPELINE is %u, but STARPU_MAX_PIPELINE is only %u", worker->pipeline_length, STARPU_MAX_PIPELINE);
  554. worker->pipeline_length = STARPU_MAX_PIPELINE;
  555. }
  556. #if defined(STARPU_SIMGRID) && defined(STARPU_NON_BLOCKING_DRIVERS)
  557. if (worker->pipeline_length >= 1)
  558. {
  559. /* We need blocking drivers, otherwise idle drivers
  560. * would keep consuming real CPU time while just
  561. * polling for task termination */
  562. _STARPU_DISP("Warning: reducing STARPU_CUDA_PIPELINE to 0 because simgrid is enabled and blocking drivers are not enabled\n");
  563. worker->pipeline_length = 0;
  564. }
  565. #endif
  566. #if !defined(STARPU_SIMGRID) && !defined(STARPU_NON_BLOCKING_DRIVERS)
  567. if (worker->pipeline_length >= 1)
  568. {
  569. /* We need non-blocking drivers, to poll for CUDA task
  570. * termination */
  571. _STARPU_DISP("Warning: reducing STARPU_CUDA_PIPELINE to 0 because blocking drivers are enabled (and simgrid is not enabled)\n");
  572. worker->pipeline_length = 0;
  573. }
  574. #endif
  575. init_worker_context(workerid);
  576. _STARPU_TRACE_WORKER_INIT_END(workerid);
  577. }
  578. /* tell the main thread that this one is ready */
  579. STARPU_PTHREAD_MUTEX_LOCK(&worker0->mutex);
  580. worker0->status = STATUS_UNKNOWN;
  581. worker0->worker_is_initialized = 1;
  582. STARPU_PTHREAD_COND_SIGNAL(&worker0->ready_cond);
  583. STARPU_PTHREAD_MUTEX_UNLOCK(&worker0->mutex);
  584. /* tell the main thread that this one is ready */
  585. STARPU_PTHREAD_MUTEX_LOCK(&worker_set->mutex);
  586. worker_set->set_is_initialized = 1;
  587. STARPU_PTHREAD_COND_SIGNAL(&worker_set->ready_cond);
  588. STARPU_PTHREAD_MUTEX_UNLOCK(&worker_set->mutex);
  589. return 0;
  590. }
  591. int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
  592. {
  593. struct _starpu_worker *worker0 = &worker_set->workers[0];
  594. unsigned memnode = worker0->memory_node;
  595. struct starpu_task *tasks[worker_set->nworkers], *task;
  596. struct _starpu_job *j;
  597. int i, res;
  598. int idle;
  599. /* First poll for completed jobs */
  600. idle = 0;
  601. for (i = 0; i < (int) worker_set->nworkers; i++)
  602. {
  603. struct _starpu_worker *worker = &worker_set->workers[i];
  604. int workerid = worker->workerid;
  605. if (!worker->ntasks)
  606. {
  607. idle++;
  608. /* Even nothing to test */
  609. continue;
  610. }
  611. task = worker->current_tasks[worker->first_task];
  612. /* On-going asynchronous task, check for its termination first */
  613. #ifdef STARPU_SIMGRID
  614. if (task_finished[workerid][worker->first_task])
  615. #else /* !STARPU_SIMGRID */
  616. cudaError_t cures = cudaEventQuery(task_events[workerid][worker->first_task]);
  617. if (cures != cudaSuccess)
  618. {
  619. STARPU_ASSERT_MSG(cures == cudaErrorNotReady, "CUDA error on task %p, codelet %p (%s): %s (%d)", task, task->cl, _starpu_codelet_get_model_name(task->cl), cudaGetErrorString(cures), cures);
  620. }
  621. else
  622. #endif /* !STARPU_SIMGRID */
  623. {
  624. /* Asynchronous task completed! */
  625. _starpu_set_local_worker_key(worker);
  626. finish_job_on_cuda(_starpu_get_job_associated_to_task(task), worker);
  627. /* See next task if any */
  628. if (worker->ntasks)
  629. {
  630. task = worker->current_tasks[worker->first_task];
  631. j = _starpu_get_job_associated_to_task(task);
  632. if (task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
  633. {
  634. /* An asynchronous task, it was already
  635. * queued, it's now running, record its start time. */
  636. _starpu_driver_start_job(worker, j, &worker->perf_arch, &j->cl_start, 0, starpu_profiling_status_get());
  637. }
  638. else
  639. {
  640. /* A synchronous task, we have finished
  641. * flushing the pipeline, we can now at
  642. * last execute it. */
  643. _STARPU_TRACE_END_PROGRESS(memnode);
  644. _STARPU_TRACE_EVENT("sync_task");
  645. execute_job_on_cuda(task, worker);
  646. _STARPU_TRACE_EVENT("end_sync_task");
  647. _STARPU_TRACE_START_PROGRESS(memnode);
  648. worker->pipeline_stuck = 0;
  649. }
  650. }
  651. #ifdef STARPU_USE_FXT
  652. int k;
  653. for (k = 0; k < (int) worker_set->nworkers; k++)
  654. if (worker_set->workers[k].ntasks)
  655. break;
  656. if (k == (int) worker_set->nworkers)
  657. /* Everybody busy */
  658. _STARPU_TRACE_END_EXECUTING()
  659. #endif
  660. }
  661. if (worker->ntasks < worker->pipeline_length)
  662. idle++;
  663. }
  664. #ifdef STARPU_NON_BLOCKING_DRIVERS
  665. if (!idle)
  666. {
  667. /* Nothing ready yet, no better thing to do than waiting */
  668. __starpu_datawizard_progress(memnode, 1, 0);
  669. return 0;
  670. }
  671. #endif
  672. /* Something done, make some progress */
  673. __starpu_datawizard_progress(memnode, 1, 1);
  674. /* And pull tasks */
  675. res = _starpu_get_multi_worker_task(worker_set->workers, tasks, worker_set->nworkers, memnode);
  676. if (!res)
  677. return 0;
  678. for (i = 0; i < (int) worker_set->nworkers; i++)
  679. {
  680. struct _starpu_worker *worker = &worker_set->workers[i];
  681. task = tasks[i];
  682. if (!task)
  683. continue;
  684. j = _starpu_get_job_associated_to_task(task);
  685. /* can CUDA do that task ? */
  686. if (!_STARPU_CUDA_MAY_PERFORM(j))
  687. {
  688. /* this is neither a cuda or a cublas task */
  689. worker->ntasks--;
  690. _starpu_push_task_to_workers(task);
  691. continue;
  692. }
  693. if (worker->ntasks > 1 && !(task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC))
  694. {
  695. /* We have to execute a non-asynchronous task but we
  696. * still have tasks in the pipeline... Record it to
  697. * prevent more tasks from coming, and do it later */
  698. worker->pipeline_stuck = 1;
  699. continue;
  700. }
  701. _starpu_set_local_worker_key(worker);
  702. _STARPU_TRACE_END_PROGRESS(memnode);
  703. execute_job_on_cuda(task, worker);
  704. _STARPU_TRACE_START_PROGRESS(memnode);
  705. }
  706. return 0;
  707. }
  708. int _starpu_cuda_driver_deinit(struct _starpu_worker_set *worker_set)
  709. {
  710. int lastdevid = -1;
  711. unsigned i;
  712. _STARPU_TRACE_WORKER_DEINIT_START;
  713. for (i = 0; i < worker_set->nworkers; i++)
  714. {
  715. struct _starpu_worker *worker = &worker_set->workers[i];
  716. unsigned devid = worker->devid;
  717. unsigned memnode = worker->memory_node;
  718. if ((int) devid == lastdevid)
  719. /* Already initialized */
  720. continue;
  721. lastdevid = devid;
  722. _starpu_handle_all_pending_node_data_requests(memnode);
  723. /* In case there remains some memory that was automatically
  724. * allocated by StarPU, we release it now. Note that data
  725. * coherency is not maintained anymore at that point ! */
  726. _starpu_free_all_automatically_allocated_buffers(memnode);
  727. _starpu_malloc_shutdown(memnode);
  728. #ifndef STARPU_SIMGRID
  729. deinit_device_context(devid);
  730. #endif /* !STARPU_SIMGRID */
  731. }
  732. for (i = 0; i < worker_set->nworkers; i++)
  733. {
  734. unsigned workerid = worker_set->workers[i].workerid;
  735. deinit_worker_context(workerid);
  736. }
  737. worker_set->workers[0].worker_is_initialized = 0;
  738. _STARPU_TRACE_WORKER_DEINIT_END(_STARPU_FUT_CUDA_KEY);
  739. return 0;
  740. }
  741. void *_starpu_cuda_worker(void *_arg)
  742. {
  743. struct _starpu_worker_set* worker = _arg;
  744. _starpu_cuda_driver_init(worker);
  745. _STARPU_TRACE_START_PROGRESS(memnode);
  746. while (_starpu_machine_is_running())
  747. {
  748. _starpu_may_pause();
  749. _starpu_cuda_driver_run_once(worker);
  750. }
  751. _STARPU_TRACE_END_PROGRESS(memnode);
  752. _starpu_cuda_driver_deinit(worker);
  753. return NULL;
  754. }
  755. #ifdef STARPU_USE_CUDA
  756. void starpu_cublas_report_error(const char *func, const char *file, int line, int status)
  757. {
  758. char *errormsg;
  759. switch (status)
  760. {
  761. case CUBLAS_STATUS_SUCCESS:
  762. errormsg = "success";
  763. break;
  764. case CUBLAS_STATUS_NOT_INITIALIZED:
  765. errormsg = "not initialized";
  766. break;
  767. case CUBLAS_STATUS_ALLOC_FAILED:
  768. errormsg = "alloc failed";
  769. break;
  770. case CUBLAS_STATUS_INVALID_VALUE:
  771. errormsg = "invalid value";
  772. break;
  773. case CUBLAS_STATUS_ARCH_MISMATCH:
  774. errormsg = "arch mismatch";
  775. break;
  776. case CUBLAS_STATUS_EXECUTION_FAILED:
  777. errormsg = "execution failed";
  778. break;
  779. case CUBLAS_STATUS_INTERNAL_ERROR:
  780. errormsg = "internal error";
  781. break;
  782. default:
  783. errormsg = "unknown error";
  784. break;
  785. }
  786. fprintf(stderr, "oops in %s (%s:%d)... %d: %s \n", func, file, line, status, errormsg);
  787. STARPU_ABORT();
  788. }
  789. void starpu_cuda_report_error(const char *func, const char *file, int line, cudaError_t status)
  790. {
  791. const char *errormsg = cudaGetErrorString(status);
  792. printf("oops in %s (%s:%d)... %d: %s \n", func, file, line, status, errormsg);
  793. STARPU_ABORT();
  794. }
  795. #endif /* STARPU_USE_CUDA */
  796. #ifdef STARPU_USE_CUDA
  797. int
  798. starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node,
  799. void *dst_ptr, unsigned dst_node,
  800. size_t ssize, cudaStream_t stream,
  801. enum cudaMemcpyKind kind)
  802. {
  803. #ifdef HAVE_CUDA_MEMCPY_PEER
  804. int peer_copy = 0;
  805. int src_dev = -1, dst_dev = -1;
  806. #endif
  807. cudaError_t cures = 0;
  808. if (kind == cudaMemcpyDeviceToDevice && src_node != dst_node)
  809. {
  810. #ifdef HAVE_CUDA_MEMCPY_PEER
  811. peer_copy = 1;
  812. src_dev = _starpu_memory_node_get_devid(src_node);
  813. dst_dev = _starpu_memory_node_get_devid(dst_node);
  814. #else
  815. STARPU_ABORT();
  816. #endif
  817. }
  818. if (stream)
  819. {
  820. _STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
  821. #ifdef HAVE_CUDA_MEMCPY_PEER
  822. if (peer_copy)
  823. {
  824. cures = cudaMemcpyPeerAsync((char *) dst_ptr, dst_dev,
  825. (char *) src_ptr, src_dev,
  826. ssize, stream);
  827. }
  828. else
  829. #endif
  830. {
  831. cures = cudaMemcpyAsync((char *)dst_ptr, (char *)src_ptr, ssize, kind, stream);
  832. }
  833. _STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
  834. }
  835. /* Test if the asynchronous copy has failed or if the caller only asked for a synchronous copy */
  836. if (stream == NULL || cures)
  837. {
  838. /* do it in a synchronous fashion */
  839. #ifdef HAVE_CUDA_MEMCPY_PEER
  840. if (peer_copy)
  841. {
  842. cures = cudaMemcpyPeer((char *) dst_ptr, dst_dev,
  843. (char *) src_ptr, src_dev,
  844. ssize);
  845. }
  846. else
  847. #endif
  848. {
  849. cures = cudaMemcpy((char *)dst_ptr, (char *)src_ptr, ssize, kind);
  850. }
  851. if (STARPU_UNLIKELY(cures))
  852. STARPU_CUDA_REPORT_ERROR(cures);
  853. return 0;
  854. }
  855. return -EAGAIN;
  856. }
  857. #endif /* STARPU_USE_CUDA */
  858. int _starpu_run_cuda(struct _starpu_worker_set *workerarg)
  859. {
  860. /* Let's go ! */
  861. _starpu_cuda_worker(workerarg);
  862. return 0;
  863. }