driver_cuda.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2009, 2010, 2011 Université de Bordeaux 1
  4. * Copyright (C) 2010 Mehdi Juhoor <mjuhoor@gmail.com>
  5. * Copyright (C) 2010, 2011 Centre National de la Recherche Scientifique
  6. * Copyright (C) 2011 Télécom-SudParis
  7. *
  8. * StarPU is free software; you can redistribute it and/or modify
  9. * it under the terms of the GNU Lesser General Public License as published by
  10. * the Free Software Foundation; either version 2.1 of the License, or (at
  11. * your option) any later version.
  12. *
  13. * StarPU is distributed in the hope that it will be useful, but
  14. * WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  16. *
  17. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  18. */
  19. #include <starpu.h>
  20. #include <starpu_cuda.h>
  21. #include <common/utils.h>
  22. #include <common/config.h>
  23. #include <core/debug.h>
  24. #include <drivers/driver_common/driver_common.h>
  25. #include "driver_cuda.h"
  26. #include <core/sched_policy.h>
  27. double _starpu_task_get_conversion_time(struct starpu_task *task);
  28. /* the number of CUDA devices */
  29. static int ncudagpus;
  30. static cudaStream_t streams[STARPU_NMAXWORKERS];
  31. static cudaStream_t transfer_streams[STARPU_NMAXWORKERS];
  32. /* In case we want to cap the amount of memory available on the GPUs by the
  33. * mean of the STARPU_LIMIT_GPU_MEM, we allocate a big buffer when the driver
  34. * is launched. */
  35. static char *wasted_memory[STARPU_NMAXWORKERS];
  36. static void limit_gpu_mem_if_needed(int devid)
  37. {
  38. cudaError_t cures;
  39. int limit = starpu_get_env_number("STARPU_LIMIT_GPU_MEM");
  40. if (limit == -1)
  41. {
  42. wasted_memory[devid] = NULL;
  43. return;
  44. }
  45. /* Find the size of the memory on the device */
  46. struct cudaDeviceProp prop;
  47. cures = cudaGetDeviceProperties(&prop, devid);
  48. if (STARPU_UNLIKELY(cures))
  49. STARPU_CUDA_REPORT_ERROR(cures);
  50. size_t totalGlobalMem = prop.totalGlobalMem;
  51. /* How much memory to waste ? */
  52. size_t to_waste = totalGlobalMem - (size_t)limit*1024*1024;
  53. _STARPU_DEBUG("CUDA device %d: Wasting %ld MB / Limit %ld MB / Total %ld MB / Remains %ld MB\n",
  54. devid, (size_t)to_waste/(1024*1024), (size_t)limit, (size_t)totalGlobalMem/(1024*1024),
  55. (size_t)(totalGlobalMem - to_waste)/(1024*1024));
  56. /* Allocate a large buffer to waste memory and constraint the amount of available memory. */
  57. cures = cudaMalloc((void **)&wasted_memory[devid], to_waste);
  58. if (STARPU_UNLIKELY(cures))
  59. STARPU_CUDA_REPORT_ERROR(cures);
  60. }
  61. static void unlimit_gpu_mem_if_needed(int devid)
  62. {
  63. cudaError_t cures;
  64. if (wasted_memory[devid])
  65. {
  66. cures = cudaFree(wasted_memory[devid]);
  67. if (STARPU_UNLIKELY(cures))
  68. STARPU_CUDA_REPORT_ERROR(cures);
  69. wasted_memory[devid] = NULL;
  70. }
  71. }
  72. size_t starpu_cuda_get_global_mem_size(int devid)
  73. {
  74. cudaError_t cures;
  75. struct cudaDeviceProp prop;
  76. /* Find the size of the memory on the device */
  77. cures = cudaGetDeviceProperties(&prop, devid);
  78. if (STARPU_UNLIKELY(cures))
  79. STARPU_CUDA_REPORT_ERROR(cures);
  80. return (size_t)prop.totalGlobalMem;
  81. }
  82. cudaStream_t starpu_cuda_get_local_transfer_stream(void)
  83. {
  84. int worker = starpu_worker_get_id();
  85. return transfer_streams[worker];
  86. }
  87. cudaStream_t starpu_cuda_get_local_stream(void)
  88. {
  89. int worker = starpu_worker_get_id();
  90. return streams[worker];
  91. }
  92. static void init_context(int devid)
  93. {
  94. cudaError_t cures;
  95. int workerid = starpu_worker_get_id();
  96. cures = cudaSetDevice(devid);
  97. if (STARPU_UNLIKELY(cures))
  98. STARPU_CUDA_REPORT_ERROR(cures);
  99. /* force CUDA to initialize the context for real */
  100. cudaFree(0);
  101. limit_gpu_mem_if_needed(devid);
  102. cures = cudaStreamCreate(&streams[workerid]);
  103. if (STARPU_UNLIKELY(cures))
  104. STARPU_CUDA_REPORT_ERROR(cures);
  105. cures = cudaStreamCreate(&transfer_streams[workerid]);
  106. if (STARPU_UNLIKELY(cures))
  107. STARPU_CUDA_REPORT_ERROR(cures);
  108. }
  109. static void deinit_context(int workerid, int devid)
  110. {
  111. cudaError_t cures;
  112. cudaStreamDestroy(streams[workerid]);
  113. cudaStreamDestroy(transfer_streams[workerid]);
  114. unlimit_gpu_mem_if_needed(devid);
  115. /* cleanup the runtime API internal stuffs (which CUBLAS is using) */
  116. cures = cudaThreadExit();
  117. if (cures)
  118. STARPU_CUDA_REPORT_ERROR(cures);
  119. }
  120. unsigned _starpu_get_cuda_device_count(void)
  121. {
  122. int cnt;
  123. cudaError_t cures;
  124. cures = cudaGetDeviceCount(&cnt);
  125. if (STARPU_UNLIKELY(cures))
  126. return 0;
  127. if (cnt > STARPU_MAXCUDADEVS) {
  128. fprintf(stderr, "# Warning: %d CUDA devices available. Only %d enabled. Use configure option --enable-maxcudadev=xxx to update the maximum value of supported CUDA devices.\n", cnt, STARPU_MAXCUDADEVS);
  129. cnt = STARPU_MAXCUDADEVS;
  130. }
  131. return (unsigned)cnt;
  132. }
  133. void _starpu_init_cuda(void)
  134. {
  135. ncudagpus = _starpu_get_cuda_device_count();
  136. assert(ncudagpus <= STARPU_MAXCUDADEVS);
  137. }
  138. static int execute_job_on_cuda(starpu_job_t j, struct starpu_worker_s *args)
  139. {
  140. int ret;
  141. uint32_t mask = 0;
  142. cudaError_t cures;
  143. STARPU_ASSERT(j);
  144. struct starpu_task *task = j->task;
  145. struct timespec codelet_start, codelet_end;
  146. unsigned calibrate_model = 0;
  147. STARPU_ASSERT(task);
  148. struct starpu_codelet *cl = task->cl;
  149. STARPU_ASSERT(cl);
  150. if (cl->model && cl->model->benchmarking)
  151. calibrate_model = 1;
  152. ret = _starpu_fetch_task_input(task, mask);
  153. if (ret != 0) {
  154. /* there was not enough memory, so the input of
  155. * the codelet cannot be fetched ... put the
  156. * codelet back, and try it later */
  157. return -EAGAIN;
  158. }
  159. double conversion_time = _starpu_task_get_conversion_time(task);
  160. if (calibrate_model)
  161. {
  162. cures = cudaStreamSynchronize(starpu_cuda_get_local_transfer_stream());
  163. if (STARPU_UNLIKELY(cures))
  164. STARPU_CUDA_REPORT_ERROR(cures);
  165. }
  166. _starpu_driver_start_job(args, j, &codelet_start, 0);
  167. #ifdef HAVE_CUDA_MEMCPY_PEER
  168. /* We make sure we do manipulate the proper device */
  169. cures = cudaSetDevice(args->devid);
  170. if (STARPU_UNLIKELY(cures != cudaSuccess))
  171. STARPU_CUDA_REPORT_ERROR(cures);
  172. #endif
  173. if (cl->cuda_func != STARPU_MULTIPLE_CUDA_IMPLEMENTATIONS) {
  174. cl_func func = cl->cuda_func;
  175. STARPU_ASSERT(func);
  176. func(task->interfaces, task->cl_arg);
  177. }
  178. else {
  179. /* _STARPU_DEBUG("Cuda driver : running kernel * (%d)\n", j->nimpl); */
  180. cl_func func = cl->cuda_funcs[j->nimpl];
  181. STARPU_ASSERT(func);
  182. func(task->interfaces, task->cl_arg);
  183. }
  184. _starpu_driver_end_job(args, j, &codelet_end, 0);
  185. _starpu_driver_update_job_feedback(j, args, args->perf_arch, &codelet_start, &codelet_end, conversion_time);
  186. _starpu_push_task_output(task, mask);
  187. return 0;
  188. }
  189. void *_starpu_cuda_worker(void *arg)
  190. {
  191. struct starpu_worker_s* args = arg;
  192. int devid = args->devid;
  193. int workerid = args->workerid;
  194. unsigned memnode = args->memory_node;
  195. #ifdef STARPU_USE_FXT
  196. _starpu_fxt_register_thread(args->bindid);
  197. #endif
  198. STARPU_TRACE_WORKER_INIT_START(STARPU_FUT_CUDA_KEY, devid, memnode);
  199. _starpu_bind_thread_on_cpu(args->config, args->bindid);
  200. _starpu_set_local_memory_node_key(&memnode);
  201. _starpu_set_local_worker_key(args);
  202. init_context(devid);
  203. /* one more time to avoid hacks from third party lib :) */
  204. _starpu_bind_thread_on_cpu(args->config, args->bindid);
  205. args->status = STATUS_UNKNOWN;
  206. /* get the device's name */
  207. char devname[128];
  208. struct cudaDeviceProp prop;
  209. cudaGetDeviceProperties(&prop, devid);
  210. strncpy(devname, prop.name, 128);
  211. float size = (float) prop.totalGlobalMem / (1<<30);
  212. #if CUDA_VERSION >= 3020
  213. snprintf(args->name, sizeof(args->name), "CUDA %d (%s %.1f GiB %02x:%02x.0)", args->devid, devname, size, prop.pciBusID, prop.pciDeviceID);
  214. #else
  215. snprintf(args->name, sizeof(args->name), "CUDA %d (%s %.1f GiB)", args->devid, devname, size);
  216. #endif
  217. snprintf(args->short_name, sizeof(args->short_name), "CUDA %d", args->devid);
  218. _STARPU_DEBUG("cuda (%s) dev id %d thread is ready to run on CPU %d !\n", devname, devid, args->bindid);
  219. STARPU_TRACE_WORKER_INIT_END
  220. /* tell the main thread that this one is ready */
  221. _STARPU_PTHREAD_MUTEX_LOCK(&args->mutex);
  222. args->worker_is_initialized = 1;
  223. _STARPU_PTHREAD_COND_SIGNAL(&args->ready_cond);
  224. _STARPU_PTHREAD_MUTEX_UNLOCK(&args->mutex);
  225. struct starpu_job_s * j;
  226. struct starpu_task *task;
  227. int res;
  228. while (_starpu_machine_is_running())
  229. {
  230. STARPU_TRACE_START_PROGRESS(memnode);
  231. _starpu_datawizard_progress(memnode, 1);
  232. STARPU_TRACE_END_PROGRESS(memnode);
  233. _STARPU_PTHREAD_MUTEX_LOCK(args->sched_mutex);
  234. task = _starpu_pop_task(args);
  235. if (task == NULL)
  236. {
  237. if (_starpu_worker_can_block(memnode))
  238. _starpu_block_worker(workerid, args->sched_cond, args->sched_mutex);
  239. _STARPU_PTHREAD_MUTEX_UNLOCK(args->sched_mutex);
  240. continue;
  241. };
  242. _STARPU_PTHREAD_MUTEX_UNLOCK(args->sched_mutex);
  243. STARPU_ASSERT(task);
  244. j = _starpu_get_job_associated_to_task(task);
  245. /* can CUDA do that task ? */
  246. if (!STARPU_CUDA_MAY_PERFORM(j))
  247. {
  248. /* this is neither a cuda or a cublas task */
  249. _starpu_push_task(j, 0);
  250. continue;
  251. }
  252. _starpu_set_current_task(task);
  253. res = execute_job_on_cuda(j, args);
  254. _starpu_set_current_task(NULL);
  255. if (res) {
  256. switch (res) {
  257. case -EAGAIN:
  258. _STARPU_DISP("ouch, put the codelet %p back ... \n", j);
  259. _starpu_push_task(j, 0);
  260. STARPU_ABORT();
  261. continue;
  262. default:
  263. assert(0);
  264. }
  265. }
  266. _starpu_handle_job_termination(j, 0);
  267. }
  268. STARPU_TRACE_WORKER_DEINIT_START
  269. _starpu_handle_all_pending_node_data_requests(memnode);
  270. /* In case there remains some memory that was automatically
  271. * allocated by StarPU, we release it now. Note that data
  272. * coherency is not maintained anymore at that point ! */
  273. _starpu_free_all_automatically_allocated_buffers(memnode);
  274. deinit_context(args->workerid, args->devid);
  275. STARPU_TRACE_WORKER_DEINIT_END(STARPU_FUT_CUDA_KEY);
  276. pthread_exit(NULL);
  277. return NULL;
  278. }
  279. void starpu_cublas_report_error(const char *func, const char *file, int line, cublasStatus status)
  280. {
  281. char *errormsg;
  282. switch (status) {
  283. case CUBLAS_STATUS_SUCCESS:
  284. errormsg = "success";
  285. break;
  286. case CUBLAS_STATUS_NOT_INITIALIZED:
  287. errormsg = "not initialized";
  288. break;
  289. case CUBLAS_STATUS_ALLOC_FAILED:
  290. errormsg = "alloc failed";
  291. break;
  292. case CUBLAS_STATUS_INVALID_VALUE:
  293. errormsg = "invalid value";
  294. break;
  295. case CUBLAS_STATUS_ARCH_MISMATCH:
  296. errormsg = "arch mismatch";
  297. break;
  298. case CUBLAS_STATUS_EXECUTION_FAILED:
  299. errormsg = "execution failed";
  300. break;
  301. case CUBLAS_STATUS_INTERNAL_ERROR:
  302. errormsg = "internal error";
  303. break;
  304. default:
  305. errormsg = "unknown error";
  306. break;
  307. }
  308. printf("oops in %s (%s:%u)... %s \n", func, file, line, errormsg);
  309. assert(0);
  310. }
  311. void starpu_cuda_report_error(const char *func, const char *file, int line, cudaError_t status)
  312. {
  313. const char *errormsg = cudaGetErrorString(status);
  314. printf("oops in %s (%s:%u)... %s \n", func, file, line, errormsg);
  315. assert(0);
  316. }