malloc.c 30 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2009-2010, 2012-2016 Université de Bordeaux
  4. * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016 CNRS
  5. *
  6. * StarPU is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU Lesser General Public License as published by
  8. * the Free Software Foundation; either version 2.1 of the License, or (at
  9. * your option) any later version.
  10. *
  11. * StarPU is distributed in the hope that it will be useful, but
  12. * WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  14. *
  15. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  16. */
  17. #include <errno.h>
  18. #include <core/workers.h>
  19. #include <core/disk.h>
  20. #include <common/config.h>
  21. #include <common/fxt.h>
  22. #include <starpu.h>
  23. #include <drivers/opencl/driver_opencl.h>
  24. #include <datawizard/memory_manager.h>
  25. #include <datawizard/memory_nodes.h>
  26. #include <datawizard/malloc.h>
  27. #include <core/simgrid.h>
  28. #ifdef STARPU_SIMGRID
  29. #include <sys/mman.h>
  30. #include <fcntl.h>
  31. #endif
  32. #ifndef O_BINARY
  33. #define O_BINARY 0
  34. #endif
  35. #ifndef MAP_POPULATE
  36. #define MAP_POPULATE 0
  37. #endif
  38. static size_t _malloc_align = sizeof(void*);
  39. static int disable_pinning;
  40. static int malloc_on_node_default_flags[STARPU_MAXNODES];
  41. /* This file is used for implementing "folded" allocation */
  42. #ifdef STARPU_SIMGRID
  43. static int bogusfile = -1;
  44. static unsigned long _starpu_malloc_simulation_fold;
  45. #endif
  46. void starpu_malloc_set_align(size_t align)
  47. {
  48. STARPU_ASSERT_MSG(!(align & (align - 1)), "Alignment given to starpu_malloc_set_align (%lu) must be a power of two", (unsigned long) align);
  49. if (_malloc_align < align)
  50. _malloc_align = align;
  51. }
  52. #if (defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER))// || defined(STARPU_USE_OPENCL)
  53. struct malloc_pinned_codelet_struct
  54. {
  55. void **ptr;
  56. size_t dim;
  57. };
  58. #endif
  59. /* Would be difficult to do it this way, we need to remember the cl_mem to be able to free it later... */
  60. //#ifdef STARPU_USE_OPENCL
  61. //static void malloc_pinned_opencl_codelet(void *buffers[] STARPU_ATTRIBUTE_UNUSED, void *arg)
  62. //{
  63. // struct malloc_pinned_codelet_struct *s = arg;
  64. // // *(s->ptr) = malloc(s->dim);
  65. // starpu_opencl_allocate_memory(devid, (void **)(s->ptr), s->dim, CL_MEM_READ_WRITE|CL_MEM_ALLOC_HOST_PTR);
  66. //}
  67. //#endif
  68. #if defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
  69. static void malloc_pinned_cuda_codelet(void *buffers[] STARPU_ATTRIBUTE_UNUSED, void *arg)
  70. {
  71. struct malloc_pinned_codelet_struct *s = arg;
  72. cudaError_t cures;
  73. cures = cudaHostAlloc((void **)(s->ptr), s->dim, cudaHostAllocPortable);
  74. if (STARPU_UNLIKELY(cures))
  75. STARPU_CUDA_REPORT_ERROR(cures);
  76. }
  77. #endif
  78. #if (defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER)) && !defined(STARPU_SIMGRID)// || defined(STARPU_USE_OPENCL)
  79. static struct starpu_perfmodel malloc_pinned_model =
  80. {
  81. .type = STARPU_HISTORY_BASED,
  82. .symbol = "malloc_pinned"
  83. };
  84. static struct starpu_codelet malloc_pinned_cl =
  85. {
  86. .cuda_funcs = {malloc_pinned_cuda_codelet},
  87. //#ifdef STARPU_USE_OPENCL
  88. // .opencl_funcs = {malloc_pinned_opencl_codelet},
  89. //#endif
  90. .nbuffers = 0,
  91. .model = &malloc_pinned_model
  92. };
  93. #endif
  94. int starpu_malloc_flags(void **A, size_t dim, int flags)
  95. {
  96. int ret=0;
  97. STARPU_ASSERT(A);
  98. if (flags & STARPU_MALLOC_COUNT)
  99. {
  100. if (!(flags & STARPU_MALLOC_NORECLAIM))
  101. while (starpu_memory_allocate(STARPU_MAIN_RAM, dim, flags) != 0)
  102. {
  103. size_t freed;
  104. size_t reclaim = 2 * dim;
  105. _STARPU_DEBUG("There is not enough memory left, we are going to reclaim %ld\n", reclaim);
  106. _STARPU_TRACE_START_MEMRECLAIM(STARPU_MAIN_RAM,0);
  107. freed = _starpu_memory_reclaim_generic(STARPU_MAIN_RAM, 0, reclaim);
  108. _STARPU_TRACE_END_MEMRECLAIM(STARPU_MAIN_RAM,0);
  109. if (freed < dim && !(flags & STARPU_MEMORY_WAIT))
  110. {
  111. // We could not reclaim enough memory
  112. *A = NULL;
  113. return -ENOMEM;
  114. }
  115. }
  116. else if (flags & STARPU_MEMORY_WAIT)
  117. starpu_memory_allocate(STARPU_MAIN_RAM, dim, flags);
  118. else
  119. starpu_memory_allocate(STARPU_MAIN_RAM, dim, flags | STARPU_MEMORY_OVERFLOW);
  120. }
  121. struct _starpu_machine_config *config = _starpu_get_machine_config();
  122. if (flags & STARPU_MALLOC_PINNED && disable_pinning <= 0 && STARPU_RUNNING_ON_VALGRIND == 0 && config->conf.ncuda != 0)
  123. {
  124. #ifdef STARPU_SIMGRID
  125. /* FIXME: CUDA seems to be taking 650µs every 1MiB.
  126. * Ideally we would simulate this batching in 1MiB requests
  127. * instead of computing an average value.
  128. */
  129. if (_starpu_simgrid_cuda_malloc_cost())
  130. MSG_process_sleep((float) dim * 0.000650 / 1048576.);
  131. #else /* STARPU_SIMGRID */
  132. if (_starpu_can_submit_cuda_task())
  133. {
  134. #ifdef STARPU_USE_CUDA
  135. #ifdef HAVE_CUDA_MEMCPY_PEER
  136. cudaError_t cures;
  137. cures = cudaHostAlloc(A, dim, cudaHostAllocPortable);
  138. if (STARPU_UNLIKELY(cures))
  139. {
  140. STARPU_CUDA_REPORT_ERROR(cures);
  141. ret = -ENOMEM;
  142. }
  143. goto end;
  144. #else
  145. int push_res;
  146. /* Old versions of CUDA are not thread-safe, we have to
  147. * run cudaHostAlloc from CUDA workers */
  148. STARPU_ASSERT_MSG(_starpu_worker_may_perform_blocking_calls(), "without CUDA peer allocation support, pinned allocation must not be done from task or callback");
  149. struct malloc_pinned_codelet_struct s =
  150. {
  151. .ptr = A,
  152. .dim = dim
  153. };
  154. malloc_pinned_cl.where = STARPU_CUDA;
  155. struct starpu_task *task = starpu_task_create();
  156. task->name = "cuda_malloc_pinned";
  157. task->callback_func = NULL;
  158. task->cl = &malloc_pinned_cl;
  159. task->cl_arg = &s;
  160. task->synchronous = 1;
  161. _starpu_exclude_task_from_dag(task);
  162. push_res = _starpu_task_submit_internally(task);
  163. STARPU_ASSERT(push_res != -ENODEV);
  164. goto end;
  165. #endif /* HAVE_CUDA_MEMCPY_PEER */
  166. #endif /* STARPU_USE_CUDA */
  167. }
  168. // else if (_starpu_can_submit_opencl_task())
  169. // {
  170. //#ifdef STARPU_USE_OPENCL
  171. // int push_res;
  172. //
  173. // STARPU_ASSERT_MSG(_starpu_worker_may_perform_blocking_calls(), "pinned OpenCL allocation must not be done from task or callback");
  174. //
  175. // struct malloc_pinned_codelet_struct s =
  176. // {
  177. // .ptr = A,
  178. // .dim = dim
  179. // };
  180. //
  181. // malloc_pinned_cl.where = STARPU_OPENCL;
  182. // struct starpu_task *task = starpu_task_create();
  183. // task->name = "opencl_malloc_pinned";
  184. // task->callback_func = NULL;
  185. // task->cl = &malloc_pinned_cl;
  186. // task->cl_arg = &s;
  187. // task->synchronous = 1;
  188. //
  189. // _starpu_exclude_task_from_dag(task);
  190. //
  191. // push_res = _starpu_task_submit_internally(task);
  192. // STARPU_ASSERT(push_res != -ENODEV);
  193. // goto end;
  194. //#endif /* STARPU_USE_OPENCL */
  195. // }
  196. #endif /* STARPU_SIMGRID */
  197. }
  198. #ifdef STARPU_SIMGRID
  199. if (flags & STARPU_MALLOC_SIMULATION_FOLDED)
  200. {
  201. /* Use "folded" allocation: the same file is mapped several
  202. * times contiguously, to get a memory area one can read/write,
  203. * without consuming memory */
  204. /* First reserve memory area */
  205. void *buf = mmap (NULL, dim, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
  206. unsigned i;
  207. if (buf == MAP_FAILED)
  208. {
  209. _STARPU_DISP("Warning: could not allocate %luMiB of memory, you need to run \"sysctl vm.overcommit_memory=1\" as root to allow so big allocations\n", (unsigned long) (dim >> 20));
  210. ret = -ENOMEM;
  211. *A = NULL;
  212. }
  213. else
  214. {
  215. if (bogusfile == -1)
  216. {
  217. char *path = starpu_getenv("TMPDIR");
  218. if (!path)
  219. path = "/tmp";
  220. /* Create bogus file if not done already */
  221. char *name = _starpu_mktemp(path, O_RDWR | O_BINARY, &bogusfile);
  222. char *dumb;
  223. if (!name)
  224. {
  225. ret = errno;
  226. munmap(buf, dim);
  227. *A = NULL;
  228. goto end;
  229. }
  230. unlink(name);
  231. free(name);
  232. _STARPU_CALLOC(dumb, 1,_starpu_malloc_simulation_fold);
  233. write(bogusfile, dumb, _starpu_malloc_simulation_fold);
  234. free(dumb);
  235. }
  236. /* Map the bogus file in place of the anonymous memory */
  237. for (i = 0; i < dim / _starpu_malloc_simulation_fold; i++)
  238. {
  239. void *pos = (void*) ((unsigned long) buf + i * _starpu_malloc_simulation_fold);
  240. void *res = mmap(pos, _starpu_malloc_simulation_fold, PROT_READ|PROT_WRITE, MAP_FIXED|MAP_SHARED|MAP_POPULATE, bogusfile, 0);
  241. STARPU_ASSERT_MSG(res == pos, "Could not map folded virtual memory (%s). Do you perhaps need to increase the STARPU_MALLOC_SIMULATION_FOLD environment variable or the sysctl vm.max_map_count?", strerror(errno));
  242. }
  243. if (dim % _starpu_malloc_simulation_fold)
  244. {
  245. void *pos = (void*) ((unsigned long) buf + i * _starpu_malloc_simulation_fold);
  246. void *res = mmap(pos, dim % _starpu_malloc_simulation_fold, PROT_READ|PROT_WRITE, MAP_FIXED|MAP_SHARED|MAP_POPULATE, bogusfile, 0);
  247. STARPU_ASSERT_MSG(res == pos, "Could not map folded virtual memory (%s). Do you perhaps need to increase the STARPU_MALLOC_SIMULATION_FOLD environment variable or the sysctl vm.max_map_count?", strerror(errno));
  248. }
  249. *A = buf;
  250. }
  251. }
  252. else
  253. #endif
  254. if (_starpu_can_submit_scc_task())
  255. {
  256. #ifdef STARPU_USE_SCC
  257. _starpu_scc_allocate_shared_memory(A, dim);
  258. #endif
  259. }
  260. else
  261. #ifdef STARPU_HAVE_POSIX_MEMALIGN
  262. if (_malloc_align != sizeof(void*))
  263. {
  264. if (posix_memalign(A, _malloc_align, dim))
  265. {
  266. ret = -ENOMEM;
  267. *A = NULL;
  268. }
  269. }
  270. else
  271. #elif defined(STARPU_HAVE_MEMALIGN)
  272. if (_malloc_align != sizeof(void*))
  273. {
  274. *A = memalign(_malloc_align, dim);
  275. if (!*A)
  276. ret = -ENOMEM;
  277. }
  278. else
  279. #endif /* STARPU_HAVE_POSIX_MEMALIGN */
  280. {
  281. *A = malloc(dim);
  282. if (!*A)
  283. ret = -ENOMEM;
  284. }
  285. #if defined(STARPU_SIMGRID) || defined(STARPU_USE_CUDA)
  286. end:
  287. #endif
  288. if (ret == 0)
  289. {
  290. STARPU_ASSERT_MSG(*A, "Failed to allocated memory of size %ld b\n", (unsigned long)dim);
  291. }
  292. else if (flags & STARPU_MALLOC_COUNT)
  293. {
  294. starpu_memory_deallocate(STARPU_MAIN_RAM, dim);
  295. }
  296. return ret;
  297. }
  298. int starpu_malloc(void **A, size_t dim)
  299. {
  300. return starpu_malloc_flags(A, dim, STARPU_MALLOC_PINNED);
  301. }
  302. #if defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
  303. static void free_pinned_cuda_codelet(void *buffers[] STARPU_ATTRIBUTE_UNUSED, void *arg)
  304. {
  305. cudaError_t cures;
  306. cures = cudaFreeHost(arg);
  307. if (STARPU_UNLIKELY(cures))
  308. STARPU_CUDA_REPORT_ERROR(cures);
  309. }
  310. #endif
  311. //#ifdef STARPU_USE_OPENCL
  312. //static void free_pinned_opencl_codelet(void *buffers[] STARPU_ATTRIBUTE_UNUSED, void *arg)
  313. //{
  314. // // free(arg);
  315. // int err = clReleaseMemObject(arg);
  316. // if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
  317. //}
  318. //#endif
  319. #if defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID) // || defined(STARPU_USE_OPENCL)
  320. static struct starpu_perfmodel free_pinned_model =
  321. {
  322. .type = STARPU_HISTORY_BASED,
  323. .symbol = "free_pinned"
  324. };
  325. static struct starpu_codelet free_pinned_cl =
  326. {
  327. .cuda_funcs = {free_pinned_cuda_codelet},
  328. //#ifdef STARPU_USE_OPENCL
  329. // .opencl_funcs = {free_pinned_opencl_codelet},
  330. //#endif
  331. .nbuffers = 0,
  332. .model = &free_pinned_model
  333. };
  334. #endif
  335. int starpu_free_flags(void *A, size_t dim, int flags)
  336. {
  337. #ifndef STARPU_SIMGRID
  338. if (flags & STARPU_MALLOC_PINNED && disable_pinning <= 0 && STARPU_RUNNING_ON_VALGRIND == 0)
  339. {
  340. if (_starpu_can_submit_cuda_task())
  341. {
  342. #ifdef STARPU_USE_CUDA
  343. #ifndef HAVE_CUDA_MEMCPY_PEER
  344. if (!_starpu_is_initialized())
  345. {
  346. #endif
  347. /* This is especially useful when starpu_free is called from
  348. * the GCC-plugin. starpu_shutdown will probably have already
  349. * been called, so we will not be able to submit a task. */
  350. cudaError_t err = cudaFreeHost(A);
  351. if (STARPU_UNLIKELY(err))
  352. STARPU_CUDA_REPORT_ERROR(err);
  353. goto out;
  354. #ifndef HAVE_CUDA_MEMCPY_PEER
  355. }
  356. else
  357. {
  358. int push_res;
  359. STARPU_ASSERT_MSG(_starpu_worker_may_perform_blocking_calls(), "without CUDA peer allocation support, pinned deallocation must not be done from task or callback");
  360. free_pinned_cl.where = STARPU_CUDA;
  361. struct starpu_task *task = starpu_task_create();
  362. task->name = "cuda_free_pinned";
  363. task->callback_func = NULL;
  364. task->cl = &free_pinned_cl;
  365. task->cl_arg = A;
  366. task->synchronous = 1;
  367. _starpu_exclude_task_from_dag(task);
  368. push_res = _starpu_task_submit_internally(task);
  369. STARPU_ASSERT(push_res != -ENODEV);
  370. goto out;
  371. }
  372. #endif /* HAVE_CUDA_MEMCPY_PEER */
  373. #endif /* STARPU_USE_CUDA */
  374. }
  375. // else if (_starpu_can_submit_opencl_task())
  376. // {
  377. //#ifdef STARPU_USE_OPENCL
  378. // int push_res;
  379. //
  380. // STARPU_ASSERT_MSG(_starpu_worker_may_perform_blocking_calls(), "pinned OpenCL deallocation must not be done from task or callback");
  381. //
  382. // free_pinned_cl.where = STARPU_OPENCL;
  383. // struct starpu_task *task = starpu_task_create();
  384. // task->name = "opencl_free_pinned";
  385. // task->callback_func = NULL;
  386. // task->cl = &free_pinned_cl;
  387. // task->cl_arg = A;
  388. // task->synchronous = 1;
  389. //
  390. // _starpu_exclude_task_from_dag(task);
  391. //
  392. // push_res = starpu_task_submit(task);
  393. // STARPU_ASSERT(push_res != -ENODEV);
  394. // goto out;
  395. // }
  396. //#endif
  397. }
  398. #endif /* STARPU_SIMGRID */
  399. #ifdef STARPU_SIMGRID
  400. if (flags & STARPU_MALLOC_SIMULATION_FOLDED)
  401. {
  402. munmap(A, dim);
  403. }
  404. else
  405. #endif
  406. if (_starpu_can_submit_scc_task())
  407. {
  408. #ifdef STARPU_USE_SCC
  409. _starpu_scc_free_shared_memory(A);
  410. #endif
  411. }
  412. else
  413. free(A);
  414. #if !defined(STARPU_SIMGRID) && defined(STARPU_USE_CUDA)
  415. out:
  416. #endif
  417. if (flags & STARPU_MALLOC_COUNT)
  418. {
  419. starpu_memory_deallocate(STARPU_MAIN_RAM, dim);
  420. }
  421. return 0;
  422. }
  423. int starpu_free(void *A)
  424. {
  425. return starpu_free_flags(A, 0, STARPU_MALLOC_PINNED);
  426. }
  427. #ifdef STARPU_SIMGRID
  428. static starpu_pthread_mutex_t cuda_alloc_mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
  429. static starpu_pthread_mutex_t opencl_alloc_mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
  430. #endif
  431. static uintptr_t
  432. _starpu_malloc_on_node(unsigned dst_node, size_t size, int flags)
  433. {
  434. uintptr_t addr = 0;
  435. #if defined(STARPU_USE_CUDA) && !defined(STARPU_SIMGRID)
  436. cudaError_t status;
  437. #endif
  438. /* Handle count first */
  439. if (flags & STARPU_MALLOC_COUNT)
  440. {
  441. if (starpu_memory_allocate(dst_node, size, flags) != 0)
  442. return 0;
  443. /* And prevent double-count in starpu_malloc_flags */
  444. flags &= ~STARPU_MALLOC_COUNT;
  445. }
  446. switch(starpu_node_get_kind(dst_node))
  447. {
  448. case STARPU_CPU_RAM:
  449. {
  450. starpu_malloc_flags((void**) &addr, size,
  451. #if defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
  452. /* without memcpy_peer, we can not
  453. * allocated pinned memory, since it
  454. * requires waiting for a task, and we
  455. * may be called with a spinlock held
  456. */
  457. flags & ~STARPU_MALLOC_PINNED
  458. #else
  459. flags
  460. #endif
  461. );
  462. break;
  463. }
  464. #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
  465. case STARPU_CUDA_RAM:
  466. {
  467. #ifdef STARPU_SIMGRID
  468. static uintptr_t last[STARPU_MAXNODES];
  469. #ifdef STARPU_DEVEL
  470. #warning TODO: record used memory, using a simgrid property to know the available memory
  471. #endif
  472. /* Sleep for the allocation */
  473. STARPU_PTHREAD_MUTEX_LOCK(&cuda_alloc_mutex);
  474. if (_starpu_simgrid_cuda_malloc_cost())
  475. MSG_process_sleep(0.000175);
  476. if (!last[dst_node])
  477. last[dst_node] = 1<<10;
  478. addr = last[dst_node];
  479. last[dst_node]+=size;
  480. STARPU_ASSERT(last[dst_node] >= addr);
  481. STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_alloc_mutex);
  482. #else
  483. struct _starpu_worker *worker = _starpu_get_local_worker_key();
  484. unsigned devid = _starpu_memory_node_get_devid(dst_node);
  485. if (!worker || worker->arch != STARPU_CUDA_WORKER || worker->devid != devid)
  486. #if defined(HAVE_CUDA_MEMCPY_PEER)
  487. starpu_cuda_set_device(devid);
  488. #else
  489. STARPU_ASSERT_MSG(0, "CUDA peer access is not available with this version of CUDA");
  490. #endif
  491. status = cudaMalloc((void **)&addr, size);
  492. if (!addr || (status != cudaSuccess))
  493. {
  494. if (STARPU_UNLIKELY(status != cudaErrorMemoryAllocation))
  495. STARPU_CUDA_REPORT_ERROR(status);
  496. addr = 0;
  497. }
  498. #endif
  499. break;
  500. }
  501. #endif
  502. #if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
  503. case STARPU_OPENCL_RAM:
  504. {
  505. #ifdef STARPU_SIMGRID
  506. static uintptr_t last[STARPU_MAXNODES];
  507. /* Sleep for the allocation */
  508. STARPU_PTHREAD_MUTEX_LOCK(&opencl_alloc_mutex);
  509. if (_starpu_simgrid_cuda_malloc_cost())
  510. MSG_process_sleep(0.000175);
  511. if (!last[dst_node])
  512. last[dst_node] = 1<<10;
  513. addr = last[dst_node];
  514. last[dst_node]+=size;
  515. STARPU_ASSERT(last[dst_node] >= addr);
  516. STARPU_PTHREAD_MUTEX_UNLOCK(&opencl_alloc_mutex);
  517. #else
  518. int ret;
  519. cl_mem ptr;
  520. ret = starpu_opencl_allocate_memory(_starpu_memory_node_get_devid(dst_node), &ptr, size, CL_MEM_READ_WRITE);
  521. if (ret)
  522. {
  523. addr = 0;
  524. }
  525. else
  526. {
  527. addr = (uintptr_t)ptr;
  528. }
  529. break;
  530. #endif
  531. }
  532. #endif
  533. case STARPU_DISK_RAM:
  534. {
  535. addr = (uintptr_t) _starpu_disk_alloc(dst_node, size);
  536. break;
  537. }
  538. #ifdef STARPU_USE_MIC
  539. case STARPU_MIC_RAM:
  540. if (_starpu_mic_allocate_memory((void **)(&addr), size, dst_node))
  541. addr = 0;
  542. break;
  543. #endif
  544. #ifdef STARPU_USE_MPI_MASTER_SLAVE
  545. case STARPU_MPI_MS_RAM:
  546. if (_starpu_mpi_src_allocate_memory((void **)(&addr), size, dst_node))
  547. addr = 0;
  548. break;
  549. #endif
  550. #ifdef STARPU_USE_SCC
  551. case STARPU_SCC_RAM:
  552. if (_starpu_scc_allocate_memory((void **)(&addr), size, dst_node))
  553. addr = 0;
  554. break;
  555. #endif
  556. default:
  557. STARPU_ABORT();
  558. }
  559. if (addr == 0)
  560. {
  561. // Allocation failed, gives the memory back to the memory manager
  562. _STARPU_TRACE_MEMORY_FULL(size);
  563. starpu_memory_deallocate(dst_node, size);
  564. }
  565. return addr;
  566. }
  567. void
  568. _starpu_free_on_node_flags(unsigned dst_node, uintptr_t addr, size_t size, int flags)
  569. {
  570. int count = flags & STARPU_MALLOC_COUNT;
  571. flags &= ~STARPU_MALLOC_COUNT;
  572. enum starpu_node_kind kind = starpu_node_get_kind(dst_node);
  573. switch(kind)
  574. {
  575. case STARPU_CPU_RAM:
  576. starpu_free_flags((void*)addr, size,
  577. #if defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
  578. flags & ~STARPU_MALLOC_PINNED
  579. #else
  580. flags
  581. #endif
  582. );
  583. break;
  584. #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
  585. case STARPU_CUDA_RAM:
  586. {
  587. #ifdef STARPU_SIMGRID
  588. STARPU_PTHREAD_MUTEX_LOCK(&cuda_alloc_mutex);
  589. /* Sleep for the free */
  590. if (_starpu_simgrid_cuda_malloc_cost())
  591. MSG_process_sleep(0.000750);
  592. STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_alloc_mutex);
  593. #else
  594. cudaError_t err;
  595. struct _starpu_worker *worker = _starpu_get_local_worker_key();
  596. unsigned devid = _starpu_memory_node_get_devid(dst_node);
  597. if (!worker || worker->arch != STARPU_CUDA_WORKER || worker->devid != devid)
  598. #if defined(HAVE_CUDA_MEMCPY_PEER)
  599. starpu_cuda_set_device(devid);
  600. #else
  601. STARPU_ASSERT_MSG(0, "CUDA peer access is not available with this version of CUDA");
  602. #endif
  603. err = cudaFree((void*)addr);
  604. if (STARPU_UNLIKELY(err != cudaSuccess
  605. #ifdef STARPU_OPENMP
  606. /* When StarPU is used as Open Runtime support,
  607. * starpu_omp_shutdown() will usually be called from a
  608. * destructor, in which case cudaThreadExit() reports a
  609. * cudaErrorCudartUnloading here. There should not
  610. * be any remaining tasks running at this point so
  611. * we can probably ignore it without much consequences. */
  612. && err != cudaErrorCudartUnloading
  613. #endif /* STARPU_OPENMP */
  614. ))
  615. STARPU_CUDA_REPORT_ERROR(err);
  616. #endif
  617. break;
  618. }
  619. #endif
  620. #if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
  621. case STARPU_OPENCL_RAM:
  622. {
  623. #ifdef STARPU_SIMGRID
  624. STARPU_PTHREAD_MUTEX_LOCK(&opencl_alloc_mutex);
  625. /* Sleep for the free */
  626. if (_starpu_simgrid_cuda_malloc_cost())
  627. MSG_process_sleep(0.000750);
  628. STARPU_PTHREAD_MUTEX_UNLOCK(&opencl_alloc_mutex);
  629. #else
  630. cl_int err;
  631. err = clReleaseMemObject((void*)addr);
  632. if (STARPU_UNLIKELY(err != CL_SUCCESS))
  633. STARPU_OPENCL_REPORT_ERROR(err);
  634. #endif
  635. break;
  636. }
  637. #endif
  638. case STARPU_DISK_RAM:
  639. {
  640. _starpu_disk_free (dst_node, (void *) addr , size);
  641. break;
  642. }
  643. #ifdef STARPU_USE_MIC
  644. case STARPU_MIC_RAM:
  645. _starpu_mic_free_memory((void*) addr, size, dst_node);
  646. break;
  647. #endif
  648. #ifdef STARPU_USE_MPI_MASTER_SLAVE
  649. case STARPU_MPI_MS_RAM:
  650. _starpu_mpi_source_free_memory((void*) addr, dst_node);
  651. break;
  652. #endif
  653. #ifdef STARPU_USE_SCC
  654. case STARPU_SCC_RAM:
  655. _starpu_scc_free_memory((void *) addr, dst_node);
  656. break;
  657. #endif
  658. default:
  659. STARPU_ABORT();
  660. }
  661. if (count)
  662. starpu_memory_deallocate(dst_node, size);
  663. }
  664. int
  665. starpu_memory_pin(void *addr STARPU_ATTRIBUTE_UNUSED, size_t size STARPU_ATTRIBUTE_UNUSED)
  666. {
  667. if (STARPU_MALLOC_PINNED && disable_pinning <= 0 && STARPU_RUNNING_ON_VALGRIND == 0)
  668. {
  669. #if defined(STARPU_USE_CUDA) && defined(HAVE_CUDA_MEMCPY_PEER)
  670. if (cudaHostRegister(addr, size, cudaHostRegisterPortable) != cudaSuccess)
  671. return -1;
  672. #endif
  673. }
  674. return 0;
  675. }
  676. int
  677. starpu_memory_unpin(void *addr STARPU_ATTRIBUTE_UNUSED, size_t size STARPU_ATTRIBUTE_UNUSED)
  678. {
  679. if (STARPU_MALLOC_PINNED && disable_pinning <= 0 && STARPU_RUNNING_ON_VALGRIND == 0)
  680. {
  681. #if defined(STARPU_USE_CUDA) && defined(HAVE_CUDA_MEMCPY_PEER)
  682. if (cudaHostUnregister(addr) != cudaSuccess)
  683. return -1;
  684. #endif
  685. }
  686. return 0;
  687. }
  688. /*
  689. * On CUDA which has very expensive malloc, for small sizes, allocate big
  690. * chunks divided in blocks, and we actually allocate segments of consecutive
  691. * blocks.
  692. *
  693. * We try to keep the list of chunks with increasing occupancy, so we can
  694. * quickly find free segments to allocate.
  695. */
  696. /* Size of each chunk, 32MiB granularity brings 128 chunks to be allocated in
  697. * order to fill a 4GiB GPU. */
  698. #define CHUNK_SIZE (32*1024*1024)
  699. /* Maximum segment size we will allocate in chunks */
  700. #define CHUNK_ALLOC_MAX (CHUNK_SIZE / 8)
  701. /* Granularity of allocation, i.e. block size, StarPU will never allocate less
  702. * than this.
  703. * 16KiB (i.e. 64x64 float) granularity eats 2MiB RAM for managing a 4GiB GPU.
  704. */
  705. #define CHUNK_ALLOC_MIN (16*1024)
  706. /* Number of blocks */
  707. #define CHUNK_NBLOCKS (CHUNK_SIZE/CHUNK_ALLOC_MIN)
  708. /* Linked list for available segments */
  709. struct block
  710. {
  711. int length; /* Number of consecutive free blocks */
  712. int next; /* next free segment */
  713. };
  714. /* One chunk */
  715. LIST_TYPE(_starpu_chunk,
  716. uintptr_t base;
  717. /* Available number of blocks, for debugging */
  718. int available;
  719. /* Overestimation of the maximum size of available segments in this chunk */
  720. int available_max;
  721. /* Bitmap describing availability of the block */
  722. /* Block 0 is always empty, and is just the head of the free segments list */
  723. struct block bitmap[CHUNK_NBLOCKS+1];
  724. )
  725. /* One list of chunks per node */
  726. static struct _starpu_chunk_list chunks[STARPU_MAXNODES];
  727. /* Number of completely free chunks */
  728. static int nfreechunks[STARPU_MAXNODES];
  729. /* This protects chunks and nfreechunks */
  730. static starpu_pthread_mutex_t chunk_mutex[STARPU_MAXNODES];
  731. void
  732. _starpu_malloc_init(unsigned dst_node)
  733. {
  734. _starpu_chunk_list_init(&chunks[dst_node]);
  735. nfreechunks[dst_node] = 0;
  736. STARPU_PTHREAD_MUTEX_INIT(&chunk_mutex[dst_node], NULL);
  737. disable_pinning = starpu_get_env_number("STARPU_DISABLE_PINNING");
  738. malloc_on_node_default_flags[dst_node] = STARPU_MALLOC_PINNED | STARPU_MALLOC_COUNT;
  739. #ifdef STARPU_SIMGRID
  740. /* Reasonably "costless" */
  741. _starpu_malloc_simulation_fold = starpu_get_env_number_default("STARPU_MALLOC_SIMULATION_FOLD", 1) << 20;
  742. #endif
  743. }
  744. void
  745. _starpu_malloc_shutdown(unsigned dst_node)
  746. {
  747. struct _starpu_chunk *chunk, *next_chunk;
  748. STARPU_PTHREAD_MUTEX_LOCK(&chunk_mutex[dst_node]);
  749. for (chunk = _starpu_chunk_list_begin(&chunks[dst_node]);
  750. chunk != _starpu_chunk_list_end(&chunks[dst_node]);
  751. chunk = next_chunk)
  752. {
  753. next_chunk = _starpu_chunk_list_next(chunk);
  754. _starpu_free_on_node_flags(dst_node, chunk->base, CHUNK_SIZE, malloc_on_node_default_flags[dst_node]);
  755. _starpu_chunk_list_erase(&chunks[dst_node], chunk);
  756. free(chunk);
  757. }
  758. STARPU_PTHREAD_MUTEX_UNLOCK(&chunk_mutex[dst_node]);
  759. STARPU_PTHREAD_MUTEX_DESTROY(&chunk_mutex[dst_node]);
  760. }
  761. /* Create a new chunk */
  762. static struct _starpu_chunk *_starpu_new_chunk(unsigned dst_node, int flags)
  763. {
  764. struct _starpu_chunk *chunk;
  765. uintptr_t base = _starpu_malloc_on_node(dst_node, CHUNK_SIZE, flags);
  766. if (!base)
  767. return NULL;
  768. /* Create a new chunk */
  769. chunk = _starpu_chunk_new();
  770. chunk->base = base;
  771. /* First block is just a fake block pointing to the free segments list */
  772. chunk->bitmap[0].length = 0;
  773. chunk->bitmap[0].next = 1;
  774. /* At first we have only one big segment for the whole chunk */
  775. chunk->bitmap[1].length = CHUNK_NBLOCKS;
  776. chunk->bitmap[1].next = -1;
  777. chunk->available_max = CHUNK_NBLOCKS;
  778. chunk->available = CHUNK_NBLOCKS;
  779. return chunk;
  780. }
  781. uintptr_t
  782. starpu_malloc_on_node_flags(unsigned dst_node, size_t size, int flags)
  783. {
  784. /* Big allocation, allocate normally */
  785. if (size > CHUNK_ALLOC_MAX || starpu_node_get_kind(dst_node) != STARPU_CUDA_RAM)
  786. return _starpu_malloc_on_node(dst_node, size, flags);
  787. /* Round up allocation to block size */
  788. int nblocks = (size + CHUNK_ALLOC_MIN - 1) / CHUNK_ALLOC_MIN;
  789. struct _starpu_chunk *chunk;
  790. int prevblock, block;
  791. int available_max;
  792. struct block *bitmap;
  793. STARPU_PTHREAD_MUTEX_LOCK(&chunk_mutex[dst_node]);
  794. /* Try to find a big enough segment among the chunks */
  795. for (chunk = _starpu_chunk_list_begin(&chunks[dst_node]);
  796. chunk != _starpu_chunk_list_end(&chunks[dst_node]);
  797. chunk = _starpu_chunk_list_next(chunk))
  798. {
  799. if (chunk->available_max < nblocks)
  800. continue;
  801. bitmap = chunk->bitmap;
  802. available_max = 0;
  803. for (prevblock = block = 0;
  804. block != -1;
  805. prevblock = block, block = bitmap[prevblock].next)
  806. {
  807. STARPU_ASSERT(block >= 0 && block <= CHUNK_NBLOCKS);
  808. int length = bitmap[block].length;
  809. if (length >= nblocks)
  810. {
  811. if (length >= 2*nblocks)
  812. {
  813. /* This one this has quite some room,
  814. * put it front, to make finding it
  815. * easier next time. */
  816. _starpu_chunk_list_erase(&chunks[dst_node], chunk);
  817. _starpu_chunk_list_push_front(&chunks[dst_node], chunk);
  818. }
  819. if (chunk->available == CHUNK_NBLOCKS)
  820. /* This one was empty, it's not empty any more */
  821. nfreechunks[dst_node]--;
  822. goto found;
  823. }
  824. if (length > available_max)
  825. available_max = length;
  826. }
  827. /* Didn't find a big enough segment in this chunk, its
  828. * available_max is out of date */
  829. chunk->available_max = available_max;
  830. }
  831. /* Didn't find a big enough segment, create another chunk. */
  832. chunk = _starpu_new_chunk(dst_node, flags);
  833. if (!chunk)
  834. {
  835. /* Really no memory any more, fail */
  836. STARPU_PTHREAD_MUTEX_UNLOCK(&chunk_mutex[dst_node]);
  837. errno = ENOMEM;
  838. return 0;
  839. }
  840. /* And make it easy to find. */
  841. _starpu_chunk_list_push_front(&chunks[dst_node], chunk);
  842. bitmap = chunk->bitmap;
  843. prevblock = 0;
  844. block = 1;
  845. found:
  846. chunk->available -= nblocks;
  847. STARPU_ASSERT(bitmap[block].length >= nblocks);
  848. STARPU_ASSERT(block <= CHUNK_NBLOCKS);
  849. if (bitmap[block].length == nblocks)
  850. {
  851. /* Fits exactly, drop this segment from the skip list */
  852. bitmap[prevblock].next = bitmap[block].next;
  853. }
  854. else
  855. {
  856. /* Still some room */
  857. STARPU_ASSERT(block + nblocks <= CHUNK_NBLOCKS);
  858. bitmap[prevblock].next = block + nblocks;
  859. bitmap[block + nblocks].length = bitmap[block].length - nblocks;
  860. bitmap[block + nblocks].next = bitmap[block].next;
  861. }
  862. STARPU_PTHREAD_MUTEX_UNLOCK(&chunk_mutex[dst_node]);
  863. return chunk->base + (block-1) * CHUNK_ALLOC_MIN;
  864. }
  865. void
  866. starpu_free_on_node_flags(unsigned dst_node, uintptr_t addr, size_t size, int flags)
  867. {
  868. /* Big allocation, deallocate normally */
  869. if (size > CHUNK_ALLOC_MAX || starpu_node_get_kind(dst_node) != STARPU_CUDA_RAM)
  870. {
  871. _starpu_free_on_node_flags(dst_node, addr, size, flags);
  872. return;
  873. }
  874. struct _starpu_chunk *chunk;
  875. /* Round up allocation to block size */
  876. int nblocks = (size + CHUNK_ALLOC_MIN - 1) / CHUNK_ALLOC_MIN;
  877. STARPU_PTHREAD_MUTEX_LOCK(&chunk_mutex[dst_node]);
  878. for (chunk = _starpu_chunk_list_begin(&chunks[dst_node]);
  879. chunk != _starpu_chunk_list_end(&chunks[dst_node]);
  880. chunk = _starpu_chunk_list_next(chunk))
  881. if (addr >= chunk->base && addr < chunk->base + CHUNK_SIZE)
  882. break;
  883. STARPU_ASSERT(chunk != _starpu_chunk_list_end(&chunks[dst_node]));
  884. struct block *bitmap = chunk->bitmap;
  885. int block = ((addr - chunk->base) / CHUNK_ALLOC_MIN) + 1, prevblock, nextblock;
  886. /* Look for free segment just before this one */
  887. for (prevblock = 0;
  888. prevblock != -1;
  889. prevblock = nextblock)
  890. {
  891. STARPU_ASSERT(prevblock >= 0 && prevblock <= CHUNK_NBLOCKS);
  892. nextblock = bitmap[prevblock].next;
  893. STARPU_ASSERT_MSG(nextblock != block, "It seems data 0x%lx (size %u) on node %u is being freed a second time\n", (unsigned long) addr, (unsigned) size, dst_node);
  894. if (nextblock > block || nextblock == -1)
  895. break;
  896. }
  897. STARPU_ASSERT(prevblock != -1);
  898. chunk->available += nblocks;
  899. /* Insert in free segments list */
  900. bitmap[block].next = nextblock;
  901. bitmap[prevblock].next = block;
  902. bitmap[block].length = nblocks;
  903. STARPU_ASSERT(nextblock >= -1 && nextblock <= CHUNK_NBLOCKS);
  904. if (nextblock == block + nblocks)
  905. {
  906. /* This freed segment is just before a free segment, merge them */
  907. bitmap[block].next = bitmap[nextblock].next;
  908. bitmap[block].length += bitmap[nextblock].length;
  909. if (bitmap[block].length > chunk->available_max)
  910. chunk->available_max = bitmap[block].length;
  911. }
  912. if (prevblock > 0 && prevblock + bitmap[prevblock].length == block)
  913. {
  914. /* This free segment is just after a free segment, merge them */
  915. bitmap[prevblock].next = bitmap[block].next;
  916. bitmap[prevblock].length += bitmap[block].length;
  917. if (bitmap[prevblock].length > chunk->available_max)
  918. chunk->available_max = bitmap[prevblock].length;
  919. block = prevblock;
  920. }
  921. if (chunk->available == CHUNK_NBLOCKS)
  922. {
  923. /* This chunk is now empty, but avoid chunk free/alloc
  924. * ping-pong by keeping some of these. */
  925. if (nfreechunks[dst_node] >= 1)
  926. {
  927. /* We already have free chunks, release this one */
  928. _starpu_free_on_node_flags(dst_node, chunk->base, CHUNK_SIZE, flags);
  929. _starpu_chunk_list_erase(&chunks[dst_node], chunk);
  930. free(chunk);
  931. }
  932. else
  933. nfreechunks[dst_node]++;
  934. }
  935. else
  936. {
  937. /* Freed some room, put this first in chunks list */
  938. _starpu_chunk_list_erase(&chunks[dst_node], chunk);
  939. _starpu_chunk_list_push_front(&chunks[dst_node], chunk);
  940. }
  941. STARPU_PTHREAD_MUTEX_UNLOCK(&chunk_mutex[dst_node]);
  942. }
  943. void starpu_malloc_on_node_set_default_flags(unsigned node, int flags)
  944. {
  945. STARPU_ASSERT_MSG(node < STARPU_MAXNODES, "bogus node value %u given to starpu_malloc_on_node_set_default_flags\n", node);
  946. malloc_on_node_default_flags[node] = flags;
  947. }
  948. uintptr_t
  949. starpu_malloc_on_node(unsigned dst_node, size_t size)
  950. {
  951. return starpu_malloc_on_node_flags(dst_node, size, malloc_on_node_default_flags[dst_node]);
  952. }
  953. void
  954. starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
  955. {
  956. starpu_free_on_node_flags(dst_node, addr, size, malloc_on_node_default_flags[dst_node]);
  957. }