coherency.c 45 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2008-2021 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  4. * Copyright (C) 2018,2021 Federal University of Rio Grande do Sul (UFRGS)
  5. *
  6. * StarPU is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU Lesser General Public License as published by
  8. * the Free Software Foundation; either version 2.1 of the License, or (at
  9. * your option) any later version.
  10. *
  11. * StarPU is distributed in the hope that it will be useful, but
  12. * WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  14. *
  15. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  16. */
  17. #include <limits.h>
  18. #include <math.h>
  19. #include <common/config.h>
  20. #include <datawizard/coherency.h>
  21. #include <datawizard/copy_driver.h>
  22. #include <datawizard/write_back.h>
  23. #include <datawizard/memory_nodes.h>
  24. #include <core/dependencies/data_concurrency.h>
  25. #include <core/disk.h>
  26. #include <profiling/profiling.h>
  27. #include <core/task.h>
  28. #include <starpu_scheduler.h>
  29. #include <core/workers.h>
  30. #ifdef STARPU_SIMGRID
  31. #include <core/simgrid.h>
  32. #endif
  33. static int link_supports_direct_transfers(starpu_data_handle_t handle, unsigned src_node, unsigned dst_node, unsigned *handling_node);
  34. int _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
  35. {
  36. int src_node = -1;
  37. unsigned i;
  38. unsigned nnodes = starpu_memory_nodes_get_count();
  39. /* first find a valid copy, either a STARPU_OWNER or a STARPU_SHARED */
  40. unsigned node;
  41. size_t size = _starpu_data_get_size(handle);
  42. double cost = INFINITY;
  43. unsigned src_node_mask = 0;
  44. for (node = 0; node < nnodes; node++)
  45. {
  46. if (handle->per_node[node].state != STARPU_INVALID)
  47. {
  48. /* we found a copy ! */
  49. src_node_mask |= (1<<node);
  50. }
  51. }
  52. if (src_node_mask == 0 && handle->init_cl)
  53. {
  54. /* No copy yet, but applicationg told us how to build it. */
  55. return -1;
  56. }
  57. /* we should have found at least one copy ! */
  58. STARPU_ASSERT_MSG(src_node_mask != 0, "The data for the handle %p is requested, but the handle does not have a valid value. Perhaps some initialization task is missing?", handle);
  59. /* Without knowing the size, we won't know the cost */
  60. if (!size)
  61. cost = 0;
  62. /* Check whether we have transfer cost for all nodes, if so, take the minimum */
  63. if (cost)
  64. for (i = 0; i < nnodes; i++)
  65. {
  66. if (src_node_mask & (1<<i))
  67. {
  68. double time = starpu_transfer_predict(i, destination, size);
  69. unsigned handling_node;
  70. /* Avoid indirect transfers */
  71. /* TODO: but with NVLink, that might be better than a "direct" transfer that actually goes through the Host! */
  72. if (!link_supports_direct_transfers(handle, i, destination, &handling_node))
  73. continue;
  74. if (_STARPU_IS_ZERO(time))
  75. {
  76. /* No estimation, will have to revert to dumb strategy */
  77. cost = 0.0;
  78. break;
  79. }
  80. else if (time < cost)
  81. {
  82. cost = time;
  83. src_node = i;
  84. }
  85. }
  86. }
  87. if (cost && src_node != -1)
  88. {
  89. /* Could estimate through cost, return that */
  90. STARPU_ASSERT(handle->per_node[src_node].allocated);
  91. STARPU_ASSERT(handle->per_node[src_node].initialized);
  92. return src_node;
  93. }
  94. int i_ram = -1;
  95. int i_gpu = -1;
  96. int i_disk = -1;
  97. /* Revert to dumb strategy: take RAM unless only a GPU has it */
  98. for (i = 0; i < nnodes; i++)
  99. {
  100. if (src_node_mask & (1<<i))
  101. {
  102. int (*can_copy)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, unsigned handling_node) = handle->ops->copy_methods->can_copy;
  103. /* Avoid transfers which the interface does not want */
  104. if (can_copy)
  105. {
  106. void *src_interface = handle->per_node[i].data_interface;
  107. void *dst_interface = handle->per_node[destination].data_interface;
  108. unsigned handling_node;
  109. if (!link_supports_direct_transfers(handle, i, destination, &handling_node))
  110. {
  111. /* Avoid through RAM if the interface does not want it */
  112. void *ram_interface = handle->per_node[STARPU_MAIN_RAM].data_interface;
  113. if ((!can_copy(src_interface, i, ram_interface, STARPU_MAIN_RAM, i)
  114. && !can_copy(src_interface, i, ram_interface, STARPU_MAIN_RAM, STARPU_MAIN_RAM))
  115. || (!can_copy(ram_interface, STARPU_MAIN_RAM, dst_interface, destination, STARPU_MAIN_RAM)
  116. && !can_copy(ram_interface, STARPU_MAIN_RAM, dst_interface, destination, destination)))
  117. continue;
  118. }
  119. }
  120. /* however GPU are expensive sources, really !
  121. * Unless peer transfer is supported (and it would then have been selected above).
  122. * Other should be ok */
  123. if (starpu_node_get_kind(i) == STARPU_CPU_RAM ||
  124. starpu_node_get_kind(i) == STARPU_MPI_MS_RAM)
  125. i_ram = i;
  126. else if (starpu_node_get_kind(i) == STARPU_DISK_RAM)
  127. i_disk = i;
  128. else
  129. i_gpu = i;
  130. }
  131. }
  132. /* we have to use cpu_ram in first */
  133. if (i_ram != -1)
  134. src_node = i_ram;
  135. else if (i_gpu != -1)
  136. /* otherwise a gpu */
  137. src_node = i_gpu;
  138. else
  139. /* no luck we have to use the disk memory */
  140. src_node = i_disk;
  141. STARPU_ASSERT(src_node != -1);
  142. STARPU_ASSERT(handle->per_node[src_node].allocated);
  143. STARPU_ASSERT(handle->per_node[src_node].initialized);
  144. return src_node;
  145. }
  146. /* this may be called once the data is fetched with header and STARPU_RW-lock hold */
  147. void _starpu_update_data_state(starpu_data_handle_t handle,
  148. struct _starpu_data_replicate *requesting_replicate,
  149. enum starpu_data_access_mode mode)
  150. {
  151. /* There is nothing to do for relaxed coherency modes (scratch or
  152. * reductions) */
  153. if (!(mode & STARPU_RW))
  154. return;
  155. unsigned nnodes = starpu_memory_nodes_get_count();
  156. /* the data is present now */
  157. unsigned requesting_node = requesting_replicate->memory_node;
  158. if (mode & STARPU_W)
  159. {
  160. /* the requesting node now has the only valid copy */
  161. unsigned node;
  162. for (node = 0; node < nnodes; node++)
  163. {
  164. if (handle->per_node[node].state != STARPU_INVALID)
  165. _STARPU_TRACE_DATA_STATE_INVALID(handle, node);
  166. handle->per_node[node].state = STARPU_INVALID;
  167. }
  168. if (requesting_replicate->state != STARPU_OWNER)
  169. _STARPU_TRACE_DATA_STATE_OWNER(handle, requesting_node);
  170. requesting_replicate->state = STARPU_OWNER;
  171. if (handle->home_node != -1 && handle->per_node[handle->home_node].state == STARPU_INVALID)
  172. /* Notify that this MC is now dirty */
  173. _starpu_memchunk_dirty(requesting_replicate->mc, requesting_replicate->memory_node);
  174. }
  175. else
  176. {
  177. /* read only */
  178. if (requesting_replicate->state != STARPU_OWNER)
  179. {
  180. /* there was at least another copy of the data */
  181. unsigned node;
  182. for (node = 0; node < nnodes; node++)
  183. {
  184. struct _starpu_data_replicate *replicate = &handle->per_node[node];
  185. if (replicate->state != STARPU_INVALID)
  186. {
  187. if (replicate->state != STARPU_SHARED)
  188. _STARPU_TRACE_DATA_STATE_SHARED(handle, node);
  189. replicate->state = STARPU_SHARED;
  190. }
  191. }
  192. if (requesting_replicate->state != STARPU_SHARED)
  193. _STARPU_TRACE_DATA_STATE_SHARED(handle, requesting_node);
  194. requesting_replicate->state = STARPU_SHARED;
  195. }
  196. }
  197. }
  198. static int worker_supports_direct_access(unsigned node, unsigned handling_node)
  199. {
  200. if (node == handling_node)
  201. return 1;
  202. if (!_starpu_memory_node_get_nworkers(handling_node))
  203. /* No worker to process the request from that node */
  204. return 0;
  205. struct _starpu_node_ops *node_ops = _starpu_memory_node_get_node_ops(node);
  206. if (node_ops && node_ops->is_direct_access_supported)
  207. return node_ops->is_direct_access_supported(node, handling_node);
  208. else
  209. return 0;
  210. }
  211. static int link_supports_direct_transfers(starpu_data_handle_t handle, unsigned src_node, unsigned dst_node, unsigned *handling_node)
  212. {
  213. int (*can_copy)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, unsigned handling_node) = handle->ops->copy_methods->can_copy;
  214. void *src_interface = handle->per_node[src_node].data_interface;
  215. void *dst_interface = handle->per_node[dst_node].data_interface;
  216. /* XXX That's a hack until we fix cudaMemcpy3DPeerAsync in the block interface
  217. * Perhaps not all data interface provide a direct GPU-GPU transfer
  218. * method ! */
  219. #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
  220. if (src_node != dst_node && starpu_node_get_kind(src_node) == STARPU_CUDA_RAM && starpu_node_get_kind(dst_node) == STARPU_CUDA_RAM)
  221. {
  222. const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
  223. if (!copy_methods->cuda_to_cuda_async && !copy_methods->any_to_any)
  224. return 0;
  225. }
  226. #endif
  227. /* Note: with CUDA, performance seems a bit better when issuing the transfer from the destination (tested without GPUDirect, but GPUDirect probably behave the same) */
  228. if (worker_supports_direct_access(src_node, dst_node) && (!can_copy || can_copy(src_interface, src_node, dst_interface, dst_node, dst_node)))
  229. {
  230. *handling_node = dst_node;
  231. return 1;
  232. }
  233. if (worker_supports_direct_access(dst_node, src_node) && (!can_copy || can_copy(src_interface, src_node, dst_interface, dst_node, src_node)))
  234. {
  235. *handling_node = src_node;
  236. return 1;
  237. }
  238. return 0;
  239. }
  240. /* Now, we use slowness/bandwidth to compare numa nodes, is it better to use latency ? */
  241. static unsigned chose_best_numa_between_src_and_dest(int src, int dst)
  242. {
  243. double timing_best;
  244. int best_numa = -1;
  245. unsigned numa;
  246. const unsigned nb_numa_nodes = starpu_memory_nodes_get_numa_count();
  247. for(numa = 0; numa < nb_numa_nodes; numa++)
  248. {
  249. double actual = 1.0/starpu_transfer_bandwidth(src, numa) + 1.0/starpu_transfer_bandwidth(numa, dst);
  250. /* Compare slowness : take the lowest */
  251. if (best_numa < 0 || actual < timing_best)
  252. {
  253. best_numa = numa;
  254. timing_best = actual;
  255. }
  256. }
  257. STARPU_ASSERT(best_numa >= 0);
  258. return best_numa;
  259. }
  260. /* Determines the path of a request : each hop is defined by (src,dst) and the
  261. * node that handles the hop. The returned value indicates the number of hops,
  262. * and the max_len is the maximum number of hops (ie. the size of the
  263. * src_nodes, dst_nodes and handling_nodes arrays. */
  264. int _starpu_determine_request_path(starpu_data_handle_t handle,
  265. int src_node, int dst_node,
  266. enum starpu_data_access_mode mode, int max_len,
  267. unsigned *src_nodes, unsigned *dst_nodes,
  268. unsigned *handling_nodes, unsigned write_invalidation)
  269. {
  270. if (src_node == dst_node || !(mode & STARPU_R))
  271. {
  272. if (dst_node == -1 || starpu_node_get_kind(dst_node) == STARPU_DISK_RAM)
  273. handling_nodes[0] = src_node;
  274. else
  275. handling_nodes[0] = dst_node;
  276. if (write_invalidation)
  277. /* The invalidation request will be enough */
  278. return 0;
  279. /* The destination node should only allocate the data, no transfer is required */
  280. STARPU_ASSERT(max_len >= 1);
  281. src_nodes[0] = STARPU_MAIN_RAM; // ignored
  282. dst_nodes[0] = dst_node;
  283. return 1;
  284. }
  285. if (src_node < 0)
  286. {
  287. /* Will just initialize the destination */
  288. STARPU_ASSERT(max_len >= 1);
  289. src_nodes[0] = src_node; // ignored
  290. dst_nodes[0] = dst_node;
  291. return 1;
  292. }
  293. unsigned handling_node;
  294. int link_is_valid = link_supports_direct_transfers(handle, src_node, dst_node, &handling_node);
  295. if (!link_is_valid)
  296. {
  297. int (*can_copy)(void *, unsigned, void *, unsigned, unsigned) = handle->ops->copy_methods->can_copy;
  298. void *src_interface = handle->per_node[src_node].data_interface;
  299. void *dst_interface = handle->per_node[dst_node].data_interface;
  300. /* We need an intermediate hop to implement data staging
  301. * through main memory. */
  302. STARPU_ASSERT(max_len >= 2);
  303. STARPU_ASSERT(src_node >= 0);
  304. unsigned numa = chose_best_numa_between_src_and_dest(src_node, dst_node);
  305. /* GPU -> RAM */
  306. src_nodes[0] = src_node;
  307. dst_nodes[0] = numa;
  308. if (starpu_node_get_kind(src_node) == STARPU_DISK_RAM)
  309. /* Disks don't have their own driver thread */
  310. handling_nodes[0] = dst_node;
  311. else if (!can_copy || can_copy(src_interface, src_node, dst_interface, dst_node, src_node))
  312. {
  313. handling_nodes[0] = src_node;
  314. }
  315. else
  316. {
  317. STARPU_ASSERT_MSG(can_copy(src_interface, src_node, dst_interface, dst_node, dst_node), "interface %d refuses all kinds of transfers from node %d to node %d\n", handle->ops->interfaceid, src_node, dst_node);
  318. handling_nodes[0] = dst_node;
  319. }
  320. /* RAM -> GPU */
  321. src_nodes[1] = numa;
  322. dst_nodes[1] = dst_node;
  323. if (starpu_node_get_kind(dst_node) == STARPU_DISK_RAM)
  324. /* Disks don't have their own driver thread */
  325. handling_nodes[1] = src_node;
  326. else if (!can_copy || can_copy(src_interface, src_node, dst_interface, dst_node, dst_node))
  327. {
  328. handling_nodes[1] = dst_node;
  329. }
  330. else
  331. {
  332. STARPU_ASSERT_MSG(can_copy(src_interface, src_node, dst_interface, dst_node, src_node), "interface %d refuses all kinds of transfers from node %d to node %d\n", handle->ops->interfaceid, src_node, dst_node);
  333. handling_nodes[1] = src_node;
  334. }
  335. return 2;
  336. }
  337. else
  338. {
  339. STARPU_ASSERT(max_len >= 1);
  340. src_nodes[0] = src_node;
  341. dst_nodes[0] = dst_node;
  342. handling_nodes[0] = handling_node;
  343. #if !defined(STARPU_HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
  344. STARPU_ASSERT(!(mode & STARPU_R) || starpu_node_get_kind(src_node) != STARPU_CUDA_RAM || starpu_node_get_kind(dst_node) != STARPU_CUDA_RAM);
  345. #endif
  346. return 1;
  347. }
  348. }
  349. /* handle->lock should be taken. r is returned locked. The node parameter
  350. * indicate either the source of the request, or the destination for a
  351. * write-only request. */
  352. static struct _starpu_data_request *_starpu_search_existing_data_request(struct _starpu_data_replicate *replicate, unsigned node, enum starpu_data_access_mode mode, struct starpu_task *task, enum starpu_is_prefetch is_prefetch)
  353. {
  354. struct _starpu_data_request *r;
  355. for (r = replicate->request[node]; r; r = r->next_same_req)
  356. {
  357. _starpu_spin_checklocked(&r->handle->header_lock);
  358. if (task && r->task && task != r->task)
  359. /* Do not collapse requests for different tasks */
  360. continue;
  361. _starpu_spin_lock(&r->lock);
  362. /* perhaps we need to "upgrade" the request */
  363. if (is_prefetch < r->prefetch)
  364. _starpu_update_prefetch_status(r, is_prefetch);
  365. if (mode & STARPU_R)
  366. {
  367. /* in case the exisiting request did not imply a memory
  368. * transfer yet, we have to take a second refcnt now
  369. * for the source, in addition to the refcnt for the
  370. * destination
  371. * (so that the source remains valid) */
  372. if (!(r->mode & STARPU_R))
  373. {
  374. replicate->refcnt++;
  375. replicate->handle->busy_count++;
  376. }
  377. r->mode = (enum starpu_data_access_mode) ((int) r->mode | (int) STARPU_R);
  378. }
  379. if (mode & STARPU_W)
  380. r->mode = (enum starpu_data_access_mode) ((int) r->mode | (int) STARPU_W);
  381. /* We collapse with this request */
  382. return r;
  383. }
  384. return NULL;
  385. }
  386. /*
  387. * This function is called when the data is needed on the local node, this
  388. * returns a pointer to the local copy
  389. *
  390. * R STARPU_W STARPU_RW
  391. * Owner OK OK OK
  392. * Shared OK 1 1
  393. * Invalid 2 3 4
  394. *
  395. * case 1 : shared + (read)write :
  396. * no data copy but shared->Invalid/Owner
  397. * case 2 : invalid + read :
  398. * data copy + invalid->shared + owner->shared (STARPU_ASSERT(there is a valid))
  399. * case 3 : invalid + write :
  400. * no data copy + invalid->owner + (owner,shared)->invalid
  401. * case 4 : invalid + R/STARPU_W :
  402. * data copy + if (STARPU_W) (invalid->owner + owner->invalid)
  403. * else (invalid,owner->shared)
  404. */
  405. struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_handle_t handle,
  406. struct _starpu_data_replicate *dst_replicate,
  407. enum starpu_data_access_mode mode,
  408. struct starpu_task *task,
  409. enum starpu_is_prefetch is_prefetch,
  410. unsigned async,
  411. void (*callback_func)(void *), void *callback_arg, int prio, const char *origin)
  412. {
  413. /* We don't care about commuting for data requests, that was handled before. */
  414. mode &= ~STARPU_COMMUTE;
  415. /* This function is called with handle's header lock taken */
  416. _starpu_spin_checklocked(&handle->header_lock);
  417. int requesting_node = dst_replicate ? dst_replicate->memory_node : -1;
  418. unsigned nwait = 0;
  419. if (mode & STARPU_W)
  420. {
  421. /* We will write to the buffer. We will have to wait for all
  422. * existing requests before the last request which will
  423. * invalidate all their results (which were possibly spurious,
  424. * e.g. too aggressive eviction).
  425. */
  426. unsigned i, j;
  427. unsigned nnodes = starpu_memory_nodes_get_count();
  428. for (i = 0; i < nnodes; i++)
  429. for (j = 0; j < nnodes; j++)
  430. {
  431. struct _starpu_data_request *r;
  432. for (r = handle->per_node[i].request[j]; r; r = r->next_same_req)
  433. nwait++;
  434. }
  435. /* If the request is not detached (i.e. the caller really wants
  436. * proper ownership), no new requests will appear because a
  437. * reference will be kept on the dst replicate, which will
  438. * notably prevent data reclaiming.
  439. */
  440. }
  441. if ((!dst_replicate || dst_replicate->state != STARPU_INVALID) && (!nwait || is_prefetch))
  442. {
  443. if (dst_replicate)
  444. {
  445. #ifdef STARPU_MEMORY_STATS
  446. enum _starpu_cache_state old_state = dst_replicate->state;
  447. #endif
  448. /* the data is already available and we don't have to wait for
  449. * any request, so we can stop */
  450. _starpu_update_data_state(handle, dst_replicate, mode);
  451. _starpu_msi_cache_hit(requesting_node);
  452. #ifdef STARPU_MEMORY_STATS
  453. _starpu_memory_handle_stats_cache_hit(handle, requesting_node);
  454. /* XXX Broken ? */
  455. if (old_state == STARPU_SHARED
  456. && dst_replicate->state == STARPU_OWNER)
  457. _starpu_memory_handle_stats_shared_to_owner(handle, requesting_node);
  458. #endif
  459. if (dst_replicate->mc)
  460. {
  461. if (is_prefetch == STARPU_TASK_PREFETCH)
  462. /* Make sure it stays there */
  463. dst_replicate->nb_tasks_prefetch++;
  464. _starpu_memchunk_recently_used(dst_replicate->mc, requesting_node);
  465. }
  466. if (task)
  467. {
  468. unsigned j;
  469. unsigned nnodes = starpu_memory_nodes_get_count();
  470. /* Cancel any existing (prefetch) request */
  471. struct _starpu_data_request *r2;
  472. for (j = 0; j < nnodes; j++)
  473. {
  474. for (r2 = dst_replicate->request[j]; r2; r2 = r2->next_same_req)
  475. {
  476. if (r2->task && r2->task == task)
  477. {
  478. r2->canceled = 1;
  479. break;
  480. }
  481. }
  482. }
  483. }
  484. }
  485. _starpu_spin_unlock(&handle->header_lock);
  486. if (callback_func)
  487. callback_func(callback_arg);
  488. _STARPU_LOG_OUT_TAG("data available");
  489. return NULL;
  490. }
  491. if (dst_replicate)
  492. _starpu_msi_cache_miss(requesting_node);
  493. /* the only remaining situation is that the local copy was invalid */
  494. STARPU_ASSERT((dst_replicate && dst_replicate->state == STARPU_INVALID) || nwait);
  495. /* find someone who already has the data */
  496. int src_node = -1;
  497. if (dst_replicate && mode & STARPU_R)
  498. {
  499. if (dst_replicate->state == STARPU_INVALID)
  500. src_node = _starpu_select_src_node(handle, requesting_node);
  501. else
  502. src_node = requesting_node;
  503. if (src_node < 0)
  504. {
  505. /* We will create it, no need to read an existing value */
  506. mode &= ~STARPU_R;
  507. }
  508. }
  509. else if (dst_replicate)
  510. {
  511. /* if the data is in write only mode (and not SCRATCH or REDUX), there is no need for a source, data will be initialized by the task itself */
  512. if (mode & STARPU_W)
  513. dst_replicate->initialized = 1;
  514. if (starpu_node_get_kind(requesting_node) == STARPU_CPU_RAM && !nwait
  515. && !_starpu_malloc_willpin_on_node(requesting_node))
  516. {
  517. /* And this is the main RAM without pinning, really no need for a
  518. * request, just quickly allocate and be done */
  519. if (_starpu_allocate_memory_on_node(handle, dst_replicate, is_prefetch, 0) == 0)
  520. {
  521. _starpu_update_data_state(handle, dst_replicate, mode);
  522. if (dst_replicate->mc)
  523. {
  524. if (is_prefetch == STARPU_TASK_PREFETCH)
  525. /* Make sure it stays there */
  526. dst_replicate->nb_tasks_prefetch++;
  527. _starpu_memchunk_recently_used(dst_replicate->mc, requesting_node);
  528. }
  529. _starpu_spin_unlock(&handle->header_lock);
  530. if (callback_func)
  531. callback_func(callback_arg);
  532. _STARPU_LOG_OUT_TAG("data immediately allocated");
  533. return NULL;
  534. }
  535. }
  536. }
  537. #define MAX_REQUESTS 4
  538. /* We can safely assume that there won't be more than 2 hops in the
  539. * current implementation */
  540. unsigned src_nodes[MAX_REQUESTS], dst_nodes[MAX_REQUESTS], handling_nodes[MAX_REQUESTS];
  541. /* keep one slot for the last W request, if any */
  542. int write_invalidation = (mode & STARPU_W) && nwait && !is_prefetch;
  543. int nhops = _starpu_determine_request_path(handle, src_node, requesting_node, mode, MAX_REQUESTS,
  544. src_nodes, dst_nodes, handling_nodes, write_invalidation);
  545. STARPU_ASSERT(nhops >= 0 && nhops <= MAX_REQUESTS-1);
  546. struct _starpu_data_request *requests[nhops + write_invalidation];
  547. /* Did we reuse a request for that hop ? */
  548. int reused_requests[nhops + write_invalidation];
  549. /* Construct an array with a list of requests, possibly reusing existing requests */
  550. int hop;
  551. for (hop = 0; hop < nhops; hop++)
  552. {
  553. struct _starpu_data_request *r;
  554. unsigned hop_src_node = src_nodes[hop];
  555. unsigned hop_dst_node = dst_nodes[hop];
  556. unsigned hop_handling_node = handling_nodes[hop];
  557. struct _starpu_data_replicate *hop_src_replicate;
  558. struct _starpu_data_replicate *hop_dst_replicate;
  559. /* Only the first request is independant */
  560. unsigned ndeps = (hop == 0)?0:1;
  561. hop_src_replicate = &handle->per_node[hop_src_node];
  562. hop_dst_replicate = (hop != nhops - 1)?&handle->per_node[hop_dst_node]:dst_replicate;
  563. /* Try to reuse a request if possible */
  564. #ifdef STARPU_DEVEL
  565. #warning We do not actually want to reuse an existing request when our request is for a task with low priority, that will get executed much later. We don t want to wire down the data in between, at worse that could hog the complete gpu memory...
  566. #endif
  567. r = _starpu_search_existing_data_request(hop_dst_replicate,
  568. (mode & STARPU_R)?hop_src_node:hop_dst_node,
  569. mode, task, is_prefetch);
  570. reused_requests[hop] = !!r;
  571. if (!r)
  572. {
  573. /* Create a new request if there was no request to reuse */
  574. r = _starpu_create_data_request(handle, hop_src_replicate,
  575. hop_dst_replicate, hop_handling_node,
  576. mode, ndeps, task, is_prefetch, prio, 0, origin);
  577. nwait++;
  578. }
  579. requests[hop] = r;
  580. }
  581. /* Chain these requests */
  582. for (hop = 0; hop < nhops; hop++)
  583. {
  584. struct _starpu_data_request *r;
  585. r = requests[hop];
  586. if (hop != nhops - 1)
  587. {
  588. if (!reused_requests[hop + 1])
  589. {
  590. r->next_req[r->next_req_count++] = requests[hop + 1];
  591. STARPU_ASSERT(r->next_req_count <= STARPU_MAXNODES);
  592. }
  593. }
  594. else
  595. {
  596. if (is_prefetch == STARPU_TASK_PREFETCH)
  597. /* Make last request add the prefetch count on the mc to keep the data
  598. * there until the task gets to execute. */
  599. r->nb_tasks_prefetch++;
  600. if (!write_invalidation)
  601. /* The last request will perform the callback after termination */
  602. _starpu_data_request_append_callback(r, callback_func, callback_arg);
  603. }
  604. if (reused_requests[hop])
  605. _starpu_spin_unlock(&r->lock);
  606. }
  607. if (write_invalidation)
  608. {
  609. /* Some requests were still pending, we have to add yet another
  610. * request, depending on them, which will invalidate their
  611. * result.
  612. */
  613. struct _starpu_data_request *r = _starpu_create_data_request(handle, dst_replicate,
  614. dst_replicate, requesting_node,
  615. STARPU_W, nwait, task, is_prefetch, prio, 1, origin);
  616. /* and perform the callback after termination */
  617. _starpu_data_request_append_callback(r, callback_func, callback_arg);
  618. /* We will write to the buffer. We will have to wait for all
  619. * existing requests before the last request which will
  620. * invalidate all their results (which were possibly spurious,
  621. * e.g. too aggressive eviction).
  622. */
  623. unsigned i, j;
  624. unsigned nnodes = starpu_memory_nodes_get_count();
  625. for (i = 0; i < nnodes; i++)
  626. for (j = 0; j < nnodes; j++)
  627. {
  628. struct _starpu_data_request *r2;
  629. for (r2 = handle->per_node[i].request[j]; r2; r2 = r2->next_same_req)
  630. {
  631. _starpu_spin_lock(&r2->lock);
  632. if (is_prefetch < r2->prefetch)
  633. /* Hasten the request we will have to wait for */
  634. _starpu_update_prefetch_status(r2, is_prefetch);
  635. r2->next_req[r2->next_req_count++] = r;
  636. STARPU_ASSERT(r2->next_req_count <= STARPU_MAXNODES + 1);
  637. _starpu_spin_unlock(&r2->lock);
  638. nwait--;
  639. }
  640. }
  641. STARPU_ASSERT(nwait == 0);
  642. nhops++;
  643. requests[nhops - 1] = r;
  644. /* existing requests will post this one */
  645. reused_requests[nhops - 1] = 1;
  646. }
  647. STARPU_ASSERT(nhops);
  648. if (!async)
  649. requests[nhops - 1]->refcnt++;
  650. /* we only submit the first request, the remaining will be
  651. * automatically submitted afterward */
  652. if (!reused_requests[0])
  653. _starpu_post_data_request(requests[0]);
  654. return requests[nhops - 1];
  655. }
  656. int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *dst_replicate,
  657. enum starpu_data_access_mode mode, unsigned detached,
  658. struct starpu_task *task, enum starpu_is_prefetch is_prefetch, unsigned async,
  659. void (*callback_func)(void *), void *callback_arg, int prio, const char *origin)
  660. {
  661. _STARPU_LOG_IN();
  662. _starpu_spin_lock(&handle->header_lock);
  663. if (mode & STARPU_R && is_prefetch > STARPU_FETCH)
  664. {
  665. unsigned src_node_mask = 0;
  666. unsigned nnodes = starpu_memory_nodes_get_count();
  667. unsigned n;
  668. for (n = 0; n < nnodes; n++)
  669. {
  670. if (handle->per_node[n].state != STARPU_INVALID)
  671. {
  672. /* we found a copy ! */
  673. src_node_mask |= (1<<n);
  674. }
  675. }
  676. if (src_node_mask == 0)
  677. {
  678. /* no valid copy, nothing to prefetch */
  679. STARPU_ASSERT_MSG(handle->init_cl, "Could not find a valid copy of the data, and no handle initialization function");
  680. _starpu_spin_unlock(&handle->header_lock);
  681. return 0;
  682. }
  683. }
  684. if (!detached)
  685. {
  686. /* Take references which will be released by _starpu_release_data_on_node */
  687. if (dst_replicate)
  688. dst_replicate->refcnt++;
  689. else if (node == STARPU_ACQUIRE_NO_NODE_LOCK_ALL)
  690. {
  691. int i;
  692. for (i = 0; i < STARPU_MAXNODES; i++)
  693. handle->per_node[i].refcnt++;
  694. }
  695. handle->busy_count++;
  696. }
  697. struct _starpu_data_request *r;
  698. r = _starpu_create_request_to_fetch_data(handle, dst_replicate, mode,
  699. task, is_prefetch, async, callback_func, callback_arg, prio, origin);
  700. /* If no request was created, the handle was already up-to-date on the
  701. * node. In this case, _starpu_create_request_to_fetch_data has already
  702. * unlocked the header. */
  703. if (!r)
  704. return 0;
  705. _starpu_spin_unlock(&handle->header_lock);
  706. int ret = async?0:_starpu_wait_data_request_completion(r, 1);
  707. _STARPU_LOG_OUT();
  708. return ret;
  709. }
  710. static int idle_prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, struct starpu_task *task, int prio)
  711. {
  712. return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, task, STARPU_IDLEFETCH, 1, NULL, NULL, prio, "idle_prefetch_data_on_node");
  713. }
  714. static int task_prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, struct starpu_task *task, int prio)
  715. {
  716. return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, task, STARPU_TASK_PREFETCH, 1, NULL, NULL, prio, "task_prefetch_data_on_node");
  717. }
  718. static int STARPU_ATTRIBUTE_UNUSED prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, struct starpu_task *task, int prio)
  719. {
  720. return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, task, STARPU_PREFETCH, 1, NULL, NULL, prio, "prefetch_data_on_node");
  721. }
  722. static int fetch_data(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, struct starpu_task *task, int prio)
  723. {
  724. return _starpu_fetch_data_on_node(handle, node, replicate, mode, 0, task, STARPU_FETCH, 0, NULL, NULL, prio, "fetch_data");
  725. }
  726. uint32_t _starpu_get_data_refcnt(starpu_data_handle_t handle, unsigned node)
  727. {
  728. return handle->per_node[node].refcnt;
  729. }
  730. size_t _starpu_data_get_size(starpu_data_handle_t handle)
  731. {
  732. return handle->ops->get_size(handle);
  733. }
  734. size_t _starpu_data_get_alloc_size(starpu_data_handle_t handle)
  735. {
  736. if (handle->ops->get_alloc_size)
  737. return handle->ops->get_alloc_size(handle);
  738. else
  739. return handle->ops->get_size(handle);
  740. }
  741. starpu_ssize_t _starpu_data_get_max_size(starpu_data_handle_t handle)
  742. {
  743. if (handle->ops->get_max_size)
  744. return handle->ops->get_max_size(handle);
  745. else
  746. return -1;
  747. }
  748. uint32_t _starpu_data_get_footprint(starpu_data_handle_t handle)
  749. {
  750. return handle->footprint;
  751. }
  752. /* in case the data was accessed on a write mode, do not forget to
  753. * make it accessible again once it is possible ! */
  754. void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_wt_mask, enum starpu_data_access_mode down_to_mode, struct _starpu_data_replicate *replicate)
  755. {
  756. uint32_t wt_mask;
  757. size_t max_wt_mask = sizeof(wt_mask) * 8;
  758. unsigned wt_count = starpu_memory_nodes_get_count();
  759. if (max_wt_mask > STARPU_MAXNODES)
  760. max_wt_mask = STARPU_MAXNODES;
  761. if (wt_count > max_wt_mask)
  762. wt_count = max_wt_mask;
  763. wt_mask = default_wt_mask | handle->wt_mask;
  764. wt_mask &= (1ULL<<max_wt_mask)-1;
  765. /* Note that it is possible that there is no valid copy of the data (if
  766. * starpu_data_invalidate was called for instance). In that case, we do
  767. * not enforce any write-through mechanism. */
  768. unsigned memory_node = replicate->memory_node;
  769. if (replicate->state != STARPU_INVALID && handle->current_mode & STARPU_W)
  770. if (wt_mask && (memory_node >= max_wt_mask || wt_mask & ~(1<<memory_node)))
  771. _starpu_write_through_data(handle, memory_node, wt_mask);
  772. int cpt = 0;
  773. while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
  774. {
  775. cpt++;
  776. _starpu_datawizard_progress(_STARPU_DATAWIZARD_DO_ALLOC);
  777. }
  778. if (cpt == STARPU_SPIN_MAXTRY)
  779. _starpu_spin_lock(&handle->header_lock);
  780. if (down_to_mode == STARPU_NONE)
  781. {
  782. /* Release refcnt taken by fetch_data_on_node */
  783. replicate->refcnt--;
  784. STARPU_ASSERT_MSG(replicate->refcnt >= 0, "handle %p released too many times", handle);
  785. STARPU_ASSERT_MSG(handle->busy_count > 0, "handle %p released too many times", handle);
  786. handle->busy_count--;
  787. }
  788. if (!_starpu_notify_data_dependencies(handle, down_to_mode))
  789. _starpu_spin_unlock(&handle->header_lock);
  790. }
  791. int _starpu_prefetch_task_input_prio(struct starpu_task *task, int target_node, int worker, int prio, enum starpu_is_prefetch prefetch)
  792. {
  793. #ifdef STARPU_OPENMP
  794. struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
  795. /* do not attempt to prefetch task input if this is an OpenMP task resuming after blocking */
  796. if (j->discontinuous != 0)
  797. return 0;
  798. #endif
  799. STARPU_ASSERT_MSG(prefetch != STARPU_PREFETCH || !task->prefetched, "Prefetching was already requested for this task! Did you set 'prefetches' to 1 in the starpu_sched_policy structure?");
  800. unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
  801. unsigned index;
  802. for (index = 0; index < nbuffers; index++)
  803. {
  804. starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, index);
  805. enum starpu_data_access_mode mode = STARPU_TASK_GET_MODE(task, index);
  806. if (mode & (STARPU_SCRATCH|STARPU_REDUX))
  807. continue;
  808. int node;
  809. if (target_node >= 0)
  810. node = _starpu_task_data_get_node_on_node(task, index, target_node);
  811. else
  812. node = _starpu_task_data_get_node_on_worker(task, index, worker);
  813. struct _starpu_data_replicate *replicate = &handle->per_node[node];
  814. if (prefetch == STARPU_PREFETCH)
  815. task_prefetch_data_on_node(handle, node, replicate, mode, task, prio);
  816. else
  817. idle_prefetch_data_on_node(handle, node, replicate, mode, task, prio);
  818. }
  819. if (prefetch == STARPU_PREFETCH)
  820. task->prefetched = 1;
  821. return 0;
  822. }
  823. int starpu_prefetch_task_input_prio(struct starpu_task *task, int target_node, int worker, int prio)
  824. {
  825. return _starpu_prefetch_task_input_prio(task, target_node, worker, prio, STARPU_PREFETCH);
  826. }
  827. int starpu_prefetch_task_input_on_node_prio(struct starpu_task *task, unsigned target_node, int prio)
  828. {
  829. return starpu_prefetch_task_input_prio(task, target_node, -1, prio);
  830. }
  831. int starpu_prefetch_task_input_on_node(struct starpu_task *task, unsigned node)
  832. {
  833. int prio = task->priority;
  834. if (task->workerorder)
  835. prio = INT_MAX - task->workerorder;
  836. return starpu_prefetch_task_input_on_node_prio(task, node, prio);
  837. }
  838. int starpu_idle_prefetch_task_input_prio(struct starpu_task *task, int target_node, int worker, int prio)
  839. {
  840. return _starpu_prefetch_task_input_prio(task, target_node, worker, prio, STARPU_IDLEFETCH);
  841. }
  842. int starpu_idle_prefetch_task_input_on_node_prio(struct starpu_task *task, unsigned target_node, int prio)
  843. {
  844. return starpu_idle_prefetch_task_input_prio(task, target_node, -1, prio);
  845. }
  846. int starpu_idle_prefetch_task_input_on_node(struct starpu_task *task, unsigned node)
  847. {
  848. int prio = task->priority;
  849. if (task->workerorder)
  850. prio = INT_MAX - task->workerorder;
  851. return starpu_idle_prefetch_task_input_on_node_prio(task, node, prio);
  852. }
  853. int starpu_prefetch_task_input_for_prio(struct starpu_task *task, unsigned worker, int prio)
  854. {
  855. return starpu_prefetch_task_input_prio(task, -1, worker, prio);
  856. }
  857. int starpu_prefetch_task_input_for(struct starpu_task *task, unsigned worker)
  858. {
  859. int prio = task->priority;
  860. if (task->workerorder)
  861. prio = INT_MAX - task->workerorder;
  862. return starpu_prefetch_task_input_for_prio(task, worker, prio);
  863. }
  864. int starpu_idle_prefetch_task_input_for_prio(struct starpu_task *task, unsigned worker, int prio)
  865. {
  866. return starpu_idle_prefetch_task_input_prio(task, -1, worker, prio);
  867. }
  868. int starpu_idle_prefetch_task_input_for(struct starpu_task *task, unsigned worker)
  869. {
  870. int prio = task->priority;
  871. if (task->workerorder)
  872. prio = INT_MAX - task->workerorder;
  873. return starpu_idle_prefetch_task_input_for_prio(task, worker, prio);
  874. }
  875. static struct _starpu_data_replicate *get_replicate(starpu_data_handle_t handle, enum starpu_data_access_mode mode, int workerid, unsigned node)
  876. {
  877. if (mode & (STARPU_SCRATCH|STARPU_REDUX))
  878. {
  879. STARPU_ASSERT(workerid >= 0);
  880. if (!handle->per_worker)
  881. {
  882. _starpu_spin_lock(&handle->header_lock);
  883. if (!handle->per_worker)
  884. _starpu_data_initialize_per_worker(handle);
  885. _starpu_spin_unlock(&handle->header_lock);
  886. }
  887. return &handle->per_worker[workerid];
  888. }
  889. else
  890. /* That's a "normal" buffer (R/W) */
  891. return &handle->per_node[node];
  892. }
  893. /* Callback used when a buffer is send asynchronously to the sink */
  894. static void _starpu_fetch_task_input_cb(void *arg)
  895. {
  896. struct _starpu_worker * worker = (struct _starpu_worker *) arg;
  897. /* increase the number of buffer received */
  898. STARPU_WMB();
  899. (void)STARPU_ATOMIC_ADD(&worker->nb_buffers_transferred, 1);
  900. #ifdef STARPU_SIMGRID
  901. starpu_pthread_queue_broadcast(&_starpu_simgrid_transfer_queue[worker->memory_node]);
  902. #endif
  903. }
  904. /* Synchronously or asynchronously fetch data for a given task (if it's not there already)
  905. * Returns the number of data acquired here. */
  906. /* _starpu_fetch_task_input must be called before
  907. * executing the task. __starpu_push_task_output but be called after the
  908. * execution of the task. */
  909. /* The driver can either just call _starpu_fetch_task_input with async==0,
  910. * or to improve overlapping, it can call _starpu_fetch_task_input with
  911. * async==1, then wait for transfers to complete, then call
  912. * _starpu_fetch_task_input_tail to complete the fetch. */
  913. int _starpu_fetch_task_input(struct starpu_task *task, struct _starpu_job *j, int async)
  914. {
  915. struct _starpu_worker *worker = _starpu_get_local_worker_key();
  916. int workerid = worker->workerid;
  917. if (async)
  918. {
  919. worker->task_transferring = task;
  920. worker->nb_buffers_transferred = 0;
  921. if (worker->ntasks <= 1)
  922. _STARPU_TRACE_WORKER_START_FETCH_INPUT(NULL, workerid);
  923. }
  924. else
  925. _STARPU_TRACE_START_FETCH_INPUT(NULL);
  926. int profiling = starpu_profiling_status_get();
  927. if (profiling && task->profiling_info)
  928. _starpu_clock_gettime(&task->profiling_info->acquire_data_start_time);
  929. struct _starpu_data_descr *descrs = _STARPU_JOB_GET_ORDERED_BUFFERS(j);
  930. unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
  931. unsigned nacquires;
  932. unsigned index;
  933. nacquires = 0;
  934. for (index = 0; index < nbuffers; index++)
  935. {
  936. int ret;
  937. starpu_data_handle_t handle = descrs[index].handle;
  938. enum starpu_data_access_mode mode = descrs[index].mode;
  939. int node = _starpu_task_data_get_node_on_worker(task, descrs[index].index, workerid);
  940. /* We set this here for coherency with __starpu_push_task_output */
  941. descrs[index].node = node;
  942. if (mode == STARPU_NONE ||
  943. (mode & ((1<<STARPU_MODE_SHIFT) - 1)) >= STARPU_ACCESS_MODE_MAX ||
  944. (mode >> STARPU_MODE_SHIFT) >= (STARPU_SHIFTED_MODE_MAX >> STARPU_MODE_SHIFT))
  945. STARPU_ASSERT_MSG(0, "mode %d (0x%x) is bogus\n", mode, mode);
  946. struct _starpu_data_replicate *local_replicate;
  947. if (index && descrs[index-1].handle == descrs[index].handle)
  948. /* We have already took this data, skip it. This
  949. * depends on ordering putting writes before reads, see
  950. * _starpu_compar_handles */
  951. continue;
  952. local_replicate = get_replicate(handle, mode, workerid, node);
  953. if (async)
  954. {
  955. ret = _starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, task, STARPU_FETCH, 1,
  956. _starpu_fetch_task_input_cb, worker, task->priority, "_starpu_fetch_task_input");
  957. #ifdef STARPU_SIMGRID
  958. if (_starpu_simgrid_fetching_input_cost())
  959. starpu_sleep(0.000001);
  960. #endif
  961. if (STARPU_UNLIKELY(ret))
  962. {
  963. /* Ooops, not enough memory, make worker wait for these for now, and the synchronous call will finish by forcing eviction*/
  964. worker->nb_buffers_totransfer = nacquires;
  965. _starpu_set_worker_status(worker, STATUS_WAITING);
  966. return 0;
  967. }
  968. }
  969. else
  970. {
  971. ret = fetch_data(handle, node, local_replicate, mode, task, task->priority);
  972. #ifdef STARPU_SIMGRID
  973. if (_starpu_simgrid_fetching_input_cost())
  974. starpu_sleep(0.000001);
  975. #endif
  976. if (STARPU_UNLIKELY(ret))
  977. goto enomem;
  978. }
  979. nacquires++;
  980. }
  981. if (async)
  982. {
  983. worker->nb_buffers_totransfer = nacquires;
  984. _starpu_set_worker_status(worker, STATUS_WAITING);
  985. return 0;
  986. }
  987. _starpu_fetch_task_input_tail(task, j, worker);
  988. return 0;
  989. enomem:
  990. _STARPU_TRACE_END_FETCH_INPUT(NULL);
  991. _STARPU_DISP("something went wrong with buffer %u\n", index);
  992. /* try to unreference all the input that were successfully taken */
  993. unsigned index2;
  994. for (index2 = 0; index2 < index; index2++)
  995. {
  996. starpu_data_handle_t handle = descrs[index2].handle;
  997. enum starpu_data_access_mode mode = descrs[index2].mode;
  998. int node = descrs[index].node;
  999. struct _starpu_data_replicate *local_replicate;
  1000. if (index2 && descrs[index2-1].handle == descrs[index2].handle)
  1001. /* We have already released this data, skip it. This
  1002. * depends on ordering putting writes before reads, see
  1003. * _starpu_compar_handles */
  1004. continue;
  1005. local_replicate = get_replicate(handle, mode, workerid, node);
  1006. _starpu_release_data_on_node(handle, 0, STARPU_NONE, local_replicate);
  1007. }
  1008. return -1;
  1009. }
  1010. /* Now that we have taken the data locks in locking order, fill the codelet interfaces in function order. */
  1011. void _starpu_fetch_task_input_tail(struct starpu_task *task, struct _starpu_job *j, struct _starpu_worker *worker)
  1012. {
  1013. int workerid = worker->workerid;
  1014. int profiling = starpu_profiling_status_get();
  1015. unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
  1016. struct _starpu_data_descr *descrs = _STARPU_JOB_GET_ORDERED_BUFFERS(j);
  1017. unsigned index;
  1018. unsigned long total_size = 0;
  1019. for (index = 0; index < nbuffers; index++)
  1020. {
  1021. starpu_data_handle_t handle = descrs[index].handle;
  1022. enum starpu_data_access_mode mode = descrs[index].mode;
  1023. int node = descrs[index].node;
  1024. struct _starpu_data_replicate *local_replicate;
  1025. int needs_init;
  1026. local_replicate = get_replicate(handle, mode, workerid, node);
  1027. _starpu_spin_lock(&handle->header_lock);
  1028. if (local_replicate->mc)
  1029. {
  1030. if (task->prefetched && local_replicate->initialized &&
  1031. /* See prefetch conditions in
  1032. * starpu_prefetch_task_input_on_node_prio and alike */
  1033. !(mode & (STARPU_SCRATCH|STARPU_REDUX)) &&
  1034. (mode & STARPU_R))
  1035. {
  1036. /* Allocations or transfer prefetchs should have been done by now and marked
  1037. * this mc as needed for us.
  1038. * Now that we added a reference for the task, we can relieve that. */
  1039. /* Note: the replicate might have been evicted in between, thus not 100% sure
  1040. * that our prefetch request is still recorded here. */
  1041. if (local_replicate->nb_tasks_prefetch > 0)
  1042. local_replicate->nb_tasks_prefetch--;
  1043. }
  1044. }
  1045. needs_init = !local_replicate->initialized;
  1046. _starpu_spin_unlock(&handle->header_lock);
  1047. _STARPU_TASK_SET_INTERFACE(task , local_replicate->data_interface, descrs[index].index);
  1048. /* If the replicate was not initialized yet, we have to do it now */
  1049. if (!(mode & STARPU_SCRATCH) && needs_init)
  1050. _starpu_redux_init_data_replicate(handle, local_replicate, workerid);
  1051. #ifdef STARPU_USE_FXT
  1052. if (fut_active)
  1053. total_size += _starpu_data_get_size(handle);
  1054. #endif
  1055. }
  1056. _STARPU_TRACE_DATA_LOAD(workerid,total_size);
  1057. if (profiling && task->profiling_info)
  1058. _starpu_clock_gettime(&task->profiling_info->acquire_data_end_time);
  1059. _STARPU_TRACE_END_FETCH_INPUT(NULL);
  1060. }
  1061. /* Release task data dependencies */
  1062. void __starpu_push_task_output(struct _starpu_job *j)
  1063. {
  1064. #ifdef STARPU_OPENMP
  1065. STARPU_ASSERT(!j->continuation);
  1066. #endif
  1067. int profiling = starpu_profiling_status_get();
  1068. struct starpu_task *task = j->task;
  1069. if (profiling && task->profiling_info)
  1070. _starpu_clock_gettime(&task->profiling_info->release_data_start_time);
  1071. struct _starpu_data_descr *descrs = _STARPU_JOB_GET_ORDERED_BUFFERS(j);
  1072. unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
  1073. int workerid = starpu_worker_get_id();
  1074. unsigned index;
  1075. for (index = 0; index < nbuffers; index++)
  1076. {
  1077. starpu_data_handle_t handle = descrs[index].handle;
  1078. enum starpu_data_access_mode mode = descrs[index].mode;
  1079. int node = descrs[index].node;
  1080. struct _starpu_data_replicate *local_replicate = NULL;
  1081. if (index && descrs[index-1].handle == descrs[index].handle)
  1082. /* We have already released this data, skip it. This
  1083. * depends on ordering putting writes before reads, see
  1084. * _starpu_compar_handles */
  1085. continue;
  1086. if (node != -1)
  1087. local_replicate = get_replicate(handle, mode, workerid, node);
  1088. /* Keep a reference for future
  1089. * _starpu_release_task_enforce_sequential_consistency call */
  1090. _starpu_spin_lock(&handle->header_lock);
  1091. handle->busy_count++;
  1092. if (node == -1)
  1093. {
  1094. /* NOWHERE case, just notify dependencies */
  1095. if (!_starpu_notify_data_dependencies(handle, STARPU_NONE))
  1096. _starpu_spin_unlock(&handle->header_lock);
  1097. }
  1098. else
  1099. {
  1100. _starpu_spin_unlock(&handle->header_lock);
  1101. _starpu_release_data_on_node(handle, 0, STARPU_NONE, local_replicate);
  1102. }
  1103. }
  1104. if (profiling && task->profiling_info)
  1105. _starpu_clock_gettime(&task->profiling_info->release_data_end_time);
  1106. }
  1107. /* Version for a driver running on a worker: we show the driver state in the trace */
  1108. void _starpu_push_task_output(struct _starpu_job *j)
  1109. {
  1110. _STARPU_TRACE_START_PUSH_OUTPUT(NULL);
  1111. __starpu_push_task_output(j);
  1112. _STARPU_TRACE_END_PUSH_OUTPUT(NULL);
  1113. }
  1114. struct fetch_nowhere_wrapper
  1115. {
  1116. struct _starpu_job *j;
  1117. unsigned pending;
  1118. };
  1119. static void _starpu_fetch_nowhere_task_input_cb(void *arg);
  1120. /* Asynchronously fetch data for a task which will have no content */
  1121. void _starpu_fetch_nowhere_task_input(struct _starpu_job *j)
  1122. {
  1123. int profiling = starpu_profiling_status_get();
  1124. struct starpu_task *task = j->task;
  1125. if (profiling && task->profiling_info)
  1126. _starpu_clock_gettime(&task->profiling_info->acquire_data_start_time);
  1127. struct _starpu_data_descr *descrs = _STARPU_JOB_GET_ORDERED_BUFFERS(j);
  1128. unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
  1129. unsigned nfetchbuffers = 0;
  1130. struct fetch_nowhere_wrapper *wrapper;
  1131. unsigned index;
  1132. for (index = 0; index < nbuffers; index++)
  1133. {
  1134. /* Note here we just follow what was requested, and not use _starpu_task_data_get_node* */
  1135. int node = -1;
  1136. if (task->cl->specific_nodes)
  1137. node = STARPU_CODELET_GET_NODE(task->cl, descrs[index].index);
  1138. descrs[index].node = node;
  1139. if (node != -1)
  1140. nfetchbuffers++;
  1141. }
  1142. if (!nfetchbuffers)
  1143. {
  1144. /* Nothing to fetch actually, already finished! */
  1145. __starpu_push_task_output(j);
  1146. _starpu_handle_job_termination(j);
  1147. _STARPU_LOG_OUT_TAG("handle_job_termination");
  1148. return;
  1149. }
  1150. _STARPU_MALLOC(wrapper, (sizeof(*wrapper)));
  1151. wrapper->j = j;
  1152. /* +1 for the call below */
  1153. wrapper->pending = nfetchbuffers + 1;
  1154. for (index = 0; index < nbuffers; index++)
  1155. {
  1156. starpu_data_handle_t handle = descrs[index].handle;
  1157. enum starpu_data_access_mode mode = descrs[index].mode;
  1158. int node = descrs[index].node;
  1159. if (node == -1)
  1160. continue;
  1161. if (mode == STARPU_NONE ||
  1162. (mode & ((1<<STARPU_MODE_SHIFT) - 1)) >= STARPU_ACCESS_MODE_MAX ||
  1163. (mode >> STARPU_MODE_SHIFT) >= (STARPU_SHIFTED_MODE_MAX >> STARPU_MODE_SHIFT))
  1164. STARPU_ASSERT_MSG(0, "mode %d (0x%x) is bogus\n", mode, mode);
  1165. STARPU_ASSERT(mode != STARPU_SCRATCH && mode != STARPU_REDUX);
  1166. struct _starpu_data_replicate *local_replicate;
  1167. local_replicate = get_replicate(handle, mode, -1, node);
  1168. _starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, task, STARPU_FETCH, 1, _starpu_fetch_nowhere_task_input_cb, wrapper, 0, "_starpu_fetch_nowhere_task_input");
  1169. }
  1170. if (profiling && task->profiling_info)
  1171. _starpu_clock_gettime(&task->profiling_info->acquire_data_end_time);
  1172. /* Finished working with the task, release our reference */
  1173. _starpu_fetch_nowhere_task_input_cb(wrapper);
  1174. }
  1175. static void _starpu_fetch_nowhere_task_input_cb(void *arg)
  1176. {
  1177. /* One more transfer finished */
  1178. struct fetch_nowhere_wrapper *wrapper = arg;
  1179. unsigned pending = STARPU_ATOMIC_ADD(&wrapper->pending, -1);
  1180. ANNOTATE_HAPPENS_BEFORE(&wrapper->pending);
  1181. if (pending == 0)
  1182. {
  1183. ANNOTATE_HAPPENS_AFTER(&wrapper->pending);
  1184. /* Finished transferring, task is over */
  1185. struct _starpu_job *j = wrapper->j;
  1186. free(wrapper);
  1187. __starpu_push_task_output(j);
  1188. _starpu_handle_job_termination(j);
  1189. _STARPU_LOG_OUT_TAG("handle_job_termination");
  1190. }
  1191. }
  1192. /* NB : this value can only be an indication of the status of a data
  1193. at some point, but there is no strong garantee ! */
  1194. unsigned starpu_data_is_on_node(starpu_data_handle_t handle, unsigned node)
  1195. {
  1196. unsigned ret = 0;
  1197. // XXX : this is just a hint, so we don't take the lock ...
  1198. // STARPU_PTHREAD_SPIN_LOCK(&handle->header_lock);
  1199. if (handle->per_node[node].state != STARPU_INVALID)
  1200. {
  1201. ret = 1;
  1202. }
  1203. else
  1204. {
  1205. unsigned i;
  1206. unsigned nnodes = starpu_memory_nodes_get_count();
  1207. for (i = 0; i < nnodes; i++)
  1208. {
  1209. if (handle->per_node[node].request[i])
  1210. {
  1211. ret = 1;
  1212. break;
  1213. }
  1214. }
  1215. }
  1216. // STARPU_PTHREAD_SPIN_UNLOCK(&handle->header_lock);
  1217. return ret;
  1218. }