coherency.c 44 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2008-2021 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  4. * Copyright (C) 2018 Federal University of Rio Grande do Sul (UFRGS)
  5. *
  6. * StarPU is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU Lesser General Public License as published by
  8. * the Free Software Foundation; either version 2.1 of the License, or (at
  9. * your option) any later version.
  10. *
  11. * StarPU is distributed in the hope that it will be useful, but
  12. * WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  14. *
  15. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  16. */
  17. #include <limits.h>
  18. #include <math.h>
  19. #include <common/config.h>
  20. #include <datawizard/coherency.h>
  21. #include <datawizard/copy_driver.h>
  22. #include <datawizard/write_back.h>
  23. #include <datawizard/memory_nodes.h>
  24. #include <core/dependencies/data_concurrency.h>
  25. #include <core/disk.h>
  26. #include <profiling/profiling.h>
  27. #include <core/task.h>
  28. #include <starpu_scheduler.h>
  29. #include <core/workers.h>
  30. #ifdef STARPU_SIMGRID
  31. #include <core/simgrid.h>
  32. #endif
  33. static int link_supports_direct_transfers(starpu_data_handle_t handle, unsigned src_node, unsigned dst_node, unsigned *handling_node);
  34. int _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
  35. {
  36. int src_node = -1;
  37. unsigned i;
  38. unsigned nnodes = starpu_memory_nodes_get_count();
  39. /* first find a valid copy, either a STARPU_OWNER or a STARPU_SHARED */
  40. unsigned node;
  41. size_t size = _starpu_data_get_size(handle);
  42. double cost = INFINITY;
  43. unsigned src_node_mask = 0;
  44. for (node = 0; node < nnodes; node++)
  45. {
  46. if (handle->per_node[node].state != STARPU_INVALID)
  47. {
  48. /* we found a copy ! */
  49. src_node_mask |= (1<<node);
  50. }
  51. }
  52. if (src_node_mask == 0 && handle->init_cl)
  53. {
  54. /* No copy yet, but applicationg told us how to build it. */
  55. return -1;
  56. }
  57. /* we should have found at least one copy ! */
  58. STARPU_ASSERT_MSG(src_node_mask != 0, "The data for the handle %p is requested, but the handle does not have a valid value. Perhaps some initialization task is missing?", handle);
  59. /* Without knowing the size, we won't know the cost */
  60. if (!size)
  61. cost = 0;
  62. /* Check whether we have transfer cost for all nodes, if so, take the minimum */
  63. if (cost)
  64. for (i = 0; i < nnodes; i++)
  65. {
  66. if (src_node_mask & (1<<i))
  67. {
  68. double time = starpu_transfer_predict(i, destination, size);
  69. unsigned handling_node;
  70. /* Avoid indirect transfers */
  71. if (!link_supports_direct_transfers(handle, i, destination, &handling_node))
  72. continue;
  73. if (_STARPU_IS_ZERO(time))
  74. {
  75. /* No estimation, will have to revert to dumb strategy */
  76. cost = 0.0;
  77. break;
  78. }
  79. else if (time < cost)
  80. {
  81. cost = time;
  82. src_node = i;
  83. }
  84. }
  85. }
  86. if (cost && src_node != -1)
  87. {
  88. /* Could estimate through cost, return that */
  89. STARPU_ASSERT(handle->per_node[src_node].allocated);
  90. STARPU_ASSERT(handle->per_node[src_node].initialized);
  91. return src_node;
  92. }
  93. int i_ram = -1;
  94. int i_gpu = -1;
  95. int i_disk = -1;
  96. /* Revert to dumb strategy: take RAM unless only a GPU has it */
  97. for (i = 0; i < nnodes; i++)
  98. {
  99. if (src_node_mask & (1<<i))
  100. {
  101. int (*can_copy)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, unsigned handling_node) = handle->ops->copy_methods->can_copy;
  102. /* Avoid transfers which the interface does not want */
  103. if (can_copy)
  104. {
  105. void *src_interface = handle->per_node[i].data_interface;
  106. void *dst_interface = handle->per_node[destination].data_interface;
  107. unsigned handling_node;
  108. if (!link_supports_direct_transfers(handle, i, destination, &handling_node))
  109. {
  110. /* Avoid through RAM if the interface does not want it */
  111. void *ram_interface = handle->per_node[STARPU_MAIN_RAM].data_interface;
  112. if ((!can_copy(src_interface, i, ram_interface, STARPU_MAIN_RAM, i)
  113. && !can_copy(src_interface, i, ram_interface, STARPU_MAIN_RAM, STARPU_MAIN_RAM))
  114. || (!can_copy(ram_interface, STARPU_MAIN_RAM, dst_interface, destination, STARPU_MAIN_RAM)
  115. && !can_copy(ram_interface, STARPU_MAIN_RAM, dst_interface, destination, destination)))
  116. continue;
  117. }
  118. }
  119. /* however GPU are expensive sources, really !
  120. * Unless peer transfer is supported (and it would then have been selected above).
  121. * Other should be ok */
  122. if (starpu_node_get_kind(i) == STARPU_CPU_RAM ||
  123. starpu_node_get_kind(i) == STARPU_MPI_MS_RAM)
  124. i_ram = i;
  125. else if (starpu_node_get_kind(i) == STARPU_DISK_RAM)
  126. i_disk = i;
  127. else
  128. i_gpu = i;
  129. }
  130. }
  131. /* we have to use cpu_ram in first */
  132. if (i_ram != -1)
  133. src_node = i_ram;
  134. else if (i_gpu != -1)
  135. /* otherwise a gpu */
  136. src_node = i_gpu;
  137. else
  138. /* no luck we have to use the disk memory */
  139. src_node = i_disk;
  140. STARPU_ASSERT(src_node != -1);
  141. STARPU_ASSERT(handle->per_node[src_node].allocated);
  142. STARPU_ASSERT(handle->per_node[src_node].initialized);
  143. return src_node;
  144. }
  145. /* this may be called once the data is fetched with header and STARPU_RW-lock hold */
  146. void _starpu_update_data_state(starpu_data_handle_t handle,
  147. struct _starpu_data_replicate *requesting_replicate,
  148. enum starpu_data_access_mode mode)
  149. {
  150. /* There is nothing to do for relaxed coherency modes (scratch or
  151. * reductions) */
  152. if (!(mode & STARPU_RW))
  153. return;
  154. unsigned nnodes = starpu_memory_nodes_get_count();
  155. /* the data is present now */
  156. unsigned requesting_node = requesting_replicate->memory_node;
  157. if (mode & STARPU_W)
  158. {
  159. /* the requesting node now has the only valid copy */
  160. unsigned node;
  161. for (node = 0; node < nnodes; node++)
  162. {
  163. if (handle->per_node[node].state != STARPU_INVALID)
  164. _STARPU_TRACE_DATA_STATE_INVALID(handle, node);
  165. handle->per_node[node].state = STARPU_INVALID;
  166. }
  167. if (requesting_replicate->state != STARPU_OWNER)
  168. _STARPU_TRACE_DATA_STATE_OWNER(handle, requesting_node);
  169. requesting_replicate->state = STARPU_OWNER;
  170. if (handle->home_node != -1 && handle->per_node[handle->home_node].state == STARPU_INVALID)
  171. /* Notify that this MC is now dirty */
  172. _starpu_memchunk_dirty(requesting_replicate->mc, requesting_replicate->memory_node);
  173. }
  174. else
  175. {
  176. /* read only */
  177. if (requesting_replicate->state != STARPU_OWNER)
  178. {
  179. /* there was at least another copy of the data */
  180. unsigned node;
  181. for (node = 0; node < nnodes; node++)
  182. {
  183. struct _starpu_data_replicate *replicate = &handle->per_node[node];
  184. if (replicate->state != STARPU_INVALID)
  185. {
  186. if (replicate->state != STARPU_SHARED)
  187. _STARPU_TRACE_DATA_STATE_SHARED(handle, node);
  188. replicate->state = STARPU_SHARED;
  189. }
  190. }
  191. if (requesting_replicate->state != STARPU_SHARED)
  192. _STARPU_TRACE_DATA_STATE_SHARED(handle, requesting_node);
  193. requesting_replicate->state = STARPU_SHARED;
  194. }
  195. }
  196. }
  197. static int worker_supports_direct_access(unsigned node, unsigned handling_node)
  198. {
  199. if (node == handling_node)
  200. return 1;
  201. if (!_starpu_memory_node_get_nworkers(handling_node))
  202. /* No worker to process the request from that node */
  203. return 0;
  204. struct _starpu_node_ops *node_ops = _starpu_memory_node_get_node_ops(node);
  205. if (node_ops && node_ops->is_direct_access_supported)
  206. return node_ops->is_direct_access_supported(node, handling_node);
  207. else
  208. {
  209. STARPU_ABORT_MSG("Node %s does not define the operation 'is_direct_access_supported'", _starpu_node_get_prefix(starpu_node_get_kind(node)));
  210. return 1;
  211. }
  212. }
  213. static int link_supports_direct_transfers(starpu_data_handle_t handle, unsigned src_node, unsigned dst_node, unsigned *handling_node)
  214. {
  215. int (*can_copy)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, unsigned handling_node) = handle->ops->copy_methods->can_copy;
  216. void *src_interface = handle->per_node[src_node].data_interface;
  217. void *dst_interface = handle->per_node[dst_node].data_interface;
  218. /* XXX That's a hack until we fix cudaMemcpy3DPeerAsync in the block interface
  219. * Perhaps not all data interface provide a direct GPU-GPU transfer
  220. * method ! */
  221. #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
  222. if (src_node != dst_node && starpu_node_get_kind(src_node) == STARPU_CUDA_RAM && starpu_node_get_kind(dst_node) == STARPU_CUDA_RAM)
  223. {
  224. const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
  225. if (!copy_methods->cuda_to_cuda_async && !copy_methods->any_to_any)
  226. return 0;
  227. }
  228. #endif
  229. /* Note: with CUDA, performance seems a bit better when issuing the transfer from the destination (tested without GPUDirect, but GPUDirect probably behave the same) */
  230. if (worker_supports_direct_access(src_node, dst_node) && (!can_copy || can_copy(src_interface, src_node, dst_interface, dst_node, dst_node)))
  231. {
  232. *handling_node = dst_node;
  233. return 1;
  234. }
  235. if (worker_supports_direct_access(dst_node, src_node) && (!can_copy || can_copy(src_interface, src_node, dst_interface, dst_node, src_node)))
  236. {
  237. *handling_node = src_node;
  238. return 1;
  239. }
  240. return 0;
  241. }
  242. /* Now, we use slowness/bandwidth to compare numa nodes, is it better to use latency ? */
  243. static unsigned chose_best_numa_between_src_and_dest(int src, int dst)
  244. {
  245. double timing_best;
  246. int best_numa = -1;
  247. unsigned numa;
  248. const unsigned nb_numa_nodes = starpu_memory_nodes_get_numa_count();
  249. for(numa = 0; numa < nb_numa_nodes; numa++)
  250. {
  251. double actual = 1.0/starpu_transfer_bandwidth(src, numa) + 1.0/starpu_transfer_bandwidth(numa, dst);
  252. /* Compare slowness : take the lowest */
  253. if (best_numa < 0 || actual < timing_best)
  254. {
  255. best_numa = numa;
  256. timing_best = actual;
  257. }
  258. }
  259. STARPU_ASSERT(best_numa >= 0);
  260. return best_numa;
  261. }
  262. /* Determines the path of a request : each hop is defined by (src,dst) and the
  263. * node that handles the hop. The returned value indicates the number of hops,
  264. * and the max_len is the maximum number of hops (ie. the size of the
  265. * src_nodes, dst_nodes and handling_nodes arrays. */
  266. int _starpu_determine_request_path(starpu_data_handle_t handle,
  267. int src_node, int dst_node,
  268. enum starpu_data_access_mode mode, int max_len,
  269. unsigned *src_nodes, unsigned *dst_nodes,
  270. unsigned *handling_nodes, unsigned write_invalidation)
  271. {
  272. if (src_node == dst_node || !(mode & STARPU_R))
  273. {
  274. if (dst_node == -1 || starpu_node_get_kind(dst_node) == STARPU_DISK_RAM)
  275. handling_nodes[0] = src_node;
  276. else
  277. handling_nodes[0] = dst_node;
  278. if (write_invalidation)
  279. /* The invalidation request will be enough */
  280. return 0;
  281. /* The destination node should only allocate the data, no transfer is required */
  282. STARPU_ASSERT(max_len >= 1);
  283. src_nodes[0] = STARPU_MAIN_RAM; // ignored
  284. dst_nodes[0] = dst_node;
  285. return 1;
  286. }
  287. if (src_node < 0)
  288. {
  289. /* Will just initialize the destination */
  290. STARPU_ASSERT(max_len >= 1);
  291. src_nodes[0] = src_node; // ignored
  292. dst_nodes[0] = dst_node;
  293. return 1;
  294. }
  295. unsigned handling_node;
  296. int link_is_valid = link_supports_direct_transfers(handle, src_node, dst_node, &handling_node);
  297. if (!link_is_valid)
  298. {
  299. int (*can_copy)(void *, unsigned, void *, unsigned, unsigned) = handle->ops->copy_methods->can_copy;
  300. void *src_interface = handle->per_node[src_node].data_interface;
  301. void *dst_interface = handle->per_node[dst_node].data_interface;
  302. /* We need an intermediate hop to implement data staging
  303. * through main memory. */
  304. STARPU_ASSERT(max_len >= 2);
  305. STARPU_ASSERT(src_node >= 0);
  306. unsigned numa = chose_best_numa_between_src_and_dest(src_node, dst_node);
  307. /* GPU -> RAM */
  308. src_nodes[0] = src_node;
  309. dst_nodes[0] = numa;
  310. if (starpu_node_get_kind(src_node) == STARPU_DISK_RAM)
  311. /* Disks don't have their own driver thread */
  312. handling_nodes[0] = dst_node;
  313. else if (!can_copy || can_copy(src_interface, src_node, dst_interface, dst_node, src_node))
  314. {
  315. handling_nodes[0] = src_node;
  316. }
  317. else
  318. {
  319. STARPU_ASSERT_MSG(can_copy(src_interface, src_node, dst_interface, dst_node, dst_node), "interface %d refuses all kinds of transfers from node %d to node %d\n", handle->ops->interfaceid, src_node, dst_node);
  320. handling_nodes[0] = dst_node;
  321. }
  322. /* RAM -> GPU */
  323. src_nodes[1] = numa;
  324. dst_nodes[1] = dst_node;
  325. if (starpu_node_get_kind(dst_node) == STARPU_DISK_RAM)
  326. /* Disks don't have their own driver thread */
  327. handling_nodes[1] = src_node;
  328. else if (!can_copy || can_copy(src_interface, src_node, dst_interface, dst_node, dst_node))
  329. {
  330. handling_nodes[1] = dst_node;
  331. }
  332. else
  333. {
  334. STARPU_ASSERT_MSG(can_copy(src_interface, src_node, dst_interface, dst_node, src_node), "interface %d refuses all kinds of transfers from node %d to node %d\n", handle->ops->interfaceid, src_node, dst_node);
  335. handling_nodes[1] = src_node;
  336. }
  337. return 2;
  338. }
  339. else
  340. {
  341. STARPU_ASSERT(max_len >= 1);
  342. src_nodes[0] = src_node;
  343. dst_nodes[0] = dst_node;
  344. handling_nodes[0] = handling_node;
  345. #if !defined(STARPU_HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
  346. STARPU_ASSERT(!(mode & STARPU_R) || starpu_node_get_kind(src_node) != STARPU_CUDA_RAM || starpu_node_get_kind(dst_node) != STARPU_CUDA_RAM);
  347. #endif
  348. return 1;
  349. }
  350. }
  351. /* handle->lock should be taken. r is returned locked. The node parameter
  352. * indicate either the source of the request, or the destination for a
  353. * write-only request. */
  354. static struct _starpu_data_request *_starpu_search_existing_data_request(struct _starpu_data_replicate *replicate, unsigned node, enum starpu_data_access_mode mode, enum starpu_is_prefetch is_prefetch)
  355. {
  356. struct _starpu_data_request *r;
  357. r = replicate->request[node];
  358. if (r)
  359. {
  360. _starpu_spin_checklocked(&r->handle->header_lock);
  361. _starpu_spin_lock(&r->lock);
  362. /* perhaps we need to "upgrade" the request */
  363. if (is_prefetch < r->prefetch)
  364. _starpu_update_prefetch_status(r, is_prefetch);
  365. if (mode & STARPU_R)
  366. {
  367. /* in case the exisiting request did not imply a memory
  368. * transfer yet, we have to take a second refcnt now
  369. * for the source, in addition to the refcnt for the
  370. * destination
  371. * (so that the source remains valid) */
  372. if (!(r->mode & STARPU_R))
  373. {
  374. replicate->refcnt++;
  375. replicate->handle->busy_count++;
  376. }
  377. r->mode = (enum starpu_data_access_mode) ((int) r->mode | (int) STARPU_R);
  378. }
  379. if (mode & STARPU_W)
  380. r->mode = (enum starpu_data_access_mode) ((int) r->mode | (int) STARPU_W);
  381. }
  382. return r;
  383. }
  384. /*
  385. * This function is called when the data is needed on the local node, this
  386. * returns a pointer to the local copy
  387. *
  388. * R STARPU_W STARPU_RW
  389. * Owner OK OK OK
  390. * Shared OK 1 1
  391. * Invalid 2 3 4
  392. *
  393. * case 1 : shared + (read)write :
  394. * no data copy but shared->Invalid/Owner
  395. * case 2 : invalid + read :
  396. * data copy + invalid->shared + owner->shared (STARPU_ASSERT(there is a valid))
  397. * case 3 : invalid + write :
  398. * no data copy + invalid->owner + (owner,shared)->invalid
  399. * case 4 : invalid + R/STARPU_W :
  400. * data copy + if (STARPU_W) (invalid->owner + owner->invalid)
  401. * else (invalid,owner->shared)
  402. */
  403. struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_handle_t handle,
  404. struct _starpu_data_replicate *dst_replicate,
  405. enum starpu_data_access_mode mode, enum starpu_is_prefetch is_prefetch,
  406. unsigned async,
  407. void (*callback_func)(void *), void *callback_arg, int prio, const char *origin)
  408. {
  409. /* We don't care about commuting for data requests, that was handled before. */
  410. mode &= ~STARPU_COMMUTE;
  411. /* This function is called with handle's header lock taken */
  412. _starpu_spin_checklocked(&handle->header_lock);
  413. int requesting_node = dst_replicate ? dst_replicate->memory_node : -1;
  414. unsigned nwait = 0;
  415. if (mode & STARPU_W)
  416. {
  417. /* We will write to the buffer. We will have to wait for all
  418. * existing requests before the last request which will
  419. * invalidate all their results (which were possibly spurious,
  420. * e.g. too aggressive eviction).
  421. */
  422. unsigned i, j;
  423. unsigned nnodes = starpu_memory_nodes_get_count();
  424. for (i = 0; i < nnodes; i++)
  425. for (j = 0; j < nnodes; j++)
  426. if (handle->per_node[i].request[j])
  427. nwait++;
  428. /* If the request is not detached (i.e. the caller really wants
  429. * proper ownership), no new requests will appear because a
  430. * reference will be kept on the dst replicate, which will
  431. * notably prevent data reclaiming.
  432. */
  433. }
  434. if ((!dst_replicate || dst_replicate->state != STARPU_INVALID) && (!nwait || is_prefetch))
  435. {
  436. if (dst_replicate)
  437. {
  438. #ifdef STARPU_MEMORY_STATS
  439. enum _starpu_cache_state old_state = dst_replicate->state;
  440. #endif
  441. /* the data is already available and we don't have to wait for
  442. * any request, so we can stop */
  443. _starpu_update_data_state(handle, dst_replicate, mode);
  444. _starpu_msi_cache_hit(requesting_node);
  445. #ifdef STARPU_MEMORY_STATS
  446. _starpu_memory_handle_stats_cache_hit(handle, requesting_node);
  447. /* XXX Broken ? */
  448. if (old_state == STARPU_SHARED
  449. && dst_replicate->state == STARPU_OWNER)
  450. _starpu_memory_handle_stats_shared_to_owner(handle, requesting_node);
  451. #endif
  452. if (dst_replicate->mc)
  453. {
  454. if (is_prefetch == STARPU_TASK_PREFETCH)
  455. /* Make sure it stays there */
  456. dst_replicate->nb_tasks_prefetch++;
  457. _starpu_memchunk_recently_used(dst_replicate->mc, requesting_node);
  458. }
  459. }
  460. _starpu_spin_unlock(&handle->header_lock);
  461. if (callback_func)
  462. callback_func(callback_arg);
  463. _STARPU_LOG_OUT_TAG("data available");
  464. return NULL;
  465. }
  466. if (dst_replicate)
  467. _starpu_msi_cache_miss(requesting_node);
  468. /* the only remaining situation is that the local copy was invalid */
  469. STARPU_ASSERT((dst_replicate && dst_replicate->state == STARPU_INVALID) || nwait);
  470. /* find someone who already has the data */
  471. int src_node = -1;
  472. if (dst_replicate && mode & STARPU_R)
  473. {
  474. if (dst_replicate->state == STARPU_INVALID)
  475. src_node = _starpu_select_src_node(handle, requesting_node);
  476. else
  477. src_node = requesting_node;
  478. if (src_node < 0)
  479. {
  480. /* We will create it, no need to read an existing value */
  481. mode &= ~STARPU_R;
  482. }
  483. }
  484. else if (dst_replicate)
  485. {
  486. /* if the data is in write only mode (and not SCRATCH or REDUX), there is no need for a source, data will be initialized by the task itself */
  487. if (mode & STARPU_W)
  488. dst_replicate->initialized = 1;
  489. if (starpu_node_get_kind(requesting_node) == STARPU_CPU_RAM && !nwait
  490. && !_starpu_malloc_willpin_on_node(requesting_node))
  491. {
  492. /* And this is the main RAM without pinning, really no need for a
  493. * request, just quickly allocate and be done */
  494. if (_starpu_allocate_memory_on_node(handle, dst_replicate, is_prefetch) == 0)
  495. {
  496. _starpu_update_data_state(handle, dst_replicate, mode);
  497. if (dst_replicate->mc)
  498. {
  499. if (is_prefetch == STARPU_TASK_PREFETCH)
  500. /* Make sure it stays there */
  501. dst_replicate->nb_tasks_prefetch++;
  502. _starpu_memchunk_recently_used(dst_replicate->mc, requesting_node);
  503. }
  504. _starpu_spin_unlock(&handle->header_lock);
  505. if (callback_func)
  506. callback_func(callback_arg);
  507. _STARPU_LOG_OUT_TAG("data immediately allocated");
  508. return NULL;
  509. }
  510. }
  511. }
  512. #define MAX_REQUESTS 4
  513. /* We can safely assume that there won't be more than 2 hops in the
  514. * current implementation */
  515. unsigned src_nodes[MAX_REQUESTS], dst_nodes[MAX_REQUESTS], handling_nodes[MAX_REQUESTS];
  516. /* keep one slot for the last W request, if any */
  517. int write_invalidation = (mode & STARPU_W) && nwait && !is_prefetch;
  518. int nhops = _starpu_determine_request_path(handle, src_node, requesting_node, mode, MAX_REQUESTS,
  519. src_nodes, dst_nodes, handling_nodes, write_invalidation);
  520. STARPU_ASSERT(nhops >= 0 && nhops <= MAX_REQUESTS-1);
  521. struct _starpu_data_request *requests[nhops + write_invalidation];
  522. /* Did we reuse a request for that hop ? */
  523. int reused_requests[nhops + write_invalidation];
  524. /* Construct an array with a list of requests, possibly reusing existing requests */
  525. int hop;
  526. for (hop = 0; hop < nhops; hop++)
  527. {
  528. struct _starpu_data_request *r;
  529. unsigned hop_src_node = src_nodes[hop];
  530. unsigned hop_dst_node = dst_nodes[hop];
  531. unsigned hop_handling_node = handling_nodes[hop];
  532. struct _starpu_data_replicate *hop_src_replicate;
  533. struct _starpu_data_replicate *hop_dst_replicate;
  534. /* Only the first request is independant */
  535. unsigned ndeps = (hop == 0)?0:1;
  536. hop_src_replicate = &handle->per_node[hop_src_node];
  537. hop_dst_replicate = (hop != nhops - 1)?&handle->per_node[hop_dst_node]:dst_replicate;
  538. /* Try to reuse a request if possible */
  539. #ifdef STARPU_DEVEL
  540. #warning We do not actually want to reuse an existing request when our request is for a task with low priority, that will get executed much later. We don t want to wire down the data in between, at worse that could hog the complete gpu memory...
  541. #endif
  542. r = _starpu_search_existing_data_request(hop_dst_replicate,
  543. (mode & STARPU_R)?hop_src_node:hop_dst_node,
  544. mode, is_prefetch);
  545. reused_requests[hop] = !!r;
  546. if (!r)
  547. {
  548. /* Create a new request if there was no request to reuse */
  549. r = _starpu_create_data_request(handle, hop_src_replicate,
  550. hop_dst_replicate, hop_handling_node,
  551. mode, ndeps, is_prefetch, prio, 0, origin);
  552. nwait++;
  553. }
  554. requests[hop] = r;
  555. }
  556. /* Chain these requests */
  557. for (hop = 0; hop < nhops; hop++)
  558. {
  559. struct _starpu_data_request *r;
  560. r = requests[hop];
  561. if (hop != nhops - 1)
  562. {
  563. if (!reused_requests[hop + 1])
  564. {
  565. r->next_req[r->next_req_count++] = requests[hop + 1];
  566. STARPU_ASSERT(r->next_req_count <= STARPU_MAXNODES);
  567. }
  568. }
  569. else
  570. {
  571. if (is_prefetch == STARPU_TASK_PREFETCH)
  572. /* Make last request add the prefetch count on the mc to keep the data
  573. * there until the task gets to execute. */
  574. r->nb_tasks_prefetch++;
  575. if (!write_invalidation)
  576. /* The last request will perform the callback after termination */
  577. _starpu_data_request_append_callback(r, callback_func, callback_arg);
  578. }
  579. if (reused_requests[hop])
  580. _starpu_spin_unlock(&r->lock);
  581. }
  582. if (write_invalidation)
  583. {
  584. /* Some requests were still pending, we have to add yet another
  585. * request, depending on them, which will invalidate their
  586. * result.
  587. */
  588. struct _starpu_data_request *r = _starpu_create_data_request(handle, dst_replicate,
  589. dst_replicate, requesting_node,
  590. STARPU_W, nwait, is_prefetch, prio, 1, origin);
  591. /* and perform the callback after termination */
  592. _starpu_data_request_append_callback(r, callback_func, callback_arg);
  593. /* We will write to the buffer. We will have to wait for all
  594. * existing requests before the last request which will
  595. * invalidate all their results (which were possibly spurious,
  596. * e.g. too aggressive eviction).
  597. */
  598. unsigned i, j;
  599. unsigned nnodes = starpu_memory_nodes_get_count();
  600. for (i = 0; i < nnodes; i++)
  601. for (j = 0; j < nnodes; j++)
  602. {
  603. struct _starpu_data_request *r2 = handle->per_node[i].request[j];
  604. if (r2)
  605. {
  606. _starpu_spin_lock(&r2->lock);
  607. if (is_prefetch < r2->prefetch)
  608. /* Hasten the request we will have to wait for */
  609. _starpu_update_prefetch_status(r2, is_prefetch);
  610. r2->next_req[r2->next_req_count++] = r;
  611. STARPU_ASSERT(r2->next_req_count <= STARPU_MAXNODES + 1);
  612. _starpu_spin_unlock(&r2->lock);
  613. nwait--;
  614. }
  615. }
  616. STARPU_ASSERT(nwait == 0);
  617. nhops++;
  618. requests[nhops - 1] = r;
  619. /* existing requests will post this one */
  620. reused_requests[nhops - 1] = 1;
  621. }
  622. STARPU_ASSERT(nhops);
  623. if (!async)
  624. requests[nhops - 1]->refcnt++;
  625. /* we only submit the first request, the remaining will be
  626. * automatically submitted afterward */
  627. if (!reused_requests[0])
  628. _starpu_post_data_request(requests[0]);
  629. return requests[nhops - 1];
  630. }
  631. int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *dst_replicate,
  632. enum starpu_data_access_mode mode, unsigned detached, enum starpu_is_prefetch is_prefetch, unsigned async,
  633. void (*callback_func)(void *), void *callback_arg, int prio, const char *origin)
  634. {
  635. _STARPU_LOG_IN();
  636. int cpt = 0;
  637. while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
  638. {
  639. cpt++;
  640. _starpu_datawizard_progress(1);
  641. }
  642. if (cpt == STARPU_SPIN_MAXTRY)
  643. _starpu_spin_lock(&handle->header_lock);
  644. if (mode & STARPU_R && is_prefetch > STARPU_FETCH)
  645. {
  646. unsigned src_node_mask = 0;
  647. unsigned nnodes = starpu_memory_nodes_get_count();
  648. unsigned n;
  649. for (n = 0; n < nnodes; n++)
  650. {
  651. if (handle->per_node[n].state != STARPU_INVALID)
  652. {
  653. /* we found a copy ! */
  654. src_node_mask |= (1<<n);
  655. }
  656. }
  657. if (src_node_mask == 0)
  658. {
  659. /* no valid copy, nothing to prefetch */
  660. STARPU_ASSERT_MSG(handle->init_cl, "Could not find a valid copy of the data, and no handle initialization function");
  661. _starpu_spin_unlock(&handle->header_lock);
  662. return 0;
  663. }
  664. }
  665. if (!detached)
  666. {
  667. /* Take references which will be released by _starpu_release_data_on_node */
  668. if (dst_replicate)
  669. dst_replicate->refcnt++;
  670. else if (node == STARPU_ACQUIRE_NO_NODE_LOCK_ALL)
  671. {
  672. int i;
  673. for (i = 0; i < STARPU_MAXNODES; i++)
  674. handle->per_node[i].refcnt++;
  675. }
  676. handle->busy_count++;
  677. }
  678. struct _starpu_data_request *r;
  679. r = _starpu_create_request_to_fetch_data(handle, dst_replicate, mode,
  680. is_prefetch, async, callback_func, callback_arg, prio, origin);
  681. /* If no request was created, the handle was already up-to-date on the
  682. * node. In this case, _starpu_create_request_to_fetch_data has already
  683. * unlocked the header. */
  684. if (!r)
  685. return 0;
  686. _starpu_spin_unlock(&handle->header_lock);
  687. int ret = async?0:_starpu_wait_data_request_completion(r, 1);
  688. _STARPU_LOG_OUT();
  689. return ret;
  690. }
  691. static int idle_prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
  692. {
  693. return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, STARPU_IDLEFETCH, 1, NULL, NULL, prio, "idle_prefetch_data_on_node");
  694. }
  695. static int task_prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
  696. {
  697. return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, STARPU_TASK_PREFETCH, 1, NULL, NULL, prio, "task_prefetch_data_on_node");
  698. }
  699. static int STARPU_ATTRIBUTE_UNUSED prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
  700. {
  701. return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, STARPU_PREFETCH, 1, NULL, NULL, prio, "prefetch_data_on_node");
  702. }
  703. static int fetch_data(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
  704. {
  705. return _starpu_fetch_data_on_node(handle, node, replicate, mode, 0, STARPU_FETCH, 0, NULL, NULL, prio, "fetch_data");
  706. }
  707. uint32_t _starpu_get_data_refcnt(starpu_data_handle_t handle, unsigned node)
  708. {
  709. return handle->per_node[node].refcnt;
  710. }
  711. size_t _starpu_data_get_size(starpu_data_handle_t handle)
  712. {
  713. return handle->ops->get_size(handle);
  714. }
  715. size_t _starpu_data_get_alloc_size(starpu_data_handle_t handle)
  716. {
  717. if (handle->ops->get_alloc_size)
  718. return handle->ops->get_alloc_size(handle);
  719. else
  720. return handle->ops->get_size(handle);
  721. }
  722. starpu_ssize_t _starpu_data_get_max_size(starpu_data_handle_t handle)
  723. {
  724. if (handle->ops->get_max_size)
  725. return handle->ops->get_max_size(handle);
  726. else
  727. return -1;
  728. }
  729. uint32_t _starpu_data_get_footprint(starpu_data_handle_t handle)
  730. {
  731. return handle->footprint;
  732. }
  733. /* in case the data was accessed on a write mode, do not forget to
  734. * make it accessible again once it is possible ! */
  735. void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_wt_mask, enum starpu_data_access_mode down_to_mode, struct _starpu_data_replicate *replicate)
  736. {
  737. uint32_t wt_mask;
  738. wt_mask = default_wt_mask | handle->wt_mask;
  739. wt_mask &= (1<<starpu_memory_nodes_get_count())-1;
  740. /* Note that it is possible that there is no valid copy of the data (if
  741. * starpu_data_invalidate was called for instance). In that case, we do
  742. * not enforce any write-through mechanism. */
  743. unsigned memory_node = replicate->memory_node;
  744. if (replicate->state != STARPU_INVALID && handle->current_mode & STARPU_W)
  745. if (wt_mask & ~(1<<memory_node))
  746. _starpu_write_through_data(handle, memory_node, wt_mask);
  747. int cpt = 0;
  748. while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
  749. {
  750. cpt++;
  751. _starpu_datawizard_progress(1);
  752. }
  753. if (cpt == STARPU_SPIN_MAXTRY)
  754. _starpu_spin_lock(&handle->header_lock);
  755. if (down_to_mode == STARPU_NONE)
  756. {
  757. /* Release refcnt taken by fetch_data_on_node */
  758. replicate->refcnt--;
  759. STARPU_ASSERT_MSG(replicate->refcnt >= 0, "handle %p released too many times", handle);
  760. STARPU_ASSERT_MSG(handle->busy_count > 0, "handle %p released too many times", handle);
  761. handle->busy_count--;
  762. }
  763. if (!_starpu_notify_data_dependencies(handle, down_to_mode))
  764. _starpu_spin_unlock(&handle->header_lock);
  765. }
  766. int _starpu_prefetch_task_input_prio(struct starpu_task *task, int target_node, int worker, int prio, enum starpu_is_prefetch prefetch)
  767. {
  768. #ifdef STARPU_OPENMP
  769. struct _starpu_job *j = _starpu_get_job_associated_to_task(task);
  770. /* do not attempt to prefetch task input if this is an OpenMP task resuming after blocking */
  771. if (j->discontinuous != 0)
  772. return 0;
  773. #endif
  774. STARPU_ASSERT_MSG(prefetch != STARPU_PREFETCH || !task->prefetched, "Prefetching was already requested for this task! Did you set 'prefetches' to 1 in the starpu_sched_policy structure?");
  775. unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
  776. unsigned index;
  777. for (index = 0; index < nbuffers; index++)
  778. {
  779. starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, index);
  780. enum starpu_data_access_mode mode = STARPU_TASK_GET_MODE(task, index);
  781. if (mode & (STARPU_SCRATCH|STARPU_REDUX))
  782. continue;
  783. int node;
  784. if (target_node >= 0)
  785. node = _starpu_task_data_get_node_on_node(task, index, target_node);
  786. else
  787. node = _starpu_task_data_get_node_on_worker(task, index, worker);
  788. struct _starpu_data_replicate *replicate = &handle->per_node[node];
  789. if (prefetch == STARPU_PREFETCH)
  790. task_prefetch_data_on_node(handle, node, replicate, mode, prio);
  791. else
  792. idle_prefetch_data_on_node(handle, node, replicate, mode, prio);
  793. }
  794. if (prefetch == STARPU_PREFETCH)
  795. task->prefetched = 1;
  796. return 0;
  797. }
  798. int starpu_prefetch_task_input_prio(struct starpu_task *task, int target_node, int worker, int prio)
  799. {
  800. return _starpu_prefetch_task_input_prio(task, target_node, worker, prio, STARPU_PREFETCH);
  801. }
  802. int starpu_prefetch_task_input_on_node_prio(struct starpu_task *task, unsigned target_node, int prio)
  803. {
  804. return starpu_prefetch_task_input_prio(task, target_node, -1, prio);
  805. }
  806. int starpu_prefetch_task_input_on_node(struct starpu_task *task, unsigned node)
  807. {
  808. int prio = task->priority;
  809. if (task->workerorder)
  810. prio = INT_MAX - task->workerorder;
  811. return starpu_prefetch_task_input_on_node_prio(task, node, prio);
  812. }
  813. int starpu_idle_prefetch_task_input_prio(struct starpu_task *task, int target_node, int worker, int prio)
  814. {
  815. return _starpu_prefetch_task_input_prio(task, target_node, worker, prio, STARPU_IDLEFETCH);
  816. }
  817. int starpu_idle_prefetch_task_input_on_node_prio(struct starpu_task *task, unsigned target_node, int prio)
  818. {
  819. return starpu_idle_prefetch_task_input_prio(task, target_node, -1, prio);
  820. }
  821. int starpu_idle_prefetch_task_input_on_node(struct starpu_task *task, unsigned node)
  822. {
  823. int prio = task->priority;
  824. if (task->workerorder)
  825. prio = INT_MAX - task->workerorder;
  826. return starpu_idle_prefetch_task_input_on_node_prio(task, node, prio);
  827. }
  828. int starpu_prefetch_task_input_for_prio(struct starpu_task *task, unsigned worker, int prio)
  829. {
  830. return starpu_prefetch_task_input_prio(task, -1, worker, prio);
  831. }
  832. int starpu_prefetch_task_input_for(struct starpu_task *task, unsigned worker)
  833. {
  834. int prio = task->priority;
  835. if (task->workerorder)
  836. prio = INT_MAX - task->workerorder;
  837. return starpu_prefetch_task_input_for_prio(task, worker, prio);
  838. }
  839. int starpu_idle_prefetch_task_input_for_prio(struct starpu_task *task, unsigned worker, int prio)
  840. {
  841. return starpu_idle_prefetch_task_input_prio(task, -1, worker, prio);
  842. }
  843. int starpu_idle_prefetch_task_input_for(struct starpu_task *task, unsigned worker)
  844. {
  845. int prio = task->priority;
  846. if (task->workerorder)
  847. prio = INT_MAX - task->workerorder;
  848. return starpu_idle_prefetch_task_input_for_prio(task, worker, prio);
  849. }
  850. struct _starpu_data_replicate *get_replicate(starpu_data_handle_t handle, enum starpu_data_access_mode mode, int workerid, unsigned node)
  851. {
  852. if (mode & (STARPU_SCRATCH|STARPU_REDUX))
  853. {
  854. STARPU_ASSERT(workerid >= 0);
  855. if (!handle->per_worker)
  856. {
  857. _starpu_spin_lock(&handle->header_lock);
  858. if (!handle->per_worker)
  859. _starpu_data_initialize_per_worker(handle);
  860. _starpu_spin_unlock(&handle->header_lock);
  861. }
  862. return &handle->per_worker[workerid];
  863. }
  864. else
  865. /* That's a "normal" buffer (R/W) */
  866. return &handle->per_node[node];
  867. }
  868. /* Callback used when a buffer is send asynchronously to the sink */
  869. static void _starpu_fetch_task_input_cb(void *arg)
  870. {
  871. struct _starpu_worker * worker = (struct _starpu_worker *) arg;
  872. /* increase the number of buffer received */
  873. STARPU_WMB();
  874. (void)STARPU_ATOMIC_ADD(&worker->nb_buffers_transferred, 1);
  875. #ifdef STARPU_SIMGRID
  876. starpu_pthread_queue_broadcast(&_starpu_simgrid_transfer_queue[worker->memory_node]);
  877. #endif
  878. }
  879. /* Synchronously or asynchronously fetch data for a given task (if it's not there already)
  880. * Returns the number of data acquired here. */
  881. /* _starpu_fetch_task_input must be called before
  882. * executing the task. __starpu_push_task_output but be called after the
  883. * execution of the task. */
  884. /* The driver can either just call _starpu_fetch_task_input with async==0,
  885. * or to improve overlapping, it can call _starpu_fetch_task_input with
  886. * async==1, then wait for transfers to complete, then call
  887. * _starpu_fetch_task_input_tail to complete the fetch. */
  888. int _starpu_fetch_task_input(struct starpu_task *task, struct _starpu_job *j, int async)
  889. {
  890. struct _starpu_worker *worker = _starpu_get_local_worker_key();
  891. int workerid = worker->workerid;
  892. if (async)
  893. {
  894. worker->task_transferring = task;
  895. worker->nb_buffers_transferred = 0;
  896. if (worker->ntasks <= 1)
  897. _STARPU_TRACE_WORKER_START_FETCH_INPUT(NULL, workerid);
  898. }
  899. else
  900. _STARPU_TRACE_START_FETCH_INPUT(NULL);
  901. int profiling = starpu_profiling_status_get();
  902. if (profiling && task->profiling_info)
  903. _starpu_clock_gettime(&task->profiling_info->acquire_data_start_time);
  904. struct _starpu_data_descr *descrs = _STARPU_JOB_GET_ORDERED_BUFFERS(j);
  905. unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
  906. unsigned nacquires;
  907. unsigned index;
  908. nacquires = 0;
  909. for (index = 0; index < nbuffers; index++)
  910. {
  911. int ret;
  912. starpu_data_handle_t handle = descrs[index].handle;
  913. enum starpu_data_access_mode mode = descrs[index].mode;
  914. int node = _starpu_task_data_get_node_on_worker(task, descrs[index].index, workerid);
  915. /* We set this here for coherency with __starpu_push_task_output */
  916. descrs[index].node = node;
  917. if (mode == STARPU_NONE ||
  918. (mode & ((1<<STARPU_MODE_SHIFT) - 1)) >= STARPU_ACCESS_MODE_MAX ||
  919. (mode >> STARPU_MODE_SHIFT) >= (STARPU_SHIFTED_MODE_MAX >> STARPU_MODE_SHIFT))
  920. STARPU_ASSERT_MSG(0, "mode %d (0x%x) is bogus\n", mode, mode);
  921. struct _starpu_data_replicate *local_replicate;
  922. if (index && descrs[index-1].handle == descrs[index].handle)
  923. /* We have already took this data, skip it. This
  924. * depends on ordering putting writes before reads, see
  925. * _starpu_compar_handles */
  926. continue;
  927. local_replicate = get_replicate(handle, mode, workerid, node);
  928. if (async)
  929. {
  930. ret = _starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, STARPU_FETCH, 1,
  931. _starpu_fetch_task_input_cb, worker, 0, "_starpu_fetch_task_input");
  932. #ifdef STARPU_SIMGRID
  933. if (_starpu_simgrid_fetching_input_cost())
  934. starpu_sleep(0.000001);
  935. #endif
  936. if (STARPU_UNLIKELY(ret))
  937. {
  938. /* Ooops, not enough memory, make worker wait for these for now, and the synchronous call will finish by forcing eviction*/
  939. worker->nb_buffers_totransfer = nacquires;
  940. _starpu_set_worker_status(worker, STATUS_WAITING);
  941. return 0;
  942. }
  943. }
  944. else
  945. {
  946. ret = fetch_data(handle, node, local_replicate, mode, 0);
  947. #ifdef STARPU_SIMGRID
  948. if (_starpu_simgrid_fetching_input_cost())
  949. starpu_sleep(0.000001);
  950. #endif
  951. if (STARPU_UNLIKELY(ret))
  952. goto enomem;
  953. }
  954. nacquires++;
  955. }
  956. if (async)
  957. {
  958. worker->nb_buffers_totransfer = nacquires;
  959. _starpu_set_worker_status(worker, STATUS_WAITING);
  960. return 0;
  961. }
  962. _starpu_fetch_task_input_tail(task, j, worker);
  963. return 0;
  964. enomem:
  965. _STARPU_TRACE_END_FETCH_INPUT(NULL);
  966. _STARPU_DISP("something went wrong with buffer %u\n", index);
  967. /* try to unreference all the input that were successfully taken */
  968. unsigned index2;
  969. for (index2 = 0; index2 < index; index2++)
  970. {
  971. starpu_data_handle_t handle = descrs[index2].handle;
  972. enum starpu_data_access_mode mode = descrs[index2].mode;
  973. int node = descrs[index].node;
  974. struct _starpu_data_replicate *local_replicate;
  975. if (index2 && descrs[index2-1].handle == descrs[index2].handle)
  976. /* We have already released this data, skip it. This
  977. * depends on ordering putting writes before reads, see
  978. * _starpu_compar_handles */
  979. continue;
  980. local_replicate = get_replicate(handle, mode, workerid, node);
  981. _starpu_release_data_on_node(handle, 0, STARPU_NONE, local_replicate);
  982. }
  983. return -1;
  984. }
  985. /* Now that we have taken the data locks in locking order, fill the codelet interfaces in function order. */
  986. void _starpu_fetch_task_input_tail(struct starpu_task *task, struct _starpu_job *j, struct _starpu_worker *worker)
  987. {
  988. int workerid = worker->workerid;
  989. int profiling = starpu_profiling_status_get();
  990. unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
  991. struct _starpu_data_descr *descrs = _STARPU_JOB_GET_ORDERED_BUFFERS(j);
  992. unsigned index;
  993. unsigned long total_size = 0;
  994. for (index = 0; index < nbuffers; index++)
  995. {
  996. starpu_data_handle_t handle = descrs[index].handle;
  997. enum starpu_data_access_mode mode = descrs[index].mode;
  998. int node = descrs[index].node;
  999. struct _starpu_data_replicate *local_replicate;
  1000. local_replicate = get_replicate(handle, mode, workerid, node);
  1001. _starpu_spin_lock(&handle->header_lock);
  1002. if (local_replicate->mc)
  1003. {
  1004. if (task->prefetched && local_replicate->initialized &&
  1005. /* See prefetch conditions in
  1006. * starpu_prefetch_task_input_on_node_prio and alike */
  1007. !(mode & (STARPU_SCRATCH|STARPU_REDUX)) &&
  1008. (mode & STARPU_R))
  1009. {
  1010. /* Allocations or transfer prefetchs should have been done by now and marked
  1011. * this mc as needed for us.
  1012. * Now that we added a reference for the task, we can relieve that. */
  1013. /* Note: the replicate might have been evicted in between, thus not 100% sure
  1014. * that our prefetch request is still recorded here. */
  1015. if (local_replicate->nb_tasks_prefetch > 0)
  1016. local_replicate->nb_tasks_prefetch--;
  1017. }
  1018. }
  1019. _starpu_spin_unlock(&handle->header_lock);
  1020. _STARPU_TASK_SET_INTERFACE(task , local_replicate->data_interface, descrs[index].index);
  1021. /* If the replicate was not initialized yet, we have to do it now */
  1022. if (!(mode & STARPU_SCRATCH) && !local_replicate->initialized)
  1023. _starpu_redux_init_data_replicate(handle, local_replicate, workerid);
  1024. #ifdef STARPU_USE_FXT
  1025. total_size += _starpu_data_get_size(handle);
  1026. #endif
  1027. }
  1028. _STARPU_TRACE_DATA_LOAD(workerid,total_size);
  1029. if (profiling && task->profiling_info)
  1030. _starpu_clock_gettime(&task->profiling_info->acquire_data_end_time);
  1031. _STARPU_TRACE_END_FETCH_INPUT(NULL);
  1032. }
  1033. /* Release task data dependencies */
  1034. void __starpu_push_task_output(struct _starpu_job *j)
  1035. {
  1036. #ifdef STARPU_OPENMP
  1037. STARPU_ASSERT(!j->continuation);
  1038. #endif
  1039. int profiling = starpu_profiling_status_get();
  1040. struct starpu_task *task = j->task;
  1041. if (profiling && task->profiling_info)
  1042. _starpu_clock_gettime(&task->profiling_info->release_data_start_time);
  1043. struct _starpu_data_descr *descrs = _STARPU_JOB_GET_ORDERED_BUFFERS(j);
  1044. unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
  1045. int workerid = starpu_worker_get_id();
  1046. unsigned index;
  1047. for (index = 0; index < nbuffers; index++)
  1048. {
  1049. starpu_data_handle_t handle = descrs[index].handle;
  1050. enum starpu_data_access_mode mode = descrs[index].mode;
  1051. int node = descrs[index].node;
  1052. struct _starpu_data_replicate *local_replicate = NULL;
  1053. if (index && descrs[index-1].handle == descrs[index].handle)
  1054. /* We have already released this data, skip it. This
  1055. * depends on ordering putting writes before reads, see
  1056. * _starpu_compar_handles */
  1057. continue;
  1058. if (node != -1)
  1059. local_replicate = get_replicate(handle, mode, workerid, node);
  1060. /* Keep a reference for future
  1061. * _starpu_release_task_enforce_sequential_consistency call */
  1062. _starpu_spin_lock(&handle->header_lock);
  1063. handle->busy_count++;
  1064. if (node == -1)
  1065. {
  1066. /* NOWHERE case, just notify dependencies */
  1067. if (!_starpu_notify_data_dependencies(handle, STARPU_NONE))
  1068. _starpu_spin_unlock(&handle->header_lock);
  1069. }
  1070. else
  1071. {
  1072. _starpu_spin_unlock(&handle->header_lock);
  1073. _starpu_release_data_on_node(handle, 0, STARPU_NONE, local_replicate);
  1074. }
  1075. }
  1076. if (profiling && task->profiling_info)
  1077. _starpu_clock_gettime(&task->profiling_info->release_data_end_time);
  1078. }
  1079. /* Version for a driver running on a worker: we show the driver state in the trace */
  1080. void _starpu_push_task_output(struct _starpu_job *j)
  1081. {
  1082. _STARPU_TRACE_START_PUSH_OUTPUT(NULL);
  1083. __starpu_push_task_output(j);
  1084. _STARPU_TRACE_END_PUSH_OUTPUT(NULL);
  1085. }
  1086. struct fetch_nowhere_wrapper
  1087. {
  1088. struct _starpu_job *j;
  1089. unsigned pending;
  1090. };
  1091. static void _starpu_fetch_nowhere_task_input_cb(void *arg);
  1092. /* Asynchronously fetch data for a task which will have no content */
  1093. void _starpu_fetch_nowhere_task_input(struct _starpu_job *j)
  1094. {
  1095. int profiling = starpu_profiling_status_get();
  1096. struct starpu_task *task = j->task;
  1097. if (profiling && task->profiling_info)
  1098. _starpu_clock_gettime(&task->profiling_info->acquire_data_start_time);
  1099. struct _starpu_data_descr *descrs = _STARPU_JOB_GET_ORDERED_BUFFERS(j);
  1100. unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
  1101. unsigned nfetchbuffers = 0;
  1102. struct fetch_nowhere_wrapper *wrapper;
  1103. unsigned index;
  1104. for (index = 0; index < nbuffers; index++)
  1105. {
  1106. /* Note here we just follow what was requested, and not use _starpu_task_data_get_node* */
  1107. int node = -1;
  1108. if (task->cl->specific_nodes)
  1109. node = STARPU_CODELET_GET_NODE(task->cl, descrs[index].index);
  1110. descrs[index].node = node;
  1111. if (node != -1)
  1112. nfetchbuffers++;
  1113. }
  1114. if (!nfetchbuffers)
  1115. {
  1116. /* Nothing to fetch actually, already finished! */
  1117. __starpu_push_task_output(j);
  1118. _starpu_handle_job_termination(j);
  1119. _STARPU_LOG_OUT_TAG("handle_job_termination");
  1120. return;
  1121. }
  1122. _STARPU_MALLOC(wrapper, (sizeof(*wrapper)));
  1123. wrapper->j = j;
  1124. /* +1 for the call below */
  1125. wrapper->pending = nfetchbuffers + 1;
  1126. for (index = 0; index < nbuffers; index++)
  1127. {
  1128. starpu_data_handle_t handle = descrs[index].handle;
  1129. enum starpu_data_access_mode mode = descrs[index].mode;
  1130. int node = descrs[index].node;
  1131. if (node == -1)
  1132. continue;
  1133. if (mode == STARPU_NONE ||
  1134. (mode & ((1<<STARPU_MODE_SHIFT) - 1)) >= STARPU_ACCESS_MODE_MAX ||
  1135. (mode >> STARPU_MODE_SHIFT) >= (STARPU_SHIFTED_MODE_MAX >> STARPU_MODE_SHIFT))
  1136. STARPU_ASSERT_MSG(0, "mode %d (0x%x) is bogus\n", mode, mode);
  1137. STARPU_ASSERT(mode != STARPU_SCRATCH && mode != STARPU_REDUX);
  1138. struct _starpu_data_replicate *local_replicate;
  1139. local_replicate = get_replicate(handle, mode, -1, node);
  1140. _starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, STARPU_FETCH, 1, _starpu_fetch_nowhere_task_input_cb, wrapper, 0, "_starpu_fetch_nowhere_task_input");
  1141. }
  1142. if (profiling && task->profiling_info)
  1143. _starpu_clock_gettime(&task->profiling_info->acquire_data_end_time);
  1144. /* Finished working with the task, release our reference */
  1145. _starpu_fetch_nowhere_task_input_cb(wrapper);
  1146. }
  1147. static void _starpu_fetch_nowhere_task_input_cb(void *arg)
  1148. {
  1149. /* One more transfer finished */
  1150. struct fetch_nowhere_wrapper *wrapper = arg;
  1151. unsigned pending = STARPU_ATOMIC_ADD(&wrapper->pending, -1);
  1152. ANNOTATE_HAPPENS_BEFORE(&wrapper->pending);
  1153. if (pending == 0)
  1154. {
  1155. ANNOTATE_HAPPENS_AFTER(&wrapper->pending);
  1156. /* Finished transferring, task is over */
  1157. struct _starpu_job *j = wrapper->j;
  1158. free(wrapper);
  1159. __starpu_push_task_output(j);
  1160. _starpu_handle_job_termination(j);
  1161. _STARPU_LOG_OUT_TAG("handle_job_termination");
  1162. }
  1163. }
  1164. /* NB : this value can only be an indication of the status of a data
  1165. at some point, but there is no strong garantee ! */
  1166. unsigned starpu_data_is_on_node(starpu_data_handle_t handle, unsigned node)
  1167. {
  1168. unsigned ret = 0;
  1169. // XXX : this is just a hint, so we don't take the lock ...
  1170. // STARPU_PTHREAD_SPIN_LOCK(&handle->header_lock);
  1171. if (handle->per_node[node].state != STARPU_INVALID)
  1172. {
  1173. ret = 1;
  1174. }
  1175. else
  1176. {
  1177. unsigned i;
  1178. unsigned nnodes = starpu_memory_nodes_get_count();
  1179. for (i = 0; i < nnodes; i++)
  1180. {
  1181. if (handle->per_node[node].request[i])
  1182. ret = 1;
  1183. }
  1184. }
  1185. // STARPU_PTHREAD_SPIN_UNLOCK(&handle->header_lock);
  1186. return ret;
  1187. }