coherency.c 38 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures. *
  2. * Copyright (C) 2009-2016 Université de Bordeaux
  3. * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016 CNRS
  4. * Copyright (C) 2014 INRIA
  5. *
  6. * StarPU is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU Lesser General Public License as published by
  8. * the Free Software Foundation; either version 2.1 of the License, or (at
  9. * your option) any later version.
  10. *
  11. * StarPU is distributed in the hope that it will be useful, but
  12. * WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  14. *
  15. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  16. */
  17. #include <common/config.h>
  18. #include <datawizard/coherency.h>
  19. #include <datawizard/copy_driver.h>
  20. #include <datawizard/write_back.h>
  21. #include <datawizard/memory_nodes.h>
  22. #include <core/dependencies/data_concurrency.h>
  23. #include <core/disk.h>
  24. #include <profiling/profiling.h>
  25. #include <math.h>
  26. #include <core/task.h>
  27. #include <starpu_scheduler.h>
  28. #include <core/workers.h>
  29. #include <limits.h>
  30. #ifdef STARPU_SIMGRID
  31. #include <core/simgrid.h>
  32. #endif
  33. static int link_supports_direct_transfers(starpu_data_handle_t handle, unsigned src_node, unsigned dst_node, unsigned *handling_node);
  34. int _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
  35. {
  36. int src_node = -1;
  37. unsigned i;
  38. unsigned nnodes = starpu_memory_nodes_get_count();
  39. /* first find a valid copy, either a STARPU_OWNER or a STARPU_SHARED */
  40. unsigned node;
  41. size_t size = _starpu_data_get_size(handle);
  42. double cost = INFINITY;
  43. unsigned src_node_mask = 0;
  44. for (node = 0; node < nnodes; node++)
  45. {
  46. if (handle->per_node[node].state != STARPU_INVALID)
  47. {
  48. /* we found a copy ! */
  49. src_node_mask |= (1<<node);
  50. }
  51. }
  52. if (src_node_mask == 0 && handle->init_cl)
  53. {
  54. /* No copy yet, but applicationg told us how to build it. */
  55. return -1;
  56. }
  57. /* we should have found at least one copy ! */
  58. STARPU_ASSERT_MSG(src_node_mask != 0, "The data for the handle %p is requested, but the handle does not have a valid value. Perhaps some initialization task is missing?", handle);
  59. /* Without knowing the size, we won't know the cost */
  60. if (!size)
  61. cost = 0;
  62. /* Check whether we have transfer cost for all nodes, if so, take the minimum */
  63. if (cost)
  64. for (i = 0; i < nnodes; i++)
  65. {
  66. if (src_node_mask & (1<<i))
  67. {
  68. double time = starpu_transfer_predict(i, destination, size);
  69. unsigned handling_node;
  70. /* Avoid indirect transfers */
  71. if (!link_supports_direct_transfers(handle, i, destination, &handling_node))
  72. continue;
  73. if (_STARPU_IS_ZERO(time))
  74. {
  75. /* No estimation, will have to revert to dumb strategy */
  76. cost = 0.0;
  77. break;
  78. }
  79. else if (time < cost)
  80. {
  81. cost = time;
  82. src_node = i;
  83. }
  84. }
  85. }
  86. if (cost && src_node != -1)
  87. {
  88. /* Could estimate through cost, return that */
  89. STARPU_ASSERT(handle->per_node[src_node].allocated);
  90. STARPU_ASSERT(handle->per_node[src_node].initialized);
  91. return src_node;
  92. }
  93. int i_ram = -1;
  94. int i_gpu = -1;
  95. int i_disk = -1;
  96. /* Revert to dumb strategy: take RAM unless only a GPU has it */
  97. for (i = 0; i < nnodes; i++)
  98. {
  99. if (src_node_mask & (1<<i))
  100. {
  101. int (*can_copy)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, unsigned handling_node) = handle->ops->copy_methods->can_copy;
  102. /* Avoid transfers which the interface does not want */
  103. if (can_copy)
  104. {
  105. void *src_interface = handle->per_node[i].data_interface;
  106. void *dst_interface = handle->per_node[destination].data_interface;
  107. unsigned handling_node;
  108. if (!link_supports_direct_transfers(handle, i, destination, &handling_node))
  109. {
  110. /* Avoid through RAM if the interface does not want it */
  111. void *ram_interface = handle->per_node[STARPU_MAIN_RAM].data_interface;
  112. if ((!can_copy(src_interface, i, ram_interface, STARPU_MAIN_RAM, i)
  113. && !can_copy(src_interface, i, ram_interface, STARPU_MAIN_RAM, STARPU_MAIN_RAM))
  114. || (!can_copy(ram_interface, STARPU_MAIN_RAM, dst_interface, destination, STARPU_MAIN_RAM)
  115. && !can_copy(ram_interface, STARPU_MAIN_RAM, dst_interface, destination, destination)))
  116. continue;
  117. }
  118. }
  119. /* however GPU are expensive sources, really !
  120. * Unless peer transfer is supported (and it would then have been selected above).
  121. * Other should be ok */
  122. if (starpu_node_get_kind(i) == STARPU_CUDA_RAM ||
  123. starpu_node_get_kind(i) == STARPU_OPENCL_RAM ||
  124. starpu_node_get_kind(i) == STARPU_MIC_RAM)
  125. i_gpu = i;
  126. if (starpu_node_get_kind(i) == STARPU_CPU_RAM ||
  127. starpu_node_get_kind(i) == STARPU_SCC_RAM ||
  128. starpu_node_get_kind(i) == STARPU_SCC_SHM)
  129. i_ram = i;
  130. if (starpu_node_get_kind(i) == STARPU_DISK_RAM)
  131. i_disk = i;
  132. }
  133. }
  134. /* we have to use cpu_ram in first */
  135. if (i_ram != -1)
  136. src_node = i_ram;
  137. /* no luck we have to use the disk memory */
  138. else if (i_gpu != -1)
  139. src_node = i_gpu;
  140. else
  141. src_node = i_disk;
  142. STARPU_ASSERT(src_node != -1);
  143. STARPU_ASSERT(handle->per_node[src_node].allocated);
  144. STARPU_ASSERT(handle->per_node[src_node].initialized);
  145. return src_node;
  146. }
  147. /* this may be called once the data is fetched with header and STARPU_RW-lock hold */
  148. void _starpu_update_data_state(starpu_data_handle_t handle,
  149. struct _starpu_data_replicate *requesting_replicate,
  150. enum starpu_data_access_mode mode)
  151. {
  152. /* There is nothing to do for relaxed coherency modes (scratch or
  153. * reductions) */
  154. if (!(mode & STARPU_RW))
  155. return;
  156. unsigned nnodes = starpu_memory_nodes_get_count();
  157. /* the data is present now */
  158. unsigned requesting_node = requesting_replicate->memory_node;
  159. requesting_replicate->requested &= ~(1UL << requesting_node);
  160. if (mode & STARPU_W)
  161. {
  162. /* the requesting node now has the only valid copy */
  163. unsigned node;
  164. for (node = 0; node < nnodes; node++)
  165. {
  166. _STARPU_TRACE_DATA_INVALIDATE(handle, node);
  167. handle->per_node[node].state = STARPU_INVALID;
  168. }
  169. requesting_replicate->state = STARPU_OWNER;
  170. if (handle->home_node != -1 && handle->per_node[handle->home_node].state == STARPU_INVALID)
  171. /* Notify that this MC is now dirty */
  172. _starpu_memchunk_dirty(requesting_replicate->mc, requesting_replicate->memory_node);
  173. }
  174. else
  175. { /* read only */
  176. if (requesting_replicate->state != STARPU_OWNER)
  177. {
  178. /* there was at least another copy of the data */
  179. unsigned node;
  180. for (node = 0; node < nnodes; node++)
  181. {
  182. struct _starpu_data_replicate *replicate = &handle->per_node[node];
  183. if (replicate->state != STARPU_INVALID)
  184. replicate->state = STARPU_SHARED;
  185. }
  186. requesting_replicate->state = STARPU_SHARED;
  187. }
  188. }
  189. }
  190. static int worker_supports_direct_access(unsigned node, unsigned handling_node)
  191. {
  192. /* only support disk <-> ram and disk <-> disk */
  193. if (starpu_node_get_kind(node) == STARPU_DISK_RAM || starpu_node_get_kind(handling_node) == STARPU_DISK_RAM)
  194. return 0;
  195. if (node == handling_node)
  196. return 1;
  197. if (!_starpu_memory_node_get_nworkers(handling_node))
  198. /* No worker to process the request from that node */
  199. return 0;
  200. int type = starpu_node_get_kind(node);
  201. switch (type)
  202. {
  203. case STARPU_CUDA_RAM:
  204. {
  205. /* GPUs not always allow direct remote access: if CUDA4
  206. * is enabled, we allow two CUDA devices to communicate. */
  207. #ifdef STARPU_SIMGRID
  208. if (starpu_node_get_kind(handling_node) == STARPU_CUDA_RAM)
  209. {
  210. char name[16];
  211. msg_host_t host;
  212. const char* cuda_memcpy_peer;
  213. snprintf(name, sizeof(name), "CUDA%d", _starpu_memory_node_get_devid(handling_node));
  214. host = _starpu_simgrid_get_host_by_name(name);
  215. cuda_memcpy_peer = MSG_host_get_property_value(host, "memcpy_peer");
  216. return cuda_memcpy_peer && atoll(cuda_memcpy_peer);
  217. }
  218. else
  219. return 0;
  220. #elif defined(HAVE_CUDA_MEMCPY_PEER)
  221. /* simgrid */
  222. enum starpu_node_kind kind = starpu_node_get_kind(handling_node);
  223. return kind == STARPU_CUDA_RAM;
  224. #else /* HAVE_CUDA_MEMCPY_PEER */
  225. /* Direct GPU-GPU transfers are not allowed in general */
  226. return 0;
  227. #endif /* HAVE_CUDA_MEMCPY_PEER */
  228. }
  229. case STARPU_OPENCL_RAM:
  230. return 0;
  231. case STARPU_MIC_RAM:
  232. /* TODO: We don't handle direct MIC-MIC transfers yet */
  233. return 0;
  234. case STARPU_SCC_RAM:
  235. return 1;
  236. default:
  237. return 1;
  238. }
  239. }
  240. static int link_supports_direct_transfers(starpu_data_handle_t handle, unsigned src_node, unsigned dst_node, unsigned *handling_node)
  241. {
  242. int (*can_copy)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, unsigned handling_node) = handle->ops->copy_methods->can_copy;
  243. void *src_interface = handle->per_node[src_node].data_interface;
  244. void *dst_interface = handle->per_node[dst_node].data_interface;
  245. /* XXX That's a hack until we fix cudaMemcpy3DPeerAsync in the block interface
  246. * Perhaps not all data interface provide a direct GPU-GPU transfer
  247. * method ! */
  248. #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
  249. if (src_node != dst_node && starpu_node_get_kind(src_node) == STARPU_CUDA_RAM && starpu_node_get_kind(dst_node) == STARPU_CUDA_RAM)
  250. {
  251. const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
  252. if (!copy_methods->cuda_to_cuda_async && !copy_methods->any_to_any)
  253. return 0;
  254. }
  255. #endif
  256. /* Note: with CUDA, performance seems a bit better when issuing the transfer from the destination (tested without GPUDirect, but GPUDirect probably behave the same) */
  257. if (worker_supports_direct_access(src_node, dst_node) && (!can_copy || can_copy(src_interface, src_node, dst_interface, dst_node, dst_node)))
  258. {
  259. *handling_node = dst_node;
  260. return 1;
  261. }
  262. if (worker_supports_direct_access(dst_node, src_node) && (!can_copy || can_copy(src_interface, src_node, dst_interface, dst_node, src_node)))
  263. {
  264. *handling_node = src_node;
  265. return 1;
  266. }
  267. /* Link between disk and ram */
  268. if ((starpu_node_get_kind(src_node) == STARPU_DISK_RAM && starpu_node_get_kind(dst_node) == STARPU_CPU_RAM) ||
  269. (starpu_node_get_kind(src_node) == STARPU_CPU_RAM && starpu_node_get_kind(dst_node) == STARPU_DISK_RAM))
  270. {
  271. /* FIXME: not necessarily a worker :/ */
  272. *handling_node = STARPU_MAIN_RAM;
  273. return 1;
  274. }
  275. /* link between disk and disk, and they have the same kind */
  276. if (_starpu_is_same_kind_disk(src_node, dst_node))
  277. return 1;
  278. return 0;
  279. }
  280. /* Determines the path of a request : each hop is defined by (src,dst) and the
  281. * node that handles the hop. The returned value indicates the number of hops,
  282. * and the max_len is the maximum number of hops (ie. the size of the
  283. * src_nodes, dst_nodes and handling_nodes arrays. */
  284. static int determine_request_path(starpu_data_handle_t handle,
  285. int src_node, int dst_node,
  286. enum starpu_data_access_mode mode, int max_len,
  287. unsigned *src_nodes, unsigned *dst_nodes,
  288. unsigned *handling_nodes, unsigned write_invalidation)
  289. {
  290. if (src_node == dst_node || !(mode & STARPU_R))
  291. {
  292. if (write_invalidation)
  293. /* The invalidation request will be enough */
  294. return 0;
  295. /* The destination node should only allocate the data, no transfer is required */
  296. STARPU_ASSERT(max_len >= 1);
  297. src_nodes[0] = STARPU_MAIN_RAM; // ignored
  298. dst_nodes[0] = dst_node;
  299. handling_nodes[0] = dst_node;
  300. return 1;
  301. }
  302. unsigned handling_node;
  303. int link_is_valid = link_supports_direct_transfers(handle, src_node, dst_node, &handling_node);
  304. if (!link_is_valid)
  305. {
  306. int (*can_copy)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, unsigned handling_node) = handle->ops->copy_methods->can_copy;
  307. void *src_interface = handle->per_node[src_node].data_interface;
  308. void *dst_interface = handle->per_node[dst_node].data_interface;
  309. /* We need an intermediate hop to implement data staging
  310. * through main memory. */
  311. STARPU_ASSERT(max_len >= 2);
  312. STARPU_ASSERT(src_node >= 0);
  313. /* GPU -> RAM */
  314. src_nodes[0] = src_node;
  315. dst_nodes[0] = STARPU_MAIN_RAM;
  316. if (starpu_node_get_kind(src_node) == STARPU_DISK_RAM)
  317. /* Disks don't have their own driver thread */
  318. handling_nodes[0] = dst_node;
  319. else if (!can_copy || can_copy(src_interface, src_node, dst_interface, dst_node, src_node))
  320. {
  321. handling_nodes[0] = src_node;
  322. }
  323. else
  324. {
  325. STARPU_ASSERT_MSG(can_copy(src_interface, src_node, dst_interface, dst_node, dst_node), "interface %d refuses all kinds of transfers from node %u to node %u\n", handle->ops->interfaceid, src_node, dst_node);
  326. handling_nodes[0] = dst_node;
  327. }
  328. /* RAM -> GPU */
  329. src_nodes[1] = STARPU_MAIN_RAM;
  330. dst_nodes[1] = dst_node;
  331. if (starpu_node_get_kind(dst_node) == STARPU_DISK_RAM)
  332. /* Disks don't have their own driver thread */
  333. handling_nodes[1] = src_node;
  334. else if (!can_copy || can_copy(src_interface, src_node, dst_interface, dst_node, dst_node))
  335. {
  336. handling_nodes[1] = dst_node;
  337. }
  338. else
  339. {
  340. STARPU_ASSERT_MSG(can_copy(src_interface, src_node, dst_interface, dst_node, src_node), "interface %d refuses all kinds of transfers from node %u to node %u\n", handle->ops->interfaceid, src_node, dst_node);
  341. handling_nodes[1] = src_node;
  342. }
  343. return 2;
  344. }
  345. else
  346. {
  347. STARPU_ASSERT(max_len >= 1);
  348. src_nodes[0] = src_node;
  349. dst_nodes[0] = dst_node;
  350. handling_nodes[0] = handling_node;
  351. #if !defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
  352. STARPU_ASSERT(!(mode & STARPU_R) || starpu_node_get_kind(src_node) != STARPU_CUDA_RAM || starpu_node_get_kind(dst_node) != STARPU_CUDA_RAM);
  353. #endif
  354. return 1;
  355. }
  356. }
  357. /* handle->lock should be taken. r is returned locked. The node parameter
  358. * indicate either the source of the request, or the destination for a
  359. * write-only request. */
  360. static struct _starpu_data_request *_starpu_search_existing_data_request(struct _starpu_data_replicate *replicate, unsigned node, enum starpu_data_access_mode mode, unsigned is_prefetch)
  361. {
  362. struct _starpu_data_request *r;
  363. r = replicate->request[node];
  364. if (r)
  365. {
  366. _starpu_spin_checklocked(&r->handle->header_lock);
  367. _starpu_spin_lock(&r->lock);
  368. /* perhaps we need to "upgrade" the request */
  369. if (is_prefetch < r->prefetch)
  370. _starpu_update_prefetch_status(r, is_prefetch);
  371. if (mode & STARPU_R)
  372. {
  373. /* in case the exisiting request did not imply a memory
  374. * transfer yet, we have to take a second refcnt now
  375. * for the source, in addition to the refcnt for the
  376. * destination
  377. * (so that the source remains valid) */
  378. if (!(r->mode & STARPU_R))
  379. {
  380. replicate->refcnt++;
  381. replicate->handle->busy_count++;
  382. }
  383. r->mode = (enum starpu_data_access_mode) ((int) r->mode | (int) STARPU_R);
  384. }
  385. if (mode & STARPU_W)
  386. r->mode = (enum starpu_data_access_mode) ((int) r->mode | (int) STARPU_W);
  387. }
  388. return r;
  389. }
  390. /*
  391. * This function is called when the data is needed on the local node, this
  392. * returns a pointer to the local copy
  393. *
  394. * R STARPU_W STARPU_RW
  395. * Owner OK OK OK
  396. * Shared OK 1 1
  397. * Invalid 2 3 4
  398. *
  399. * case 1 : shared + (read)write :
  400. * no data copy but shared->Invalid/Owner
  401. * case 2 : invalid + read :
  402. * data copy + invalid->shared + owner->shared (STARPU_ASSERT(there is a valid))
  403. * case 3 : invalid + write :
  404. * no data copy + invalid->owner + (owner,shared)->invalid
  405. * case 4 : invalid + R/STARPU_W :
  406. * data copy + if (STARPU_W) (invalid->owner + owner->invalid)
  407. * else (invalid,owner->shared)
  408. */
  409. struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_handle_t handle,
  410. struct _starpu_data_replicate *dst_replicate,
  411. enum starpu_data_access_mode mode, unsigned is_prefetch,
  412. unsigned async,
  413. void (*callback_func)(void *), void *callback_arg, int prio, const char *origin)
  414. {
  415. /* We don't care about commuting for data requests, that was handled before. */
  416. mode &= ~STARPU_COMMUTE;
  417. /* This function is called with handle's header lock taken */
  418. _starpu_spin_checklocked(&handle->header_lock);
  419. unsigned requesting_node = dst_replicate ? dst_replicate->memory_node : -1;
  420. unsigned nwait = 0;
  421. if (mode & STARPU_W)
  422. {
  423. /* We will write to the buffer. We will have to wait for all
  424. * existing requests before the last request which will
  425. * invalidate all their results (which were possibly spurious,
  426. * e.g. too aggressive eviction).
  427. */
  428. unsigned i, j;
  429. unsigned nnodes = starpu_memory_nodes_get_count();
  430. for (i = 0; i < nnodes; i++)
  431. for (j = 0; j < nnodes; j++)
  432. if (handle->per_node[i].request[j])
  433. nwait++;
  434. /* If the request is not detached (i.e. the caller really wants
  435. * proper ownership), no new requests will appear because a
  436. * reference will be kept on the dst replicate, which will
  437. * notably prevent data reclaiming.
  438. */
  439. }
  440. if ((!dst_replicate || dst_replicate->state != STARPU_INVALID) && (!nwait || is_prefetch))
  441. {
  442. if (dst_replicate)
  443. {
  444. #ifdef STARPU_MEMORY_STATS
  445. enum _starpu_cache_state old_state = dst_replicate->state;
  446. #endif
  447. /* the data is already available and we don't have to wait for
  448. * any request, so we can stop */
  449. _starpu_update_data_state(handle, dst_replicate, mode);
  450. _starpu_msi_cache_hit(requesting_node);
  451. #ifdef STARPU_MEMORY_STATS
  452. _starpu_memory_handle_stats_cache_hit(handle, requesting_node);
  453. /* XXX Broken ? */
  454. if (old_state == STARPU_SHARED
  455. && dst_replicate->state == STARPU_OWNER)
  456. _starpu_memory_handle_stats_shared_to_owner(handle, requesting_node);
  457. #endif
  458. if (dst_replicate->mc)
  459. _starpu_memchunk_recently_used(dst_replicate->mc, requesting_node);
  460. }
  461. _starpu_spin_unlock(&handle->header_lock);
  462. if (callback_func)
  463. callback_func(callback_arg);
  464. _STARPU_LOG_OUT_TAG("data available");
  465. return NULL;
  466. }
  467. _starpu_msi_cache_miss(requesting_node);
  468. /* the only remaining situation is that the local copy was invalid */
  469. STARPU_ASSERT((dst_replicate && dst_replicate->state == STARPU_INVALID) || nwait);
  470. /* find someone who already has the data */
  471. int src_node = -1;
  472. if (dst_replicate && mode & STARPU_R)
  473. {
  474. if (dst_replicate->state == STARPU_INVALID)
  475. src_node = _starpu_select_src_node(handle, requesting_node);
  476. else
  477. src_node = requesting_node;
  478. if (src_node < 0)
  479. {
  480. /* We will create it, no need to read an existing value */
  481. mode &= ~STARPU_R;
  482. }
  483. }
  484. else if (dst_replicate)
  485. {
  486. /* if the data is in write only mode (and not SCRATCH or REDUX), there is no need for a source, data will be initialized by the task itself */
  487. if (mode & STARPU_W)
  488. dst_replicate->initialized = 1;
  489. if (requesting_node == STARPU_MAIN_RAM && !nwait)
  490. {
  491. /* And this is the main RAM, really no need for a
  492. * request, just allocate */
  493. if (_starpu_allocate_memory_on_node(handle, dst_replicate, is_prefetch) == 0)
  494. {
  495. _starpu_update_data_state(handle, dst_replicate, mode);
  496. _starpu_spin_unlock(&handle->header_lock);
  497. if (callback_func)
  498. callback_func(callback_arg);
  499. _STARPU_LOG_OUT_TAG("data immediately allocated");
  500. return NULL;
  501. }
  502. }
  503. }
  504. #define MAX_REQUESTS 4
  505. /* We can safely assume that there won't be more than 2 hops in the
  506. * current implementation */
  507. unsigned src_nodes[MAX_REQUESTS], dst_nodes[MAX_REQUESTS], handling_nodes[MAX_REQUESTS];
  508. /* keep one slot for the last W request, if any */
  509. int write_invalidation = (mode & STARPU_W) && nwait && !is_prefetch;
  510. int nhops = determine_request_path(handle, src_node, requesting_node, mode, MAX_REQUESTS,
  511. src_nodes, dst_nodes, handling_nodes, write_invalidation);
  512. STARPU_ASSERT(nhops >= 0 && nhops <= MAX_REQUESTS-1);
  513. struct _starpu_data_request *requests[nhops + write_invalidation];
  514. /* Did we reuse a request for that hop ? */
  515. int reused_requests[nhops + write_invalidation];
  516. /* Construct an array with a list of requests, possibly reusing existing requests */
  517. int hop;
  518. for (hop = 0; hop < nhops; hop++)
  519. {
  520. struct _starpu_data_request *r;
  521. unsigned hop_src_node = src_nodes[hop];
  522. unsigned hop_dst_node = dst_nodes[hop];
  523. unsigned hop_handling_node = handling_nodes[hop];
  524. struct _starpu_data_replicate *hop_src_replicate;
  525. struct _starpu_data_replicate *hop_dst_replicate;
  526. /* Only the first request is independant */
  527. unsigned ndeps = (hop == 0)?0:1;
  528. hop_src_replicate = &handle->per_node[hop_src_node];
  529. hop_dst_replicate = (hop != nhops - 1)?&handle->per_node[hop_dst_node]:dst_replicate;
  530. /* Try to reuse a request if possible */
  531. r = _starpu_search_existing_data_request(hop_dst_replicate,
  532. (mode & STARPU_R)?hop_src_node:hop_dst_node,
  533. mode, is_prefetch);
  534. reused_requests[hop] = !!r;
  535. if (!r)
  536. {
  537. /* Create a new request if there was no request to reuse */
  538. r = _starpu_create_data_request(handle, hop_src_replicate,
  539. hop_dst_replicate, hop_handling_node,
  540. mode, ndeps, is_prefetch, prio, 0, origin);
  541. nwait++;
  542. }
  543. requests[hop] = r;
  544. }
  545. /* Chain these requests */
  546. for (hop = 0; hop < nhops; hop++)
  547. {
  548. struct _starpu_data_request *r;
  549. r = requests[hop];
  550. if (hop != nhops - 1)
  551. {
  552. if (!reused_requests[hop + 1])
  553. {
  554. r->next_req[r->next_req_count++] = requests[hop + 1];
  555. STARPU_ASSERT(r->next_req_count <= STARPU_MAXNODES);
  556. }
  557. }
  558. else if (!write_invalidation)
  559. /* The last request will perform the callback after termination */
  560. _starpu_data_request_append_callback(r, callback_func, callback_arg);
  561. if (reused_requests[hop])
  562. _starpu_spin_unlock(&r->lock);
  563. }
  564. if (write_invalidation)
  565. {
  566. /* Some requests were still pending, we have to add yet another
  567. * request, depending on them, which will invalidate their
  568. * result.
  569. */
  570. struct _starpu_data_request *r = _starpu_create_data_request(handle, dst_replicate,
  571. dst_replicate, requesting_node,
  572. STARPU_W, nwait, is_prefetch, prio, 1, origin);
  573. /* and perform the callback after termination */
  574. _starpu_data_request_append_callback(r, callback_func, callback_arg);
  575. /* We will write to the buffer. We will have to wait for all
  576. * existing requests before the last request which will
  577. * invalidate all their results (which were possibly spurious,
  578. * e.g. too aggressive eviction).
  579. */
  580. unsigned i, j;
  581. unsigned nnodes = starpu_memory_nodes_get_count();
  582. for (i = 0; i < nnodes; i++)
  583. for (j = 0; j < nnodes; j++)
  584. {
  585. struct _starpu_data_request *r2 = handle->per_node[i].request[j];
  586. if (r2)
  587. {
  588. _starpu_spin_lock(&r2->lock);
  589. if (is_prefetch < r2->prefetch)
  590. /* Hasten the request we will have to wait for */
  591. _starpu_update_prefetch_status(r2, is_prefetch);
  592. r2->next_req[r2->next_req_count++] = r;
  593. STARPU_ASSERT(r2->next_req_count <= STARPU_MAXNODES + 1);
  594. _starpu_spin_unlock(&r2->lock);
  595. nwait--;
  596. }
  597. }
  598. STARPU_ASSERT(nwait == 0);
  599. nhops++;
  600. requests[nhops - 1] = r;
  601. /* existing requests will post this one */
  602. reused_requests[nhops - 1] = 1;
  603. }
  604. STARPU_ASSERT(nhops);
  605. if (!async)
  606. requests[nhops - 1]->refcnt++;
  607. /* we only submit the first request, the remaining will be
  608. * automatically submitted afterward */
  609. if (!reused_requests[0])
  610. _starpu_post_data_request(requests[0], handling_nodes[0]);
  611. return requests[nhops - 1];
  612. }
  613. int _starpu_fetch_data_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *dst_replicate,
  614. enum starpu_data_access_mode mode, unsigned detached, unsigned is_prefetch, unsigned async,
  615. void (*callback_func)(void *), void *callback_arg, int prio, const char *origin)
  616. {
  617. unsigned local_node = _starpu_memory_node_get_local_key();
  618. _STARPU_LOG_IN();
  619. int cpt = 0;
  620. while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
  621. {
  622. cpt++;
  623. _starpu_datawizard_progress(local_node, 1);
  624. }
  625. if (cpt == STARPU_SPIN_MAXTRY)
  626. _starpu_spin_lock(&handle->header_lock);
  627. if (!detached)
  628. {
  629. /* Take references which will be released by _starpu_release_data_on_node */
  630. if (dst_replicate)
  631. dst_replicate->refcnt++;
  632. handle->busy_count++;
  633. }
  634. struct _starpu_data_request *r;
  635. r = _starpu_create_request_to_fetch_data(handle, dst_replicate, mode,
  636. is_prefetch, async, callback_func, callback_arg, prio, origin);
  637. /* If no request was created, the handle was already up-to-date on the
  638. * node. In this case, _starpu_create_request_to_fetch_data has already
  639. * unlocked the header. */
  640. if (!r)
  641. return 0;
  642. _starpu_spin_unlock(&handle->header_lock);
  643. int ret = async?0:_starpu_wait_data_request_completion(r, 1);
  644. _STARPU_LOG_OUT();
  645. return ret;
  646. }
  647. static int idle_prefetch_data_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
  648. {
  649. return _starpu_fetch_data_on_node(handle, replicate, mode, 1, 2, 1, NULL, NULL, prio, "idle_prefetch_data_on_node");
  650. }
  651. static int prefetch_data_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
  652. {
  653. return _starpu_fetch_data_on_node(handle, replicate, mode, 1, 1, 1, NULL, NULL, prio, "prefetch_data_on_node");
  654. }
  655. static int fetch_data(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
  656. {
  657. return _starpu_fetch_data_on_node(handle, replicate, mode, 0, 0, 0, NULL, NULL, prio, "fetch_data");
  658. }
  659. uint32_t _starpu_get_data_refcnt(starpu_data_handle_t handle, unsigned node)
  660. {
  661. return handle->per_node[node].refcnt;
  662. }
  663. size_t _starpu_data_get_size(starpu_data_handle_t handle)
  664. {
  665. return handle->ops->get_size(handle);
  666. }
  667. uint32_t _starpu_data_get_footprint(starpu_data_handle_t handle)
  668. {
  669. return handle->footprint;
  670. }
  671. /* in case the data was accessed on a write mode, do not forget to
  672. * make it accessible again once it is possible ! */
  673. void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_wt_mask, struct _starpu_data_replicate *replicate)
  674. {
  675. uint32_t wt_mask;
  676. wt_mask = default_wt_mask | handle->wt_mask;
  677. wt_mask &= (1<<starpu_memory_nodes_get_count())-1;
  678. /* Note that it is possible that there is no valid copy of the data (if
  679. * starpu_data_invalidate was called for instance). In that case, we do
  680. * not enforce any write-through mechanism. */
  681. unsigned memory_node = replicate->memory_node;
  682. if (replicate->state != STARPU_INVALID && handle->current_mode & STARPU_W)
  683. if ((wt_mask & ~(1<<memory_node)))
  684. _starpu_write_through_data(handle, memory_node, wt_mask);
  685. unsigned local_node = _starpu_memory_node_get_local_key();
  686. int cpt = 0;
  687. while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
  688. {
  689. cpt++;
  690. _starpu_datawizard_progress(local_node, 1);
  691. }
  692. if (cpt == STARPU_SPIN_MAXTRY)
  693. _starpu_spin_lock(&handle->header_lock);
  694. /* Release refcnt taken by fetch_data_on_node */
  695. replicate->refcnt--;
  696. STARPU_ASSERT_MSG(replicate->refcnt >= 0, "handle %p released too many times", handle);
  697. STARPU_ASSERT_MSG(handle->busy_count > 0, "handle %p released too many times", handle);
  698. handle->busy_count--;
  699. if (!_starpu_notify_data_dependencies(handle))
  700. _starpu_spin_unlock(&handle->header_lock);
  701. }
  702. static void _starpu_set_data_requested_flag_if_needed(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate)
  703. {
  704. unsigned local_node = _starpu_memory_node_get_local_key();
  705. int cpt = 0;
  706. while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
  707. {
  708. cpt++;
  709. _starpu_datawizard_progress(local_node, 1);
  710. }
  711. if (cpt == STARPU_SPIN_MAXTRY)
  712. _starpu_spin_lock(&handle->header_lock);
  713. if (replicate->state == STARPU_INVALID)
  714. {
  715. unsigned dst_node = replicate->memory_node;
  716. replicate->requested |= 1UL << dst_node;
  717. }
  718. _starpu_spin_unlock(&handle->header_lock);
  719. }
  720. int starpu_prefetch_task_input_on_node_prio(struct starpu_task *task, unsigned node, int prio)
  721. {
  722. STARPU_ASSERT(!task->prefetched);
  723. unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
  724. unsigned index;
  725. for (index = 0; index < nbuffers; index++)
  726. {
  727. starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, index);
  728. enum starpu_data_access_mode mode = STARPU_TASK_GET_MODE(task, index);
  729. if (mode & (STARPU_SCRATCH|STARPU_REDUX))
  730. continue;
  731. struct _starpu_data_replicate *replicate = &handle->per_node[node];
  732. prefetch_data_on_node(handle, replicate, mode, prio);
  733. _starpu_set_data_requested_flag_if_needed(handle, replicate);
  734. }
  735. return 0;
  736. }
  737. int starpu_prefetch_task_input_on_node(struct starpu_task *task, unsigned node)
  738. {
  739. int prio = task->priority;
  740. if (task->workerorder)
  741. prio = INT_MAX - task->workerorder;
  742. return starpu_prefetch_task_input_on_node_prio(task, node, prio);
  743. }
  744. int starpu_idle_prefetch_task_input_on_node_prio(struct starpu_task *task, unsigned node, int prio)
  745. {
  746. unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
  747. unsigned index;
  748. for (index = 0; index < nbuffers; index++)
  749. {
  750. starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, index);
  751. enum starpu_data_access_mode mode = STARPU_TASK_GET_MODE(task, index);
  752. if (mode & (STARPU_SCRATCH|STARPU_REDUX))
  753. continue;
  754. struct _starpu_data_replicate *replicate = &handle->per_node[node];
  755. idle_prefetch_data_on_node(handle, replicate, mode, prio);
  756. }
  757. return 0;
  758. }
  759. int starpu_idle_prefetch_task_input_on_node(struct starpu_task *task, unsigned node)
  760. {
  761. int prio = task->priority;
  762. if (task->workerorder)
  763. prio = INT_MAX - task->workerorder;
  764. return starpu_idle_prefetch_task_input_on_node_prio(task, node, prio);
  765. }
  766. static struct _starpu_data_replicate *get_replicate(starpu_data_handle_t handle, enum starpu_data_access_mode mode, int workerid, unsigned node)
  767. {
  768. if (mode & (STARPU_SCRATCH|STARPU_REDUX))
  769. {
  770. STARPU_ASSERT(workerid >= 0);
  771. if (!handle->per_worker)
  772. {
  773. _starpu_spin_lock(&handle->header_lock);
  774. if (!handle->per_worker)
  775. _starpu_data_initialize_per_worker(handle);
  776. _starpu_spin_unlock(&handle->header_lock);
  777. }
  778. return &handle->per_worker[workerid];
  779. }
  780. else
  781. /* That's a "normal" buffer (R/W) */
  782. return &handle->per_node[node];
  783. }
  784. /* Synchronously fetch data for a given task (if it's not there already) */
  785. int _starpu_fetch_task_input(struct _starpu_job *j)
  786. {
  787. _STARPU_TRACE_START_FETCH_INPUT(NULL);
  788. int profiling = starpu_profiling_status_get();
  789. struct starpu_task *task = j->task;
  790. if (profiling && task->profiling_info)
  791. _starpu_clock_gettime(&task->profiling_info->acquire_data_start_time);
  792. struct _starpu_data_descr *descrs = _STARPU_JOB_GET_ORDERED_BUFFERS(j);
  793. unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
  794. unsigned local_memory_node = _starpu_memory_node_get_local_key();
  795. int workerid = starpu_worker_get_id_check();
  796. #ifdef STARPU_USE_FXT
  797. unsigned long total_size = 0;
  798. #endif
  799. unsigned index;
  800. for (index = 0; index < nbuffers; index++)
  801. {
  802. int ret;
  803. starpu_data_handle_t handle = descrs[index].handle;
  804. enum starpu_data_access_mode mode = descrs[index].mode;
  805. int node = descrs[index].node;
  806. if (node == -1)
  807. node = local_memory_node;
  808. if (mode == STARPU_NONE ||
  809. (mode & ((1<<STARPU_MODE_SHIFT) - 1)) >= STARPU_ACCESS_MODE_MAX ||
  810. (mode >> STARPU_MODE_SHIFT) >= (STARPU_SHIFTED_MODE_MAX >> STARPU_MODE_SHIFT))
  811. STARPU_ASSERT_MSG(0, "mode %d (0x%x) is bogus\n", mode, mode);
  812. struct _starpu_data_replicate *local_replicate;
  813. if (index && descrs[index-1].handle == descrs[index].handle)
  814. /* We have already took this data, skip it. This
  815. * depends on ordering putting writes before reads, see
  816. * _starpu_compar_handles */
  817. continue;
  818. local_replicate = get_replicate(handle, mode, workerid, node);
  819. ret = fetch_data(handle, local_replicate, mode, 0);
  820. if (STARPU_UNLIKELY(ret))
  821. goto enomem;
  822. #ifdef STARPU_USE_FXT
  823. total_size += _starpu_data_get_size(handle);
  824. #endif
  825. }
  826. _STARPU_TRACE_DATA_LOAD(workerid,total_size);
  827. /* Now that we have taken the data locks in locking order, fill the codelet interfaces in function order. */
  828. for (index = 0; index < nbuffers; index++)
  829. {
  830. starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, index);
  831. enum starpu_data_access_mode mode = STARPU_TASK_GET_MODE(task, index);
  832. int node = descrs[index].node;
  833. if (node == -1)
  834. node = local_memory_node;
  835. struct _starpu_data_replicate *local_replicate;
  836. local_replicate = get_replicate(handle, mode, workerid, node);
  837. _STARPU_TASK_SET_INTERFACE(task , local_replicate->data_interface, index);
  838. /* If the replicate was not initialized yet, we have to do it now */
  839. if (!(mode & STARPU_SCRATCH) && !local_replicate->initialized)
  840. _starpu_redux_init_data_replicate(handle, local_replicate, workerid);
  841. }
  842. if (profiling && task->profiling_info)
  843. _starpu_clock_gettime(&task->profiling_info->acquire_data_end_time);
  844. _STARPU_TRACE_END_FETCH_INPUT(NULL);
  845. return 0;
  846. enomem:
  847. _STARPU_TRACE_END_FETCH_INPUT(NULL);
  848. _STARPU_DISP("something went wrong with buffer %u\n", index);
  849. /* try to unreference all the input that were successfully taken */
  850. unsigned index2;
  851. for (index2 = 0; index2 < index; index2++)
  852. {
  853. starpu_data_handle_t handle = descrs[index2].handle;
  854. enum starpu_data_access_mode mode = descrs[index2].mode;
  855. int node = descrs[index].node;
  856. if (node == -1)
  857. node = local_memory_node;
  858. struct _starpu_data_replicate *local_replicate;
  859. if (index2 && descrs[index2-1].handle == descrs[index2].handle)
  860. /* We have already released this data, skip it. This
  861. * depends on ordering putting writes before reads, see
  862. * _starpu_compar_handles */
  863. continue;
  864. local_replicate = get_replicate(handle, mode, workerid, node);
  865. _starpu_release_data_on_node(handle, 0, local_replicate);
  866. }
  867. return -1;
  868. }
  869. /* Release task data dependencies */
  870. void __starpu_push_task_output(struct _starpu_job *j)
  871. {
  872. #ifdef STARPU_OPENMP
  873. STARPU_ASSERT(!j->continuation);
  874. #endif
  875. int profiling = starpu_profiling_status_get();
  876. struct starpu_task *task = j->task;
  877. if (profiling && task->profiling_info)
  878. _starpu_clock_gettime(&task->profiling_info->release_data_start_time);
  879. struct _starpu_data_descr *descrs = _STARPU_JOB_GET_ORDERED_BUFFERS(j);
  880. unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
  881. int workerid = starpu_worker_get_id();
  882. unsigned local_memory_node = _starpu_memory_node_get_local_key();
  883. unsigned index;
  884. for (index = 0; index < nbuffers; index++)
  885. {
  886. starpu_data_handle_t handle = descrs[index].handle;
  887. enum starpu_data_access_mode mode = descrs[index].mode;
  888. int node = descrs[index].node;
  889. if (node == -1 && task->cl->where != STARPU_NOWHERE)
  890. node = local_memory_node;
  891. struct _starpu_data_replicate *local_replicate;
  892. if (index && descrs[index-1].handle == descrs[index].handle)
  893. /* We have already released this data, skip it. This
  894. * depends on ordering putting writes before reads, see
  895. * _starpu_compar_handles */
  896. continue;
  897. if (node != -1)
  898. local_replicate = get_replicate(handle, mode, workerid, node);
  899. /* Keep a reference for future
  900. * _starpu_release_task_enforce_sequential_consistency call */
  901. _starpu_spin_lock(&handle->header_lock);
  902. handle->busy_count++;
  903. if (node == -1)
  904. {
  905. /* NOWHERE case, just notify dependencies */
  906. if (!_starpu_notify_data_dependencies(handle))
  907. _starpu_spin_unlock(&handle->header_lock);
  908. }
  909. else
  910. {
  911. _starpu_spin_unlock(&handle->header_lock);
  912. _starpu_release_data_on_node(handle, 0, local_replicate);
  913. }
  914. }
  915. if (profiling && task->profiling_info)
  916. _starpu_clock_gettime(&task->profiling_info->release_data_end_time);
  917. }
  918. /* Version for a driver running on a worker: we show the driver state in the trace */
  919. void _starpu_push_task_output(struct _starpu_job *j)
  920. {
  921. _STARPU_TRACE_START_PUSH_OUTPUT(NULL);
  922. __starpu_push_task_output(j);
  923. _STARPU_TRACE_END_PUSH_OUTPUT(NULL);
  924. }
  925. struct fetch_nowhere_wrapper
  926. {
  927. struct _starpu_job *j;
  928. unsigned pending;
  929. };
  930. static void _starpu_fetch_nowhere_task_input_cb(void *arg);
  931. /* Asynchronously fetch data for a task which will have no content */
  932. void _starpu_fetch_nowhere_task_input(struct _starpu_job *j)
  933. {
  934. int profiling = starpu_profiling_status_get();
  935. struct starpu_task *task = j->task;
  936. if (profiling && task->profiling_info)
  937. _starpu_clock_gettime(&task->profiling_info->acquire_data_start_time);
  938. struct _starpu_data_descr *descrs = _STARPU_JOB_GET_ORDERED_BUFFERS(j);
  939. unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
  940. unsigned nfetchbuffers = 0;
  941. struct fetch_nowhere_wrapper *wrapper;
  942. unsigned index;
  943. for (index = 0; index < nbuffers; index++)
  944. {
  945. int node = descrs[index].node;
  946. if (node != -1)
  947. nfetchbuffers++;
  948. }
  949. if (!nfetchbuffers)
  950. {
  951. /* Nothing to fetch actually, already finished! */
  952. __starpu_push_task_output(j);
  953. _starpu_handle_job_termination(j);
  954. _STARPU_LOG_OUT_TAG("handle_job_termination");
  955. return;
  956. }
  957. wrapper = malloc(sizeof(*wrapper));
  958. wrapper->j = j;
  959. wrapper->pending = nfetchbuffers;
  960. for (index = 0; index < nbuffers; index++)
  961. {
  962. starpu_data_handle_t handle = descrs[index].handle;
  963. enum starpu_data_access_mode mode = descrs[index].mode;
  964. int node = descrs[index].node;
  965. if (node == -1)
  966. continue;
  967. if (mode == STARPU_NONE ||
  968. (mode & ((1<<STARPU_MODE_SHIFT) - 1)) >= STARPU_ACCESS_MODE_MAX ||
  969. (mode >> STARPU_MODE_SHIFT) >= (STARPU_SHIFTED_MODE_MAX >> STARPU_MODE_SHIFT))
  970. STARPU_ASSERT_MSG(0, "mode %d (0x%x) is bogus\n", mode, mode);
  971. STARPU_ASSERT(mode != STARPU_SCRATCH && mode != STARPU_REDUX);
  972. struct _starpu_data_replicate *local_replicate;
  973. local_replicate = get_replicate(handle, mode, -1, node);
  974. _starpu_fetch_data_on_node(handle, local_replicate, mode, 0, 0, 1, _starpu_fetch_nowhere_task_input_cb, wrapper, 0, "_starpu_fetch_nowhere_task_input");
  975. }
  976. if (profiling && task->profiling_info)
  977. _starpu_clock_gettime(&task->profiling_info->acquire_data_end_time);
  978. }
  979. static void _starpu_fetch_nowhere_task_input_cb(void *arg)
  980. {
  981. /* One more transfer finished */
  982. struct fetch_nowhere_wrapper *wrapper = arg;
  983. unsigned pending = STARPU_ATOMIC_ADD(&wrapper->pending, -1);
  984. ANNOTATE_HAPPENS_BEFORE(&wrapper->pending);
  985. if (pending == 0)
  986. {
  987. ANNOTATE_HAPPENS_AFTER(&wrapper->pending);
  988. /* Finished transferring, task is over */
  989. struct _starpu_job *j = wrapper->j;
  990. free(wrapper);
  991. __starpu_push_task_output(j);
  992. _starpu_handle_job_termination(j);
  993. _STARPU_LOG_OUT_TAG("handle_job_termination");
  994. }
  995. }
  996. /* NB : this value can only be an indication of the status of a data
  997. at some point, but there is no strong garantee ! */
  998. unsigned _starpu_is_data_present_or_requested(starpu_data_handle_t handle, unsigned node)
  999. {
  1000. unsigned ret = 0;
  1001. // XXX : this is just a hint, so we don't take the lock ...
  1002. // STARPU_PTHREAD_SPIN_LOCK(&handle->header_lock);
  1003. if (handle->per_node[node].state != STARPU_INVALID)
  1004. {
  1005. ret = 1;
  1006. }
  1007. else
  1008. {
  1009. unsigned i;
  1010. unsigned nnodes = starpu_memory_nodes_get_count();
  1011. for (i = 0; i < nnodes; i++)
  1012. {
  1013. if ((handle->per_node[node].requested & (1UL << i)) || handle->per_node[node].request[i])
  1014. ret = 1;
  1015. }
  1016. }
  1017. // STARPU_PTHREAD_SPIN_UNLOCK(&handle->header_lock);
  1018. return ret;
  1019. }
  1020. void _starpu_data_set_unregister_hook(starpu_data_handle_t handle, _starpu_data_handle_unregister_hook func)
  1021. {
  1022. handle->unregister_hook = func;
  1023. }