coherency.c 41 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures. *
  2. * Copyright (C) 2009-2017 Université de Bordeaux
  3. * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 CNRS
  4. * Copyright (C) 2014 INRIA
  5. *
  6. * StarPU is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU Lesser General Public License as published by
  8. * the Free Software Foundation; either version 2.1 of the License, or (at
  9. * your option) any later version.
  10. *
  11. * StarPU is distributed in the hope that it will be useful, but
  12. * WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  14. *
  15. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  16. */
  17. #include <common/config.h>
  18. #include <datawizard/coherency.h>
  19. #include <datawizard/copy_driver.h>
  20. #include <datawizard/write_back.h>
  21. #include <datawizard/memory_nodes.h>
  22. #include <core/dependencies/data_concurrency.h>
  23. #include <core/disk.h>
  24. #include <profiling/profiling.h>
  25. #include <math.h>
  26. #include <core/task.h>
  27. #include <starpu_scheduler.h>
  28. #include <core/workers.h>
  29. #include <limits.h>
  30. #ifdef STARPU_SIMGRID
  31. #include <core/simgrid.h>
  32. #endif
  33. static int link_supports_direct_transfers(starpu_data_handle_t handle, unsigned src_node, unsigned dst_node, unsigned *handling_node);
  34. int _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
  35. {
  36. int src_node = -1;
  37. unsigned i;
  38. unsigned nnodes = starpu_memory_nodes_get_count();
  39. /* first find a valid copy, either a STARPU_OWNER or a STARPU_SHARED */
  40. unsigned node;
  41. size_t size = _starpu_data_get_size(handle);
  42. double cost = INFINITY;
  43. unsigned src_node_mask = 0;
  44. for (node = 0; node < nnodes; node++)
  45. {
  46. if (handle->per_node[node].state != STARPU_INVALID)
  47. {
  48. /* we found a copy ! */
  49. src_node_mask |= (1<<node);
  50. }
  51. }
  52. if (src_node_mask == 0 && handle->init_cl)
  53. {
  54. /* No copy yet, but applicationg told us how to build it. */
  55. return -1;
  56. }
  57. /* we should have found at least one copy ! */
  58. STARPU_ASSERT_MSG(src_node_mask != 0, "The data for the handle %p is requested, but the handle does not have a valid value. Perhaps some initialization task is missing?", handle);
  59. /* Without knowing the size, we won't know the cost */
  60. if (!size)
  61. cost = 0;
  62. /* Check whether we have transfer cost for all nodes, if so, take the minimum */
  63. if (cost)
  64. for (i = 0; i < nnodes; i++)
  65. {
  66. if (src_node_mask & (1<<i))
  67. {
  68. double time = starpu_transfer_predict(i, destination, size);
  69. unsigned handling_node;
  70. /* Avoid indirect transfers */
  71. if (!link_supports_direct_transfers(handle, i, destination, &handling_node))
  72. continue;
  73. if (_STARPU_IS_ZERO(time))
  74. {
  75. /* No estimation, will have to revert to dumb strategy */
  76. cost = 0.0;
  77. break;
  78. }
  79. else if (time < cost)
  80. {
  81. cost = time;
  82. src_node = i;
  83. }
  84. }
  85. }
  86. if (cost && src_node != -1)
  87. {
  88. /* Could estimate through cost, return that */
  89. STARPU_ASSERT(handle->per_node[src_node].allocated);
  90. STARPU_ASSERT(handle->per_node[src_node].initialized);
  91. return src_node;
  92. }
  93. int i_ram = -1;
  94. int i_gpu = -1;
  95. int i_disk = -1;
  96. /* Revert to dumb strategy: take RAM unless only a GPU has it */
  97. for (i = 0; i < nnodes; i++)
  98. {
  99. if (src_node_mask & (1<<i))
  100. {
  101. int (*can_copy)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, unsigned handling_node) = handle->ops->copy_methods->can_copy;
  102. /* Avoid transfers which the interface does not want */
  103. if (can_copy)
  104. {
  105. void *src_interface = handle->per_node[i].data_interface;
  106. void *dst_interface = handle->per_node[destination].data_interface;
  107. unsigned handling_node;
  108. if (!link_supports_direct_transfers(handle, i, destination, &handling_node))
  109. {
  110. /* Avoid through RAM if the interface does not want it */
  111. void *ram_interface = handle->per_node[STARPU_MAIN_RAM].data_interface;
  112. if ((!can_copy(src_interface, i, ram_interface, STARPU_MAIN_RAM, i)
  113. && !can_copy(src_interface, i, ram_interface, STARPU_MAIN_RAM, STARPU_MAIN_RAM))
  114. || (!can_copy(ram_interface, STARPU_MAIN_RAM, dst_interface, destination, STARPU_MAIN_RAM)
  115. && !can_copy(ram_interface, STARPU_MAIN_RAM, dst_interface, destination, destination)))
  116. continue;
  117. }
  118. }
  119. /* however GPU are expensive sources, really !
  120. * Unless peer transfer is supported (and it would then have been selected above).
  121. * Other should be ok */
  122. if (starpu_node_get_kind(i) == STARPU_CUDA_RAM ||
  123. starpu_node_get_kind(i) == STARPU_OPENCL_RAM ||
  124. starpu_node_get_kind(i) == STARPU_MIC_RAM)
  125. i_gpu = i;
  126. if (starpu_node_get_kind(i) == STARPU_CPU_RAM ||
  127. starpu_node_get_kind(i) == STARPU_SCC_RAM ||
  128. starpu_node_get_kind(i) == STARPU_SCC_SHM ||
  129. starpu_node_get_kind(i) == STARPU_MPI_MS_RAM)
  130. i_ram = i;
  131. if (starpu_node_get_kind(i) == STARPU_DISK_RAM)
  132. i_disk = i;
  133. }
  134. }
  135. /* we have to use cpu_ram in first */
  136. if (i_ram != -1)
  137. src_node = i_ram;
  138. /* no luck we have to use the disk memory */
  139. else if (i_gpu != -1)
  140. src_node = i_gpu;
  141. else
  142. src_node = i_disk;
  143. STARPU_ASSERT(src_node != -1);
  144. STARPU_ASSERT(handle->per_node[src_node].allocated);
  145. STARPU_ASSERT(handle->per_node[src_node].initialized);
  146. return src_node;
  147. }
  148. /* this may be called once the data is fetched with header and STARPU_RW-lock hold */
  149. void _starpu_update_data_state(starpu_data_handle_t handle,
  150. struct _starpu_data_replicate *requesting_replicate,
  151. enum starpu_data_access_mode mode)
  152. {
  153. /* There is nothing to do for relaxed coherency modes (scratch or
  154. * reductions) */
  155. if (!(mode & STARPU_RW))
  156. return;
  157. unsigned nnodes = starpu_memory_nodes_get_count();
  158. /* the data is present now */
  159. unsigned requesting_node = requesting_replicate->memory_node;
  160. requesting_replicate->requested &= ~(1UL << requesting_node);
  161. if (mode & STARPU_W)
  162. {
  163. /* the requesting node now has the only valid copy */
  164. unsigned node;
  165. for (node = 0; node < nnodes; node++)
  166. {
  167. _STARPU_TRACE_DATA_INVALIDATE(handle, node);
  168. handle->per_node[node].state = STARPU_INVALID;
  169. }
  170. requesting_replicate->state = STARPU_OWNER;
  171. if (handle->home_node != -1 && handle->per_node[handle->home_node].state == STARPU_INVALID)
  172. /* Notify that this MC is now dirty */
  173. _starpu_memchunk_dirty(requesting_replicate->mc, requesting_replicate->memory_node);
  174. }
  175. else
  176. {
  177. /* read only */
  178. if (requesting_replicate->state != STARPU_OWNER)
  179. {
  180. /* there was at least another copy of the data */
  181. unsigned node;
  182. for (node = 0; node < nnodes; node++)
  183. {
  184. struct _starpu_data_replicate *replicate = &handle->per_node[node];
  185. if (replicate->state != STARPU_INVALID)
  186. replicate->state = STARPU_SHARED;
  187. }
  188. requesting_replicate->state = STARPU_SHARED;
  189. }
  190. }
  191. }
  192. static int worker_supports_direct_access(unsigned node, unsigned handling_node)
  193. {
  194. /* only support disk <-> ram and disk <-> disk */
  195. if (starpu_node_get_kind(node) == STARPU_DISK_RAM || starpu_node_get_kind(handling_node) == STARPU_DISK_RAM)
  196. return 0;
  197. if (node == handling_node)
  198. return 1;
  199. if (!_starpu_memory_node_get_nworkers(handling_node))
  200. /* No worker to process the request from that node */
  201. return 0;
  202. int type = starpu_node_get_kind(node);
  203. switch (type)
  204. {
  205. case STARPU_CUDA_RAM:
  206. {
  207. /* GPUs not always allow direct remote access: if CUDA4
  208. * is enabled, we allow two CUDA devices to communicate. */
  209. #ifdef STARPU_SIMGRID
  210. if (starpu_node_get_kind(handling_node) == STARPU_CUDA_RAM)
  211. {
  212. msg_host_t host = _starpu_simgrid_get_memnode_host(handling_node);
  213. const char* cuda_memcpy_peer = MSG_host_get_property_value(host, "memcpy_peer");
  214. return cuda_memcpy_peer && atoll(cuda_memcpy_peer);
  215. }
  216. else
  217. return 0;
  218. #elif defined(HAVE_CUDA_MEMCPY_PEER)
  219. /* simgrid */
  220. enum starpu_node_kind kind = starpu_node_get_kind(handling_node);
  221. return kind == STARPU_CUDA_RAM;
  222. #else /* HAVE_CUDA_MEMCPY_PEER */
  223. /* Direct GPU-GPU transfers are not allowed in general */
  224. return 0;
  225. #endif /* HAVE_CUDA_MEMCPY_PEER */
  226. }
  227. case STARPU_OPENCL_RAM:
  228. return 0;
  229. case STARPU_MIC_RAM:
  230. /* TODO: We don't handle direct MIC-MIC transfers yet */
  231. return 0;
  232. case STARPU_MPI_MS_RAM:
  233. {
  234. enum starpu_node_kind kind = starpu_node_get_kind(handling_node);
  235. return kind == STARPU_MPI_MS_RAM;
  236. }
  237. case STARPU_SCC_RAM:
  238. return 1;
  239. default:
  240. return 1;
  241. }
  242. }
  243. static int link_supports_direct_transfers(starpu_data_handle_t handle, unsigned src_node, unsigned dst_node, unsigned *handling_node)
  244. {
  245. int (*can_copy)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, unsigned handling_node) = handle->ops->copy_methods->can_copy;
  246. void *src_interface = handle->per_node[src_node].data_interface;
  247. void *dst_interface = handle->per_node[dst_node].data_interface;
  248. /* XXX That's a hack until we fix cudaMemcpy3DPeerAsync in the block interface
  249. * Perhaps not all data interface provide a direct GPU-GPU transfer
  250. * method ! */
  251. #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
  252. if (src_node != dst_node && starpu_node_get_kind(src_node) == STARPU_CUDA_RAM && starpu_node_get_kind(dst_node) == STARPU_CUDA_RAM)
  253. {
  254. const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
  255. if (!copy_methods->cuda_to_cuda_async && !copy_methods->any_to_any)
  256. return 0;
  257. }
  258. #endif
  259. /* Note: with CUDA, performance seems a bit better when issuing the transfer from the destination (tested without GPUDirect, but GPUDirect probably behave the same) */
  260. if (worker_supports_direct_access(src_node, dst_node) && (!can_copy || can_copy(src_interface, src_node, dst_interface, dst_node, dst_node)))
  261. {
  262. *handling_node = dst_node;
  263. return 1;
  264. }
  265. if (worker_supports_direct_access(dst_node, src_node) && (!can_copy || can_copy(src_interface, src_node, dst_interface, dst_node, src_node)))
  266. {
  267. *handling_node = src_node;
  268. return 1;
  269. }
  270. /* Link between disk and ram */
  271. if ((starpu_node_get_kind(src_node) == STARPU_DISK_RAM && starpu_node_get_kind(dst_node) == STARPU_CPU_RAM) ||
  272. (starpu_node_get_kind(src_node) == STARPU_CPU_RAM && starpu_node_get_kind(dst_node) == STARPU_DISK_RAM))
  273. {
  274. /* FIXME: not necessarily a worker :/ */
  275. *handling_node = STARPU_MAIN_RAM;
  276. return 1;
  277. }
  278. /* link between disk and disk, and they have the same kind */
  279. if (_starpu_is_same_kind_disk(src_node, dst_node))
  280. return 1;
  281. return 0;
  282. }
  283. /* Determines the path of a request : each hop is defined by (src,dst) and the
  284. * node that handles the hop. The returned value indicates the number of hops,
  285. * and the max_len is the maximum number of hops (ie. the size of the
  286. * src_nodes, dst_nodes and handling_nodes arrays. */
  287. static int determine_request_path(starpu_data_handle_t handle,
  288. int src_node, int dst_node,
  289. enum starpu_data_access_mode mode, int max_len,
  290. unsigned *src_nodes, unsigned *dst_nodes,
  291. unsigned *handling_nodes, unsigned write_invalidation)
  292. {
  293. if (src_node == dst_node || !(mode & STARPU_R))
  294. {
  295. if (dst_node == -1 || starpu_node_get_kind(dst_node) == STARPU_DISK_RAM)
  296. handling_nodes[0] = src_node;
  297. else
  298. handling_nodes[0] = dst_node;
  299. if (write_invalidation)
  300. /* The invalidation request will be enough */
  301. return 0;
  302. /* The destination node should only allocate the data, no transfer is required */
  303. STARPU_ASSERT(max_len >= 1);
  304. src_nodes[0] = STARPU_MAIN_RAM; // ignored
  305. dst_nodes[0] = dst_node;
  306. return 1;
  307. }
  308. unsigned handling_node;
  309. int link_is_valid = link_supports_direct_transfers(handle, src_node, dst_node, &handling_node);
  310. if (!link_is_valid)
  311. {
  312. int (*can_copy)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, unsigned handling_node) = handle->ops->copy_methods->can_copy;
  313. void *src_interface = handle->per_node[src_node].data_interface;
  314. void *dst_interface = handle->per_node[dst_node].data_interface;
  315. /* We need an intermediate hop to implement data staging
  316. * through main memory. */
  317. STARPU_ASSERT(max_len >= 2);
  318. STARPU_ASSERT(src_node >= 0);
  319. /* GPU -> RAM */
  320. src_nodes[0] = src_node;
  321. dst_nodes[0] = STARPU_MAIN_RAM;
  322. if (starpu_node_get_kind(src_node) == STARPU_DISK_RAM)
  323. /* Disks don't have their own driver thread */
  324. handling_nodes[0] = dst_node;
  325. else if (!can_copy || can_copy(src_interface, src_node, dst_interface, dst_node, src_node))
  326. {
  327. handling_nodes[0] = src_node;
  328. }
  329. else
  330. {
  331. STARPU_ASSERT_MSG(can_copy(src_interface, src_node, dst_interface, dst_node, dst_node), "interface %d refuses all kinds of transfers from node %d to node %d\n", handle->ops->interfaceid, src_node, dst_node);
  332. handling_nodes[0] = dst_node;
  333. }
  334. /* RAM -> GPU */
  335. src_nodes[1] = STARPU_MAIN_RAM;
  336. dst_nodes[1] = dst_node;
  337. if (starpu_node_get_kind(dst_node) == STARPU_DISK_RAM)
  338. /* Disks don't have their own driver thread */
  339. handling_nodes[1] = src_node;
  340. else if (!can_copy || can_copy(src_interface, src_node, dst_interface, dst_node, dst_node))
  341. {
  342. handling_nodes[1] = dst_node;
  343. }
  344. else
  345. {
  346. STARPU_ASSERT_MSG(can_copy(src_interface, src_node, dst_interface, dst_node, src_node), "interface %d refuses all kinds of transfers from node %d to node %d\n", handle->ops->interfaceid, src_node, dst_node);
  347. handling_nodes[1] = src_node;
  348. }
  349. return 2;
  350. }
  351. else
  352. {
  353. STARPU_ASSERT(max_len >= 1);
  354. src_nodes[0] = src_node;
  355. dst_nodes[0] = dst_node;
  356. handling_nodes[0] = handling_node;
  357. #if !defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
  358. STARPU_ASSERT(!(mode & STARPU_R) || starpu_node_get_kind(src_node) != STARPU_CUDA_RAM || starpu_node_get_kind(dst_node) != STARPU_CUDA_RAM);
  359. #endif
  360. return 1;
  361. }
  362. }
  363. /* handle->lock should be taken. r is returned locked. The node parameter
  364. * indicate either the source of the request, or the destination for a
  365. * write-only request. */
  366. static struct _starpu_data_request *_starpu_search_existing_data_request(struct _starpu_data_replicate *replicate, unsigned node, enum starpu_data_access_mode mode, unsigned is_prefetch)
  367. {
  368. struct _starpu_data_request *r;
  369. r = replicate->request[node];
  370. if (r)
  371. {
  372. _starpu_spin_checklocked(&r->handle->header_lock);
  373. _starpu_spin_lock(&r->lock);
  374. /* perhaps we need to "upgrade" the request */
  375. if (is_prefetch < r->prefetch)
  376. _starpu_update_prefetch_status(r, is_prefetch);
  377. if (mode & STARPU_R)
  378. {
  379. /* in case the exisiting request did not imply a memory
  380. * transfer yet, we have to take a second refcnt now
  381. * for the source, in addition to the refcnt for the
  382. * destination
  383. * (so that the source remains valid) */
  384. if (!(r->mode & STARPU_R))
  385. {
  386. replicate->refcnt++;
  387. replicate->handle->busy_count++;
  388. }
  389. r->mode = (enum starpu_data_access_mode) ((int) r->mode | (int) STARPU_R);
  390. }
  391. if (mode & STARPU_W)
  392. r->mode = (enum starpu_data_access_mode) ((int) r->mode | (int) STARPU_W);
  393. }
  394. return r;
  395. }
  396. /*
  397. * This function is called when the data is needed on the local node, this
  398. * returns a pointer to the local copy
  399. *
  400. * R STARPU_W STARPU_RW
  401. * Owner OK OK OK
  402. * Shared OK 1 1
  403. * Invalid 2 3 4
  404. *
  405. * case 1 : shared + (read)write :
  406. * no data copy but shared->Invalid/Owner
  407. * case 2 : invalid + read :
  408. * data copy + invalid->shared + owner->shared (STARPU_ASSERT(there is a valid))
  409. * case 3 : invalid + write :
  410. * no data copy + invalid->owner + (owner,shared)->invalid
  411. * case 4 : invalid + R/STARPU_W :
  412. * data copy + if (STARPU_W) (invalid->owner + owner->invalid)
  413. * else (invalid,owner->shared)
  414. */
  415. struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_handle_t handle,
  416. struct _starpu_data_replicate *dst_replicate,
  417. enum starpu_data_access_mode mode, unsigned is_prefetch,
  418. unsigned async,
  419. void (*callback_func)(void *), void *callback_arg, int prio, const char *origin)
  420. {
  421. /* We don't care about commuting for data requests, that was handled before. */
  422. mode &= ~STARPU_COMMUTE;
  423. /* This function is called with handle's header lock taken */
  424. _starpu_spin_checklocked(&handle->header_lock);
  425. int requesting_node = dst_replicate ? dst_replicate->memory_node : -1;
  426. unsigned nwait = 0;
  427. if (mode & STARPU_W)
  428. {
  429. /* We will write to the buffer. We will have to wait for all
  430. * existing requests before the last request which will
  431. * invalidate all their results (which were possibly spurious,
  432. * e.g. too aggressive eviction).
  433. */
  434. unsigned i, j;
  435. unsigned nnodes = starpu_memory_nodes_get_count();
  436. for (i = 0; i < nnodes; i++)
  437. for (j = 0; j < nnodes; j++)
  438. if (handle->per_node[i].request[j])
  439. nwait++;
  440. /* If the request is not detached (i.e. the caller really wants
  441. * proper ownership), no new requests will appear because a
  442. * reference will be kept on the dst replicate, which will
  443. * notably prevent data reclaiming.
  444. */
  445. }
  446. if ((!dst_replicate || dst_replicate->state != STARPU_INVALID) && (!nwait || is_prefetch))
  447. {
  448. if (dst_replicate)
  449. {
  450. #ifdef STARPU_MEMORY_STATS
  451. enum _starpu_cache_state old_state = dst_replicate->state;
  452. #endif
  453. /* the data is already available and we don't have to wait for
  454. * any request, so we can stop */
  455. _starpu_update_data_state(handle, dst_replicate, mode);
  456. _starpu_msi_cache_hit(requesting_node);
  457. #ifdef STARPU_MEMORY_STATS
  458. _starpu_memory_handle_stats_cache_hit(handle, requesting_node);
  459. /* XXX Broken ? */
  460. if (old_state == STARPU_SHARED
  461. && dst_replicate->state == STARPU_OWNER)
  462. _starpu_memory_handle_stats_shared_to_owner(handle, requesting_node);
  463. #endif
  464. if (dst_replicate->mc)
  465. _starpu_memchunk_recently_used(dst_replicate->mc, requesting_node);
  466. }
  467. _starpu_spin_unlock(&handle->header_lock);
  468. if (callback_func)
  469. callback_func(callback_arg);
  470. _STARPU_LOG_OUT_TAG("data available");
  471. return NULL;
  472. }
  473. if (dst_replicate)
  474. _starpu_msi_cache_miss(requesting_node);
  475. /* the only remaining situation is that the local copy was invalid */
  476. STARPU_ASSERT((dst_replicate && dst_replicate->state == STARPU_INVALID) || nwait);
  477. /* find someone who already has the data */
  478. int src_node = -1;
  479. if (dst_replicate && mode & STARPU_R)
  480. {
  481. if (dst_replicate->state == STARPU_INVALID)
  482. src_node = _starpu_select_src_node(handle, requesting_node);
  483. else
  484. src_node = requesting_node;
  485. if (src_node < 0)
  486. {
  487. /* We will create it, no need to read an existing value */
  488. mode &= ~STARPU_R;
  489. }
  490. }
  491. else if (dst_replicate)
  492. {
  493. /* if the data is in write only mode (and not SCRATCH or REDUX), there is no need for a source, data will be initialized by the task itself */
  494. if (mode & STARPU_W)
  495. dst_replicate->initialized = 1;
  496. if (requesting_node == STARPU_MAIN_RAM && !nwait)
  497. {
  498. /* And this is the main RAM, really no need for a
  499. * request, just allocate */
  500. if (_starpu_allocate_memory_on_node(handle, dst_replicate, is_prefetch) == 0)
  501. {
  502. _starpu_update_data_state(handle, dst_replicate, mode);
  503. _starpu_spin_unlock(&handle->header_lock);
  504. if (callback_func)
  505. callback_func(callback_arg);
  506. _STARPU_LOG_OUT_TAG("data immediately allocated");
  507. return NULL;
  508. }
  509. }
  510. }
  511. #define MAX_REQUESTS 4
  512. /* We can safely assume that there won't be more than 2 hops in the
  513. * current implementation */
  514. unsigned src_nodes[MAX_REQUESTS], dst_nodes[MAX_REQUESTS], handling_nodes[MAX_REQUESTS];
  515. /* keep one slot for the last W request, if any */
  516. int write_invalidation = (mode & STARPU_W) && nwait && !is_prefetch;
  517. int nhops = determine_request_path(handle, src_node, requesting_node, mode, MAX_REQUESTS,
  518. src_nodes, dst_nodes, handling_nodes, write_invalidation);
  519. STARPU_ASSERT(nhops >= 0 && nhops <= MAX_REQUESTS-1);
  520. struct _starpu_data_request *requests[nhops + write_invalidation];
  521. /* Did we reuse a request for that hop ? */
  522. int reused_requests[nhops + write_invalidation];
  523. /* Construct an array with a list of requests, possibly reusing existing requests */
  524. int hop;
  525. for (hop = 0; hop < nhops; hop++)
  526. {
  527. struct _starpu_data_request *r;
  528. unsigned hop_src_node = src_nodes[hop];
  529. unsigned hop_dst_node = dst_nodes[hop];
  530. unsigned hop_handling_node = handling_nodes[hop];
  531. struct _starpu_data_replicate *hop_src_replicate;
  532. struct _starpu_data_replicate *hop_dst_replicate;
  533. /* Only the first request is independant */
  534. unsigned ndeps = (hop == 0)?0:1;
  535. hop_src_replicate = &handle->per_node[hop_src_node];
  536. hop_dst_replicate = (hop != nhops - 1)?&handle->per_node[hop_dst_node]:dst_replicate;
  537. /* Try to reuse a request if possible */
  538. r = _starpu_search_existing_data_request(hop_dst_replicate,
  539. (mode & STARPU_R)?hop_src_node:hop_dst_node,
  540. mode, is_prefetch);
  541. reused_requests[hop] = !!r;
  542. if (!r)
  543. {
  544. /* Create a new request if there was no request to reuse */
  545. r = _starpu_create_data_request(handle, hop_src_replicate,
  546. hop_dst_replicate, hop_handling_node,
  547. mode, ndeps, is_prefetch, prio, 0, origin);
  548. nwait++;
  549. }
  550. requests[hop] = r;
  551. }
  552. /* Chain these requests */
  553. for (hop = 0; hop < nhops; hop++)
  554. {
  555. struct _starpu_data_request *r;
  556. r = requests[hop];
  557. if (hop != nhops - 1)
  558. {
  559. if (!reused_requests[hop + 1])
  560. {
  561. r->next_req[r->next_req_count++] = requests[hop + 1];
  562. STARPU_ASSERT(r->next_req_count <= STARPU_MAXNODES);
  563. }
  564. }
  565. else if (!write_invalidation)
  566. /* The last request will perform the callback after termination */
  567. _starpu_data_request_append_callback(r, callback_func, callback_arg);
  568. if (reused_requests[hop])
  569. _starpu_spin_unlock(&r->lock);
  570. }
  571. if (write_invalidation)
  572. {
  573. /* Some requests were still pending, we have to add yet another
  574. * request, depending on them, which will invalidate their
  575. * result.
  576. */
  577. struct _starpu_data_request *r = _starpu_create_data_request(handle, dst_replicate,
  578. dst_replicate, requesting_node,
  579. STARPU_W, nwait, is_prefetch, prio, 1, origin);
  580. /* and perform the callback after termination */
  581. _starpu_data_request_append_callback(r, callback_func, callback_arg);
  582. /* We will write to the buffer. We will have to wait for all
  583. * existing requests before the last request which will
  584. * invalidate all their results (which were possibly spurious,
  585. * e.g. too aggressive eviction).
  586. */
  587. unsigned i, j;
  588. unsigned nnodes = starpu_memory_nodes_get_count();
  589. for (i = 0; i < nnodes; i++)
  590. for (j = 0; j < nnodes; j++)
  591. {
  592. struct _starpu_data_request *r2 = handle->per_node[i].request[j];
  593. if (r2)
  594. {
  595. _starpu_spin_lock(&r2->lock);
  596. if (is_prefetch < r2->prefetch)
  597. /* Hasten the request we will have to wait for */
  598. _starpu_update_prefetch_status(r2, is_prefetch);
  599. r2->next_req[r2->next_req_count++] = r;
  600. STARPU_ASSERT(r2->next_req_count <= STARPU_MAXNODES + 1);
  601. _starpu_spin_unlock(&r2->lock);
  602. nwait--;
  603. }
  604. }
  605. STARPU_ASSERT(nwait == 0);
  606. nhops++;
  607. requests[nhops - 1] = r;
  608. /* existing requests will post this one */
  609. reused_requests[nhops - 1] = 1;
  610. }
  611. STARPU_ASSERT(nhops);
  612. if (!async)
  613. requests[nhops - 1]->refcnt++;
  614. /* we only submit the first request, the remaining will be
  615. * automatically submitted afterward */
  616. if (!reused_requests[0])
  617. _starpu_post_data_request(requests[0]);
  618. return requests[nhops - 1];
  619. }
  620. int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *dst_replicate,
  621. enum starpu_data_access_mode mode, unsigned detached, unsigned is_prefetch, unsigned async,
  622. void (*callback_func)(void *), void *callback_arg, int prio, const char *origin)
  623. {
  624. _STARPU_LOG_IN();
  625. int cpt = 0;
  626. while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
  627. {
  628. cpt++;
  629. _starpu_datawizard_progress(1);
  630. }
  631. if (cpt == STARPU_SPIN_MAXTRY)
  632. _starpu_spin_lock(&handle->header_lock);
  633. if (!detached)
  634. {
  635. /* Take references which will be released by _starpu_release_data_on_node */
  636. if (dst_replicate)
  637. dst_replicate->refcnt++;
  638. else if (node == STARPU_ACQUIRE_NO_NODE_LOCK_ALL)
  639. {
  640. int i;
  641. for (i = 0; i < STARPU_MAXNODES; i++)
  642. handle->per_node[i].refcnt++;
  643. }
  644. handle->busy_count++;
  645. }
  646. struct _starpu_data_request *r;
  647. r = _starpu_create_request_to_fetch_data(handle, dst_replicate, mode,
  648. is_prefetch, async, callback_func, callback_arg, prio, origin);
  649. /* If no request was created, the handle was already up-to-date on the
  650. * node. In this case, _starpu_create_request_to_fetch_data has already
  651. * unlocked the header. */
  652. if (!r)
  653. return 0;
  654. _starpu_spin_unlock(&handle->header_lock);
  655. int ret = async?0:_starpu_wait_data_request_completion(r, 1);
  656. _STARPU_LOG_OUT();
  657. return ret;
  658. }
  659. static int idle_prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
  660. {
  661. return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, 2, 1, NULL, NULL, prio, "idle_prefetch_data_on_node");
  662. }
  663. static int prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
  664. {
  665. return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, 1, 1, NULL, NULL, prio, "prefetch_data_on_node");
  666. }
  667. static int fetch_data(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
  668. {
  669. return _starpu_fetch_data_on_node(handle, node, replicate, mode, 0, 0, 0, NULL, NULL, prio, "fetch_data");
  670. }
  671. uint32_t _starpu_get_data_refcnt(starpu_data_handle_t handle, unsigned node)
  672. {
  673. return handle->per_node[node].refcnt;
  674. }
  675. size_t _starpu_data_get_size(starpu_data_handle_t handle)
  676. {
  677. return handle->ops->get_size(handle);
  678. }
  679. uint32_t _starpu_data_get_footprint(starpu_data_handle_t handle)
  680. {
  681. return handle->footprint;
  682. }
  683. /* in case the data was accessed on a write mode, do not forget to
  684. * make it accessible again once it is possible ! */
  685. void _starpu_release_data_on_node(starpu_data_handle_t handle, uint32_t default_wt_mask, struct _starpu_data_replicate *replicate)
  686. {
  687. uint32_t wt_mask;
  688. wt_mask = default_wt_mask | handle->wt_mask;
  689. wt_mask &= (1<<starpu_memory_nodes_get_count())-1;
  690. /* Note that it is possible that there is no valid copy of the data (if
  691. * starpu_data_invalidate was called for instance). In that case, we do
  692. * not enforce any write-through mechanism. */
  693. unsigned memory_node = replicate->memory_node;
  694. if (replicate->state != STARPU_INVALID && handle->current_mode & STARPU_W)
  695. if ((wt_mask & ~(1<<memory_node)))
  696. _starpu_write_through_data(handle, memory_node, wt_mask);
  697. int cpt = 0;
  698. while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
  699. {
  700. cpt++;
  701. _starpu_datawizard_progress(1);
  702. }
  703. if (cpt == STARPU_SPIN_MAXTRY)
  704. _starpu_spin_lock(&handle->header_lock);
  705. /* Release refcnt taken by fetch_data_on_node */
  706. replicate->refcnt--;
  707. STARPU_ASSERT_MSG(replicate->refcnt >= 0, "handle %p released too many times", handle);
  708. STARPU_ASSERT_MSG(handle->busy_count > 0, "handle %p released too many times", handle);
  709. handle->busy_count--;
  710. if (!_starpu_notify_data_dependencies(handle))
  711. _starpu_spin_unlock(&handle->header_lock);
  712. }
  713. static void _starpu_set_data_requested_flag_if_needed(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate)
  714. {
  715. int cpt = 0;
  716. while (cpt < STARPU_SPIN_MAXTRY && _starpu_spin_trylock(&handle->header_lock))
  717. {
  718. cpt++;
  719. _starpu_datawizard_progress(1);
  720. }
  721. if (cpt == STARPU_SPIN_MAXTRY)
  722. _starpu_spin_lock(&handle->header_lock);
  723. if (replicate->state == STARPU_INVALID)
  724. {
  725. unsigned dst_node = replicate->memory_node;
  726. replicate->requested |= 1UL << dst_node;
  727. }
  728. _starpu_spin_unlock(&handle->header_lock);
  729. }
  730. int starpu_prefetch_task_input_on_node_prio(struct starpu_task *task, unsigned node, int prio)
  731. {
  732. STARPU_ASSERT(!task->prefetched);
  733. unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
  734. unsigned index;
  735. for (index = 0; index < nbuffers; index++)
  736. {
  737. starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, index);
  738. enum starpu_data_access_mode mode = STARPU_TASK_GET_MODE(task, index);
  739. if (mode & (STARPU_SCRATCH|STARPU_REDUX))
  740. continue;
  741. struct _starpu_data_replicate *replicate = &handle->per_node[node];
  742. prefetch_data_on_node(handle, node, replicate, mode, prio);
  743. _starpu_set_data_requested_flag_if_needed(handle, replicate);
  744. }
  745. return 0;
  746. }
  747. int starpu_prefetch_task_input_on_node(struct starpu_task *task, unsigned node)
  748. {
  749. int prio = task->priority;
  750. if (task->workerorder)
  751. prio = INT_MAX - task->workerorder;
  752. return starpu_prefetch_task_input_on_node_prio(task, node, prio);
  753. }
  754. int starpu_idle_prefetch_task_input_on_node_prio(struct starpu_task *task, unsigned node, int prio)
  755. {
  756. unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
  757. unsigned index;
  758. for (index = 0; index < nbuffers; index++)
  759. {
  760. starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, index);
  761. enum starpu_data_access_mode mode = STARPU_TASK_GET_MODE(task, index);
  762. if (mode & (STARPU_SCRATCH|STARPU_REDUX))
  763. continue;
  764. struct _starpu_data_replicate *replicate = &handle->per_node[node];
  765. idle_prefetch_data_on_node(handle, node, replicate, mode, prio);
  766. }
  767. return 0;
  768. }
  769. int starpu_idle_prefetch_task_input_on_node(struct starpu_task *task, unsigned node)
  770. {
  771. int prio = task->priority;
  772. if (task->workerorder)
  773. prio = INT_MAX - task->workerorder;
  774. return starpu_idle_prefetch_task_input_on_node_prio(task, node, prio);
  775. }
  776. struct _starpu_data_replicate *get_replicate(starpu_data_handle_t handle, enum starpu_data_access_mode mode, int workerid, unsigned node)
  777. {
  778. if (mode & (STARPU_SCRATCH|STARPU_REDUX))
  779. {
  780. STARPU_ASSERT(workerid >= 0);
  781. if (!handle->per_worker)
  782. {
  783. _starpu_spin_lock(&handle->header_lock);
  784. if (!handle->per_worker)
  785. _starpu_data_initialize_per_worker(handle);
  786. _starpu_spin_unlock(&handle->header_lock);
  787. }
  788. return &handle->per_worker[workerid];
  789. }
  790. else
  791. /* That's a "normal" buffer (R/W) */
  792. return &handle->per_node[node];
  793. }
  794. /* Callback used when a buffer is send asynchronously to the sink */
  795. static void _starpu_fetch_task_input_cb(void *arg)
  796. {
  797. struct _starpu_worker * worker = (struct _starpu_worker *) arg;
  798. /* increase the number of buffer received */
  799. STARPU_WMB();
  800. (void)STARPU_ATOMIC_ADD(&worker->nb_buffers_transferred, 1);
  801. #ifdef STARPU_SIMGRID
  802. starpu_pthread_queue_broadcast(&_starpu_simgrid_transfer_queue[worker->memory_node]);
  803. #endif
  804. }
  805. /* Synchronously or asynchronously fetch data for a given task (if it's not there already)
  806. * Returns the number of data acquired here. */
  807. /* _starpu_fetch_task_input must be called before
  808. * executing the task. __starpu_push_task_output but be called after the
  809. * execution of the task. */
  810. /* The driver can either just call _starpu_fetch_task_input with async==0,
  811. * or to improve overlapping, it can call _starpu_fetch_task_input with
  812. * async==1, then wait for transfers to complete, then call
  813. * _starpu_fetch_task_input_tail to complete the fetch. */
  814. int _starpu_fetch_task_input(struct starpu_task *task, struct _starpu_job *j, int async)
  815. {
  816. struct _starpu_worker *worker = _starpu_get_local_worker_key();
  817. int workerid = worker->workerid;
  818. if (async)
  819. {
  820. worker->task_transferring = task;
  821. worker->nb_buffers_transferred = 0;
  822. if (worker->ntasks <= 1)
  823. _STARPU_TRACE_WORKER_START_FETCH_INPUT(NULL, workerid);
  824. }
  825. else
  826. _STARPU_TRACE_START_FETCH_INPUT(NULL);
  827. int profiling = starpu_profiling_status_get();
  828. if (profiling && task->profiling_info)
  829. _starpu_clock_gettime(&task->profiling_info->acquire_data_start_time);
  830. struct _starpu_data_descr *descrs = _STARPU_JOB_GET_ORDERED_BUFFERS(j);
  831. unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
  832. unsigned nacquires;
  833. unsigned local_memory_node = worker->memory_node;
  834. unsigned index;
  835. nacquires = 0;
  836. for (index = 0; index < nbuffers; index++)
  837. {
  838. int ret;
  839. starpu_data_handle_t handle = descrs[index].handle;
  840. enum starpu_data_access_mode mode = descrs[index].mode;
  841. int node = descrs[index].node;
  842. if (node == -1)
  843. node = local_memory_node;
  844. if (mode == STARPU_NONE ||
  845. (mode & ((1<<STARPU_MODE_SHIFT) - 1)) >= STARPU_ACCESS_MODE_MAX ||
  846. (mode >> STARPU_MODE_SHIFT) >= (STARPU_SHIFTED_MODE_MAX >> STARPU_MODE_SHIFT))
  847. STARPU_ASSERT_MSG(0, "mode %d (0x%x) is bogus\n", mode, mode);
  848. struct _starpu_data_replicate *local_replicate;
  849. if (index && descrs[index-1].handle == descrs[index].handle)
  850. /* We have already took this data, skip it. This
  851. * depends on ordering putting writes before reads, see
  852. * _starpu_compar_handles */
  853. continue;
  854. local_replicate = get_replicate(handle, mode, workerid, node);
  855. if (async)
  856. {
  857. ret = _starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, 0, 1,
  858. _starpu_fetch_task_input_cb, worker, 0, "_starpu_src_common_worker_internal_work");
  859. #ifdef STARPU_SIMGRID
  860. if (_starpu_simgrid_fetching_input_cost())
  861. MSG_process_sleep(0.000001);
  862. #endif
  863. if (STARPU_UNLIKELY(ret))
  864. {
  865. /* Ooops, not enough memory, make worker wait for these for now, and the synchronous call will finish by forcing eviction*/
  866. worker->nb_buffers_totransfer = nacquires;
  867. return 0;
  868. }
  869. }
  870. else
  871. {
  872. ret = fetch_data(handle, node, local_replicate, mode, 0);
  873. #ifdef STARPU_SIMGRID
  874. if (_starpu_simgrid_fetching_input_cost())
  875. MSG_process_sleep(0.000001);
  876. #endif
  877. if (STARPU_UNLIKELY(ret))
  878. goto enomem;
  879. }
  880. nacquires++;
  881. }
  882. if (async)
  883. {
  884. worker->nb_buffers_totransfer = nacquires;
  885. return 0;
  886. }
  887. _starpu_fetch_task_input_tail(task, j, worker);
  888. return 0;
  889. enomem:
  890. _STARPU_TRACE_END_FETCH_INPUT(NULL);
  891. _STARPU_DISP("something went wrong with buffer %u\n", index);
  892. /* try to unreference all the input that were successfully taken */
  893. unsigned index2;
  894. for (index2 = 0; index2 < index; index2++)
  895. {
  896. starpu_data_handle_t handle = descrs[index2].handle;
  897. enum starpu_data_access_mode mode = descrs[index2].mode;
  898. int node = descrs[index].node;
  899. if (node == -1)
  900. node = local_memory_node;
  901. struct _starpu_data_replicate *local_replicate;
  902. if (index2 && descrs[index2-1].handle == descrs[index2].handle)
  903. /* We have already released this data, skip it. This
  904. * depends on ordering putting writes before reads, see
  905. * _starpu_compar_handles */
  906. continue;
  907. local_replicate = get_replicate(handle, mode, workerid, node);
  908. _starpu_release_data_on_node(handle, 0, local_replicate);
  909. }
  910. return -1;
  911. }
  912. /* Now that we have taken the data locks in locking order, fill the codelet interfaces in function order. */
  913. void _starpu_fetch_task_input_tail(struct starpu_task *task, struct _starpu_job *j, struct _starpu_worker *worker)
  914. {
  915. int workerid = worker->workerid;
  916. int profiling = starpu_profiling_status_get();
  917. struct _starpu_data_descr *descrs = _STARPU_JOB_GET_ORDERED_BUFFERS(j);
  918. unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
  919. unsigned local_memory_node = worker->memory_node;
  920. unsigned index;
  921. unsigned long total_size = 0;
  922. for (index = 0; index < nbuffers; index++)
  923. {
  924. starpu_data_handle_t handle = STARPU_TASK_GET_HANDLE(task, index);
  925. enum starpu_data_access_mode mode = STARPU_TASK_GET_MODE(task, index);
  926. int node = descrs[index].node;
  927. if (node == -1)
  928. node = local_memory_node;
  929. struct _starpu_data_replicate *local_replicate;
  930. local_replicate = get_replicate(handle, mode, workerid, node);
  931. if (local_replicate->mc)
  932. local_replicate->mc->diduse = 1;
  933. _STARPU_TASK_SET_INTERFACE(task , local_replicate->data_interface, index);
  934. /* If the replicate was not initialized yet, we have to do it now */
  935. if (!(mode & STARPU_SCRATCH) && !local_replicate->initialized)
  936. _starpu_redux_init_data_replicate(handle, local_replicate, workerid);
  937. #ifdef STARPU_USE_FXT
  938. total_size += _starpu_data_get_size(handle);
  939. #endif
  940. }
  941. _STARPU_TRACE_DATA_LOAD(workerid,total_size);
  942. if (profiling && task->profiling_info)
  943. _starpu_clock_gettime(&task->profiling_info->acquire_data_end_time);
  944. _STARPU_TRACE_END_FETCH_INPUT(NULL);
  945. }
  946. /* Release task data dependencies */
  947. void __starpu_push_task_output(struct _starpu_job *j)
  948. {
  949. #ifdef STARPU_OPENMP
  950. STARPU_ASSERT(!j->continuation);
  951. #endif
  952. int profiling = starpu_profiling_status_get();
  953. struct starpu_task *task = j->task;
  954. if (profiling && task->profiling_info)
  955. _starpu_clock_gettime(&task->profiling_info->release_data_start_time);
  956. struct _starpu_data_descr *descrs = _STARPU_JOB_GET_ORDERED_BUFFERS(j);
  957. unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
  958. int workerid = starpu_worker_get_id();
  959. unsigned local_memory_node = _starpu_memory_node_get_local_key();
  960. unsigned index;
  961. for (index = 0; index < nbuffers; index++)
  962. {
  963. starpu_data_handle_t handle = descrs[index].handle;
  964. enum starpu_data_access_mode mode = descrs[index].mode;
  965. int node = descrs[index].node;
  966. if (node == -1 && task->cl->where != STARPU_NOWHERE)
  967. node = local_memory_node;
  968. struct _starpu_data_replicate *local_replicate = NULL;
  969. if (index && descrs[index-1].handle == descrs[index].handle)
  970. /* We have already released this data, skip it. This
  971. * depends on ordering putting writes before reads, see
  972. * _starpu_compar_handles */
  973. continue;
  974. if (node != -1)
  975. local_replicate = get_replicate(handle, mode, workerid, node);
  976. /* Keep a reference for future
  977. * _starpu_release_task_enforce_sequential_consistency call */
  978. _starpu_spin_lock(&handle->header_lock);
  979. handle->busy_count++;
  980. if (node == -1)
  981. {
  982. /* NOWHERE case, just notify dependencies */
  983. if (!_starpu_notify_data_dependencies(handle))
  984. _starpu_spin_unlock(&handle->header_lock);
  985. }
  986. else
  987. {
  988. _starpu_spin_unlock(&handle->header_lock);
  989. _starpu_release_data_on_node(handle, 0, local_replicate);
  990. }
  991. }
  992. if (profiling && task->profiling_info)
  993. _starpu_clock_gettime(&task->profiling_info->release_data_end_time);
  994. }
  995. /* Version for a driver running on a worker: we show the driver state in the trace */
  996. void _starpu_push_task_output(struct _starpu_job *j)
  997. {
  998. _STARPU_TRACE_START_PUSH_OUTPUT(NULL);
  999. __starpu_push_task_output(j);
  1000. _STARPU_TRACE_END_PUSH_OUTPUT(NULL);
  1001. }
  1002. struct fetch_nowhere_wrapper
  1003. {
  1004. struct _starpu_job *j;
  1005. unsigned pending;
  1006. };
  1007. static void _starpu_fetch_nowhere_task_input_cb(void *arg);
  1008. /* Asynchronously fetch data for a task which will have no content */
  1009. void _starpu_fetch_nowhere_task_input(struct _starpu_job *j)
  1010. {
  1011. int profiling = starpu_profiling_status_get();
  1012. struct starpu_task *task = j->task;
  1013. if (profiling && task->profiling_info)
  1014. _starpu_clock_gettime(&task->profiling_info->acquire_data_start_time);
  1015. struct _starpu_data_descr *descrs = _STARPU_JOB_GET_ORDERED_BUFFERS(j);
  1016. unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
  1017. unsigned nfetchbuffers = 0;
  1018. struct fetch_nowhere_wrapper *wrapper;
  1019. unsigned index;
  1020. for (index = 0; index < nbuffers; index++)
  1021. {
  1022. int node = descrs[index].node;
  1023. if (node != -1)
  1024. nfetchbuffers++;
  1025. }
  1026. if (!nfetchbuffers)
  1027. {
  1028. /* Nothing to fetch actually, already finished! */
  1029. __starpu_push_task_output(j);
  1030. _starpu_handle_job_termination(j);
  1031. _STARPU_LOG_OUT_TAG("handle_job_termination");
  1032. return;
  1033. }
  1034. _STARPU_MALLOC(wrapper, (sizeof(*wrapper)));
  1035. wrapper->j = j;
  1036. /* +1 for the call below */
  1037. wrapper->pending = nfetchbuffers + 1;
  1038. for (index = 0; index < nbuffers; index++)
  1039. {
  1040. starpu_data_handle_t handle = descrs[index].handle;
  1041. enum starpu_data_access_mode mode = descrs[index].mode;
  1042. int node = descrs[index].node;
  1043. if (node == -1)
  1044. continue;
  1045. if (mode == STARPU_NONE ||
  1046. (mode & ((1<<STARPU_MODE_SHIFT) - 1)) >= STARPU_ACCESS_MODE_MAX ||
  1047. (mode >> STARPU_MODE_SHIFT) >= (STARPU_SHIFTED_MODE_MAX >> STARPU_MODE_SHIFT))
  1048. STARPU_ASSERT_MSG(0, "mode %d (0x%x) is bogus\n", mode, mode);
  1049. STARPU_ASSERT(mode != STARPU_SCRATCH && mode != STARPU_REDUX);
  1050. struct _starpu_data_replicate *local_replicate;
  1051. local_replicate = get_replicate(handle, mode, -1, node);
  1052. _starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, 0, 1, _starpu_fetch_nowhere_task_input_cb, wrapper, 0, "_starpu_fetch_nowhere_task_input");
  1053. }
  1054. if (profiling && task->profiling_info)
  1055. _starpu_clock_gettime(&task->profiling_info->acquire_data_end_time);
  1056. /* Finished working with the task, release our reference */
  1057. _starpu_fetch_nowhere_task_input_cb(wrapper);
  1058. }
  1059. static void _starpu_fetch_nowhere_task_input_cb(void *arg)
  1060. {
  1061. /* One more transfer finished */
  1062. struct fetch_nowhere_wrapper *wrapper = arg;
  1063. unsigned pending = STARPU_ATOMIC_ADD(&wrapper->pending, -1);
  1064. ANNOTATE_HAPPENS_BEFORE(&wrapper->pending);
  1065. if (pending == 0)
  1066. {
  1067. ANNOTATE_HAPPENS_AFTER(&wrapper->pending);
  1068. /* Finished transferring, task is over */
  1069. struct _starpu_job *j = wrapper->j;
  1070. free(wrapper);
  1071. __starpu_push_task_output(j);
  1072. _starpu_handle_job_termination(j);
  1073. _STARPU_LOG_OUT_TAG("handle_job_termination");
  1074. }
  1075. }
  1076. /* NB : this value can only be an indication of the status of a data
  1077. at some point, but there is no strong garantee ! */
  1078. unsigned _starpu_is_data_present_or_requested(starpu_data_handle_t handle, unsigned node)
  1079. {
  1080. unsigned ret = 0;
  1081. // XXX : this is just a hint, so we don't take the lock ...
  1082. // STARPU_PTHREAD_SPIN_LOCK(&handle->header_lock);
  1083. if (handle->per_node[node].state != STARPU_INVALID)
  1084. {
  1085. ret = 1;
  1086. }
  1087. else
  1088. {
  1089. unsigned i;
  1090. unsigned nnodes = starpu_memory_nodes_get_count();
  1091. for (i = 0; i < nnodes; i++)
  1092. {
  1093. if ((handle->per_node[node].requested & (1UL << i)) || handle->per_node[node].request[i])
  1094. ret = 1;
  1095. }
  1096. }
  1097. // STARPU_PTHREAD_SPIN_UNLOCK(&handle->header_lock);
  1098. return ret;
  1099. }
  1100. void _starpu_data_set_unregister_hook(starpu_data_handle_t handle, _starpu_data_handle_unregister_hook func)
  1101. {
  1102. handle->unregister_hook = func;
  1103. }