starpu_mpi.c 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2009-2021 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  4. * Copyright (C) 2019,2021 Federal University of Rio Grande do Sul (UFRGS)
  5. *
  6. * StarPU is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU Lesser General Public License as published by
  8. * the Free Software Foundation; either version 2.1 of the License, or (at
  9. * your option) any later version.
  10. *
  11. * StarPU is distributed in the hope that it will be useful, but
  12. * WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  14. *
  15. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  16. */
  17. #include <stdlib.h>
  18. #include <limits.h>
  19. #include <starpu_mpi.h>
  20. #include <starpu_mpi_datatype.h>
  21. #include <starpu_mpi_private.h>
  22. #include <starpu_mpi_cache.h>
  23. #include <starpu_profiling.h>
  24. #include <starpu_mpi_stats.h>
  25. #include <starpu_mpi_cache.h>
  26. #include <starpu_mpi_select_node.h>
  27. #include <starpu_mpi_init.h>
  28. #include <common/config.h>
  29. #include <common/thread.h>
  30. #include <datawizard/interfaces/data_interface.h>
  31. #include <datawizard/coherency.h>
  32. #include <core/simgrid.h>
  33. #include <core/task.h>
  34. #include <core/topology.h>
  35. int _starpu_mpi_choose_node(starpu_data_handle_t handle, enum starpu_data_access_mode mode)
  36. {
  37. return STARPU_MAIN_RAM;
  38. /* TODO: this is completely untested */
  39. if (mode & STARPU_W)
  40. {
  41. /* TODO: lookup NIC location */
  42. /* Where to receive the data? */
  43. if (handle->home_node >= 0 && starpu_node_get_kind(handle->home_node) == STARPU_CPU_RAM)
  44. /* For now, better use the home node to avoid duplicates */
  45. return handle->home_node;
  46. if (starpu_memory_nodes_get_numa_count() == 1)
  47. return STARPU_MAIN_RAM;
  48. /* Several potential places */
  49. unsigned i;
  50. for (i = 0; i < STARPU_MAXNODES; i++)
  51. {
  52. /* TODO: we may want to take as a hint that it's allocated on the GPU as
  53. * a clue that we want to push to the GPU */
  54. if (starpu_node_get_kind(i) == STARPU_CPU_RAM &&
  55. handle->per_node[i].allocated)
  56. /* This node already has allocated buffers, let's just use it */
  57. return i;
  58. }
  59. /* No luck, take the least loaded node */
  60. starpu_ssize_t maximum = 0;
  61. starpu_ssize_t needed = _starpu_data_get_alloc_size(handle);
  62. unsigned node;
  63. for (i = 0; i < STARPU_MAXNODES; i++)
  64. {
  65. if (starpu_node_get_kind(i) == STARPU_CPU_RAM)
  66. {
  67. starpu_ssize_t size = starpu_memory_get_available(i);
  68. if (size >= needed && size > maximum)
  69. {
  70. node = i;
  71. maximum = size;
  72. }
  73. }
  74. }
  75. return node;
  76. }
  77. else
  78. {
  79. if (starpu_memory_nodes_get_numa_count() == 1)
  80. return STARPU_MAIN_RAM;
  81. /* Several potential places */
  82. unsigned i;
  83. for (i = 0; i < STARPU_MAXNODES; i++)
  84. {
  85. /* TODO: GPUDirect */
  86. if (starpu_node_get_kind(i) == STARPU_CPU_RAM &&
  87. handle->per_node[i].state != STARPU_INVALID)
  88. /* This node already has the value, let's just use it */
  89. /* TODO: rather pick up place next to NIC */
  90. return i;
  91. }
  92. /* No luck, take the least loaded node, to transfer from e.g. GPU */
  93. starpu_ssize_t maximum = 0;
  94. starpu_ssize_t needed = _starpu_data_get_alloc_size(handle);
  95. unsigned node;
  96. for (i = 0; i < STARPU_MAXNODES; i++)
  97. {
  98. if (starpu_node_get_kind(i) == STARPU_CPU_RAM)
  99. {
  100. starpu_ssize_t size = starpu_memory_get_available(i);
  101. if (size >= needed && size > maximum)
  102. {
  103. node = i;
  104. maximum = size;
  105. }
  106. }
  107. }
  108. return node;
  109. }
  110. }
  111. static void _starpu_mpi_acquired_callback(void *arg, int *nodep, enum starpu_data_access_mode mode)
  112. {
  113. struct _starpu_mpi_req *req = arg;
  114. int node = *nodep;
  115. /* The data was acquired in terms of dependencies, we can now look the
  116. * current state of the handle and decide which node we prefer for the data
  117. * fetch */
  118. if (node < 0)
  119. node = _starpu_mpi_choose_node(req->data_handle, mode);
  120. req->node = *nodep = node;
  121. }
  122. static void _starpu_mpi_isend_irecv_common(struct _starpu_mpi_req *req, enum starpu_data_access_mode mode, int sequential_consistency)
  123. {
  124. int node = -1;
  125. /* Asynchronously request StarPU to fetch the data in main memory: when
  126. * it is available in main memory, _starpu_mpi_submit_ready_request(req) is called and
  127. * the request is actually submitted */
  128. if (_starpu_mpi_mem_throttle && mode & STARPU_W && !req->data_handle->initialized)
  129. {
  130. /* We will trigger allocation, pre-reserve for it */
  131. size_t size = starpu_data_get_size(req->data_handle);
  132. if (size)
  133. {
  134. /* FIXME: rather take the less-loaded NUMA node */
  135. node = STARPU_MAIN_RAM;
  136. /* This will potentially block */
  137. starpu_memory_allocate(node, size, STARPU_MEMORY_WAIT);
  138. req->reserved_size = size;
  139. /* This also decides where we will store the data */
  140. req->node = node;
  141. }
  142. }
  143. if (sequential_consistency)
  144. {
  145. starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, node, mode, _starpu_mpi_acquired_callback, _starpu_mpi_submit_ready_request, (void *)req, 1 /*sequential consistency*/, 1, &req->pre_sync_jobid, &req->post_sync_jobid, req->prio);
  146. }
  147. else
  148. {
  149. /* post_sync_job_id has already been filled */
  150. starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, node, mode, _starpu_mpi_acquired_callback, _starpu_mpi_submit_ready_request, (void *)req, 0 /*sequential consistency*/, 1, &req->pre_sync_jobid, NULL, req->prio);
  151. }
  152. }
  153. static struct _starpu_mpi_req *_starpu_mpi_isend_common(starpu_data_handle_t data_handle, int dest, starpu_mpi_tag_t data_tag, MPI_Comm comm, unsigned detached, unsigned sync, int prio, void (*callback)(void *), void *arg, int sequential_consistency)
  154. {
  155. if (STARPU_UNLIKELY(_starpu_mpi_fake_world_size != -1))
  156. {
  157. /* Don't actually do the communication */
  158. return NULL;
  159. }
  160. #ifdef STARPU_MPI_PEDANTIC_ISEND
  161. enum starpu_data_access_mode mode = STARPU_RW;
  162. #else
  163. enum starpu_data_access_mode mode = STARPU_R;
  164. #endif
  165. struct _starpu_mpi_req *req = _starpu_mpi_request_fill(data_handle, dest, data_tag, comm, detached, sync, prio, callback, arg, SEND_REQ, _mpi_backend._starpu_mpi_backend_isend_size_func, sequential_consistency, 0, 0);
  166. _starpu_mpi_req_willpost(req);
  167. if (_starpu_mpi_use_coop_sends && detached == 1 && sync == 0 && callback == NULL)
  168. {
  169. /* It's a send & forget send, we can perhaps optimize its distribution over several nodes */
  170. _starpu_mpi_coop_send(data_handle, req, mode, sequential_consistency);
  171. return req;
  172. }
  173. /* Post normally */
  174. _starpu_mpi_isend_irecv_common(req, mode, sequential_consistency);
  175. return req;
  176. }
  177. int starpu_mpi_isend_prio(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int dest, starpu_mpi_tag_t data_tag, int prio, MPI_Comm comm)
  178. {
  179. _STARPU_MPI_LOG_IN();
  180. STARPU_MPI_ASSERT_MSG(public_req, "starpu_mpi_isend needs a valid starpu_mpi_req");
  181. struct _starpu_mpi_req *req;
  182. _STARPU_MPI_TRACE_ISEND_COMPLETE_BEGIN(dest, data_tag, 0);
  183. req = _starpu_mpi_isend_common(data_handle, dest, data_tag, comm, 0, 0, prio, NULL, NULL, 1);
  184. _STARPU_MPI_TRACE_ISEND_COMPLETE_END(dest, data_tag, 0);
  185. STARPU_MPI_ASSERT_MSG(req, "Invalid return for _starpu_mpi_isend_common");
  186. *public_req = req;
  187. _STARPU_MPI_LOG_OUT();
  188. return 0;
  189. }
  190. int starpu_mpi_isend(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int dest, starpu_mpi_tag_t data_tag, MPI_Comm comm)
  191. {
  192. return starpu_mpi_isend_prio(data_handle, public_req, dest, data_tag, 0, comm);
  193. }
  194. int starpu_mpi_isend_detached_prio(starpu_data_handle_t data_handle, int dest, starpu_mpi_tag_t data_tag, int prio, MPI_Comm comm, void (*callback)(void *), void *arg)
  195. {
  196. _STARPU_MPI_LOG_IN();
  197. _starpu_mpi_isend_common(data_handle, dest, data_tag, comm, 1, 0, prio, callback, arg, 1);
  198. _STARPU_MPI_LOG_OUT();
  199. return 0;
  200. }
  201. int starpu_mpi_isend_detached(starpu_data_handle_t data_handle, int dest, starpu_mpi_tag_t data_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
  202. {
  203. return starpu_mpi_isend_detached_prio(data_handle, dest, data_tag, 0, comm, callback, arg);
  204. }
  205. int starpu_mpi_send_prio(starpu_data_handle_t data_handle, int dest, starpu_mpi_tag_t data_tag, int prio, MPI_Comm comm)
  206. {
  207. starpu_mpi_req req;
  208. MPI_Status status;
  209. _STARPU_MPI_LOG_IN();
  210. starpu_mpi_isend_prio(data_handle, &req, dest, data_tag, prio, comm);
  211. memset(&status, 0, sizeof(MPI_Status));
  212. starpu_mpi_wait(&req, &status);
  213. _STARPU_MPI_LOG_OUT();
  214. return 0;
  215. }
  216. int starpu_mpi_send(starpu_data_handle_t data_handle, int dest, starpu_mpi_tag_t data_tag, MPI_Comm comm)
  217. {
  218. return starpu_mpi_send_prio(data_handle, dest, data_tag, 0, comm);
  219. }
  220. int starpu_mpi_issend_prio(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int dest, starpu_mpi_tag_t data_tag, int prio, MPI_Comm comm)
  221. {
  222. _STARPU_MPI_LOG_IN();
  223. STARPU_MPI_ASSERT_MSG(public_req, "starpu_mpi_issend needs a valid starpu_mpi_req");
  224. struct _starpu_mpi_req *req;
  225. req = _starpu_mpi_isend_common(data_handle, dest, data_tag, comm, 0, 1, prio, NULL, NULL, 1);
  226. STARPU_MPI_ASSERT_MSG(req, "Invalid return for _starpu_mpi_isend_common");
  227. *public_req = req;
  228. _STARPU_MPI_LOG_OUT();
  229. return 0;
  230. }
  231. int starpu_mpi_issend(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int dest, starpu_mpi_tag_t data_tag, MPI_Comm comm)
  232. {
  233. return starpu_mpi_issend_prio(data_handle, public_req, dest, data_tag, 0, comm);
  234. }
  235. int starpu_mpi_issend_detached_prio(starpu_data_handle_t data_handle, int dest, starpu_mpi_tag_t data_tag, int prio, MPI_Comm comm, void (*callback)(void *), void *arg)
  236. {
  237. _STARPU_MPI_LOG_IN();
  238. _starpu_mpi_isend_common(data_handle, dest, data_tag, comm, 1, 1, prio, callback, arg, 1);
  239. _STARPU_MPI_LOG_OUT();
  240. return 0;
  241. }
  242. int starpu_mpi_issend_detached(starpu_data_handle_t data_handle, int dest, starpu_mpi_tag_t data_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
  243. {
  244. return starpu_mpi_issend_detached_prio(data_handle, dest, data_tag, 0, comm, callback, arg);
  245. }
  246. struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm, unsigned detached, unsigned sync, void (*callback)(void *), void *arg, int sequential_consistency, int is_internal_req, starpu_ssize_t count, int prio)
  247. {
  248. if (_starpu_mpi_fake_world_size != -1)
  249. {
  250. /* Don't actually do the communication */
  251. return NULL;
  252. }
  253. struct _starpu_mpi_req *req = _starpu_mpi_request_fill(data_handle, source, data_tag, comm, detached, sync, prio, callback, arg, RECV_REQ, _mpi_backend._starpu_mpi_backend_irecv_size_func, sequential_consistency, is_internal_req, count);
  254. _starpu_mpi_req_willpost(req);
  255. if (sequential_consistency == 0)
  256. {
  257. /* Synchronization task jobid from redux is used */
  258. _starpu_mpi_redux_fill_post_sync_jobid(arg, &(req->post_sync_jobid));
  259. }
  260. _starpu_mpi_isend_irecv_common(req, STARPU_W, sequential_consistency);
  261. return req;
  262. }
  263. int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm)
  264. {
  265. _STARPU_MPI_LOG_IN();
  266. STARPU_MPI_ASSERT_MSG(public_req, "starpu_mpi_irecv needs a valid starpu_mpi_req");
  267. struct _starpu_mpi_req *req;
  268. _STARPU_MPI_TRACE_IRECV_COMPLETE_BEGIN(source, data_tag);
  269. req = _starpu_mpi_irecv_common(data_handle, source, data_tag, comm, 0, 0, NULL, NULL, 1, 0, 0, STARPU_DEFAULT_PRIO);
  270. _STARPU_MPI_TRACE_IRECV_COMPLETE_END(source, data_tag);
  271. STARPU_MPI_ASSERT_MSG(req, "Invalid return for _starpu_mpi_irecv_common");
  272. *public_req = req;
  273. _STARPU_MPI_LOG_OUT();
  274. return 0;
  275. }
  276. int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
  277. {
  278. _STARPU_MPI_LOG_IN();
  279. _starpu_mpi_irecv_common(data_handle, source, data_tag, comm, 1, 0, callback, arg, 1, 0, 0, STARPU_DEFAULT_PRIO);
  280. _STARPU_MPI_LOG_OUT();
  281. return 0;
  282. }
  283. int starpu_mpi_irecv_detached_prio(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, int prio, MPI_Comm comm, void (*callback)(void *), void *arg)
  284. {
  285. _STARPU_MPI_LOG_IN();
  286. _starpu_mpi_irecv_common(data_handle, source, data_tag, comm, 1, 0, callback, arg, 1, 0, 0, prio);
  287. _STARPU_MPI_LOG_OUT();
  288. return 0;
  289. }
  290. int starpu_mpi_irecv_detached_sequential_consistency(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm, void (*callback)(void *), void *arg, int sequential_consistency)
  291. {
  292. _STARPU_MPI_LOG_IN();
  293. _starpu_mpi_irecv_common(data_handle, source, data_tag, comm, 1, 0, callback, arg, sequential_consistency, 0, 0, STARPU_DEFAULT_PRIO);
  294. _STARPU_MPI_LOG_OUT();
  295. return 0;
  296. }
  297. int starpu_mpi_recv(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm, MPI_Status *status)
  298. {
  299. STARPU_ASSERT_MSG(status != NULL || status == MPI_STATUS_IGNORE, "MPI_Status value cannot be NULL or different from MPI_STATUS_IGNORE");
  300. starpu_mpi_req req;
  301. _STARPU_MPI_LOG_IN();
  302. starpu_mpi_irecv(data_handle, &req, source, data_tag, comm);
  303. starpu_mpi_wait(&req, status);
  304. _STARPU_MPI_LOG_OUT();
  305. return 0;
  306. }
  307. int starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
  308. {
  309. STARPU_ASSERT_MSG(status != NULL || status == MPI_STATUS_IGNORE, "MPI_Status value cannot be NULL or different from MPI_STATUS_IGNORE");
  310. return _mpi_backend._starpu_mpi_backend_wait(public_req, status);
  311. }
  312. int starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
  313. {
  314. return _mpi_backend._starpu_mpi_backend_test(public_req, flag, status);
  315. }
  316. int starpu_mpi_barrier(MPI_Comm comm)
  317. {
  318. return _mpi_backend._starpu_mpi_backend_barrier(comm);
  319. }
  320. void _starpu_mpi_data_clear(starpu_data_handle_t data_handle)
  321. {
  322. struct _starpu_mpi_data *data = data_handle->mpi_data;
  323. _mpi_backend._starpu_mpi_backend_data_clear(data_handle);
  324. _starpu_mpi_cache_data_clear(data_handle);
  325. _starpu_spin_destroy(&data->coop_lock);
  326. free(data->redux_map);
  327. data->redux_map = NULL;
  328. free(data);
  329. }
  330. struct _starpu_mpi_data *_starpu_mpi_data_get(starpu_data_handle_t data_handle)
  331. {
  332. struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
  333. if (mpi_data)
  334. {
  335. STARPU_ASSERT(mpi_data->magic == 42);
  336. }
  337. else
  338. {
  339. _STARPU_CALLOC(mpi_data, 1, sizeof(struct _starpu_mpi_data));
  340. mpi_data->magic = 42;
  341. mpi_data->node_tag.data_tag = -1;
  342. mpi_data->node_tag.node.rank = -1;
  343. mpi_data->node_tag.node.comm = MPI_COMM_WORLD;
  344. _starpu_spin_init(&mpi_data->coop_lock);
  345. data_handle->mpi_data = mpi_data;
  346. _starpu_mpi_cache_data_init(data_handle);
  347. _starpu_data_set_unregister_hook(data_handle, _starpu_mpi_data_clear);
  348. }
  349. return mpi_data;
  350. }
  351. void starpu_mpi_data_register_comm(starpu_data_handle_t data_handle, starpu_mpi_tag_t data_tag, int rank, MPI_Comm comm)
  352. {
  353. struct _starpu_mpi_data *mpi_data = _starpu_mpi_data_get(data_handle);
  354. if (data_tag != -1)
  355. {
  356. _mpi_backend._starpu_mpi_backend_data_register(data_handle, data_tag);
  357. mpi_data->node_tag.data_tag = data_tag;
  358. _STARPU_MPI_TRACE_DATA_SET_TAG(data_handle, data_tag);
  359. }
  360. if (rank != -1)
  361. {
  362. _STARPU_MPI_TRACE_DATA_SET_RANK(data_handle, rank);
  363. mpi_data->node_tag.node.rank = rank;
  364. mpi_data->node_tag.node.comm = comm;
  365. }
  366. }
  367. void starpu_mpi_data_set_rank_comm(starpu_data_handle_t handle, int rank, MPI_Comm comm)
  368. {
  369. starpu_mpi_data_register_comm(handle, -1, rank, comm);
  370. }
  371. void starpu_mpi_data_set_tag(starpu_data_handle_t handle, starpu_mpi_tag_t data_tag)
  372. {
  373. starpu_mpi_data_register_comm(handle, data_tag, -1, MPI_COMM_WORLD);
  374. }
  375. int starpu_mpi_data_get_rank(starpu_data_handle_t data)
  376. {
  377. STARPU_ASSERT_MSG(data->mpi_data, "starpu_mpi_data_register MUST be called for data %p\n", data);
  378. return ((struct _starpu_mpi_data *)(data->mpi_data))->node_tag.node.rank;
  379. }
  380. starpu_mpi_tag_t starpu_mpi_data_get_tag(starpu_data_handle_t data)
  381. {
  382. STARPU_ASSERT_MSG(data->mpi_data, "starpu_mpi_data_register MUST be called for data %p\n", data);
  383. return ((struct _starpu_mpi_data *)(data->mpi_data))->node_tag.data_tag;
  384. }
  385. char* starpu_mpi_data_get_redux_map(starpu_data_handle_t data)
  386. {
  387. STARPU_ASSERT_MSG(data->mpi_data, "starpu_mpi_data_register MUST be called for data %p\n", data);
  388. return ((struct _starpu_mpi_data *)(data->mpi_data))->redux_map;
  389. }
  390. void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t data_handle, int node, void (*callback)(void*), void *arg)
  391. {
  392. int me, rank;
  393. starpu_mpi_tag_t data_tag;
  394. rank = starpu_mpi_data_get_rank(data_handle);
  395. if (rank == -1)
  396. {
  397. _STARPU_ERROR("StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register() or starpu_mpi_data_register_comm()\n");
  398. }
  399. starpu_mpi_comm_rank(comm, &me);
  400. if (node == rank)
  401. return;
  402. data_tag = starpu_mpi_data_get_tag(data_handle);
  403. if (data_tag == -1)
  404. {
  405. _STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register() or starpu_mpi_data_register_comm()\n");
  406. }
  407. if (me == node)
  408. {
  409. _STARPU_MPI_DEBUG(1, "Migrating data %p from %d to %d\n", data_handle, rank, node);
  410. int already_received = starpu_mpi_cached_receive_set(data_handle);
  411. if (already_received == 0)
  412. {
  413. _STARPU_MPI_DEBUG(1, "Receiving data %p from %d\n", data_handle, rank);
  414. starpu_mpi_irecv_detached(data_handle, rank, data_tag, comm, callback, arg);
  415. }
  416. }
  417. else if (me == rank)
  418. {
  419. _STARPU_MPI_DEBUG(1, "Migrating data %p from %d to %d\n", data_handle, rank, node);
  420. int already_sent = starpu_mpi_cached_send_set(data_handle, node);
  421. if (already_sent == 0)
  422. {
  423. _STARPU_MPI_DEBUG(1, "Sending data %p to %d\n", data_handle, node);
  424. starpu_mpi_isend_detached(data_handle, node, data_tag, comm, NULL, NULL);
  425. }
  426. }
  427. }
  428. void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle, int node)
  429. {
  430. int me, rank;
  431. starpu_mpi_tag_t data_tag;
  432. rank = starpu_mpi_data_get_rank(data_handle);
  433. if (rank == -1)
  434. {
  435. _STARPU_ERROR("StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register\n");
  436. }
  437. starpu_mpi_comm_rank(comm, &me);
  438. if (node == rank)
  439. return;
  440. data_tag = starpu_mpi_data_get_tag(data_handle);
  441. if (data_tag == -1)
  442. {
  443. _STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
  444. }
  445. if (me == node)
  446. {
  447. MPI_Status status;
  448. _STARPU_MPI_DEBUG(1, "Migrating data %p from %d to %d\n", data_handle, rank, node);
  449. int already_received = starpu_mpi_cached_receive_set(data_handle);
  450. if (already_received == 0)
  451. {
  452. _STARPU_MPI_DEBUG(1, "Receiving data %p from %d\n", data_handle, rank);
  453. starpu_mpi_recv(data_handle, rank, data_tag, comm, &status);
  454. }
  455. }
  456. else if (me == rank)
  457. {
  458. _STARPU_MPI_DEBUG(1, "Migrating data %p from %d to %d\n", data_handle, rank, node);
  459. int already_sent = starpu_mpi_cached_send_set(data_handle, node);
  460. if (already_sent == 0)
  461. {
  462. _STARPU_MPI_DEBUG(1, "Sending data %p to %d\n", data_handle, node);
  463. starpu_mpi_send(data_handle, node, data_tag, comm);
  464. }
  465. }
  466. }
  467. void starpu_mpi_get_data_on_all_nodes_detached(MPI_Comm comm, starpu_data_handle_t data_handle)
  468. {
  469. int size, i;
  470. starpu_mpi_comm_size(comm, &size);
  471. for (i = 0; i < size; i++)
  472. starpu_mpi_get_data_on_node_detached(comm, data_handle, i, NULL, NULL);
  473. }
  474. void starpu_mpi_data_migrate(MPI_Comm comm, starpu_data_handle_t data, int new_rank)
  475. {
  476. int old_rank = starpu_mpi_data_get_rank(data);
  477. if (new_rank == old_rank)
  478. /* Already there */
  479. return;
  480. /* First submit data migration if it's not already on destination */
  481. starpu_mpi_get_data_on_node_detached(comm, data, new_rank, NULL, NULL);
  482. /* And note new owner */
  483. starpu_mpi_data_set_rank_comm(data, new_rank, comm);
  484. /* Flush cache in all other nodes */
  485. /* TODO: Ideally we'd transmit the knowledge of who owns it */
  486. /* TODO: or at least remember that the previous owner has the data, that's an easy case to support */
  487. starpu_mpi_cache_flush(comm, data);
  488. return;
  489. }
  490. int starpu_mpi_wait_for_all(MPI_Comm comm)
  491. {
  492. return _mpi_backend._starpu_mpi_backend_wait_for_all(comm);
  493. }