starpu_mpi.c 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2009-2021 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  4. * Copyright (C) 2019 Federal University of Rio Grande do Sul (UFRGS)
  5. *
  6. * StarPU is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU Lesser General Public License as published by
  8. * the Free Software Foundation; either version 2.1 of the License, or (at
  9. * your option) any later version.
  10. *
  11. * StarPU is distributed in the hope that it will be useful, but
  12. * WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  14. *
  15. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  16. */
  17. #include <stdlib.h>
  18. #include <limits.h>
  19. #include <starpu_mpi.h>
  20. #include <starpu_mpi_datatype.h>
  21. #include <starpu_mpi_private.h>
  22. #include <starpu_mpi_cache.h>
  23. #include <starpu_profiling.h>
  24. #include <starpu_mpi_stats.h>
  25. #include <starpu_mpi_cache.h>
  26. #include <starpu_mpi_select_node.h>
  27. #include <starpu_mpi_init.h>
  28. #include <common/config.h>
  29. #include <common/thread.h>
  30. #include <datawizard/interfaces/data_interface.h>
  31. #include <datawizard/coherency.h>
  32. #include <core/simgrid.h>
  33. #include <core/task.h>
  34. #include <core/topology.h>
  35. int _starpu_mpi_choose_node(starpu_data_handle_t handle, enum starpu_data_access_mode mode)
  36. {
  37. return STARPU_MAIN_RAM;
  38. /* TODO: this is completely untested */
  39. if (mode & STARPU_W)
  40. {
  41. /* TODO: lookup NIC location */
  42. /* Where to receive the data? */
  43. if (handle->home_node >= 0 && starpu_node_get_kind(handle->home_node) == STARPU_CPU_RAM)
  44. /* For now, better use the home node to avoid duplicates */
  45. return handle->home_node;
  46. if (starpu_memory_nodes_get_numa_count() == 1)
  47. return STARPU_MAIN_RAM;
  48. /* Several potential places */
  49. unsigned i;
  50. for (i = 0; i < STARPU_MAXNODES; i++)
  51. {
  52. /* TODO: we may want to take as a hint that it's allocated on the GPU as
  53. * a clue that we want to push to the GPU */
  54. if (starpu_node_get_kind(i) == STARPU_CPU_RAM &&
  55. handle->per_node[i].allocated)
  56. /* This node already has allocated buffers, let's just use it */
  57. return i;
  58. }
  59. /* No luck, take the least loaded node */
  60. starpu_ssize_t maximum = 0;
  61. starpu_ssize_t needed = _starpu_data_get_alloc_size(handle);
  62. unsigned node;
  63. for (i = 0; i < STARPU_MAXNODES; i++)
  64. {
  65. if (starpu_node_get_kind(i) == STARPU_CPU_RAM)
  66. {
  67. starpu_ssize_t size = starpu_memory_get_available(i);
  68. if (size >= needed && size > maximum)
  69. {
  70. node = i;
  71. maximum = size;
  72. }
  73. }
  74. }
  75. return node;
  76. }
  77. else
  78. {
  79. if (starpu_memory_nodes_get_numa_count() == 1)
  80. return STARPU_MAIN_RAM;
  81. /* Several potential places */
  82. unsigned i;
  83. for (i = 0; i < STARPU_MAXNODES; i++)
  84. {
  85. /* TODO: GPUDirect */
  86. if (starpu_node_get_kind(i) == STARPU_CPU_RAM &&
  87. handle->per_node[i].state != STARPU_INVALID)
  88. /* This node already has the value, let's just use it */
  89. /* TODO: rather pick up place next to NIC */
  90. return i;
  91. }
  92. /* No luck, take the least loaded node, to transfer from e.g. GPU */
  93. starpu_ssize_t maximum = 0;
  94. starpu_ssize_t needed = _starpu_data_get_alloc_size(handle);
  95. unsigned node;
  96. for (i = 0; i < STARPU_MAXNODES; i++)
  97. {
  98. if (starpu_node_get_kind(i) == STARPU_CPU_RAM)
  99. {
  100. starpu_ssize_t size = starpu_memory_get_available(i);
  101. if (size >= needed && size > maximum)
  102. {
  103. node = i;
  104. maximum = size;
  105. }
  106. }
  107. }
  108. return node;
  109. }
  110. }
  111. static void _starpu_mpi_acquired_callback(void *arg, int *nodep, enum starpu_data_access_mode mode)
  112. {
  113. struct _starpu_mpi_req *req = arg;
  114. int node = *nodep;
  115. /* The data was acquired in terms of dependencies, we can now look the
  116. * current state of the handle and decide which node we prefer for the data
  117. * fetch */
  118. if (node < 0)
  119. node = _starpu_mpi_choose_node(req->data_handle, mode);
  120. req->node = *nodep = node;
  121. }
  122. static void _starpu_mpi_isend_irecv_common(struct _starpu_mpi_req *req, enum starpu_data_access_mode mode, int sequential_consistency)
  123. {
  124. int node = -1;
  125. /* Asynchronously request StarPU to fetch the data in main memory: when
  126. * it is available in main memory, _starpu_mpi_submit_ready_request(req) is called and
  127. * the request is actually submitted */
  128. if (_starpu_mpi_mem_throttle && mode & STARPU_W && !req->data_handle->initialized)
  129. {
  130. /* We will trigger allocation, pre-reserve for it */
  131. size_t size = starpu_data_get_size(req->data_handle);
  132. if (size)
  133. {
  134. /* FIXME: rather take the less-loaded NUMA node */
  135. node = STARPU_MAIN_RAM;
  136. /* This will potentially block */
  137. starpu_memory_allocate(node, size, STARPU_MEMORY_WAIT);
  138. req->reserved_size = size;
  139. /* This also decides where we will store the data */
  140. req->node = node;
  141. }
  142. }
  143. if (sequential_consistency)
  144. {
  145. starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, node, mode, _starpu_mpi_acquired_callback, _starpu_mpi_submit_ready_request, (void *)req, 1 /*sequential consistency*/, 1, &req->pre_sync_jobid, &req->post_sync_jobid, req->prio);
  146. }
  147. else
  148. {
  149. /* post_sync_job_id has already been filled */
  150. starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, node, mode, _starpu_mpi_acquired_callback, _starpu_mpi_submit_ready_request, (void *)req, 0 /*sequential consistency*/, 1, &req->pre_sync_jobid, NULL, req->prio);
  151. }
  152. }
  153. static struct _starpu_mpi_req *_starpu_mpi_isend_common(starpu_data_handle_t data_handle, int dest, starpu_mpi_tag_t data_tag, MPI_Comm comm, unsigned detached, unsigned sync, int prio, void (*callback)(void *), void *arg, int sequential_consistency)
  154. {
  155. if (STARPU_UNLIKELY(_starpu_mpi_fake_world_size != -1))
  156. {
  157. /* Don't actually do the communication */
  158. return NULL;
  159. }
  160. #ifdef STARPU_MPI_PEDANTIC_ISEND
  161. enum starpu_data_access_mode mode = STARPU_RW;
  162. #else
  163. enum starpu_data_access_mode mode = STARPU_R;
  164. #endif
  165. struct _starpu_mpi_req *req = _starpu_mpi_request_fill(data_handle, dest, data_tag, comm, detached, sync, prio, callback, arg, SEND_REQ, _mpi_backend._starpu_mpi_backend_isend_size_func, sequential_consistency, 0, 0);
  166. _starpu_mpi_req_willpost(req);
  167. if (_starpu_mpi_use_coop_sends && detached == 1 && sync == 0 && callback == NULL)
  168. {
  169. /* It's a send & forget send, we can perhaps optimize its distribution over several nodes */
  170. _starpu_mpi_coop_send(data_handle, req, mode, sequential_consistency);
  171. return req;
  172. }
  173. /* Post normally */
  174. _starpu_mpi_isend_irecv_common(req, mode, sequential_consistency);
  175. return req;
  176. }
  177. int starpu_mpi_isend_prio(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int dest, starpu_mpi_tag_t data_tag, int prio, MPI_Comm comm)
  178. {
  179. _STARPU_MPI_LOG_IN();
  180. STARPU_MPI_ASSERT_MSG(public_req, "starpu_mpi_isend needs a valid starpu_mpi_req");
  181. struct _starpu_mpi_req *req;
  182. _STARPU_MPI_TRACE_ISEND_COMPLETE_BEGIN(dest, data_tag, 0);
  183. req = _starpu_mpi_isend_common(data_handle, dest, data_tag, comm, 0, 0, prio, NULL, NULL, 1);
  184. _STARPU_MPI_TRACE_ISEND_COMPLETE_END(dest, data_tag, 0);
  185. STARPU_MPI_ASSERT_MSG(req, "Invalid return for _starpu_mpi_isend_common");
  186. *public_req = req;
  187. _STARPU_MPI_LOG_OUT();
  188. return 0;
  189. }
  190. int starpu_mpi_isend(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int dest, starpu_mpi_tag_t data_tag, MPI_Comm comm)
  191. {
  192. return starpu_mpi_isend_prio(data_handle, public_req, dest, data_tag, 0, comm);
  193. }
  194. int starpu_mpi_isend_detached_prio(starpu_data_handle_t data_handle, int dest, starpu_mpi_tag_t data_tag, int prio, MPI_Comm comm, void (*callback)(void *), void *arg)
  195. {
  196. _STARPU_MPI_LOG_IN();
  197. _starpu_mpi_isend_common(data_handle, dest, data_tag, comm, 1, 0, prio, callback, arg, 1);
  198. _STARPU_MPI_LOG_OUT();
  199. return 0;
  200. }
  201. int starpu_mpi_isend_detached(starpu_data_handle_t data_handle, int dest, starpu_mpi_tag_t data_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
  202. {
  203. return starpu_mpi_isend_detached_prio(data_handle, dest, data_tag, 0, comm, callback, arg);
  204. }
  205. int starpu_mpi_send_prio(starpu_data_handle_t data_handle, int dest, starpu_mpi_tag_t data_tag, int prio, MPI_Comm comm)
  206. {
  207. starpu_mpi_req req;
  208. MPI_Status status;
  209. _STARPU_MPI_LOG_IN();
  210. starpu_mpi_isend_prio(data_handle, &req, dest, data_tag, prio, comm);
  211. memset(&status, 0, sizeof(MPI_Status));
  212. starpu_mpi_wait(&req, &status);
  213. _STARPU_MPI_LOG_OUT();
  214. return 0;
  215. }
  216. int starpu_mpi_send(starpu_data_handle_t data_handle, int dest, starpu_mpi_tag_t data_tag, MPI_Comm comm)
  217. {
  218. return starpu_mpi_send_prio(data_handle, dest, data_tag, 0, comm);
  219. }
  220. int starpu_mpi_issend_prio(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int dest, starpu_mpi_tag_t data_tag, int prio, MPI_Comm comm)
  221. {
  222. _STARPU_MPI_LOG_IN();
  223. STARPU_MPI_ASSERT_MSG(public_req, "starpu_mpi_issend needs a valid starpu_mpi_req");
  224. struct _starpu_mpi_req *req;
  225. req = _starpu_mpi_isend_common(data_handle, dest, data_tag, comm, 0, 1, prio, NULL, NULL, 1);
  226. STARPU_MPI_ASSERT_MSG(req, "Invalid return for _starpu_mpi_isend_common");
  227. *public_req = req;
  228. _STARPU_MPI_LOG_OUT();
  229. return 0;
  230. }
  231. int starpu_mpi_issend(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int dest, starpu_mpi_tag_t data_tag, MPI_Comm comm)
  232. {
  233. return starpu_mpi_issend_prio(data_handle, public_req, dest, data_tag, 0, comm);
  234. }
  235. int starpu_mpi_issend_detached_prio(starpu_data_handle_t data_handle, int dest, starpu_mpi_tag_t data_tag, int prio, MPI_Comm comm, void (*callback)(void *), void *arg)
  236. {
  237. _STARPU_MPI_LOG_IN();
  238. _starpu_mpi_isend_common(data_handle, dest, data_tag, comm, 1, 1, prio, callback, arg, 1);
  239. _STARPU_MPI_LOG_OUT();
  240. return 0;
  241. }
  242. int starpu_mpi_issend_detached(starpu_data_handle_t data_handle, int dest, starpu_mpi_tag_t data_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
  243. {
  244. return starpu_mpi_issend_detached_prio(data_handle, dest, data_tag, 0, comm, callback, arg);
  245. }
  246. struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm, unsigned detached, unsigned sync, void (*callback)(void *), void *arg, int sequential_consistency, int is_internal_req, starpu_ssize_t count, int prio)
  247. {
  248. if (_starpu_mpi_fake_world_size != -1)
  249. {
  250. /* Don't actually do the communication */
  251. return NULL;
  252. }
  253. struct _starpu_mpi_req *req = _starpu_mpi_request_fill(data_handle, source, data_tag, comm, detached, sync, prio, callback, arg, RECV_REQ, _mpi_backend._starpu_mpi_backend_irecv_size_func, sequential_consistency, is_internal_req, count);
  254. _starpu_mpi_req_willpost(req);
  255. if (sequential_consistency == 0)
  256. {
  257. /* Synchronization task jobid from redux is used */
  258. _starpu_mpi_redux_fill_post_sync_jobid(arg, &(req->post_sync_jobid));
  259. }
  260. _starpu_mpi_isend_irecv_common(req, STARPU_W, sequential_consistency);
  261. return req;
  262. }
  263. int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm)
  264. {
  265. _STARPU_MPI_LOG_IN();
  266. STARPU_MPI_ASSERT_MSG(public_req, "starpu_mpi_irecv needs a valid starpu_mpi_req");
  267. struct _starpu_mpi_req *req;
  268. _STARPU_MPI_TRACE_IRECV_COMPLETE_BEGIN(source, data_tag);
  269. req = _starpu_mpi_irecv_common(data_handle, source, data_tag, comm, 0, 0, NULL, NULL, 1, 0, 0, STARPU_DEFAULT_PRIO);
  270. _STARPU_MPI_TRACE_IRECV_COMPLETE_END(source, data_tag);
  271. STARPU_MPI_ASSERT_MSG(req, "Invalid return for _starpu_mpi_irecv_common");
  272. *public_req = req;
  273. _STARPU_MPI_LOG_OUT();
  274. return 0;
  275. }
  276. int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
  277. {
  278. _STARPU_MPI_LOG_IN();
  279. _starpu_mpi_irecv_common(data_handle, source, data_tag, comm, 1, 0, callback, arg, 1, 0, 0, STARPU_DEFAULT_PRIO);
  280. _STARPU_MPI_LOG_OUT();
  281. return 0;
  282. }
  283. int starpu_mpi_irecv_detached_prio(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, int prio, MPI_Comm comm, void (*callback)(void *), void *arg)
  284. {
  285. _STARPU_MPI_LOG_IN();
  286. _starpu_mpi_irecv_common(data_handle, source, data_tag, comm, 1, 0, callback, arg, 1, 0, 0, prio);
  287. _STARPU_MPI_LOG_OUT();
  288. return 0;
  289. }
  290. int starpu_mpi_irecv_detached_sequential_consistency(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm, void (*callback)(void *), void *arg, int sequential_consistency)
  291. {
  292. _STARPU_MPI_LOG_IN();
  293. _starpu_mpi_irecv_common(data_handle, source, data_tag, comm, 1, 0, callback, arg, sequential_consistency, 0, 0, STARPU_DEFAULT_PRIO);
  294. _STARPU_MPI_LOG_OUT();
  295. return 0;
  296. }
  297. int starpu_mpi_recv(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm, MPI_Status *status)
  298. {
  299. STARPU_ASSERT_MSG(status != NULL || status == MPI_STATUS_IGNORE, "MPI_Status value cannot be NULL or different from MPI_STATUS_IGNORE");
  300. starpu_mpi_req req;
  301. _STARPU_MPI_LOG_IN();
  302. starpu_mpi_irecv(data_handle, &req, source, data_tag, comm);
  303. starpu_mpi_wait(&req, status);
  304. _STARPU_MPI_LOG_OUT();
  305. return 0;
  306. }
  307. int starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
  308. {
  309. STARPU_ASSERT_MSG(status != NULL || status == MPI_STATUS_IGNORE, "MPI_Status value cannot be NULL or different from MPI_STATUS_IGNORE");
  310. return _mpi_backend._starpu_mpi_backend_wait(public_req, status);
  311. }
  312. int starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
  313. {
  314. return _mpi_backend._starpu_mpi_backend_test(public_req, flag, status);
  315. }
  316. int starpu_mpi_barrier(MPI_Comm comm)
  317. {
  318. return _mpi_backend._starpu_mpi_backend_barrier(comm);
  319. }
  320. void _starpu_mpi_data_clear(starpu_data_handle_t data_handle)
  321. {
  322. struct _starpu_mpi_data *data = data_handle->mpi_data;
  323. _mpi_backend._starpu_mpi_backend_data_clear(data_handle);
  324. _starpu_mpi_cache_data_clear(data_handle);
  325. _starpu_spin_destroy(&data->coop_lock);
  326. if (data->redux_map != REDUX_CONTRIB)
  327. free(data->redux_map);
  328. free(data);
  329. data_handle->mpi_data = NULL;
  330. }
  331. struct _starpu_mpi_data *_starpu_mpi_data_get(starpu_data_handle_t data_handle)
  332. {
  333. struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
  334. if (mpi_data)
  335. {
  336. STARPU_ASSERT(mpi_data->magic == 42);
  337. }
  338. else
  339. {
  340. _STARPU_CALLOC(mpi_data, 1, sizeof(struct _starpu_mpi_data));
  341. mpi_data->magic = 42;
  342. mpi_data->node_tag.data_tag = -1;
  343. mpi_data->node_tag.node.rank = -1;
  344. mpi_data->node_tag.node.comm = MPI_COMM_WORLD;
  345. _starpu_spin_init(&mpi_data->coop_lock);
  346. data_handle->mpi_data = mpi_data;
  347. _starpu_mpi_cache_data_init(data_handle);
  348. _starpu_data_set_unregister_hook(data_handle, _starpu_mpi_data_clear);
  349. }
  350. return mpi_data;
  351. }
  352. void starpu_mpi_data_register_comm(starpu_data_handle_t data_handle, starpu_mpi_tag_t data_tag, int rank, MPI_Comm comm)
  353. {
  354. struct _starpu_mpi_data *mpi_data = _starpu_mpi_data_get(data_handle);
  355. if (data_tag != -1)
  356. {
  357. _mpi_backend._starpu_mpi_backend_data_register(data_handle, data_tag);
  358. mpi_data->node_tag.data_tag = data_tag;
  359. _STARPU_MPI_TRACE_DATA_SET_TAG(data_handle, data_tag);
  360. }
  361. if (rank != -1)
  362. {
  363. _STARPU_MPI_TRACE_DATA_SET_RANK(data_handle, rank);
  364. mpi_data->node_tag.node.rank = rank;
  365. mpi_data->node_tag.node.comm = comm;
  366. }
  367. }
  368. void starpu_mpi_data_set_rank_comm(starpu_data_handle_t handle, int rank, MPI_Comm comm)
  369. {
  370. starpu_mpi_data_register_comm(handle, -1, rank, comm);
  371. }
  372. void starpu_mpi_data_set_tag(starpu_data_handle_t handle, starpu_mpi_tag_t data_tag)
  373. {
  374. starpu_mpi_data_register_comm(handle, data_tag, -1, MPI_COMM_WORLD);
  375. }
  376. int starpu_mpi_data_get_rank(starpu_data_handle_t data)
  377. {
  378. STARPU_ASSERT_MSG(data->mpi_data, "starpu_mpi_data_register MUST be called for data %p\n", data);
  379. return ((struct _starpu_mpi_data *)(data->mpi_data))->node_tag.node.rank;
  380. }
  381. starpu_mpi_tag_t starpu_mpi_data_get_tag(starpu_data_handle_t data)
  382. {
  383. STARPU_ASSERT_MSG(data->mpi_data, "starpu_mpi_data_register MUST be called for data %p\n", data);
  384. return ((struct _starpu_mpi_data *)(data->mpi_data))->node_tag.data_tag;
  385. }
  386. char* starpu_mpi_data_get_redux_map(starpu_data_handle_t data)
  387. {
  388. STARPU_ASSERT_MSG(data->mpi_data, "starpu_mpi_data_register MUST be called for data %p\n", data);
  389. return ((struct _starpu_mpi_data *)(data->mpi_data))->redux_map;
  390. }
  391. void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t data_handle, int node, void (*callback)(void*), void *arg)
  392. {
  393. int me, rank;
  394. starpu_mpi_tag_t data_tag;
  395. rank = starpu_mpi_data_get_rank(data_handle);
  396. if (rank == -1)
  397. {
  398. _STARPU_ERROR("StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register() or starpu_mpi_data_register_comm()\n");
  399. }
  400. starpu_mpi_comm_rank(comm, &me);
  401. if (node == rank)
  402. return;
  403. data_tag = starpu_mpi_data_get_tag(data_handle);
  404. if (data_tag == -1)
  405. {
  406. _STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register() or starpu_mpi_data_register_comm()\n");
  407. }
  408. if (me == node)
  409. {
  410. _STARPU_MPI_DEBUG(1, "Migrating data %p from %d to %d\n", data_handle, rank, node);
  411. int already_received = starpu_mpi_cached_receive_set(data_handle);
  412. if (already_received == 0)
  413. {
  414. _STARPU_MPI_DEBUG(1, "Receiving data %p from %d\n", data_handle, rank);
  415. starpu_mpi_irecv_detached(data_handle, rank, data_tag, comm, callback, arg);
  416. }
  417. }
  418. else if (me == rank)
  419. {
  420. _STARPU_MPI_DEBUG(1, "Migrating data %p from %d to %d\n", data_handle, rank, node);
  421. int already_sent = starpu_mpi_cached_send_set(data_handle, node);
  422. if (already_sent == 0)
  423. {
  424. _STARPU_MPI_DEBUG(1, "Sending data %p to %d\n", data_handle, node);
  425. starpu_mpi_isend_detached(data_handle, node, data_tag, comm, NULL, NULL);
  426. }
  427. }
  428. }
  429. void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle, int node)
  430. {
  431. int me, rank;
  432. starpu_mpi_tag_t data_tag;
  433. rank = starpu_mpi_data_get_rank(data_handle);
  434. if (rank == -1)
  435. {
  436. _STARPU_ERROR("StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register\n");
  437. }
  438. starpu_mpi_comm_rank(comm, &me);
  439. if (node == rank)
  440. return;
  441. data_tag = starpu_mpi_data_get_tag(data_handle);
  442. if (data_tag == -1)
  443. {
  444. _STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
  445. }
  446. if (me == node)
  447. {
  448. MPI_Status status;
  449. _STARPU_MPI_DEBUG(1, "Migrating data %p from %d to %d\n", data_handle, rank, node);
  450. int already_received = starpu_mpi_cached_receive_set(data_handle);
  451. if (already_received == 0)
  452. {
  453. _STARPU_MPI_DEBUG(1, "Receiving data %p from %d\n", data_handle, rank);
  454. starpu_mpi_recv(data_handle, rank, data_tag, comm, &status);
  455. }
  456. }
  457. else if (me == rank)
  458. {
  459. _STARPU_MPI_DEBUG(1, "Migrating data %p from %d to %d\n", data_handle, rank, node);
  460. int already_sent = starpu_mpi_cached_send_set(data_handle, node);
  461. if (already_sent == 0)
  462. {
  463. _STARPU_MPI_DEBUG(1, "Sending data %p to %d\n", data_handle, node);
  464. starpu_mpi_send(data_handle, node, data_tag, comm);
  465. }
  466. }
  467. }
  468. void starpu_mpi_get_data_on_all_nodes_detached(MPI_Comm comm, starpu_data_handle_t data_handle)
  469. {
  470. int size, i;
  471. starpu_mpi_comm_size(comm, &size);
  472. for (i = 0; i < size; i++)
  473. starpu_mpi_get_data_on_node_detached(comm, data_handle, i, NULL, NULL);
  474. }
  475. void starpu_mpi_data_migrate(MPI_Comm comm, starpu_data_handle_t data, int new_rank)
  476. {
  477. int old_rank = starpu_mpi_data_get_rank(data);
  478. if (new_rank == old_rank)
  479. /* Already there */
  480. return;
  481. /* First submit data migration if it's not already on destination */
  482. starpu_mpi_get_data_on_node_detached(comm, data, new_rank, NULL, NULL);
  483. /* And note new owner */
  484. starpu_mpi_data_set_rank_comm(data, new_rank, comm);
  485. /* Flush cache in all other nodes */
  486. /* TODO: Ideally we'd transmit the knowledge of who owns it */
  487. /* TODO: or at least remember that the previous owner has the data, that's an easy case to support */
  488. starpu_mpi_cache_flush(comm, data);
  489. return;
  490. }
  491. int starpu_mpi_wait_for_all(MPI_Comm comm)
  492. {
  493. return _mpi_backend._starpu_mpi_backend_wait_for_all(comm);
  494. }