starpu_mpi.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2009-2020 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  4. * Copyright (C) 2019 Federal University of Rio Grande do Sul (UFRGS)
  5. *
  6. * StarPU is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU Lesser General Public License as published by
  8. * the Free Software Foundation; either version 2.1 of the License, or (at
  9. * your option) any later version.
  10. *
  11. * StarPU is distributed in the hope that it will be useful, but
  12. * WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  14. *
  15. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  16. */
  17. #include <stdlib.h>
  18. #include <limits.h>
  19. #include <starpu_mpi.h>
  20. #include <starpu_mpi_datatype.h>
  21. #include <starpu_mpi_private.h>
  22. #include <starpu_mpi_cache.h>
  23. #include <starpu_profiling.h>
  24. #include <starpu_mpi_stats.h>
  25. #include <starpu_mpi_cache.h>
  26. #include <starpu_mpi_select_node.h>
  27. #include <starpu_mpi_init.h>
  28. #include <common/config.h>
  29. #include <common/thread.h>
  30. #include <datawizard/interfaces/data_interface.h>
  31. #include <datawizard/coherency.h>
  32. #include <core/simgrid.h>
  33. #include <core/task.h>
  34. #include <core/topology.h>
  35. #include <core/workers.h>
  36. static void _starpu_mpi_isend_irecv_common(struct _starpu_mpi_req *req, enum starpu_data_access_mode mode, int sequential_consistency)
  37. {
  38. /* Asynchronously request StarPU to fetch the data in main memory: when
  39. * it is available in main memory, _starpu_mpi_submit_ready_request(req) is called and
  40. * the request is actually submitted */
  41. if (_starpu_mpi_mem_throttle && mode & STARPU_W && !req->data_handle->initialized)
  42. {
  43. /* We will trigger allocation, pre-reserve for it */
  44. size_t size = starpu_data_get_size(req->data_handle);
  45. if (size)
  46. {
  47. /* This will potentially block */
  48. starpu_memory_allocate(STARPU_MAIN_RAM, size, STARPU_MEMORY_WAIT);
  49. req->reserved_size = size;
  50. }
  51. }
  52. starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, STARPU_MAIN_RAM, mode, _starpu_mpi_submit_ready_request, (void *)req, sequential_consistency, 1, &req->pre_sync_jobid, &req->post_sync_jobid);
  53. }
  54. static struct _starpu_mpi_req *_starpu_mpi_isend_common(starpu_data_handle_t data_handle, int dest, starpu_mpi_tag_t data_tag, MPI_Comm comm, unsigned detached, unsigned sync, int prio, void (*callback)(void *), void *arg, int sequential_consistency)
  55. {
  56. if (STARPU_UNLIKELY(_starpu_mpi_fake_world_size != -1))
  57. {
  58. /* Don't actually do the communication */
  59. return NULL;
  60. }
  61. #ifdef STARPU_MPI_PEDANTIC_ISEND
  62. enum starpu_data_access_mode mode = STARPU_RW;
  63. #else
  64. enum starpu_data_access_mode mode = STARPU_R;
  65. #endif
  66. struct _starpu_mpi_req *req = _starpu_mpi_request_fill(data_handle, dest, data_tag, comm, detached, sync, prio, callback, arg, SEND_REQ, _mpi_backend._starpu_mpi_backend_isend_size_func, sequential_consistency, 0, 0);
  67. _starpu_mpi_req_willpost(req);
  68. if (_starpu_mpi_use_coop_sends && detached == 1 && sync == 0 && callback == NULL)
  69. {
  70. /* It's a send & forget send, we can perhaps optimize its distribution over several nodes */
  71. _starpu_mpi_coop_send(data_handle, req, mode, sequential_consistency);
  72. return req;
  73. }
  74. /* Post normally */
  75. _starpu_mpi_isend_irecv_common(req, mode, sequential_consistency);
  76. return req;
  77. }
  78. int starpu_mpi_isend_prio(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int dest, starpu_mpi_tag_t data_tag, int prio, MPI_Comm comm)
  79. {
  80. _STARPU_MPI_LOG_IN();
  81. STARPU_MPI_ASSERT_MSG(public_req, "starpu_mpi_isend needs a valid starpu_mpi_req");
  82. struct _starpu_mpi_req *req;
  83. _STARPU_MPI_TRACE_ISEND_COMPLETE_BEGIN(dest, data_tag, 0);
  84. req = _starpu_mpi_isend_common(data_handle, dest, data_tag, comm, 0, 0, prio, NULL, NULL, 1);
  85. _STARPU_MPI_TRACE_ISEND_COMPLETE_END(dest, data_tag, 0);
  86. STARPU_MPI_ASSERT_MSG(req, "Invalid return for _starpu_mpi_isend_common");
  87. *public_req = req;
  88. _STARPU_MPI_LOG_OUT();
  89. return 0;
  90. }
  91. int starpu_mpi_isend(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int dest, starpu_mpi_tag_t data_tag, MPI_Comm comm)
  92. {
  93. return starpu_mpi_isend_prio(data_handle, public_req, dest, data_tag, 0, comm);
  94. }
  95. int starpu_mpi_isend_detached_prio(starpu_data_handle_t data_handle, int dest, starpu_mpi_tag_t data_tag, int prio, MPI_Comm comm, void (*callback)(void *), void *arg)
  96. {
  97. _STARPU_MPI_LOG_IN();
  98. _starpu_mpi_isend_common(data_handle, dest, data_tag, comm, 1, 0, prio, callback, arg, 1);
  99. _STARPU_MPI_LOG_OUT();
  100. return 0;
  101. }
  102. int starpu_mpi_isend_detached(starpu_data_handle_t data_handle, int dest, starpu_mpi_tag_t data_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
  103. {
  104. return starpu_mpi_isend_detached_prio(data_handle, dest, data_tag, 0, comm, callback, arg);
  105. }
  106. int starpu_mpi_send_prio(starpu_data_handle_t data_handle, int dest, starpu_mpi_tag_t data_tag, int prio, MPI_Comm comm)
  107. {
  108. starpu_mpi_req req;
  109. MPI_Status status;
  110. _STARPU_MPI_LOG_IN();
  111. starpu_mpi_isend_prio(data_handle, &req, dest, data_tag, prio, comm);
  112. memset(&status, 0, sizeof(MPI_Status));
  113. starpu_mpi_wait(&req, &status);
  114. _STARPU_MPI_LOG_OUT();
  115. return 0;
  116. }
  117. int starpu_mpi_send(starpu_data_handle_t data_handle, int dest, starpu_mpi_tag_t data_tag, MPI_Comm comm)
  118. {
  119. return starpu_mpi_send_prio(data_handle, dest, data_tag, 0, comm);
  120. }
  121. int starpu_mpi_issend_prio(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int dest, starpu_mpi_tag_t data_tag, int prio, MPI_Comm comm)
  122. {
  123. _STARPU_MPI_LOG_IN();
  124. STARPU_MPI_ASSERT_MSG(public_req, "starpu_mpi_issend needs a valid starpu_mpi_req");
  125. struct _starpu_mpi_req *req;
  126. req = _starpu_mpi_isend_common(data_handle, dest, data_tag, comm, 0, 1, prio, NULL, NULL, 1);
  127. STARPU_MPI_ASSERT_MSG(req, "Invalid return for _starpu_mpi_isend_common");
  128. *public_req = req;
  129. _STARPU_MPI_LOG_OUT();
  130. return 0;
  131. }
  132. int starpu_mpi_issend(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int dest, starpu_mpi_tag_t data_tag, MPI_Comm comm)
  133. {
  134. return starpu_mpi_issend_prio(data_handle, public_req, dest, data_tag, 0, comm);
  135. }
  136. int starpu_mpi_issend_detached_prio(starpu_data_handle_t data_handle, int dest, starpu_mpi_tag_t data_tag, int prio, MPI_Comm comm, void (*callback)(void *), void *arg)
  137. {
  138. _STARPU_MPI_LOG_IN();
  139. _starpu_mpi_isend_common(data_handle, dest, data_tag, comm, 1, 1, prio, callback, arg, 1);
  140. _STARPU_MPI_LOG_OUT();
  141. return 0;
  142. }
  143. int starpu_mpi_issend_detached(starpu_data_handle_t data_handle, int dest, starpu_mpi_tag_t data_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
  144. {
  145. return starpu_mpi_issend_detached_prio(data_handle, dest, data_tag, 0, comm, callback, arg);
  146. }
  147. struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm, unsigned detached, unsigned sync, void (*callback)(void *), void *arg, int sequential_consistency, int is_internal_req, starpu_ssize_t count)
  148. {
  149. if (_starpu_mpi_fake_world_size != -1)
  150. {
  151. /* Don't actually do the communication */
  152. return NULL;
  153. }
  154. struct _starpu_mpi_req *req = _starpu_mpi_request_fill(data_handle, source, data_tag, comm, detached, sync, 0, callback, arg, RECV_REQ, _mpi_backend._starpu_mpi_backend_irecv_size_func, sequential_consistency, is_internal_req, count);
  155. _starpu_mpi_req_willpost(req);
  156. _starpu_mpi_isend_irecv_common(req, STARPU_W, sequential_consistency);
  157. return req;
  158. }
  159. int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm)
  160. {
  161. _STARPU_MPI_LOG_IN();
  162. STARPU_MPI_ASSERT_MSG(public_req, "starpu_mpi_irecv needs a valid starpu_mpi_req");
  163. struct _starpu_mpi_req *req;
  164. _STARPU_MPI_TRACE_IRECV_COMPLETE_BEGIN(source, data_tag);
  165. req = _starpu_mpi_irecv_common(data_handle, source, data_tag, comm, 0, 0, NULL, NULL, 1, 0, 0);
  166. _STARPU_MPI_TRACE_IRECV_COMPLETE_END(source, data_tag);
  167. STARPU_MPI_ASSERT_MSG(req, "Invalid return for _starpu_mpi_irecv_common");
  168. *public_req = req;
  169. _STARPU_MPI_LOG_OUT();
  170. return 0;
  171. }
  172. int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
  173. {
  174. _STARPU_MPI_LOG_IN();
  175. _starpu_mpi_irecv_common(data_handle, source, data_tag, comm, 1, 0, callback, arg, 1, 0, 0);
  176. _STARPU_MPI_LOG_OUT();
  177. return 0;
  178. }
  179. int starpu_mpi_irecv_detached_sequential_consistency(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm, void (*callback)(void *), void *arg, int sequential_consistency)
  180. {
  181. _STARPU_MPI_LOG_IN();
  182. _starpu_mpi_irecv_common(data_handle, source, data_tag, comm, 1, 0, callback, arg, sequential_consistency, 0, 0);
  183. _STARPU_MPI_LOG_OUT();
  184. return 0;
  185. }
  186. int starpu_mpi_recv(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm, MPI_Status *status)
  187. {
  188. starpu_mpi_req req;
  189. _STARPU_MPI_LOG_IN();
  190. starpu_mpi_irecv(data_handle, &req, source, data_tag, comm);
  191. starpu_mpi_wait(&req, status);
  192. _STARPU_MPI_LOG_OUT();
  193. return 0;
  194. }
  195. int starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
  196. {
  197. return _mpi_backend._starpu_mpi_backend_wait(public_req, status);
  198. }
  199. int starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
  200. {
  201. return _mpi_backend._starpu_mpi_backend_test(public_req, flag, status);
  202. }
  203. int starpu_mpi_barrier(MPI_Comm comm)
  204. {
  205. return _mpi_backend._starpu_mpi_backend_barrier(comm);
  206. }
  207. void _starpu_mpi_data_clear(starpu_data_handle_t data_handle)
  208. {
  209. _mpi_backend._starpu_mpi_backend_data_clear(data_handle);
  210. _starpu_mpi_cache_data_clear(data_handle);
  211. free(data_handle->mpi_data);
  212. data_handle->mpi_data = NULL;
  213. }
  214. struct _starpu_mpi_data *_starpu_mpi_data_get(starpu_data_handle_t data_handle)
  215. {
  216. struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
  217. if (mpi_data)
  218. {
  219. STARPU_ASSERT(mpi_data->magic == 42);
  220. }
  221. else
  222. {
  223. _STARPU_CALLOC(mpi_data, 1, sizeof(struct _starpu_mpi_data));
  224. mpi_data->magic = 42;
  225. mpi_data->node_tag.data_tag = -1;
  226. mpi_data->node_tag.node.rank = -1;
  227. mpi_data->node_tag.node.comm = MPI_COMM_WORLD;
  228. _starpu_spin_init(&mpi_data->coop_lock);
  229. data_handle->mpi_data = mpi_data;
  230. _starpu_mpi_cache_data_init(data_handle);
  231. _starpu_data_set_unregister_hook(data_handle, _starpu_mpi_data_clear);
  232. }
  233. return mpi_data;
  234. }
  235. void starpu_mpi_data_register_comm(starpu_data_handle_t data_handle, starpu_mpi_tag_t data_tag, int rank, MPI_Comm comm)
  236. {
  237. struct _starpu_mpi_data *mpi_data = _starpu_mpi_data_get(data_handle);
  238. if (data_tag != -1)
  239. {
  240. _mpi_backend._starpu_mpi_backend_data_register(data_handle, data_tag);
  241. mpi_data->node_tag.data_tag = data_tag;
  242. _STARPU_MPI_TRACE_DATA_SET_TAG(data_handle, data_tag);
  243. }
  244. if (rank != -1)
  245. {
  246. _STARPU_MPI_TRACE_DATA_SET_RANK(data_handle, rank);
  247. mpi_data->node_tag.node.rank = rank;
  248. mpi_data->node_tag.node.comm = comm;
  249. }
  250. }
  251. void starpu_mpi_data_set_rank_comm(starpu_data_handle_t handle, int rank, MPI_Comm comm)
  252. {
  253. starpu_mpi_data_register_comm(handle, -1, rank, comm);
  254. }
  255. void starpu_mpi_data_set_tag(starpu_data_handle_t handle, starpu_mpi_tag_t data_tag)
  256. {
  257. starpu_mpi_data_register_comm(handle, data_tag, -1, MPI_COMM_WORLD);
  258. }
  259. int starpu_mpi_data_get_rank(starpu_data_handle_t data)
  260. {
  261. STARPU_ASSERT_MSG(data->mpi_data, "starpu_mpi_data_register MUST be called for data %p\n", data);
  262. return ((struct _starpu_mpi_data *)(data->mpi_data))->node_tag.node.rank;
  263. }
  264. starpu_mpi_tag_t starpu_mpi_data_get_tag(starpu_data_handle_t data)
  265. {
  266. STARPU_ASSERT_MSG(data->mpi_data, "starpu_mpi_data_register MUST be called for data %p\n", data);
  267. return ((struct _starpu_mpi_data *)(data->mpi_data))->node_tag.data_tag;
  268. }
  269. void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t data_handle, int node, void (*callback)(void*), void *arg)
  270. {
  271. int me, rank;
  272. starpu_mpi_tag_t data_tag;
  273. rank = starpu_mpi_data_get_rank(data_handle);
  274. if (rank == -1)
  275. {
  276. _STARPU_ERROR("StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register() or starpu_mpi_data_register_comm()\n");
  277. }
  278. starpu_mpi_comm_rank(comm, &me);
  279. if (node == rank)
  280. return;
  281. data_tag = starpu_mpi_data_get_tag(data_handle);
  282. if (data_tag == -1)
  283. {
  284. _STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register() or starpu_mpi_data_register_comm()\n");
  285. }
  286. if (me == node)
  287. {
  288. _STARPU_MPI_DEBUG(1, "Migrating data %p from %d to %d\n", data_handle, rank, node);
  289. int already_received = starpu_mpi_cached_receive_set(data_handle);
  290. if (already_received == 0)
  291. {
  292. _STARPU_MPI_DEBUG(1, "Receiving data %p from %d\n", data_handle, rank);
  293. starpu_mpi_irecv_detached(data_handle, rank, data_tag, comm, callback, arg);
  294. }
  295. }
  296. else if (me == rank)
  297. {
  298. _STARPU_MPI_DEBUG(1, "Migrating data %p from %d to %d\n", data_handle, rank, node);
  299. int already_sent = starpu_mpi_cached_send_set(data_handle, node);
  300. if (already_sent == 0)
  301. {
  302. _STARPU_MPI_DEBUG(1, "Sending data %p to %d\n", data_handle, node);
  303. starpu_mpi_isend_detached(data_handle, node, data_tag, comm, NULL, NULL);
  304. }
  305. }
  306. }
  307. void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle, int node)
  308. {
  309. int me, rank;
  310. starpu_mpi_tag_t data_tag;
  311. rank = starpu_mpi_data_get_rank(data_handle);
  312. if (rank == -1)
  313. {
  314. _STARPU_ERROR("StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register\n");
  315. }
  316. starpu_mpi_comm_rank(comm, &me);
  317. if (node == rank)
  318. return;
  319. data_tag = starpu_mpi_data_get_tag(data_handle);
  320. if (data_tag == -1)
  321. {
  322. _STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
  323. }
  324. if (me == node)
  325. {
  326. MPI_Status status;
  327. _STARPU_MPI_DEBUG(1, "Migrating data %p from %d to %d\n", data_handle, rank, node);
  328. int already_received = starpu_mpi_cached_receive_set(data_handle);
  329. if (already_received == 0)
  330. {
  331. _STARPU_MPI_DEBUG(1, "Receiving data %p from %d\n", data_handle, rank);
  332. starpu_mpi_recv(data_handle, rank, data_tag, comm, &status);
  333. }
  334. }
  335. else if (me == rank)
  336. {
  337. _STARPU_MPI_DEBUG(1, "Migrating data %p from %d to %d\n", data_handle, rank, node);
  338. int already_sent = starpu_mpi_cached_send_set(data_handle, node);
  339. if (already_sent == 0)
  340. {
  341. _STARPU_MPI_DEBUG(1, "Sending data %p to %d\n", data_handle, node);
  342. starpu_mpi_send(data_handle, node, data_tag, comm);
  343. }
  344. }
  345. }
  346. void starpu_mpi_get_data_on_all_nodes_detached(MPI_Comm comm, starpu_data_handle_t data_handle)
  347. {
  348. int size, i;
  349. starpu_mpi_comm_size(comm, &size);
  350. for (i = 0; i < size; i++)
  351. starpu_mpi_get_data_on_node_detached(comm, data_handle, i, NULL, NULL);
  352. }
  353. void starpu_mpi_data_migrate(MPI_Comm comm, starpu_data_handle_t data, int new_rank)
  354. {
  355. int old_rank = starpu_mpi_data_get_rank(data);
  356. if (new_rank == old_rank)
  357. /* Already there */
  358. return;
  359. /* First submit data migration if it's not already on destination */
  360. starpu_mpi_get_data_on_node_detached(comm, data, new_rank, NULL, NULL);
  361. /* And note new owner */
  362. starpu_mpi_data_set_rank_comm(data, new_rank, comm);
  363. /* Flush cache in all other nodes */
  364. /* TODO: Ideally we'd transmit the knowledge of who owns it */
  365. /* TODO: or at least remember that the previous owner has the data, that's an easy case to support */
  366. starpu_mpi_cache_flush(comm, data);
  367. return;
  368. }
  369. int starpu_mpi_wait_for_all(MPI_Comm comm)
  370. {
  371. return _mpi_backend._starpu_mpi_backend_wait_for_all(comm);
  372. }