starpu_mpi.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2009-2020 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  4. * Copyright (C) 2019 Federal University of Rio Grande do Sul (UFRGS)
  5. *
  6. * StarPU is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU Lesser General Public License as published by
  8. * the Free Software Foundation; either version 2.1 of the License, or (at
  9. * your option) any later version.
  10. *
  11. * StarPU is distributed in the hope that it will be useful, but
  12. * WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  14. *
  15. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  16. */
  17. #include <stdlib.h>
  18. #include <limits.h>
  19. #include <starpu_mpi.h>
  20. #include <starpu_mpi_datatype.h>
  21. #include <starpu_mpi_private.h>
  22. #include <starpu_mpi_cache.h>
  23. #include <starpu_profiling.h>
  24. #include <starpu_mpi_stats.h>
  25. #include <starpu_mpi_cache.h>
  26. #include <starpu_mpi_select_node.h>
  27. #include <starpu_mpi_init.h>
  28. #include <common/config.h>
  29. #include <common/thread.h>
  30. #include <datawizard/interfaces/data_interface.h>
  31. #include <datawizard/coherency.h>
  32. #include <core/simgrid.h>
  33. #include <core/task.h>
  34. #include <core/topology.h>
  35. #include <core/workers.h>
  36. static void _starpu_mpi_isend_irecv_common(struct _starpu_mpi_req *req, enum starpu_data_access_mode mode, int sequential_consistency)
  37. {
  38. /* Asynchronously request StarPU to fetch the data in main memory: when
  39. * it is available in main memory, _starpu_mpi_submit_ready_request(req) is called and
  40. * the request is actually submitted */
  41. if (_starpu_mpi_mem_throttle && mode & STARPU_W && !req->data_handle->initialized)
  42. {
  43. /* We will trigger allocation, pre-reserve for it */
  44. size_t size = starpu_data_get_size(req->data_handle);
  45. if (size)
  46. {
  47. /* This will potentially block */
  48. starpu_memory_allocate(STARPU_MAIN_RAM, size, STARPU_MEMORY_WAIT);
  49. req->reserved_size = size;
  50. }
  51. }
  52. if (sequential_consistency)
  53. {
  54. starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, STARPU_MAIN_RAM, mode, _starpu_mpi_submit_ready_request, (void *)req, 1 /*sequential consistency*/, 1, &req->pre_sync_jobid, &req->post_sync_jobid);
  55. }
  56. else
  57. {
  58. /* post_sync_job_id has already been filled */
  59. starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(req->data_handle, STARPU_MAIN_RAM, mode, _starpu_mpi_submit_ready_request, (void *)req, 0 /*sequential consistency*/, 1, &req->pre_sync_jobid, NULL);
  60. }
  61. }
  62. static struct _starpu_mpi_req *_starpu_mpi_isend_common(starpu_data_handle_t data_handle, int dest, starpu_mpi_tag_t data_tag, MPI_Comm comm, unsigned detached, unsigned sync, int prio, void (*callback)(void *), void *arg, int sequential_consistency)
  63. {
  64. if (STARPU_UNLIKELY(_starpu_mpi_fake_world_size != -1))
  65. {
  66. /* Don't actually do the communication */
  67. return NULL;
  68. }
  69. #ifdef STARPU_MPI_PEDANTIC_ISEND
  70. enum starpu_data_access_mode mode = STARPU_RW;
  71. #else
  72. enum starpu_data_access_mode mode = STARPU_R;
  73. #endif
  74. struct _starpu_mpi_req *req = _starpu_mpi_request_fill(data_handle, dest, data_tag, comm, detached, sync, prio, callback, arg, SEND_REQ, _mpi_backend._starpu_mpi_backend_isend_size_func, sequential_consistency, 0, 0);
  75. _starpu_mpi_req_willpost(req);
  76. if (_starpu_mpi_use_coop_sends && detached == 1 && sync == 0 && callback == NULL)
  77. {
  78. /* It's a send & forget send, we can perhaps optimize its distribution over several nodes */
  79. _starpu_mpi_coop_send(data_handle, req, mode, sequential_consistency);
  80. return req;
  81. }
  82. /* Post normally */
  83. _starpu_mpi_isend_irecv_common(req, mode, sequential_consistency);
  84. return req;
  85. }
  86. int starpu_mpi_isend_prio(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int dest, starpu_mpi_tag_t data_tag, int prio, MPI_Comm comm)
  87. {
  88. _STARPU_MPI_LOG_IN();
  89. STARPU_MPI_ASSERT_MSG(public_req, "starpu_mpi_isend needs a valid starpu_mpi_req");
  90. struct _starpu_mpi_req *req;
  91. _STARPU_MPI_TRACE_ISEND_COMPLETE_BEGIN(dest, data_tag, 0);
  92. req = _starpu_mpi_isend_common(data_handle, dest, data_tag, comm, 0, 0, prio, NULL, NULL, 1);
  93. _STARPU_MPI_TRACE_ISEND_COMPLETE_END(dest, data_tag, 0);
  94. STARPU_MPI_ASSERT_MSG(req, "Invalid return for _starpu_mpi_isend_common");
  95. *public_req = req;
  96. _STARPU_MPI_LOG_OUT();
  97. return 0;
  98. }
  99. int starpu_mpi_isend(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int dest, starpu_mpi_tag_t data_tag, MPI_Comm comm)
  100. {
  101. return starpu_mpi_isend_prio(data_handle, public_req, dest, data_tag, 0, comm);
  102. }
  103. int starpu_mpi_isend_detached_prio(starpu_data_handle_t data_handle, int dest, starpu_mpi_tag_t data_tag, int prio, MPI_Comm comm, void (*callback)(void *), void *arg)
  104. {
  105. _STARPU_MPI_LOG_IN();
  106. _starpu_mpi_isend_common(data_handle, dest, data_tag, comm, 1, 0, prio, callback, arg, 1);
  107. _STARPU_MPI_LOG_OUT();
  108. return 0;
  109. }
  110. int starpu_mpi_isend_detached(starpu_data_handle_t data_handle, int dest, starpu_mpi_tag_t data_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
  111. {
  112. return starpu_mpi_isend_detached_prio(data_handle, dest, data_tag, 0, comm, callback, arg);
  113. }
  114. int starpu_mpi_send_prio(starpu_data_handle_t data_handle, int dest, starpu_mpi_tag_t data_tag, int prio, MPI_Comm comm)
  115. {
  116. starpu_mpi_req req;
  117. MPI_Status status;
  118. _STARPU_MPI_LOG_IN();
  119. starpu_mpi_isend_prio(data_handle, &req, dest, data_tag, prio, comm);
  120. memset(&status, 0, sizeof(MPI_Status));
  121. starpu_mpi_wait(&req, &status);
  122. _STARPU_MPI_LOG_OUT();
  123. return 0;
  124. }
  125. int starpu_mpi_send(starpu_data_handle_t data_handle, int dest, starpu_mpi_tag_t data_tag, MPI_Comm comm)
  126. {
  127. return starpu_mpi_send_prio(data_handle, dest, data_tag, 0, comm);
  128. }
  129. int starpu_mpi_issend_prio(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int dest, starpu_mpi_tag_t data_tag, int prio, MPI_Comm comm)
  130. {
  131. _STARPU_MPI_LOG_IN();
  132. STARPU_MPI_ASSERT_MSG(public_req, "starpu_mpi_issend needs a valid starpu_mpi_req");
  133. struct _starpu_mpi_req *req;
  134. req = _starpu_mpi_isend_common(data_handle, dest, data_tag, comm, 0, 1, prio, NULL, NULL, 1);
  135. STARPU_MPI_ASSERT_MSG(req, "Invalid return for _starpu_mpi_isend_common");
  136. *public_req = req;
  137. _STARPU_MPI_LOG_OUT();
  138. return 0;
  139. }
  140. int starpu_mpi_issend(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int dest, starpu_mpi_tag_t data_tag, MPI_Comm comm)
  141. {
  142. return starpu_mpi_issend_prio(data_handle, public_req, dest, data_tag, 0, comm);
  143. }
  144. int starpu_mpi_issend_detached_prio(starpu_data_handle_t data_handle, int dest, starpu_mpi_tag_t data_tag, int prio, MPI_Comm comm, void (*callback)(void *), void *arg)
  145. {
  146. _STARPU_MPI_LOG_IN();
  147. _starpu_mpi_isend_common(data_handle, dest, data_tag, comm, 1, 1, prio, callback, arg, 1);
  148. _STARPU_MPI_LOG_OUT();
  149. return 0;
  150. }
  151. int starpu_mpi_issend_detached(starpu_data_handle_t data_handle, int dest, starpu_mpi_tag_t data_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
  152. {
  153. return starpu_mpi_issend_detached_prio(data_handle, dest, data_tag, 0, comm, callback, arg);
  154. }
  155. struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm, unsigned detached, unsigned sync, void (*callback)(void *), void *arg, int sequential_consistency, int is_internal_req, starpu_ssize_t count)
  156. {
  157. if (_starpu_mpi_fake_world_size != -1)
  158. {
  159. /* Don't actually do the communication */
  160. return NULL;
  161. }
  162. struct _starpu_mpi_req *req = _starpu_mpi_request_fill(data_handle, source, data_tag, comm, detached, sync, 0, callback, arg, RECV_REQ, _mpi_backend._starpu_mpi_backend_irecv_size_func, sequential_consistency, is_internal_req, count);
  163. _starpu_mpi_req_willpost(req);
  164. if (sequential_consistency == 0)
  165. {
  166. /* Synchronization task jobid from redux is used */
  167. _starpu_mpi_redux_fill_post_sync_jobid(arg, &(req->post_sync_jobid));
  168. }
  169. _starpu_mpi_isend_irecv_common(req, STARPU_W, sequential_consistency);
  170. return req;
  171. }
  172. int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *public_req, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm)
  173. {
  174. _STARPU_MPI_LOG_IN();
  175. STARPU_MPI_ASSERT_MSG(public_req, "starpu_mpi_irecv needs a valid starpu_mpi_req");
  176. struct _starpu_mpi_req *req;
  177. _STARPU_MPI_TRACE_IRECV_COMPLETE_BEGIN(source, data_tag);
  178. req = _starpu_mpi_irecv_common(data_handle, source, data_tag, comm, 0, 0, NULL, NULL, 1, 0, 0);
  179. _STARPU_MPI_TRACE_IRECV_COMPLETE_END(source, data_tag);
  180. STARPU_MPI_ASSERT_MSG(req, "Invalid return for _starpu_mpi_irecv_common");
  181. *public_req = req;
  182. _STARPU_MPI_LOG_OUT();
  183. return 0;
  184. }
  185. int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm, void (*callback)(void *), void *arg)
  186. {
  187. _STARPU_MPI_LOG_IN();
  188. _starpu_mpi_irecv_common(data_handle, source, data_tag, comm, 1, 0, callback, arg, 1, 0, 0);
  189. _STARPU_MPI_LOG_OUT();
  190. return 0;
  191. }
  192. int starpu_mpi_irecv_detached_sequential_consistency(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm, void (*callback)(void *), void *arg, int sequential_consistency)
  193. {
  194. _STARPU_MPI_LOG_IN();
  195. _starpu_mpi_irecv_common(data_handle, source, data_tag, comm, 1, 0, callback, arg, sequential_consistency, 0, 0);
  196. _STARPU_MPI_LOG_OUT();
  197. return 0;
  198. }
  199. int starpu_mpi_recv(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm, MPI_Status *status)
  200. {
  201. starpu_mpi_req req;
  202. _STARPU_MPI_LOG_IN();
  203. starpu_mpi_irecv(data_handle, &req, source, data_tag, comm);
  204. starpu_mpi_wait(&req, status);
  205. _STARPU_MPI_LOG_OUT();
  206. return 0;
  207. }
  208. int starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
  209. {
  210. return _mpi_backend._starpu_mpi_backend_wait(public_req, status);
  211. }
  212. int starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
  213. {
  214. return _mpi_backend._starpu_mpi_backend_test(public_req, flag, status);
  215. }
  216. int starpu_mpi_barrier(MPI_Comm comm)
  217. {
  218. return _mpi_backend._starpu_mpi_backend_barrier(comm);
  219. }
  220. void _starpu_mpi_data_clear(starpu_data_handle_t data_handle)
  221. {
  222. _mpi_backend._starpu_mpi_backend_data_clear(data_handle);
  223. _starpu_mpi_cache_data_clear(data_handle);
  224. free(data_handle->mpi_data);
  225. data_handle->mpi_data = NULL;
  226. }
  227. struct _starpu_mpi_data *_starpu_mpi_data_get(starpu_data_handle_t data_handle)
  228. {
  229. struct _starpu_mpi_data *mpi_data = data_handle->mpi_data;
  230. if (mpi_data)
  231. {
  232. STARPU_ASSERT(mpi_data->magic == 42);
  233. }
  234. else
  235. {
  236. _STARPU_CALLOC(mpi_data, 1, sizeof(struct _starpu_mpi_data));
  237. mpi_data->magic = 42;
  238. mpi_data->node_tag.data_tag = -1;
  239. mpi_data->node_tag.node.rank = -1;
  240. mpi_data->node_tag.node.comm = MPI_COMM_WORLD;
  241. _starpu_spin_init(&mpi_data->coop_lock);
  242. data_handle->mpi_data = mpi_data;
  243. _starpu_mpi_cache_data_init(data_handle);
  244. _starpu_data_set_unregister_hook(data_handle, _starpu_mpi_data_clear);
  245. }
  246. return mpi_data;
  247. }
  248. void starpu_mpi_data_register_comm(starpu_data_handle_t data_handle, starpu_mpi_tag_t data_tag, int rank, MPI_Comm comm)
  249. {
  250. struct _starpu_mpi_data *mpi_data = _starpu_mpi_data_get(data_handle);
  251. if (data_tag != -1)
  252. {
  253. _mpi_backend._starpu_mpi_backend_data_register(data_handle, data_tag);
  254. mpi_data->node_tag.data_tag = data_tag;
  255. _STARPU_MPI_TRACE_DATA_SET_TAG(data_handle, data_tag);
  256. }
  257. if (rank != -1)
  258. {
  259. _STARPU_MPI_TRACE_DATA_SET_RANK(data_handle, rank);
  260. mpi_data->node_tag.node.rank = rank;
  261. mpi_data->node_tag.node.comm = comm;
  262. }
  263. }
  264. void starpu_mpi_data_set_rank_comm(starpu_data_handle_t handle, int rank, MPI_Comm comm)
  265. {
  266. starpu_mpi_data_register_comm(handle, -1, rank, comm);
  267. }
  268. void starpu_mpi_data_set_tag(starpu_data_handle_t handle, starpu_mpi_tag_t data_tag)
  269. {
  270. starpu_mpi_data_register_comm(handle, data_tag, -1, MPI_COMM_WORLD);
  271. }
  272. int starpu_mpi_data_get_rank(starpu_data_handle_t data)
  273. {
  274. STARPU_ASSERT_MSG(data->mpi_data, "starpu_mpi_data_register MUST be called for data %p\n", data);
  275. return ((struct _starpu_mpi_data *)(data->mpi_data))->node_tag.node.rank;
  276. }
  277. starpu_mpi_tag_t starpu_mpi_data_get_tag(starpu_data_handle_t data)
  278. {
  279. STARPU_ASSERT_MSG(data->mpi_data, "starpu_mpi_data_register MUST be called for data %p\n", data);
  280. return ((struct _starpu_mpi_data *)(data->mpi_data))->node_tag.data_tag;
  281. }
  282. void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t data_handle, int node, void (*callback)(void*), void *arg)
  283. {
  284. int me, rank;
  285. starpu_mpi_tag_t data_tag;
  286. rank = starpu_mpi_data_get_rank(data_handle);
  287. if (rank == -1)
  288. {
  289. _STARPU_ERROR("StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register() or starpu_mpi_data_register_comm()\n");
  290. }
  291. starpu_mpi_comm_rank(comm, &me);
  292. if (node == rank)
  293. return;
  294. data_tag = starpu_mpi_data_get_tag(data_handle);
  295. if (data_tag == -1)
  296. {
  297. _STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register() or starpu_mpi_data_register_comm()\n");
  298. }
  299. if (me == node)
  300. {
  301. _STARPU_MPI_DEBUG(1, "Migrating data %p from %d to %d\n", data_handle, rank, node);
  302. int already_received = starpu_mpi_cached_receive_set(data_handle);
  303. if (already_received == 0)
  304. {
  305. _STARPU_MPI_DEBUG(1, "Receiving data %p from %d\n", data_handle, rank);
  306. starpu_mpi_irecv_detached(data_handle, rank, data_tag, comm, callback, arg);
  307. }
  308. }
  309. else if (me == rank)
  310. {
  311. _STARPU_MPI_DEBUG(1, "Migrating data %p from %d to %d\n", data_handle, rank, node);
  312. int already_sent = starpu_mpi_cached_send_set(data_handle, node);
  313. if (already_sent == 0)
  314. {
  315. _STARPU_MPI_DEBUG(1, "Sending data %p to %d\n", data_handle, node);
  316. starpu_mpi_isend_detached(data_handle, node, data_tag, comm, NULL, NULL);
  317. }
  318. }
  319. }
  320. void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle, int node)
  321. {
  322. int me, rank;
  323. starpu_mpi_tag_t data_tag;
  324. rank = starpu_mpi_data_get_rank(data_handle);
  325. if (rank == -1)
  326. {
  327. _STARPU_ERROR("StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register\n");
  328. }
  329. starpu_mpi_comm_rank(comm, &me);
  330. if (node == rank)
  331. return;
  332. data_tag = starpu_mpi_data_get_tag(data_handle);
  333. if (data_tag == -1)
  334. {
  335. _STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
  336. }
  337. if (me == node)
  338. {
  339. MPI_Status status;
  340. _STARPU_MPI_DEBUG(1, "Migrating data %p from %d to %d\n", data_handle, rank, node);
  341. int already_received = starpu_mpi_cached_receive_set(data_handle);
  342. if (already_received == 0)
  343. {
  344. _STARPU_MPI_DEBUG(1, "Receiving data %p from %d\n", data_handle, rank);
  345. starpu_mpi_recv(data_handle, rank, data_tag, comm, &status);
  346. }
  347. }
  348. else if (me == rank)
  349. {
  350. _STARPU_MPI_DEBUG(1, "Migrating data %p from %d to %d\n", data_handle, rank, node);
  351. int already_sent = starpu_mpi_cached_send_set(data_handle, node);
  352. if (already_sent == 0)
  353. {
  354. _STARPU_MPI_DEBUG(1, "Sending data %p to %d\n", data_handle, node);
  355. starpu_mpi_send(data_handle, node, data_tag, comm);
  356. }
  357. }
  358. }
  359. void starpu_mpi_get_data_on_all_nodes_detached(MPI_Comm comm, starpu_data_handle_t data_handle)
  360. {
  361. int size, i;
  362. starpu_mpi_comm_size(comm, &size);
  363. for (i = 0; i < size; i++)
  364. starpu_mpi_get_data_on_node_detached(comm, data_handle, i, NULL, NULL);
  365. }
  366. void starpu_mpi_data_migrate(MPI_Comm comm, starpu_data_handle_t data, int new_rank)
  367. {
  368. int old_rank = starpu_mpi_data_get_rank(data);
  369. if (new_rank == old_rank)
  370. /* Already there */
  371. return;
  372. /* First submit data migration if it's not already on destination */
  373. starpu_mpi_get_data_on_node_detached(comm, data, new_rank, NULL, NULL);
  374. /* And note new owner */
  375. starpu_mpi_data_set_rank_comm(data, new_rank, comm);
  376. /* Flush cache in all other nodes */
  377. /* TODO: Ideally we'd transmit the knowledge of who owns it */
  378. /* TODO: or at least remember that the previous owner has the data, that's an easy case to support */
  379. starpu_mpi_cache_flush(comm, data);
  380. return;
  381. }
  382. int starpu_mpi_wait_for_all(MPI_Comm comm)
  383. {
  384. return _mpi_backend._starpu_mpi_backend_wait_for_all(comm);
  385. }