starpu_mpi.h 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2009-2021 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  4. * Copyright (C) 2021 Federal University of Rio Grande do Sul (UFRGS)
  5. *
  6. * StarPU is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU Lesser General Public License as published by
  8. * the Free Software Foundation; either version 2.1 of the License, or (at
  9. * your option) any later version.
  10. *
  11. * StarPU is distributed in the hope that it will be useful, but
  12. * WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  14. *
  15. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  16. */
  17. #ifndef __STARPU_MPI_H__
  18. #define __STARPU_MPI_H__
  19. #include <starpu.h>
  20. #if defined(STARPU_USE_MPI)
  21. #include <mpi.h>
  22. #include <stdint.h>
  23. #ifdef __cplusplus
  24. extern "C"
  25. {
  26. #endif
  27. /**
  28. @defgroup API_MPI_Support MPI Support
  29. @{
  30. */
  31. /**
  32. @name Initialisation
  33. @{
  34. */
  35. /**
  36. Initialize the StarPU library with the given \p conf, and
  37. initialize the StarPU-MPI library with the given MPI communicator
  38. \p comm. \p initialize_mpi indicates if MPI should be initialized
  39. or not by StarPU. StarPU-MPI takes the opportunity to modify \p
  40. conf to either reserve a core for its MPI thread (by default), or
  41. execute MPI calls on the CPU driver 0 between tasks.
  42. */
  43. int starpu_mpi_init_conf(int *argc, char ***argv, int initialize_mpi, MPI_Comm comm, struct starpu_conf *conf);
  44. /**
  45. Same as starpu_mpi_init_conf(), except that this does not initialize the
  46. StarPU library. The caller thus has to call starpu_init() before this, and it
  47. can not reserve a core for the MPI communications.
  48. */
  49. int starpu_mpi_init_comm(int *argc, char ***argv, int initialize_mpi, MPI_Comm comm);
  50. /**
  51. Call starpu_mpi_init_comm() with the MPI communicator \c MPI_COMM_WORLD.
  52. */
  53. int starpu_mpi_init(int *argc, char ***argv, int initialize_mpi);
  54. /**
  55. @deprecated
  56. This function has been made deprecated. One should use instead the
  57. function starpu_mpi_init(). This function does not call \c
  58. MPI_Init(), it should be called beforehand.
  59. */
  60. int starpu_mpi_initialize(void) STARPU_DEPRECATED;
  61. /**
  62. @deprecated
  63. This function has been made deprecated. One should use instead the
  64. function starpu_mpi_init(). MPI will be initialized by starpumpi by
  65. calling <c>MPI_Init_Thread(argc, argv, MPI_THREAD_SERIALIZED,
  66. ...)</c>.
  67. */
  68. int starpu_mpi_initialize_extended(int *rank, int *world_size) STARPU_DEPRECATED;
  69. /**
  70. Clean the starpumpi library. This must be called after calling any
  71. \c starpu_mpi functions and before the call to starpu_shutdown(),
  72. if any. \c MPI_Finalize() will be called if StarPU-MPI has been
  73. initialized by starpu_mpi_init().
  74. */
  75. int starpu_mpi_shutdown(void);
  76. /**
  77. Retrieve the current amount of communications from the current node
  78. in the array \p comm_amounts which must have a size greater or
  79. equal to the world size. Communications statistics must be enabled
  80. (see \ref STARPU_COMM_STATS).
  81. */
  82. void starpu_mpi_comm_amounts_retrieve(size_t *comm_amounts);
  83. /**
  84. Return in \p size the size of the communicator \p comm
  85. */
  86. int starpu_mpi_comm_size(MPI_Comm comm, int *size);
  87. /**
  88. Return in \p rank the rank of the calling process in the
  89. communicator \p comm
  90. */
  91. int starpu_mpi_comm_rank(MPI_Comm comm, int *rank);
  92. /**
  93. Return the rank of the calling process in the communicator \c
  94. MPI_COMM_WORLD
  95. */
  96. int starpu_mpi_world_rank(void);
  97. /**
  98. Return the size of the communicator \c MPI_COMM_WORLD
  99. */
  100. int starpu_mpi_world_size(void);
  101. /**
  102. When given to the function starpu_mpi_comm_get_attr(), retrieve the
  103. value for the upper bound for tag value.
  104. */
  105. #define STARPU_MPI_TAG_UB MPI_TAG_UB
  106. /**
  107. Retrieve an attribute value by key, similarly to the MPI function
  108. \c MPI_comm_get_attr(), except that the value is a pointer to
  109. int64_t instead of int. If an attribute is attached on \p comm to
  110. \p keyval, then the call returns \p flag equal to \c 1, and the
  111. attribute value in \p attribute_val. Otherwise, \p flag is set to
  112. \0.
  113. */
  114. int starpu_mpi_comm_get_attr(MPI_Comm comm, int keyval, void *attribute_val, int *flag);
  115. /**
  116. Get the logical index of the core where the MPI thread is bound.
  117. */
  118. int starpu_mpi_get_thread_cpuid(void);
  119. int starpu_mpi_get_communication_tag(void);
  120. void starpu_mpi_set_communication_tag(int tag);
  121. /** @} */
  122. /**
  123. @name Communication
  124. \anchor MPIPtpCommunication
  125. @{
  126. */
  127. /**
  128. Opaque type for communication request
  129. */
  130. typedef void *starpu_mpi_req;
  131. /**
  132. Type of the message tag.
  133. */
  134. typedef int64_t starpu_mpi_tag_t;
  135. /**
  136. Post a standard-mode, non blocking send of \p data_handle to the
  137. node \p dest using the message tag \p data_tag within the
  138. communicator \p comm. After the call, the pointer to the request \p
  139. req can be used to test or to wait for the completion of the
  140. communication.
  141. */
  142. int starpu_mpi_isend(starpu_data_handle_t data_handle, starpu_mpi_req *req, int dest, starpu_mpi_tag_t data_tag, MPI_Comm comm);
  143. /**
  144. Similar to starpu_mpi_isend(), but take a priority \p prio.
  145. */
  146. int starpu_mpi_isend_prio(starpu_data_handle_t data_handle, starpu_mpi_req *req, int dest, starpu_mpi_tag_t data_tag, int prio, MPI_Comm comm);
  147. /**
  148. Post a nonblocking receive in \p data_handle from the node \p
  149. source using the message tag \p data_tag within the communicator \p
  150. comm. After the call, the pointer to the request \p req can be used
  151. to test or to wait for the completion of the communication.
  152. */
  153. int starpu_mpi_irecv(starpu_data_handle_t data_handle, starpu_mpi_req *req, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm);
  154. /**
  155. Perform a standard-mode, blocking send of \p data_handle to the
  156. node \p dest using the message tag \p data_tag within the
  157. communicator \p comm.
  158. */
  159. int starpu_mpi_send(starpu_data_handle_t data_handle, int dest, starpu_mpi_tag_t data_tag, MPI_Comm comm);
  160. /**
  161. Similar to starpu_mpi_send(), but take a priority \p prio.
  162. */
  163. int starpu_mpi_send_prio(starpu_data_handle_t data_handle, int dest, starpu_mpi_tag_t data_tag, int prio, MPI_Comm comm);
  164. /**
  165. Perform a standard-mode, blocking receive in \p data_handle from
  166. the node \p source using the message tag \p data_tag within the
  167. communicator \p comm.
  168. The value of \p status cannot be NULL, use the predefined value
  169. MPI_STATUS_IGNORE to ignore the status.
  170. */
  171. int starpu_mpi_recv(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm, MPI_Status *status);
  172. /**
  173. Post a standard-mode, non blocking send of \p data_handle to the
  174. node \p dest using the message tag \p data_tag within the
  175. communicator \p comm. On completion, the \p callback function is
  176. called with the argument \p arg.
  177. Similarly to the pthread detached functionality, when a detached
  178. communication completes, its resources are automatically released
  179. back to the system, there is no need to test or to wait for the
  180. completion of the request.
  181. */
  182. int starpu_mpi_isend_detached(starpu_data_handle_t data_handle, int dest, starpu_mpi_tag_t data_tag, MPI_Comm comm, void (*callback)(void *), void *arg);
  183. /**
  184. Similar to starpu_mpi_isend_detached, but take a priority \p prio.
  185. */
  186. int starpu_mpi_isend_detached_prio(starpu_data_handle_t data_handle, int dest, starpu_mpi_tag_t data_tag, int prio, MPI_Comm comm, void (*callback)(void *), void *arg);
  187. /**
  188. Post a nonblocking receive in \p data_handle from the node \p
  189. source using the message tag \p data_tag within the communicator \p
  190. comm. On completion, the \p callback function is called with the
  191. argument \p arg.
  192. Similarly to the pthread detached functionality, when a detached
  193. communication completes, its resources are automatically released
  194. back to the system, there is no need to test or to wait for the
  195. completion of the request.
  196. */
  197. int starpu_mpi_irecv_detached(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm, void (*callback)(void *), void *arg);
  198. /**
  199. Same of starpu_mpi_irecv_detached but with the \p prio parameter.
  200. */
  201. int starpu_mpi_irecv_detached_prio(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, int prio, MPI_Comm comm, void (*callback)(void *), void *arg);
  202. /**
  203. Post a nonblocking receive in \p data_handle from the node \p
  204. source using the message tag \p data_tag within the communicator \p
  205. comm. On completion, the \p callback function is called with the
  206. argument \p arg.
  207. The parameter \p sequential_consistency allows to enable or disable
  208. the sequential consistency for \p data handle (sequential
  209. consistency will be enabled or disabled based on the value of the
  210. parameter \p sequential_consistency and the value of the sequential
  211. consistency defined for \p data_handle).
  212. Similarly to the pthread detached functionality, when a detached
  213. communication completes, its resources are automatically released
  214. back to the system, there is no need to test or to wait for the
  215. completion of the request.
  216. */
  217. int starpu_mpi_irecv_detached_sequential_consistency(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm, void (*callback)(void *), void *arg, int sequential_consistency);
  218. /**
  219. Perform a synchronous-mode, non-blocking send of \p data_handle to
  220. the node \p dest using the message tag \p data_tag within the
  221. communicator \p comm.
  222. */
  223. int starpu_mpi_issend(starpu_data_handle_t data_handle, starpu_mpi_req *req, int dest, starpu_mpi_tag_t data_tag, MPI_Comm comm);
  224. /**
  225. Similar to starpu_mpi_issend(), but take a priority \p prio.
  226. */
  227. int starpu_mpi_issend_prio(starpu_data_handle_t data_handle, starpu_mpi_req *req, int dest, starpu_mpi_tag_t data_tag, int prio, MPI_Comm comm);
  228. /**
  229. Perform a synchronous-mode, non-blocking send of \p data_handle to
  230. the node \p dest using the message tag \p data_tag within the
  231. communicator \p comm. On completion, the \p callback function is
  232. called with the argument \p arg.
  233. Similarly to the pthread detached functionality, when a detached
  234. communication completes, its resources are automatically released
  235. back to the system, there is no need to test or to wait for the
  236. completion of the request.
  237. */
  238. int starpu_mpi_issend_detached(starpu_data_handle_t data_handle, int dest, starpu_mpi_tag_t data_tag, MPI_Comm comm, void (*callback)(void *), void *arg);
  239. /**
  240. Similar to starpu_mpi_issend_detached(), but take a priority \p prio.
  241. */
  242. int starpu_mpi_issend_detached_prio(starpu_data_handle_t data_handle, int dest, starpu_mpi_tag_t data_tag, int prio, MPI_Comm comm, void (*callback)(void *), void *arg);
  243. /**
  244. Return when the operation identified by request \p req is complete.
  245. The value of \p status cannot be NULL, use the predefined value
  246. MPI_STATUS_IGNORE to ignore the status.
  247. */
  248. int starpu_mpi_wait(starpu_mpi_req *req, MPI_Status *status);
  249. /**
  250. If the operation identified by \p req is complete, set \p flag to
  251. 1. The \p status object is set to contain information on the
  252. completed operation.
  253. */
  254. int starpu_mpi_test(starpu_mpi_req *req, int *flag, MPI_Status *status);
  255. /**
  256. Block the caller until all group members of the communicator \p
  257. comm have called it.
  258. */
  259. int starpu_mpi_barrier(MPI_Comm comm);
  260. /**
  261. Wait until all StarPU tasks and communications for the given
  262. communicator are completed.
  263. */
  264. int starpu_mpi_wait_for_all(MPI_Comm comm);
  265. /**
  266. Post a standard-mode, non blocking send of \p data_handle to the
  267. node \p dest using the message tag \p data_tag within the
  268. communicator \p comm. On completion, \p tag is unlocked.
  269. */
  270. int starpu_mpi_isend_detached_unlock_tag(starpu_data_handle_t data_handle, int dest, starpu_mpi_tag_t data_tag, MPI_Comm comm, starpu_tag_t tag);
  271. /**
  272. Similar to starpu_mpi_isend_detached_unlock_tag(), but take a
  273. priority \p prio.
  274. */
  275. int starpu_mpi_isend_detached_unlock_tag_prio(starpu_data_handle_t data_handle, int dest, starpu_mpi_tag_t data_tag, int prio, MPI_Comm comm, starpu_tag_t tag);
  276. /**
  277. Post a nonblocking receive in \p data_handle from the node \p
  278. source using the message tag \p data_tag within the communicator \p
  279. comm. On completion, \p tag is unlocked.
  280. */
  281. int starpu_mpi_irecv_detached_unlock_tag(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm, starpu_tag_t tag);
  282. /**
  283. Post \p array_size standard-mode, non blocking send. Each post
  284. sends the n-th data of the array \p data_handle to the n-th node of
  285. the array \p dest using the n-th message tag of the array \p
  286. data_tag within the n-th communicator of the array \p comm. On
  287. completion of the all the requests, \p tag is unlocked.
  288. */
  289. int starpu_mpi_isend_array_detached_unlock_tag(unsigned array_size, starpu_data_handle_t *data_handle, int *dest, starpu_mpi_tag_t *data_tag, MPI_Comm *comm, starpu_tag_t tag);
  290. /**
  291. Similar to starpu_mpi_isend_array_detached_unlock_tag(), but take a
  292. priority \p prio.
  293. */
  294. int starpu_mpi_isend_array_detached_unlock_tag_prio(unsigned array_size, starpu_data_handle_t *data_handle, int *dest, starpu_mpi_tag_t *data_tag, int *prio, MPI_Comm *comm, starpu_tag_t tag);
  295. /**
  296. Post \p array_size nonblocking receive. Each post receives in the
  297. n-th data of the array \p data_handle from the n-th node of the
  298. array \p source using the n-th message tag of the array \p data_tag
  299. within the n-th communicator of the array \p comm. On completion of
  300. the all the requests, \p tag is unlocked.
  301. */
  302. int starpu_mpi_irecv_array_detached_unlock_tag(unsigned array_size, starpu_data_handle_t *data_handle, int *source, starpu_mpi_tag_t *data_tag, MPI_Comm *comm, starpu_tag_t tag);
  303. typedef int (*starpu_mpi_datatype_allocate_func_t)(starpu_data_handle_t, MPI_Datatype *);
  304. typedef int (*starpu_mpi_datatype_node_allocate_func_t)(starpu_data_handle_t, unsigned node, MPI_Datatype *);
  305. typedef void (*starpu_mpi_datatype_free_func_t)(MPI_Datatype *);
  306. /**
  307. Register functions to create and free a MPI datatype for the given
  308. handle.
  309. Similar to starpu_mpi_interface_datatype_register().
  310. It is important that the function is called before any
  311. communication can take place for a data with the given handle. See
  312. \ref ExchangingUserDefinedDataInterface for an example.
  313. */
  314. int starpu_mpi_datatype_register(starpu_data_handle_t handle, starpu_mpi_datatype_allocate_func_t allocate_datatype_func, starpu_mpi_datatype_free_func_t free_datatype_func);
  315. /**
  316. Register functions to create and free a MPI datatype for the given
  317. interface id.
  318. Similar to starpu_mpi_datatype_register().
  319. It is important that the function is called before any
  320. communication can take place for a data with the given handle. See
  321. \ref ExchangingUserDefinedDataInterface for an example.
  322. */
  323. int starpu_mpi_interface_datatype_register(enum starpu_data_interface_id id, starpu_mpi_datatype_allocate_func_t allocate_datatype_func, starpu_mpi_datatype_free_func_t free_datatype_func);
  324. /**
  325. Register functions to create and free a MPI datatype for the given
  326. handle.
  327. Similar to starpu_mpi_interface_datatype_register().
  328. It is important that the function is called before any
  329. communication can take place for a data with the given handle. See
  330. \ref ExchangingUserDefinedDataInterface for an example.
  331. */
  332. int starpu_mpi_datatype_node_register(starpu_data_handle_t handle, starpu_mpi_datatype_node_allocate_func_t allocate_datatype_func, starpu_mpi_datatype_free_func_t free_datatype_func);
  333. /**
  334. Register functions to create and free a MPI datatype for the given
  335. interface id.
  336. Similar to starpu_mpi_datatype_register().
  337. It is important that the function is called before any
  338. communication can take place for a data with the given handle. See
  339. \ref ExchangingUserDefinedDataInterface for an example.
  340. */
  341. int starpu_mpi_interface_datatype_node_register(enum starpu_data_interface_id id, starpu_mpi_datatype_node_allocate_func_t allocate_datatype_func, starpu_mpi_datatype_free_func_t free_datatype_func);
  342. /**
  343. Unregister the MPI datatype functions stored for the interface of
  344. the given handle.
  345. */
  346. int starpu_mpi_datatype_unregister(starpu_data_handle_t handle);
  347. /**
  348. Unregister the MPI datatype functions stored for the interface of
  349. the given interface id. Similar to starpu_mpi_datatype_unregister().
  350. */
  351. int starpu_mpi_interface_datatype_unregister(enum starpu_data_interface_id id);
  352. /** @} */
  353. /**
  354. @name Communication Cache
  355. @{
  356. */
  357. /**
  358. Return 1 if the communication cache is enabled, 0 otherwise
  359. */
  360. int starpu_mpi_cache_is_enabled();
  361. /**
  362. If \p enabled is 1, enable the communication cache. Otherwise,
  363. clean the cache if it was enabled and disable it.
  364. */
  365. int starpu_mpi_cache_set(int enabled);
  366. /**
  367. Clear the send and receive communication cache for the data \p
  368. data_handle and invalidate the value. The function has to be called
  369. at the same point of task graph submission by all the MPI nodes on
  370. which the handle was registered. The function does nothing if the
  371. cache mechanism is disabled (see \ref STARPU_MPI_CACHE).
  372. */
  373. void starpu_mpi_cache_flush(MPI_Comm comm, starpu_data_handle_t data_handle);
  374. /**
  375. Clear the send and receive communication cache for all data and
  376. invalidate their values. The function has to be called at the same
  377. point of task graph submission by all the MPI nodes. The function
  378. does nothing if the cache mechanism is disabled (see \ref
  379. STARPU_MPI_CACHE).
  380. */
  381. void starpu_mpi_cache_flush_all_data(MPI_Comm comm);
  382. /**
  383. Test whether \p data_handle is cached for reception, i.e. the value
  384. was previously received from the owner node, and not flushed since
  385. then.
  386. */
  387. int starpu_mpi_cached_receive(starpu_data_handle_t data_handle);
  388. /**
  389. * If \p data is already available in the reception cache, return 1
  390. * If \p data is NOT available in the reception cache, add it to the
  391. * cache and return 0
  392. * Return 0 if the communication cache is not enabled
  393. */
  394. int starpu_mpi_cached_receive_set(starpu_data_handle_t data);
  395. /**
  396. * Remove \p data from the reception cache
  397. */
  398. void starpu_mpi_cached_receive_clear(starpu_data_handle_t data);
  399. /**
  400. Test whether \p data_handle is cached for emission to node \p dest,
  401. i.e. the value was previously sent to \p dest, and not flushed
  402. since then.
  403. */
  404. int starpu_mpi_cached_send(starpu_data_handle_t data_handle, int dest);
  405. /**
  406. * If \p data is already available in the emission cache for node
  407. * \p dest, return 1
  408. * If \p data is NOT available in the emission cache for node \p dest,
  409. * add it to the cache and return 0
  410. * Return 0 if the communication cache is not enabled
  411. */
  412. int starpu_mpi_cached_send_set(starpu_data_handle_t data, int dest);
  413. /**
  414. * Remove \p data from the emission cache
  415. */
  416. void starpu_mpi_cached_send_clear(starpu_data_handle_t data);
  417. /** @} */
  418. /**
  419. @name MPI Insert Task
  420. \anchor MPIInsertTask
  421. @{
  422. */
  423. /**
  424. Can be used as rank when calling starpu_mpi_data_register() and
  425. alike, to specify that the data is per-node: each node will have
  426. its own value. Tasks writing to such data will be replicated on all
  427. nodes (and all parameters then have to be per-node). Tasks not
  428. writing to such data will just take the node-local value without
  429. any MPI communication.
  430. */
  431. #define STARPU_MPI_PER_NODE -2
  432. /**
  433. Register to MPI a StarPU data handle with the given tag, rank and
  434. MPI communicator. It also automatically clears the MPI
  435. communication cache when unregistering the data.
  436. */
  437. void starpu_mpi_data_register_comm(starpu_data_handle_t data_handle, starpu_mpi_tag_t data_tag, int rank, MPI_Comm comm);
  438. /**
  439. Register to MPI a StarPU data handle with the given tag, rank and
  440. the MPI communicator \c MPI_COMM_WORLD.
  441. It also automatically clears the MPI communication cache when
  442. unregistering the data.
  443. */
  444. #define starpu_mpi_data_register(data_handle, data_tag, rank) starpu_mpi_data_register_comm(data_handle, data_tag, rank, MPI_COMM_WORLD)
  445. /**
  446. Register to MPI a StarPU data handle with the given tag. No rank
  447. will be defined.
  448. It also automatically clears the MPI communication cache when
  449. unregistering the data.
  450. */
  451. void starpu_mpi_data_set_tag(starpu_data_handle_t handle, starpu_mpi_tag_t data_tag);
  452. /**
  453. Symbol kept for backward compatibility. Call function starpu_mpi_data_set_tag()
  454. */
  455. #define starpu_data_set_tag starpu_mpi_data_set_tag
  456. /**
  457. Register to MPI a StarPU data handle with the given rank and given
  458. communicator. No tag will be defined.
  459. It also automatically clears the MPI communication cache when
  460. unregistering the data.
  461. */
  462. void starpu_mpi_data_set_rank_comm(starpu_data_handle_t handle, int rank, MPI_Comm comm);
  463. /**
  464. Register to MPI a StarPU data handle with the given rank and the
  465. MPI communicator \c MPI_COMM_WORLD. No tag will be defined.
  466. It also automatically clears the MPI communication cache when
  467. unregistering the data.
  468. */
  469. #define starpu_mpi_data_set_rank(handle, rank) starpu_mpi_data_set_rank_comm(handle, rank, MPI_COMM_WORLD)
  470. /**
  471. Symbol kept for backward compatibility. Call function starpu_mpi_data_set_rank()
  472. */
  473. #define starpu_data_set_rank starpu_mpi_data_set_rank
  474. /**
  475. Return the rank of the given data.
  476. */
  477. int starpu_mpi_data_get_rank(starpu_data_handle_t handle);
  478. /**
  479. Symbol kept for backward compatibility. Call function starpu_mpi_data_get_rank()
  480. */
  481. #define starpu_data_get_rank starpu_mpi_data_get_rank
  482. /**
  483. Return the tag of the given data.
  484. */
  485. starpu_mpi_tag_t starpu_mpi_data_get_tag(starpu_data_handle_t handle);
  486. /**
  487. Return the redux map of the given data.
  488. */
  489. char* starpu_mpi_data_get_redux_map(starpu_data_handle_t handle);
  490. /**
  491. Symbol kept for backward compatibility. Call function starpu_mpi_data_get_tag()
  492. */
  493. #define starpu_data_get_tag starpu_mpi_data_get_tag
  494. /**
  495. Create and submit a task corresponding to codelet with the
  496. following arguments. The argument list must be zero-terminated.
  497. The arguments following the codelet are the same types as for the
  498. function starpu_task_insert().
  499. Access modes for data can also be
  500. set with ::STARPU_SSEND to specify the data has to be sent using a
  501. synchronous and non-blocking mode (see starpu_mpi_issend()).
  502. The extra argument ::STARPU_EXECUTE_ON_NODE followed by an integer
  503. allows to specify the MPI node to execute the codelet. It is also
  504. possible to specify that the node owning a specific data will
  505. execute the codelet, by using ::STARPU_EXECUTE_ON_DATA followed by
  506. a data handle.
  507. The internal algorithm is as follows:
  508. <ol>
  509. <li>
  510. Find out which MPI node is going to execute the codelet.
  511. <ul>
  512. <li>
  513. If there is only one node owning data in ::STARPU_W mode, it
  514. will be selected;
  515. <li>
  516. If there is several nodes owning data in ::STARPU_W mode, a
  517. node will be selected according to a given node selection
  518. policy (see ::STARPU_NODE_SELECTION_POLICY or
  519. starpu_mpi_node_selection_set_current_policy())
  520. <li>
  521. The argument ::STARPU_EXECUTE_ON_NODE followed by an integer
  522. can be used to specify the node;
  523. <li>
  524. The argument ::STARPU_EXECUTE_ON_DATA followed by a data handle can be used to specify that the node owing the given data will execute the codelet.
  525. </ul>
  526. </li>
  527. <li>
  528. Send and receive data as requested. Nodes owning data which need to
  529. be read by the task are sending them to the MPI node which will
  530. execute it. The latter receives them.
  531. </li>
  532. <li>
  533. Execute the codelet. This is done by the MPI node selected in the
  534. 1st step of the algorithm.
  535. </li>
  536. <li>
  537. If several MPI nodes own data to be written to, send written data
  538. back to their owners.
  539. </li>
  540. </ol>
  541. The algorithm also includes a communication cache mechanism that
  542. allows not to send data twice to the same MPI node, unless the data
  543. has been modified. The cache can be disabled (see \ref
  544. STARPU_MPI_CACHE).
  545. */
  546. int starpu_mpi_task_insert(MPI_Comm comm, struct starpu_codelet *codelet, ...);
  547. #ifdef STARPU_USE_FXT
  548. #define starpu_mpi_task_insert(comm, cl, ...) \
  549. starpu_mpi_task_insert((comm), (cl), STARPU_TASK_FILE, __FILE__, STARPU_TASK_LINE, __LINE__, ##__VA_ARGS__)
  550. #endif
  551. /**
  552. Call starpu_mpi_task_insert(). Symbol kept for backward compatibility.
  553. */
  554. int starpu_mpi_insert_task(MPI_Comm comm, struct starpu_codelet *codelet, ...);
  555. #ifdef STARPU_USE_FXT
  556. #define starpu_mpi_insert_task(comm, cl, ...) \
  557. starpu_mpi_insert_task((comm), (cl), STARPU_TASK_FILE, __FILE__, STARPU_TASK_LINE, __LINE__, ##__VA_ARGS__)
  558. #endif
  559. /**
  560. Create a task corresponding to \p codelet with the following given
  561. arguments. The argument list must be zero-terminated. The function
  562. performs the first two steps of the function
  563. starpu_mpi_task_insert(), i.e. submitting the MPI communications
  564. needed before the execution of the task, and the creation of the
  565. task on one node. Only the MPI node selected in the first step of
  566. the algorithm will return a valid task structure which can then be
  567. submitted, others will return <c>NULL</c>. The function
  568. starpu_mpi_task_post_build() MUST be called after that on all
  569. nodes, and after the submission of the task on the node which
  570. creates it, with the SAME list of arguments.
  571. */
  572. struct starpu_task *starpu_mpi_task_build(MPI_Comm comm, struct starpu_codelet *codelet, ...);
  573. #ifdef STARPU_USE_FXT
  574. #define starpu_mpi_task_build(comm, cl, ...) \
  575. starpu_mpi_task_build((comm), (cl), STARPU_TASK_FILE, __FILE__, STARPU_TASK_LINE, __LINE__, ##__VA_ARGS__)
  576. #endif
  577. /**
  578. MUST be called after a call to starpu_mpi_task_build(),
  579. with the SAME list of arguments. Perform the fourth -- last -- step of
  580. the algorithm described in starpu_mpi_task_insert().
  581. */
  582. int starpu_mpi_task_post_build(MPI_Comm comm, struct starpu_codelet *codelet, ...);
  583. /**
  584. Transfer data \p data_handle to MPI node \p node, sending it from
  585. its owner if needed. At least the target node and the owner have to
  586. call the function.
  587. */
  588. void starpu_mpi_get_data_on_node(MPI_Comm comm, starpu_data_handle_t data_handle, int node);
  589. /**
  590. Transfer data \p data_handle to MPI node \p node, sending it from
  591. its owner if needed. At least the target node and the owner have to
  592. call the function. On reception, the \p callback function is called
  593. with the argument \p arg.
  594. */
  595. void starpu_mpi_get_data_on_node_detached(MPI_Comm comm, starpu_data_handle_t data_handle, int node, void (*callback)(void*), void *arg);
  596. /**
  597. Transfer data \p data_handle to all MPI nodes, sending it from its
  598. owner if needed. All nodes have to call the function.
  599. */
  600. void starpu_mpi_get_data_on_all_nodes_detached(MPI_Comm comm, starpu_data_handle_t data_handle);
  601. /**
  602. Submit migration of the data onto the \p new_rank MPI node. This
  603. means both submitting the transfer of the data to node \p new_rank
  604. if it hasn't been submitted already, and setting the home node of
  605. the data to the new node. Further data transfers submitted by
  606. starpu_mpi_task_insert() will be done from that new node. This
  607. function thus needs to be called on all nodes which have registered
  608. the data at the same point of tasks submissions. This also flushes
  609. the cache for this data to avoid incoherencies.
  610. */
  611. void starpu_mpi_data_migrate(MPI_Comm comm, starpu_data_handle_t handle, int new_rank);
  612. /** @} */
  613. /**
  614. @name Node Selection Policy
  615. \anchor MPINodeSelectionPolicy
  616. @{
  617. */
  618. /**
  619. Define the current policy
  620. */
  621. #define STARPU_MPI_NODE_SELECTION_CURRENT_POLICY -1
  622. /**
  623. Define the policy in which the selected node is the one having the
  624. most data in ::STARPU_R mode
  625. */
  626. #define STARPU_MPI_NODE_SELECTION_MOST_R_DATA 0
  627. typedef int (*starpu_mpi_select_node_policy_func_t)(int me, int nb_nodes, struct starpu_data_descr *descr, int nb_data);
  628. /**
  629. Register a new policy which can then be used when there is several
  630. nodes owning data in ::STARPU_W mode.
  631. Here an example of function defining a node selection policy.
  632. The codelet will be executed on the node owing the first data with
  633. a size bigger than 1M, or on the node 0 if no data fits the given
  634. size.
  635. \code{.c}
  636. int my_node_selection_policy(int me, int nb_nodes, struct starpu_data_descr *descr, int nb_data)
  637. {
  638. // me is the current MPI rank
  639. // nb_nodes is the number of MPI nodes
  640. // descr is the description of the data specified when calling starpu_mpi_task_insert
  641. // nb_data is the number of data in descr
  642. int i;
  643. for(i= 0 ; i<nb_data ; i++)
  644. {
  645. starpu_data_handle_t data = descr[i].handle;
  646. enum starpu_data_access_mode mode = descr[i].mode;
  647. if (mode & STARPU_R)
  648. {
  649. int rank = starpu_data_get_rank(data);
  650. size_t size = starpu_data_get_size(data);
  651. if (size > 1024*1024) return rank;
  652. }
  653. }
  654. return 0;
  655. }
  656. \endcode
  657. */
  658. int starpu_mpi_node_selection_register_policy(starpu_mpi_select_node_policy_func_t policy_func);
  659. /**
  660. Unregister a previously registered policy.
  661. */
  662. int starpu_mpi_node_selection_unregister_policy(int policy);
  663. /**
  664. Return the current policy used to select the node which will
  665. execute the codelet
  666. */
  667. int starpu_mpi_node_selection_get_current_policy();
  668. /**
  669. Set the current policy used to select the node which will execute
  670. the codelet. The policy ::STARPU_MPI_NODE_SELECTION_MOST_R_DATA
  671. selects the node having the most data in ::STARPU_R mode so as to
  672. minimize the amount of data to be transfered.
  673. */
  674. int starpu_mpi_node_selection_set_current_policy(int policy);
  675. /** @} */
  676. /**
  677. @name Collective Operations
  678. \anchor MPICollectiveOperations
  679. @{
  680. */
  681. /**
  682. Perform a reduction on the given data \p handle. All nodes send the
  683. data to its owner node which will perform a reduction.
  684. */
  685. void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle);
  686. /**
  687. Similar to starpu_mpi_redux_data, but take a priority \p prio.
  688. */
  689. void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle, int prio);
  690. /**
  691. Perform a reduction on the given data \p handle.
  692. Nodes perform the reduction through in a tree-based fashion.
  693. The tree use is an \p arity - ary tree.
  694. */
  695. void starpu_mpi_redux_data_tree(MPI_Comm comm, starpu_data_handle_t data_handle, int arity);
  696. /**
  697. Similar to starpu_mpi_redux_data_tree, but take a priority \p prio.
  698. */
  699. void starpu_mpi_redux_data_prio_tree(MPI_Comm comm, starpu_data_handle_t data_handle, int prio, int arity);
  700. /**
  701. Scatter data among processes of the communicator based on the
  702. ownership of the data. For each data of the array \p data_handles,
  703. the process \p root sends the data to the process owning this data.
  704. Processes receiving data must have valid data handles to receive
  705. them. On completion of the collective communication, the \p
  706. scallback function is called with the argument \p sarg on the
  707. process \p root, the \p rcallback function is called with the
  708. argument \p rarg on any other process.
  709. */
  710. int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg);
  711. /**
  712. Gather data from the different processes of the communicator onto
  713. the process \p root. Each process owning data handle in the array
  714. \p data_handles will send them to the process \p root. The process
  715. \p root must have valid data handles to receive the data. On
  716. completion of the collective communication, the \p rcallback
  717. function is called with the argument \p rarg on the process root,
  718. the \p scallback function is called with the argument \p sarg on
  719. any other process.
  720. */
  721. int starpu_mpi_gather_detached(starpu_data_handle_t *data_handles, int count, int root, MPI_Comm comm, void (*scallback)(void *), void *sarg, void (*rcallback)(void *), void *rarg);
  722. /** @} */
  723. int starpu_mpi_pre_submit_hook_register(void (*f)(struct starpu_task *));
  724. int starpu_mpi_pre_submit_hook_unregister();
  725. /** @} */
  726. #ifdef __cplusplus
  727. }
  728. #endif
  729. #endif // STARPU_USE_MPI
  730. #endif // __STARPU_MPI_H__