starpu_mpi_private.h 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2013,2016,2017 Inria
  4. * Copyright (C) 2010-2017, 2019 CNRS
  5. * Copyright (C) 2010-2018 Université de Bordeaux
  6. *
  7. * StarPU is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU Lesser General Public License as published by
  9. * the Free Software Foundation; either version 2.1 of the License, or (at
  10. * your option) any later version.
  11. *
  12. * StarPU is distributed in the hope that it will be useful, but
  13. * WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  15. *
  16. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  17. */
  18. #ifndef __STARPU_MPI_PRIVATE_H__
  19. #define __STARPU_MPI_PRIVATE_H__
  20. #include <starpu.h>
  21. #include <common/config.h>
  22. #include <common/uthash.h>
  23. #include <starpu_mpi.h>
  24. #include <starpu_mpi_fxt.h>
  25. #include <common/list.h>
  26. #include <common/prio_list.h>
  27. #include <common/starpu_spinlock.h>
  28. #include <core/simgrid.h>
  29. #if defined(STARPU_USE_MPI_NMAD)
  30. #include <pioman.h>
  31. #include <nm_sendrecv_interface.h>
  32. #include <nm_session_interface.h>
  33. #endif
  34. #ifdef __cplusplus
  35. extern "C"
  36. {
  37. #endif
  38. #ifdef STARPU_SIMGRID
  39. starpu_pthread_wait_t wait;
  40. starpu_pthread_queue_t dontsleep;
  41. struct _starpu_simgrid_mpi_req
  42. {
  43. MPI_Request *request;
  44. MPI_Status *status;
  45. starpu_pthread_queue_t *queue;
  46. unsigned *done;
  47. };
  48. int _starpu_mpi_simgrid_mpi_test(unsigned *done, int *flag);
  49. void _starpu_mpi_simgrid_wait_req(MPI_Request *request, MPI_Status *status, starpu_pthread_queue_t *queue, unsigned *done);
  50. #endif
  51. extern int _starpu_debug_rank;
  52. char *_starpu_mpi_get_mpi_error_code(int code);
  53. extern int _starpu_mpi_comm_debug;
  54. #ifdef STARPU_MPI_VERBOSE
  55. extern int _starpu_debug_level_min;
  56. extern int _starpu_debug_level_max;
  57. void _starpu_mpi_set_debug_level_min(int level);
  58. void _starpu_mpi_set_debug_level_max(int level);
  59. #endif
  60. extern int _starpu_mpi_fake_world_size;
  61. extern int _starpu_mpi_fake_world_rank;
  62. extern int _starpu_mpi_use_prio;
  63. extern int _starpu_mpi_thread_cpuid;
  64. extern int _starpu_mpi_use_coop_sends;
  65. void _starpu_mpi_env_init(void);
  66. #ifdef STARPU_NO_ASSERT
  67. # define STARPU_MPI_ASSERT_MSG(x, msg, ...) do { if (0) { (void) (x); }} while(0)
  68. #else
  69. # if defined(__CUDACC__) && defined(STARPU_HAVE_WINDOWS)
  70. int _starpu_debug_rank;
  71. # define STARPU_MPI_ASSERT_MSG(x, msg, ...) \
  72. do \
  73. { \
  74. if (STARPU_UNLIKELY(!(x))) \
  75. { \
  76. if (_starpu_debug_rank == -1) starpu_mpi_comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank); \
  77. fprintf(stderr, "\n[%d][starpu_mpi][%s][assert failure] " msg "\n\n", _starpu_debug_rank, __starpu_func__, ## __VA_ARGS__); *(int*)NULL = 0; \
  78. } \
  79. } while(0)
  80. # else
  81. # define STARPU_MPI_ASSERT_MSG(x, msg, ...) \
  82. do \
  83. { \
  84. if (STARPU_UNLIKELY(!(x))) \
  85. { \
  86. if (_starpu_debug_rank == -1) starpu_mpi_comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank); \
  87. fprintf(stderr, "\n[%d][starpu_mpi][%s][assert failure] " msg "\n\n", _starpu_debug_rank, __starpu_func__, ## __VA_ARGS__); \
  88. } \
  89. assert(x); \
  90. } while(0)
  91. # endif
  92. #endif
  93. #define _STARPU_MPI_MALLOC(ptr, size) do { ptr = malloc(size); STARPU_MPI_ASSERT_MSG(ptr != NULL, "Cannot allocate %ld bytes\n", (long) (size)); } while (0)
  94. #define _STARPU_MPI_CALLOC(ptr, nmemb, size) do { ptr = calloc(nmemb, size); STARPU_MPI_ASSERT_MSG(ptr != NULL, "Cannot allocate %ld bytes\n", (long) (nmemb*size)); } while (0)
  95. #define _STARPU_MPI_REALLOC(ptr, size) do { void *_new_ptr = realloc(ptr, size); STARPU_MPI_ASSERT_MSG(_new_ptr != NULL, "Cannot reallocate %ld bytes\n", (long) (size)); ptr = _new_ptr; } while (0)
  96. #ifdef STARPU_MPI_VERBOSE
  97. # define _STARPU_MPI_COMM_DEBUG(ptr, count, datatype, node, tag, utag, comm, way) \
  98. do \
  99. { \
  100. if (_starpu_mpi_comm_debug) \
  101. { \
  102. int __size; \
  103. char _comm_name[128]; \
  104. int _comm_name_len; \
  105. int _rank; \
  106. starpu_mpi_comm_rank(comm, &_rank); \
  107. MPI_Type_size(datatype, &__size); \
  108. MPI_Comm_get_name(comm, _comm_name, &_comm_name_len); \
  109. fprintf(stderr, "[%d][starpu_mpi] :%d:%s:%d:%d:%ld:%s:%p:%ld:%d:%s:%d\n", _rank, _rank, way, node, tag, utag, _comm_name, ptr, count, __size, __starpu_func__ , __LINE__); \
  110. fflush(stderr); \
  111. } \
  112. } while(0);
  113. # define _STARPU_MPI_COMM_TO_DEBUG(ptr, count, datatype, dest, tag, utag, comm) _STARPU_MPI_COMM_DEBUG(ptr, count, datatype, dest, tag, utag, comm, "-->")
  114. # define _STARPU_MPI_COMM_FROM_DEBUG(ptr, count, datatype, source, tag, utag, comm) _STARPU_MPI_COMM_DEBUG(ptr, count, datatype, source, tag, utag, comm, "<--")
  115. # define _STARPU_MPI_DEBUG(level, fmt, ...) \
  116. do \
  117. { \
  118. if (!_starpu_silent && _starpu_debug_level_min <= level && level <= _starpu_debug_level_max) \
  119. { \
  120. if (_starpu_debug_rank == -1) starpu_mpi_comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank); \
  121. fprintf(stderr, "%*s[%d][starpu_mpi][%s:%d] " fmt , (_starpu_debug_rank+1)*4, "", _starpu_debug_rank, __starpu_func__ , __LINE__,## __VA_ARGS__); \
  122. fflush(stderr); \
  123. } \
  124. } while(0);
  125. #else
  126. # define _STARPU_MPI_COMM_DEBUG(ptr, count, datatype, node, tag, utag, comm, way) do { } while(0)
  127. # define _STARPU_MPI_COMM_TO_DEBUG(ptr, count, datatype, dest, tag, utag, comm) do { } while(0)
  128. # define _STARPU_MPI_COMM_FROM_DEBUG(ptr, count, datatype, source, tag, utag, comm) do { } while(0)
  129. # define _STARPU_MPI_DEBUG(level, fmt, ...) do { } while(0)
  130. #endif
  131. #define _STARPU_MPI_DISP(fmt, ...) do { if (!_starpu_silent) { \
  132. if (_starpu_debug_rank == -1) starpu_mpi_comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank); \
  133. fprintf(stderr, "%*s[%d][starpu_mpi][%s:%d] " fmt , (_starpu_debug_rank+1)*4, "", _starpu_debug_rank, __starpu_func__ , __LINE__ ,## __VA_ARGS__); \
  134. fflush(stderr); }} while(0);
  135. #define _STARPU_MPI_MSG(fmt, ...) do { if (_starpu_debug_rank == -1) starpu_mpi_comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank); \
  136. fprintf(stderr, "[%d][starpu_mpi][%s:%d] " fmt , _starpu_debug_rank, __starpu_func__ , __LINE__ ,## __VA_ARGS__); \
  137. fflush(stderr); } while(0);
  138. #ifdef STARPU_MPI_EXTRA_VERBOSE
  139. # define _STARPU_MPI_LOG_IN() do { if (!_starpu_silent) { \
  140. if (_starpu_debug_rank == -1) starpu_mpi_comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank); \
  141. fprintf(stderr, "%*s[%d][starpu_mpi][%s:%d] -->\n", (_starpu_debug_rank+1)*4, "", _starpu_debug_rank, __starpu_func__ , __LINE__); \
  142. fflush(stderr); }} while(0)
  143. # define _STARPU_MPI_LOG_OUT() do { if (!_starpu_silent) { \
  144. if (_starpu_debug_rank == -1) starpu_mpi_comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank); \
  145. fprintf(stderr, "%*s[%d][starpu_mpi][%s:%d] <--\n", (_starpu_debug_rank+1)*4, "", _starpu_debug_rank, __starpu_func__, __LINE__ ); \
  146. fflush(stderr); }} while(0)
  147. #else
  148. # define _STARPU_MPI_LOG_IN()
  149. # define _STARPU_MPI_LOG_OUT()
  150. #endif
  151. #if defined(STARPU_USE_MPI_MPI)
  152. extern int _starpu_mpi_tag;
  153. #define _STARPU_MPI_TAG_ENVELOPE _starpu_mpi_tag
  154. #define _STARPU_MPI_TAG_DATA _starpu_mpi_tag+1
  155. #define _STARPU_MPI_TAG_SYNC_DATA _starpu_mpi_tag+2
  156. #define _STARPU_MPI_ENVELOPE_DATA 0
  157. #define _STARPU_MPI_ENVELOPE_SYNC_READY 1
  158. struct _starpu_mpi_envelope
  159. {
  160. int mode;
  161. starpu_ssize_t size;
  162. starpu_mpi_tag_t data_tag;
  163. unsigned sync;
  164. };
  165. #endif /* STARPU_USE_MPI_MPI */
  166. enum _starpu_mpi_request_type
  167. {
  168. SEND_REQ=0,
  169. RECV_REQ=1,
  170. WAIT_REQ=2,
  171. TEST_REQ=3,
  172. BARRIER_REQ=4,
  173. PROBE_REQ=5,
  174. UNKNOWN_REQ=6,
  175. };
  176. struct _starpu_mpi_node_tag
  177. {
  178. MPI_Comm comm;
  179. int rank;
  180. starpu_mpi_tag_t data_tag;
  181. };
  182. MULTILIST_CREATE_TYPE(_starpu_mpi_req, coop_sends)
  183. /* One bag of cooperative sends */
  184. struct _starpu_mpi_coop_sends
  185. {
  186. /* List of send requests */
  187. struct _starpu_mpi_req_multilist_coop_sends reqs;
  188. struct _starpu_mpi_data *mpi_data;
  189. /* Array of send requests, after sorting out */
  190. struct _starpu_spinlock lock;
  191. struct _starpu_mpi_req **reqs_array;
  192. unsigned n;
  193. unsigned redirects_sent;
  194. };
  195. /* Initialized in starpu_mpi_data_register_comm */
  196. struct _starpu_mpi_data
  197. {
  198. int magic;
  199. struct _starpu_mpi_node_tag node_tag;
  200. int *cache_sent;
  201. int cache_received;
  202. /* Rendez-vous data for opportunistic cooperative sends */
  203. struct _starpu_spinlock coop_lock; /* Needed to synchronize between submit thread and workers */
  204. struct _starpu_mpi_coop_sends *coop_sends; /* Current cooperative send bag */
  205. };
  206. struct _starpu_mpi_data *_starpu_mpi_data_get(starpu_data_handle_t data_handle);
  207. struct _starpu_mpi_req;
  208. LIST_TYPE(_starpu_mpi_req,
  209. /* description of the data at StarPU level */
  210. starpu_data_handle_t data_handle;
  211. int prio;
  212. /* description of the data to be sent/received */
  213. MPI_Datatype datatype;
  214. char *datatype_name;
  215. void *ptr;
  216. starpu_ssize_t count;
  217. int registered_datatype;
  218. /* who are we talking to ? */
  219. struct _starpu_mpi_node_tag node_tag;
  220. #if defined(STARPU_USE_MPI_NMAD)
  221. nm_gate_t gate;
  222. nm_session_t session;
  223. #endif
  224. void (*func)(struct _starpu_mpi_req *);
  225. MPI_Status *status;
  226. #if defined(STARPU_USE_MPI_NMAD)
  227. nm_sr_request_t data_request;
  228. int waited;
  229. #elif defined(STARPU_USE_MPI_MPI)
  230. MPI_Request data_request;
  231. #endif
  232. struct _starpu_mpi_req_multilist_coop_sends coop_sends;
  233. struct _starpu_mpi_coop_sends *coop_sends_head;
  234. int *flag;
  235. unsigned sync;
  236. int ret;
  237. #if defined(STARPU_USE_MPI_NMAD)
  238. piom_cond_t req_cond;
  239. #elif defined(STARPU_USE_MPI_MPI)
  240. starpu_pthread_mutex_t req_mutex;
  241. starpu_pthread_cond_t req_cond;
  242. starpu_pthread_mutex_t posted_mutex;
  243. starpu_pthread_cond_t posted_cond;
  244. /* In the case of a Wait/Test request, we are going to post a request
  245. * to test the completion of another request */
  246. struct _starpu_mpi_req *other_request;
  247. #endif
  248. enum _starpu_mpi_request_type request_type; /* 0 send, 1 recv */
  249. unsigned submitted;
  250. unsigned completed;
  251. unsigned posted;
  252. /* in the case of detached requests */
  253. int detached;
  254. void *callback_arg;
  255. void (*callback)(void *);
  256. /* in the case of user-defined datatypes, we need to send the size of the data */
  257. #if defined(STARPU_USE_MPI_NMAD)
  258. nm_sr_request_t size_req;
  259. #elif defined(STARPU_USE_MPI_MPI)
  260. MPI_Request size_req;
  261. #endif
  262. #if defined(STARPU_USE_MPI_MPI)
  263. struct _starpu_mpi_envelope* envelope;
  264. unsigned is_internal_req:1;
  265. unsigned to_destroy:1;
  266. struct _starpu_mpi_req *internal_req;
  267. struct _starpu_mpi_early_data_handle *early_data_handle;
  268. UT_hash_handle hh;
  269. #endif
  270. int sequential_consistency;
  271. long pre_sync_jobid;
  272. long post_sync_jobid;
  273. #ifdef STARPU_SIMGRID
  274. MPI_Status status_store;
  275. starpu_pthread_queue_t queue;
  276. unsigned done;
  277. #endif
  278. );
  279. PRIO_LIST_TYPE(_starpu_mpi_req, prio)
  280. MULTILIST_CREATE_INLINES(struct _starpu_mpi_req, _starpu_mpi_req, coop_sends)
  281. /* To be called before actually queueing a request, so the communication layer knows it has something to look at */
  282. void _starpu_mpi_req_willpost(struct _starpu_mpi_req *req);
  283. /* To be called to actually submit the request */
  284. void _starpu_mpi_submit_ready_request(void *arg);
  285. /* To be called when request is completed */
  286. void _starpu_mpi_release_req_data(struct _starpu_mpi_req *req);
  287. /* Build a communication tree. Called before _starpu_mpi_coop_send is ever called. coop_sends->lock is held. */
  288. void _starpu_mpi_coop_sends_build_tree(struct _starpu_mpi_coop_sends *coop_sends);
  289. /* Try to merge with send request with other send requests */
  290. void _starpu_mpi_coop_send(starpu_data_handle_t data_handle, struct _starpu_mpi_req *req, enum starpu_data_access_mode mode, int sequential_consistency);
  291. /* Actually submit the coop_sends bag to MPI.
  292. * At least one of submit_control or submit_data is true.
  293. * _starpu_mpi_submit_coop_sends may be called either
  294. * - just once with both parameters being true,
  295. * - or once with submit_control being true (data is not available yet, but we
  296. * can send control messages), and a second time with submit_data being true. Or
  297. * the converse, possibly on different threads, etc.
  298. */
  299. void _starpu_mpi_submit_coop_sends(struct _starpu_mpi_coop_sends *coop_sends, int submit_control, int submit_data);
  300. void _starpu_mpi_submit_ready_request_inc(struct _starpu_mpi_req *req);
  301. void _starpu_mpi_request_init(struct _starpu_mpi_req **req);
  302. struct _starpu_mpi_req * _starpu_mpi_request_fill(starpu_data_handle_t data_handle,
  303. int srcdst, starpu_mpi_tag_t data_tag, MPI_Comm comm,
  304. unsigned detached, unsigned sync, int prio, void (*callback)(void *), void *arg,
  305. enum _starpu_mpi_request_type request_type, void (*func)(struct _starpu_mpi_req *),
  306. int sequential_consistency,
  307. int is_internal_req,
  308. starpu_ssize_t count);
  309. void _starpu_mpi_request_destroy(struct _starpu_mpi_req *req);
  310. void _starpu_mpi_isend_size_func(struct _starpu_mpi_req *req);
  311. void _starpu_mpi_irecv_size_func(struct _starpu_mpi_req *req);
  312. int _starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status);
  313. int _starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status);
  314. int _starpu_mpi_barrier(MPI_Comm comm);
  315. struct _starpu_mpi_argc_argv
  316. {
  317. int initialize_mpi;
  318. int *argc;
  319. char ***argv;
  320. MPI_Comm comm;
  321. int fargc; // Fortran argc
  322. char **fargv; // Fortran argv
  323. int rank;
  324. int world_size;
  325. };
  326. void _starpu_mpi_progress_shutdown(void **value);
  327. int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv);
  328. #ifdef STARPU_SIMGRID
  329. void _starpu_mpi_wait_for_initialization();
  330. #endif
  331. void _starpu_mpi_data_flush(starpu_data_handle_t data_handle);
  332. #ifdef __cplusplus
  333. }
  334. #endif
  335. #endif // __STARPU_MPI_PRIVATE_H__