starpu_mpi_private.h 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2010-2021 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  4. *
  5. * StarPU is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU Lesser General Public License as published by
  7. * the Free Software Foundation; either version 2.1 of the License, or (at
  8. * your option) any later version.
  9. *
  10. * StarPU is distributed in the hope that it will be useful, but
  11. * WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. *
  14. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  15. */
  16. #ifndef __STARPU_MPI_PRIVATE_H__
  17. #define __STARPU_MPI_PRIVATE_H__
  18. #include <starpu.h>
  19. #include <common/config.h>
  20. #include <common/uthash.h>
  21. #include <starpu_mpi.h>
  22. #include <starpu_mpi_fxt.h>
  23. #include <common/list.h>
  24. #include <common/prio_list.h>
  25. #include <common/starpu_spinlock.h>
  26. #include <core/simgrid.h>
  27. /** @file */
  28. #ifdef __cplusplus
  29. extern "C"
  30. {
  31. #endif
  32. #ifdef STARPU_SIMGRID
  33. extern starpu_pthread_wait_t _starpu_mpi_thread_wait;
  34. extern starpu_pthread_queue_t _starpu_mpi_thread_dontsleep;
  35. struct _starpu_simgrid_mpi_req
  36. {
  37. MPI_Request *request;
  38. MPI_Status *status;
  39. starpu_pthread_queue_t *queue;
  40. unsigned *done;
  41. };
  42. int _starpu_mpi_simgrid_mpi_test(unsigned *done, int *flag);
  43. void _starpu_mpi_simgrid_wait_req(MPI_Request *request, MPI_Status *status, starpu_pthread_queue_t *queue, unsigned *done);
  44. #endif
  45. extern int _starpu_debug_rank;
  46. char *_starpu_mpi_get_mpi_error_code(int code);
  47. extern int _starpu_mpi_comm_debug;
  48. #ifdef STARPU_MPI_VERBOSE
  49. extern int _starpu_debug_level_min;
  50. extern int _starpu_debug_level_max;
  51. void _starpu_mpi_set_debug_level_min(int level);
  52. void _starpu_mpi_set_debug_level_max(int level);
  53. #endif
  54. extern int _starpu_mpi_fake_world_size;
  55. extern int _starpu_mpi_fake_world_rank;
  56. extern int _starpu_mpi_use_prio;
  57. extern int _starpu_mpi_nobind;
  58. extern int _starpu_mpi_thread_cpuid;
  59. extern int _starpu_mpi_use_coop_sends;
  60. extern int _starpu_mpi_mem_throttle;
  61. void _starpu_mpi_env_init(void);
  62. #ifdef STARPU_NO_ASSERT
  63. # define STARPU_MPI_ASSERT_MSG(x, msg, ...) do { if (0) { (void) (x); }} while(0)
  64. #else
  65. # if defined(__CUDACC__) && defined(STARPU_HAVE_WINDOWS)
  66. int _starpu_debug_rank;
  67. # define STARPU_MPI_ASSERT_MSG(x, msg, ...) \
  68. do \
  69. { \
  70. if (STARPU_UNLIKELY(!(x))) \
  71. { \
  72. if (_starpu_debug_rank == -1) starpu_mpi_comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank); \
  73. fprintf(stderr, "\n[%d][starpu_mpi][%s][assert failure] " msg "\n\n", _starpu_debug_rank, __starpu_func__, ## __VA_ARGS__); *(int*)NULL = 0; \
  74. } \
  75. } while(0)
  76. # else
  77. # define STARPU_MPI_ASSERT_MSG(x, msg, ...) \
  78. do \
  79. { \
  80. if (STARPU_UNLIKELY(!(x))) \
  81. { \
  82. if (_starpu_debug_rank == -1) starpu_mpi_comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank); \
  83. fprintf(stderr, "\n[%d][starpu_mpi][%s][assert failure] " msg "\n\n", _starpu_debug_rank, __starpu_func__, ## __VA_ARGS__); \
  84. } \
  85. assert(x); \
  86. } while(0)
  87. # endif
  88. #endif
  89. #define _STARPU_MPI_MALLOC(ptr, size) do { ptr = malloc(size); STARPU_MPI_ASSERT_MSG(ptr != NULL, "Cannot allocate %ld bytes\n", (long) (size)); } while (0)
  90. #define _STARPU_MPI_CALLOC(ptr, nmemb, size) do { ptr = calloc(nmemb, size); STARPU_MPI_ASSERT_MSG(ptr != NULL, "Cannot allocate %ld bytes\n", (long) (nmemb*size)); } while (0)
  91. #define _STARPU_MPI_REALLOC(ptr, size) do { void *_new_ptr = realloc(ptr, size); STARPU_MPI_ASSERT_MSG(_new_ptr != NULL, "Cannot reallocate %ld bytes\n", (long) (size)); ptr = _new_ptr; } while (0)
  92. #ifdef STARPU_MPI_VERBOSE
  93. # define _STARPU_MPI_COMM_DEBUG(ptr, count, datatype, node, tag, utag, comm, way) \
  94. do \
  95. { \
  96. if (_starpu_mpi_comm_debug) \
  97. { \
  98. int __size; \
  99. char _comm_name[128]; \
  100. int _comm_name_len; \
  101. int _rank; \
  102. starpu_mpi_comm_rank(comm, &_rank); \
  103. MPI_Type_size(datatype, &__size); \
  104. MPI_Comm_get_name(comm, _comm_name, &_comm_name_len); \
  105. fprintf(stderr, "[%d][starpu_mpi] :%d:%s:%d:%d:%ld:%s:%p:%ld:%d:%s:%d\n", _rank, _rank, way, node, tag, utag, _comm_name, ptr, count, __size, __starpu_func__ , __LINE__); \
  106. fflush(stderr); \
  107. } \
  108. } while(0)
  109. # define _STARPU_MPI_COMM_TO_DEBUG(ptr, count, datatype, dest, tag, utag, comm) _STARPU_MPI_COMM_DEBUG(ptr, count, datatype, dest, tag, utag, comm, "-->")
  110. # define _STARPU_MPI_COMM_FROM_DEBUG(ptr, count, datatype, source, tag, utag, comm) _STARPU_MPI_COMM_DEBUG(ptr, count, datatype, source, tag, utag, comm, "<--")
  111. # define _STARPU_MPI_DEBUG(level, fmt, ...) \
  112. do \
  113. { \
  114. if (!_starpu_silent && _starpu_debug_level_min <= level && level <= _starpu_debug_level_max) \
  115. { \
  116. if (_starpu_debug_rank == -1) starpu_mpi_comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank); \
  117. fprintf(stderr, "%*s[%d][starpu_mpi][%s:%d] " fmt , (_starpu_debug_rank+1)*4, "", _starpu_debug_rank, __starpu_func__ , __LINE__,## __VA_ARGS__); \
  118. fflush(stderr); \
  119. } \
  120. } while(0)
  121. #else
  122. # define _STARPU_MPI_COMM_DEBUG(ptr, count, datatype, node, tag, utag, comm, way) do { } while(0)
  123. # define _STARPU_MPI_COMM_TO_DEBUG(ptr, count, datatype, dest, tag, utag, comm) do { } while(0)
  124. # define _STARPU_MPI_COMM_FROM_DEBUG(ptr, count, datatype, source, tag, utag, comm) do { } while(0)
  125. # define _STARPU_MPI_DEBUG(level, fmt, ...) do { } while(0)
  126. #endif
  127. #define _STARPU_MPI_DISP(fmt, ...) do { if (!_starpu_silent) { \
  128. if (_starpu_debug_rank == -1) starpu_mpi_comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank); \
  129. fprintf(stderr, "%*s[%d][starpu_mpi][%s:%d] " fmt , (_starpu_debug_rank+1)*4, "", _starpu_debug_rank, __starpu_func__ , __LINE__ ,## __VA_ARGS__); \
  130. fflush(stderr); }} while(0)
  131. #define _STARPU_MPI_MSG(fmt, ...) do { if (_starpu_debug_rank == -1) starpu_mpi_comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank); \
  132. fprintf(stderr, "[%d][starpu_mpi][%s:%d] " fmt , _starpu_debug_rank, __starpu_func__ , __LINE__ ,## __VA_ARGS__); \
  133. fflush(stderr); } while(0)
  134. #ifdef STARPU_MPI_EXTRA_VERBOSE
  135. # define _STARPU_MPI_LOG_IN() do { if (!_starpu_silent) { \
  136. if (_starpu_debug_rank == -1) starpu_mpi_comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank); \
  137. fprintf(stderr, "%*s[%d][starpu_mpi][%s:%d] -->\n", (_starpu_debug_rank+1)*4, "", _starpu_debug_rank, __starpu_func__ , __LINE__); \
  138. fflush(stderr); }} while(0)
  139. # define _STARPU_MPI_LOG_OUT() do { if (!_starpu_silent) { \
  140. if (_starpu_debug_rank == -1) starpu_mpi_comm_rank(MPI_COMM_WORLD, &_starpu_debug_rank); \
  141. fprintf(stderr, "%*s[%d][starpu_mpi][%s:%d] <--\n", (_starpu_debug_rank+1)*4, "", _starpu_debug_rank, __starpu_func__, __LINE__ ); \
  142. fflush(stderr); }} while(0)
  143. #else
  144. # define _STARPU_MPI_LOG_IN()
  145. # define _STARPU_MPI_LOG_OUT()
  146. #endif
  147. enum _starpu_mpi_request_type
  148. {
  149. SEND_REQ=0,
  150. RECV_REQ=1,
  151. WAIT_REQ=2,
  152. TEST_REQ=3,
  153. BARRIER_REQ=4,
  154. PROBE_REQ=5,
  155. UNKNOWN_REQ=6,
  156. };
  157. struct _starpu_mpi_node
  158. {
  159. MPI_Comm comm;
  160. int rank;
  161. };
  162. struct _starpu_mpi_node_tag
  163. {
  164. struct _starpu_mpi_node node;
  165. starpu_mpi_tag_t data_tag;
  166. };
  167. MULTILIST_CREATE_TYPE(_starpu_mpi_req, coop_sends)
  168. /** One bag of cooperative sends */
  169. struct _starpu_mpi_coop_sends
  170. {
  171. starpu_data_handle_t data_handle;
  172. /** List of send requests */
  173. struct _starpu_mpi_req_multilist_coop_sends reqs;
  174. struct _starpu_mpi_data *mpi_data;
  175. /** Array of send requests, after sorting out */
  176. struct _starpu_spinlock lock;
  177. struct _starpu_mpi_req **reqs_array;
  178. unsigned n;
  179. unsigned redirects_sent;
  180. /* Used to trace dependencies */
  181. long pre_sync_jobid;
  182. };
  183. /** cf. redux_map field : this is the value
  184. * put in this field whenever a node contributes
  185. * to the reduction of the data.
  186. * Only the owning node keeps track of all the contributing nodes. */
  187. #define REDUX_CONTRIB ((char*) -1)
  188. /** Initialized in starpu_mpi_data_register_comm */
  189. struct _starpu_mpi_data
  190. {
  191. int magic;
  192. struct _starpu_mpi_node_tag node_tag;
  193. char *cache_sent;
  194. int cache_received;
  195. /** Array used to store the contributing nodes to this data
  196. * when it is accessed in REDUX mode. */
  197. char* redux_map;
  198. /** Rendez-vous data for opportunistic cooperative sends,
  199. * Needed to synchronize between submit thread and workers */
  200. struct _starpu_spinlock coop_lock;
  201. /** Current cooperative send bag */
  202. struct _starpu_mpi_coop_sends *coop_sends;
  203. };
  204. struct _starpu_mpi_data *_starpu_mpi_data_get(starpu_data_handle_t data_handle);
  205. struct _starpu_mpi_req_backend;
  206. struct _starpu_mpi_req;
  207. LIST_TYPE(_starpu_mpi_req,
  208. /** description of the data at StarPU level */
  209. starpu_data_handle_t data_handle;
  210. int prio;
  211. unsigned node; /* Which StarPU memory node this will read from / write to */
  212. /** description of the data to be sent/received */
  213. MPI_Datatype datatype;
  214. char *datatype_name;
  215. void *ptr;
  216. starpu_ssize_t count;
  217. int registered_datatype; // = 0: datatype is not predefined by StarPU; = 1: otherwise; initialized with -1
  218. struct _starpu_mpi_req_backend *backend;
  219. /** who are we talking to ? */
  220. struct _starpu_mpi_node_tag node_tag;
  221. void (*func)(struct _starpu_mpi_req *);
  222. MPI_Status *status;
  223. struct _starpu_mpi_req_multilist_coop_sends coop_sends;
  224. struct _starpu_mpi_coop_sends *coop_sends_head;
  225. int *flag;
  226. unsigned sync;
  227. /** Amount of memory pre-reserved for the reception buffer */
  228. size_t reserved_size;
  229. int ret;
  230. /** 0 send, 1 recv */
  231. enum _starpu_mpi_request_type request_type;
  232. unsigned submitted;
  233. unsigned completed;
  234. unsigned posted;
  235. /** in the case of detached requests */
  236. int detached;
  237. void *callback_arg;
  238. void (*callback)(void *);
  239. int sequential_consistency;
  240. long pre_sync_jobid;
  241. long post_sync_jobid;
  242. #ifdef STARPU_SIMGRID
  243. MPI_Status status_store;
  244. starpu_pthread_queue_t queue;
  245. unsigned done;
  246. #endif
  247. );
  248. PRIO_LIST_TYPE(_starpu_mpi_req, prio)
  249. MULTILIST_CREATE_INLINES(struct _starpu_mpi_req, _starpu_mpi_req, coop_sends)
  250. /** To be called before actually queueing a request, so the communication layer knows it has something to look at */
  251. void _starpu_mpi_req_willpost(struct _starpu_mpi_req *req);
  252. /** To be called to actually submit the request */
  253. void _starpu_mpi_submit_ready_request(void *arg);
  254. /** To be called when request is completed */
  255. void _starpu_mpi_release_req_data(struct _starpu_mpi_req *req);
  256. #if 0
  257. /** Build a communication tree. Called before _starpu_mpi_coop_send is ever called. coop_sends->lock is held. */
  258. void _starpu_mpi_coop_sends_build_tree(struct _starpu_mpi_coop_sends *coop_sends);
  259. #endif
  260. /** Try to merge with send request with other send requests */
  261. void _starpu_mpi_coop_send(starpu_data_handle_t data_handle, struct _starpu_mpi_req *req, enum starpu_data_access_mode mode, int sequential_consistency);
  262. /** Actually submit the coop_sends bag to MPI.
  263. * At least one of submit_control or submit_data is true.
  264. * _starpu_mpi_submit_coop_sends may be called either
  265. * - just once with both parameters being true,
  266. * - or once with submit_control being true (data is not available yet, but we
  267. * can send control messages), and a second time with submit_data being true. Or
  268. * the converse, possibly on different threads, etc.
  269. */
  270. void _starpu_mpi_submit_coop_sends(struct _starpu_mpi_coop_sends *coop_sends, int submit_control, int submit_data);
  271. /*
  272. * Fills post_sync_jobid with the reduction synchronization task jobid
  273. */
  274. void _starpu_mpi_redux_fill_post_sync_jobid(const void * const redux_data_args, long * const post_sync_jobid);
  275. void _starpu_mpi_submit_ready_request_inc(struct _starpu_mpi_req *req);
  276. void _starpu_mpi_request_init(struct _starpu_mpi_req **req);
  277. struct _starpu_mpi_req * _starpu_mpi_request_fill(starpu_data_handle_t data_handle,
  278. int srcdst, starpu_mpi_tag_t data_tag, MPI_Comm comm,
  279. unsigned detached, unsigned sync, int prio, void (*callback)(void *), void *arg,
  280. enum _starpu_mpi_request_type request_type, void (*func)(struct _starpu_mpi_req *),
  281. int sequential_consistency,
  282. int is_internal_req,
  283. starpu_ssize_t count);
  284. void _starpu_mpi_request_destroy(struct _starpu_mpi_req *req);
  285. int _starpu_mpi_choose_node(starpu_data_handle_t data_handle, enum starpu_data_access_mode mode);
  286. void _starpu_mpi_data_flush(starpu_data_handle_t data_handle);
  287. struct _starpu_mpi_argc_argv
  288. {
  289. int initialize_mpi;
  290. int *argc;
  291. char ***argv;
  292. MPI_Comm comm;
  293. /** Fortran argc */
  294. int fargc;
  295. /** Fortran argv */
  296. char **fargv;
  297. int rank;
  298. int world_size;
  299. };
  300. /**
  301. * Specific functions to backend implementation
  302. */
  303. struct _starpu_mpi_backend
  304. {
  305. void (*_starpu_mpi_backend_init)(struct starpu_conf *conf);
  306. void (*_starpu_mpi_backend_shutdown)(void);
  307. int (*_starpu_mpi_backend_reserve_core)(void);
  308. void (*_starpu_mpi_backend_request_init)(struct _starpu_mpi_req *req);
  309. void (*_starpu_mpi_backend_request_fill)(struct _starpu_mpi_req *req, MPI_Comm comm, int is_internal_req);
  310. void (*_starpu_mpi_backend_request_destroy)(struct _starpu_mpi_req *req);
  311. void (*_starpu_mpi_backend_data_clear)(starpu_data_handle_t data_handle);
  312. void (*_starpu_mpi_backend_data_register)(starpu_data_handle_t data_handle, starpu_mpi_tag_t data_tag);
  313. void (*_starpu_mpi_backend_comm_register)(MPI_Comm comm);
  314. int (*_starpu_mpi_backend_progress_init)(struct _starpu_mpi_argc_argv *argc_argv);
  315. void (*_starpu_mpi_backend_progress_shutdown)(void **value);
  316. #ifdef STARPU_SIMGRID
  317. void (*_starpu_mpi_backend_wait_for_initialization)();
  318. #endif
  319. int (*_starpu_mpi_backend_barrier)(MPI_Comm comm);
  320. int (*_starpu_mpi_backend_wait_for_all)(MPI_Comm comm);
  321. int (*_starpu_mpi_backend_wait)(starpu_mpi_req *public_req, MPI_Status *status);
  322. int (*_starpu_mpi_backend_test)(starpu_mpi_req *public_req, int *flag, MPI_Status *status);
  323. void (*_starpu_mpi_backend_isend_size_func)(struct _starpu_mpi_req *req);
  324. void (*_starpu_mpi_backend_irecv_size_func)(struct _starpu_mpi_req *req);
  325. };
  326. extern struct _starpu_mpi_backend _mpi_backend;
  327. #ifdef __cplusplus
  328. }
  329. #endif
  330. #endif // __STARPU_MPI_PRIVATE_H__