starpu_mpi_nmad.c 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2017 Inria
  4. * Copyright (C) 2017 Guillaume Beauchamp
  5. * Copyright (C) 2010-2015,2017 CNRS
  6. * Copyright (C) 2009-2014,2017 Université de Bordeaux
  7. *
  8. * StarPU is free software; you can redistribute it and/or modify
  9. * it under the terms of the GNU Lesser General Public License as published by
  10. * the Free Software Foundation; either version 2.1 of the License, or (at
  11. * your option) any later version.
  12. *
  13. * StarPU is distributed in the hope that it will be useful, but
  14. * WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  16. *
  17. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  18. */
  19. #include <stdlib.h>
  20. #include <limits.h>
  21. #include <starpu_mpi.h>
  22. #include <starpu_mpi_datatype.h>
  23. #include <starpu_mpi_private.h>
  24. #include <starpu_mpi_cache.h>
  25. #include <starpu_profiling.h>
  26. #include <starpu_mpi_stats.h>
  27. #include <starpu_mpi_cache.h>
  28. #include <starpu_mpi_select_node.h>
  29. #include <starpu_mpi_init.h>
  30. #include <common/config.h>
  31. #include <common/thread.h>
  32. #include <datawizard/coherency.h>
  33. #include <core/task.h>
  34. #include <core/topology.h>
  35. #ifdef STARPU_USE_MPI_NMAD
  36. #include <nm_sendrecv_interface.h>
  37. #include <nm_mpi_nmad.h>
  38. static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req,nm_sr_event_t event);
  39. #ifdef STARPU_VERBOSE
  40. static char *_starpu_mpi_request_type(enum _starpu_mpi_request_type request_type);
  41. #endif
  42. static void _starpu_mpi_handle_new_request(void *arg);
  43. static void _starpu_mpi_handle_pending_request(struct _starpu_mpi_req *req);
  44. static void _starpu_mpi_add_sync_point_in_fxt(void);
  45. static int mpi_thread_cpuid = -1;
  46. static int use_prio = 1;
  47. int _starpu_mpi_fake_world_size = -1;
  48. int _starpu_mpi_fake_world_rank = -1;
  49. /* Condition to wake up waiting for all current MPI requests to finish */
  50. static starpu_pthread_t progress_thread;
  51. static starpu_pthread_cond_t progress_cond;
  52. static starpu_pthread_mutex_t progress_mutex;
  53. static volatile int running = 0;
  54. extern struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, int data_tag, MPI_Comm comm, unsigned detached, unsigned sync, void (*callback)(void *), void *arg, int sequential_consistency, int is_internal_req, starpu_ssize_t count);
  55. /* Count requests posted by the application and not yet submitted to MPI, i.e pushed into the new_requests list */
  56. static volatile int pending_request = 0;
  57. #define REQ_FINALIZED 0x1
  58. PUK_LFSTACK_TYPE(callback, struct _starpu_mpi_req *req;);
  59. static callback_lfstack_t callback_stack = NULL;
  60. static starpu_sem_t callback_sem;
  61. void _starpu_mpi_request_init(struct _starpu_mpi_req **req)
  62. {
  63. _STARPU_MPI_CALLOC(*req, 1, sizeof(struct _starpu_mpi_req));
  64. /* Initialize the request structure */
  65. (*req)->data_handle = NULL;
  66. (*req)->prio = 0;
  67. (*req)->completed = 0;
  68. (*req)->datatype = 0;
  69. (*req)->datatype_name = NULL;
  70. (*req)->ptr = NULL;
  71. (*req)->count = -1;
  72. (*req)->registered_datatype = -1;
  73. (*req)->node_tag.rank = -1;
  74. (*req)->node_tag.data_tag = -1;
  75. (*req)->node_tag.comm = 0;
  76. (*req)->func = NULL;
  77. (*req)->status = NULL;
  78. // (*req)->data_request = 0;
  79. (*req)->flag = NULL;
  80. (*req)->ret = -1;
  81. piom_cond_init(&((*req)->req_cond), 0);
  82. //STARPU_PTHREAD_MUTEX_INIT(&((*req)->req_mutex), NULL);
  83. //STARPU_PTHREAD_COND_INIT(&((*req)->req_cond), NULL);
  84. // STARPU_PTHREAD_MUTEX_INIT(&((*req)->posted_mutex), NULL);
  85. //STARPU_PTHREAD_COND_INIT(&((*req)->posted_cond), NULL);
  86. (*req)->request_type = UNKNOWN_REQ;
  87. (*req)->submitted = 0;
  88. (*req)->completed = 0;
  89. (*req)->posted = 0;
  90. //(*req)->other_request = NULL;
  91. (*req)->sync = 0;
  92. (*req)->detached = -1;
  93. (*req)->callback = NULL;
  94. (*req)->callback_arg = NULL;
  95. // (*req)->size_req = 0;
  96. //(*req)->internal_req = NULL;
  97. //(*req)->is_internal_req = 0;
  98. //(*req)->to_destroy = 1;
  99. //(*req)->early_data_handle = NULL;
  100. //(*req)->envelope = NULL;
  101. (*req)->sequential_consistency = 1;
  102. (*req)->pre_sync_jobid = -1;
  103. (*req)->post_sync_jobid = -1;
  104. #ifdef STARPU_SIMGRID
  105. starpu_pthread_queue_init(&((*req)->queue));
  106. starpu_pthread_queue_register(&wait, &((*req)->queue));
  107. (*req)->done = 0;
  108. #endif
  109. }
  110. void _starpu_mpi_request_destroy(struct _starpu_mpi_req *req)
  111. {
  112. piom_cond_destroy(&(req->req_cond));
  113. free(req);
  114. }
  115. /********************************************************/
  116. /* */
  117. /* Send/Receive functionalities */
  118. /* */
  119. /********************************************************/
  120. static void nop_acquire_cb(void *arg)
  121. {
  122. starpu_data_release(arg);
  123. }
  124. struct _starpu_mpi_req *_starpu_mpi_isend_irecv_common(starpu_data_handle_t data_handle,
  125. int srcdst, starpu_mpi_tag_t data_tag, MPI_Comm comm,
  126. unsigned detached, unsigned sync, int prio, void (*callback)(void *), void *arg,
  127. enum _starpu_mpi_request_type request_type, void (*func)(struct _starpu_mpi_req *),
  128. enum starpu_data_access_mode mode,
  129. int sequential_consistency,
  130. int is_internal_req,
  131. starpu_ssize_t count)
  132. {
  133. struct _starpu_mpi_req *req;
  134. if (_starpu_mpi_fake_world_size != -1)
  135. {
  136. /* Don't actually do the communication */
  137. starpu_data_acquire_on_node_cb_sequential_consistency(data_handle, STARPU_MAIN_RAM, mode, nop_acquire_cb, data_handle, sequential_consistency);
  138. return NULL;
  139. }
  140. _STARPU_MPI_LOG_IN();
  141. STARPU_ATOMIC_ADD( &pending_request, 1);
  142. /* Initialize the request structure */
  143. _starpu_mpi_request_init(&req);
  144. req->request_type = request_type;
  145. /* prio_list is sorted by increasing values */
  146. if (use_prio)
  147. req->prio = prio;
  148. req->data_handle = data_handle;
  149. req->node_tag.rank = srcdst;
  150. req->node_tag.data_tag = data_tag;
  151. req->node_tag.comm = comm;
  152. req->detached = detached;
  153. req->sync = sync;
  154. req->callback = callback;
  155. req->callback_arg = arg;
  156. req->func = func;
  157. req->sequential_consistency = sequential_consistency;
  158. nm_mpi_nmad_dest(&req->session, &req->gate, comm, req->node_tag.rank);
  159. /* Asynchronously request StarPU to fetch the data in main memory: when
  160. * it is available in main memory, _starpu_mpi_submit_new_mpi_request(req) is called and
  161. * the request is actually submitted */
  162. starpu_data_acquire_on_node_cb_sequential_consistency_sync_jobids(data_handle, STARPU_MAIN_RAM, mode, _starpu_mpi_handle_new_request, (void *)req, sequential_consistency, &req->pre_sync_jobid, &req->post_sync_jobid);
  163. _STARPU_MPI_LOG_OUT();
  164. return req;
  165. }
  166. /********************************************************/
  167. /* */
  168. /* Send functionalities */
  169. /* */
  170. /********************************************************/
  171. static void _starpu_mpi_isend_data_func(struct _starpu_mpi_req *req)
  172. {
  173. _STARPU_MPI_LOG_IN();
  174. _STARPU_MPI_DEBUG(30, "post NM isend request %p type %s tag %ld src %d data %p datasize %ld ptr %p datatype '%s' count %d registered_datatype %d sync %d\n", req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.rank, req->data_handle, starpu_data_get_size(req->data_handle), req->ptr, req->datatype_name, (int)req->count, req->registered_datatype, req->sync);
  175. _starpu_mpi_comm_amounts_inc(req->node_tag.comm, req->node_tag.rank, req->datatype, req->count);
  176. _STARPU_MPI_TRACE_ISEND_SUBMIT_BEGIN(req->node_tag.rank, req->node_tag.data_tag, 0);
  177. struct nm_data_s data;
  178. nm_mpi_nmad_data_get(&data, (void*)req->ptr, req->datatype, req->count);
  179. nm_sr_send_init(req->session, &(req->data_request));
  180. nm_sr_send_pack_data(req->session, &(req->data_request), &data);
  181. nm_sr_send_set_priority(req->session, &req->data_request, req->prio);
  182. if (req->sync == 0)
  183. {
  184. req->ret = nm_sr_send_isend(req->session, &(req->data_request), req->gate, req->node_tag.data_tag);
  185. STARPU_ASSERT_MSG(req->ret == NM_ESUCCESS, "MPI_Isend returning %d", req->ret);
  186. }
  187. else
  188. {
  189. req->ret = nm_sr_send_issend(req->session, &(req->data_request), req->gate, req->node_tag.data_tag);
  190. STARPU_ASSERT_MSG(req->ret == NM_ESUCCESS, "MPI_Issend returning %d", req->ret);
  191. }
  192. _STARPU_MPI_TRACE_ISEND_SUBMIT_END(req->node_tag.rank, req->node_tag.data_tag, starpu_data_get_size(req->data_handle), req->pre_sync_jobid);
  193. _starpu_mpi_handle_pending_request(req);
  194. _STARPU_MPI_LOG_OUT();
  195. }
  196. void _starpu_mpi_isend_size_func(struct _starpu_mpi_req *req)
  197. {
  198. _starpu_mpi_datatype_allocate(req->data_handle, req);
  199. if (req->registered_datatype == 1)
  200. {
  201. req->waited = 1;
  202. req->count = 1;
  203. req->ptr = starpu_data_handle_to_pointer(req->data_handle, STARPU_MAIN_RAM);
  204. }
  205. else
  206. {
  207. starpu_ssize_t psize = -1;
  208. int ret;
  209. req->waited =2;
  210. // Do not pack the data, just try to find out the size
  211. starpu_data_pack(req->data_handle, NULL, &psize);
  212. if (psize != -1)
  213. {
  214. // We already know the size of the data, let's send it to overlap with the packing of the data
  215. _STARPU_MPI_DEBUG(20, "Sending size %ld (%ld %s) to node %d (first call to pack)\n", psize, sizeof(req->count), "MPI_BYTE", req->node_tag.rank);
  216. req->count = psize;
  217. //ret = nm_sr_isend(nm_mpi_communicator_get_session(p_req->p_comm),nm_mpi_communicator_get_gate(p_comm,req->srcdst), req->mpi_tag,&req->count, sizeof(req->count), &req->size_req);
  218. ret = nm_sr_isend(req->session,req->gate, req->node_tag.data_tag,&req->count, sizeof(req->count), &req->size_req);
  219. // ret = MPI_Isend(&req->count, sizeof(req->count), MPI_BYTE, req->srcdst, req->mpi_tag, req->comm, &req->size_req);
  220. STARPU_ASSERT_MSG(ret == NM_ESUCCESS, "when sending size, nm_sr_isend returning %d", ret);
  221. }
  222. // Pack the data
  223. starpu_data_pack(req->data_handle, &req->ptr, &req->count);
  224. if (psize == -1)
  225. {
  226. // We know the size now, let's send it
  227. _STARPU_MPI_DEBUG(1, "Sending size %ld (%ld %s) with tag %ld to node %d (second call to pack)\n", req->count, sizeof(req->count), "MPI_BYTE", req->node_tag.data_tag, req->node_tag.rank);
  228. ret = nm_sr_isend(req->session,req->gate, req->node_tag.data_tag,&req->count, sizeof(req->count), &req->size_req);
  229. STARPU_ASSERT_MSG(ret == NM_ESUCCESS, "when sending size, nm_sr_isend returning %d", ret);
  230. }
  231. else
  232. {
  233. // We check the size returned with the 2 calls to pack is the same
  234. STARPU_ASSERT_MSG(req->count == psize, "Calls to pack_data returned different sizes %ld != %ld", req->count, psize);
  235. }
  236. // We can send the data now
  237. }
  238. _starpu_mpi_isend_data_func(req);
  239. }
  240. /********************************************************/
  241. /* */
  242. /* Receive functionalities */
  243. /* */
  244. /********************************************************/
  245. static void _starpu_mpi_irecv_data_func(struct _starpu_mpi_req *req)
  246. {
  247. _STARPU_MPI_LOG_IN();
  248. _STARPU_MPI_DEBUG(20, "post NM irecv request %p type %s tag %ld src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n", req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.rank, req->data_handle, req->ptr, req->datatype_name, (int)req->count, req->registered_datatype);
  249. _STARPU_MPI_TRACE_IRECV_SUBMIT_BEGIN(req->node_tag.rank, req->node_tag.data_tag);
  250. //req->ret = MPI_Irecv(req->ptr, req->count, req->datatype, req->srcdst, req->mpi_tag, req->comm, &req->request);
  251. struct nm_data_s data;
  252. nm_mpi_nmad_data_get(&data, (void*)req->ptr, req->datatype, req->count);
  253. nm_sr_recv_init(req->session, &(req->data_request));
  254. nm_sr_recv_unpack_data(req->session, &(req->data_request), &data);
  255. nm_sr_recv_irecv(req->session, &(req->data_request), req->gate, req->node_tag.data_tag, NM_TAG_MASK_FULL);
  256. _STARPU_MPI_TRACE_IRECV_SUBMIT_END(req->node_tag.rank, req->node_tag.data_tag);
  257. _starpu_mpi_handle_pending_request(req);
  258. _STARPU_MPI_LOG_OUT();
  259. }
  260. struct _starpu_mpi_irecv_size_callback
  261. {
  262. starpu_data_handle_t handle;
  263. struct _starpu_mpi_req *req;
  264. };
  265. static void _starpu_mpi_irecv_size_callback(void *arg)
  266. {
  267. struct _starpu_mpi_irecv_size_callback *callback = (struct _starpu_mpi_irecv_size_callback *)arg;
  268. starpu_data_unregister(callback->handle);
  269. callback->req->ptr = malloc(callback->req->count);
  270. STARPU_ASSERT_MSG(callback->req->ptr, "cannot allocate message of size %ld", callback->req->count);
  271. _starpu_mpi_irecv_data_func(callback->req);
  272. free(callback);
  273. }
  274. void _starpu_mpi_irecv_size_func(struct _starpu_mpi_req *req)
  275. {
  276. _STARPU_MPI_LOG_IN();
  277. _starpu_mpi_datatype_allocate(req->data_handle, req);
  278. if (req->registered_datatype == 1)
  279. {
  280. req->count = 1;
  281. req->ptr = starpu_data_handle_to_pointer(req->data_handle, STARPU_MAIN_RAM);
  282. _starpu_mpi_irecv_data_func(req);
  283. }
  284. else
  285. {
  286. struct _starpu_mpi_irecv_size_callback *callback = malloc(sizeof(struct _starpu_mpi_irecv_size_callback));
  287. callback->req = req;
  288. starpu_variable_data_register(&callback->handle, 0, (uintptr_t)&(callback->req->count), sizeof(callback->req->count));
  289. _STARPU_MPI_DEBUG(4, "Receiving size with tag %ld from node %d\n", req->node_tag.data_tag, req->node_tag.rank);
  290. _starpu_mpi_irecv_common(callback->handle, req->node_tag.rank, req->node_tag.data_tag, req->node_tag.comm, 1, 0, _starpu_mpi_irecv_size_callback, callback,1,0,0);
  291. }
  292. }
  293. /********************************************************/
  294. /* */
  295. /* Wait functionalities */
  296. /* */
  297. /********************************************************/
  298. #define _starpu_mpi_req_status(PUBLIC_REQ,STATUS) do { \
  299. STATUS->MPI_SOURCE=PUBLIC_REQ->node_tag.rank; /**< field name mandatory by spec */ \
  300. STATUS->MPI_TAG=PUBLIC_REQ->node_tag.data_tag; /**< field name mandatory by spec */ \
  301. STATUS->MPI_ERROR=PUBLIC_REQ->ret; /**< field name mandatory by spec */ \
  302. STATUS->size=PUBLIC_REQ->count; /**< size of data received */ \
  303. STATUS->cancelled=0; /**< whether request was cancelled */ \
  304. } while(0)
  305. int _starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
  306. {
  307. _STARPU_MPI_LOG_IN();
  308. STARPU_MPI_ASSERT_MSG(public_req, "starpu_mpi_wait needs a valid starpu_mpi_req");
  309. struct _starpu_mpi_req *req = *public_req;
  310. STARPU_MPI_ASSERT_MSG(!req->detached, "MPI_Wait cannot be called on a detached request");
  311. /* we must do a test_locked to avoid race condition :
  312. * without req_cond could still be used and couldn't be freed)*/
  313. while (!req->completed || ! piom_cond_test_locked(&(req->req_cond),REQ_FINALIZED))
  314. {
  315. piom_cond_wait(&(req->req_cond),REQ_FINALIZED);
  316. }
  317. if (status!=MPI_STATUS_IGNORE)
  318. _starpu_mpi_req_status(req,status);
  319. _starpu_mpi_request_destroy(req);
  320. *public_req = NULL;
  321. _STARPU_MPI_LOG_OUT();
  322. return MPI_SUCCESS;
  323. }
  324. /********************************************************/
  325. /* */
  326. /* Test functionalities */
  327. /* */
  328. /********************************************************/
  329. int _starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
  330. {
  331. _STARPU_MPI_LOG_IN();
  332. STARPU_MPI_ASSERT_MSG(public_req, "starpu_mpi_test needs a valid starpu_mpi_req");
  333. struct _starpu_mpi_req *req = *public_req;
  334. STARPU_MPI_ASSERT_MSG(!req->detached, "MPI_Test cannot be called on a detached request");
  335. _STARPU_MPI_DEBUG(2, "Test request %p type %s tag %ld src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
  336. req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.rank, req->data_handle, req->ptr, req->datatype_name, (int)req->count, req->registered_datatype);
  337. _STARPU_MPI_TRACE_UTESTING_BEGIN(req->node_tag.rank, req->node_tag.data_tag);
  338. /* we must do a test_locked to avoid race condition :
  339. * without req_cond could still be used and couldn't be freed)*/
  340. *flag = req->completed && piom_cond_test_locked(&(req->req_cond),REQ_FINALIZED);
  341. if (*flag && status!=MPI_STATUS_IGNORE)
  342. _starpu_mpi_req_status(req,status);
  343. _STARPU_MPI_TRACE_UTESTING_END(req->node_tag.rank, req->node_tag.data_tag);
  344. if(*flag)
  345. {
  346. _starpu_mpi_request_destroy(req);
  347. *public_req = NULL;
  348. }
  349. _STARPU_MPI_LOG_OUT();
  350. return MPI_SUCCESS;
  351. }
  352. /********************************************************/
  353. /* */
  354. /* Barrier functionalities */
  355. /* */
  356. /********************************************************/
  357. int _starpu_mpi_barrier(MPI_Comm comm)
  358. {
  359. _STARPU_MPI_LOG_IN();
  360. int ret;
  361. // STARPU_ASSERT_MSG(!barrier_running, "Concurrent starpu_mpi_barrier is not implemented, even on different communicators");
  362. ret = MPI_Barrier(comm);
  363. STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Barrier returning %d", ret);
  364. _STARPU_MPI_LOG_OUT();
  365. return ret;
  366. }
  367. /********************************************************/
  368. /* */
  369. /* Progression */
  370. /* */
  371. /********************************************************/
  372. #ifdef STARPU_VERBOSE
  373. static char *_starpu_mpi_request_type(enum _starpu_mpi_request_type request_type)
  374. {
  375. switch (request_type)
  376. {
  377. case SEND_REQ: return "SEND_REQ";
  378. case RECV_REQ: return "RECV_REQ";
  379. case WAIT_REQ: return "WAIT_REQ";
  380. case TEST_REQ: return "TEST_REQ";
  381. case BARRIER_REQ: return "BARRIER_REQ";
  382. default: return "unknown request type";
  383. }
  384. }
  385. #endif
  386. static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req,nm_sr_event_t event)
  387. {
  388. _STARPU_MPI_LOG_IN();
  389. _STARPU_MPI_DEBUG(2, "complete MPI request %p type %s tag %ld src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
  390. req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.rank, req->data_handle, req->ptr, req->datatype_name, (int)req->count, req->registered_datatype);
  391. if (req->request_type == RECV_REQ || req->request_type == SEND_REQ)
  392. {
  393. if (req->registered_datatype == 0)
  394. {
  395. if(req->waited == 1)
  396. nm_mpi_nmad_data_release(req->datatype);
  397. if (req->request_type == SEND_REQ)
  398. {
  399. req->waited--;
  400. // We need to make sure the communication for sending the size
  401. // has completed, as MPI can re-order messages, let's count
  402. // recerived message.
  403. // FIXME concurent access.
  404. STARPU_ASSERT_MSG(event == NM_SR_EVENT_FINALIZED, "Callback with event %d", event);
  405. if(req->waited>0)
  406. return;
  407. }
  408. if (req->request_type == RECV_REQ)
  409. // req->ptr is freed by starpu_data_unpack
  410. starpu_data_unpack(req->data_handle, req->ptr, req->count);
  411. else
  412. free(req->ptr);
  413. }
  414. else
  415. {
  416. nm_mpi_nmad_data_release(req->datatype);
  417. _starpu_mpi_datatype_free(req->data_handle, &req->datatype);
  418. }
  419. starpu_data_release(req->data_handle);
  420. }
  421. /* Execute the specified callback, if any */
  422. if (req->callback)
  423. {
  424. struct callback_lfstack_cell_s* c = padico_malloc(sizeof(struct callback_lfstack_cell_s));
  425. c->req = req;
  426. /* The main thread can exit without waiting
  427. * the end of the detached request. Callback thread
  428. * must then be kept alive if they have a callback.*/
  429. callback_lfstack_push(&callback_stack, c);
  430. starpu_sem_post(&callback_sem);
  431. }
  432. else
  433. {
  434. if(req->detached)
  435. {
  436. _starpu_mpi_request_destroy(req);
  437. // a detached request wont be wait/test (and freed inside).
  438. }
  439. else
  440. {
  441. /* tell anyone potentially waiting on the request that it is
  442. * terminated now (should be done after the callback)*/
  443. req->completed = 1;
  444. piom_cond_signal(&req->req_cond, REQ_FINALIZED);
  445. }
  446. int pending_remaining = STARPU_ATOMIC_ADD(&pending_request, -1);
  447. if (!running && !pending_remaining)
  448. starpu_sem_post(&callback_sem);
  449. }
  450. _STARPU_MPI_LOG_OUT();
  451. }
  452. void _starpu_mpi_handle_request_termination_callback(nm_sr_event_t event, const nm_sr_event_info_t*event_info, void*ref)
  453. {
  454. _starpu_mpi_handle_request_termination(ref,event);
  455. }
  456. static void _starpu_mpi_handle_pending_request(struct _starpu_mpi_req *req)
  457. {
  458. if(req->request_type == SEND_REQ && req->waited>1)
  459. {
  460. nm_sr_request_set_ref(&(req->size_req), req);
  461. nm_sr_request_monitor(req->session, &(req->size_req), NM_SR_EVENT_FINALIZED,_starpu_mpi_handle_request_termination_callback);
  462. }
  463. /* the if must be before, because the first callback can directly free
  464. * a detached request (the second callback free if req->waited>1). */
  465. nm_sr_request_set_ref(&(req->data_request), req);
  466. nm_sr_request_monitor(req->session, &(req->data_request), NM_SR_EVENT_FINALIZED,_starpu_mpi_handle_request_termination_callback);
  467. }
  468. static void _starpu_mpi_handle_new_request(void *arg)
  469. {
  470. _STARPU_MPI_LOG_IN();
  471. struct _starpu_mpi_req *req = arg;
  472. STARPU_ASSERT_MSG(req, "Invalid request");
  473. /* submit the request to MPI */
  474. _STARPU_MPI_DEBUG(2, "Handling new request %p type %s tag %ld src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
  475. req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.rank, req->data_handle, req->ptr, req->datatype_name, (int)req->count, req->registered_datatype);
  476. req->func(req);
  477. _STARPU_MPI_LOG_OUT();
  478. }
  479. static void *_starpu_mpi_progress_thread_func(void *arg)
  480. {
  481. struct _starpu_mpi_argc_argv *argc_argv = (struct _starpu_mpi_argc_argv *) arg;
  482. starpu_pthread_setname("MPI");
  483. #ifndef STARPU_SIMGRID
  484. if (mpi_thread_cpuid >= 0)
  485. _starpu_bind_thread_on_cpu(mpi_thread_cpuid, STARPU_NOWORKERID);
  486. _starpu_mpi_do_initialize(argc_argv);
  487. if (mpi_thread_cpuid >= 0)
  488. /* In case MPI changed the binding */
  489. _starpu_bind_thread_on_cpu(mpi_thread_cpuid, STARPU_NOWORKERID);
  490. #endif
  491. _starpu_mpi_fake_world_size = starpu_get_env_number("STARPU_MPI_FAKE_SIZE");
  492. _starpu_mpi_fake_world_rank = starpu_get_env_number("STARPU_MPI_FAKE_RANK");
  493. #ifdef STARPU_SIMGRID
  494. /* Now that MPI is set up, let the rest of simgrid get initialized */
  495. char **argv_cpy;
  496. _STARPU_MPI_MALLOC(argv_cpy, *(argc_argv->argc) * sizeof(char*));
  497. int i;
  498. for (i = 0; i < *(argc_argv->argc); i++)
  499. argv_cpy[i] = strdup((*(argc_argv->argv))[i]);
  500. MSG_process_create_with_arguments("main", smpi_simulated_main_, NULL, _starpu_simgrid_get_host_by_name("MAIN"), *(argc_argv->argc), argv_cpy);
  501. /* And set TSD for us */
  502. void **tsd;
  503. _STARPU_CALLOC(tsd, MAX_TSD + 1, sizeof(void*));
  504. if (!smpi_process_set_user_data)
  505. {
  506. _STARPU_ERROR("Your version of simgrid does not provide smpi_process_set_user_data, we can not continue without it\n");
  507. }
  508. smpi_process_set_user_data(tsd);
  509. #endif
  510. _starpu_mpi_comm_amounts_init(argc_argv->comm);
  511. _starpu_mpi_cache_init(argc_argv->comm);
  512. _starpu_mpi_select_node_init();
  513. _starpu_mpi_datatype_init();
  514. /* notify the main thread that the progression thread is ready */
  515. STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
  516. running = 1;
  517. STARPU_PTHREAD_COND_SIGNAL(&progress_cond);
  518. STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
  519. #ifdef STARPU_USE_FXT
  520. _starpu_fxt_wait_initialisation();
  521. #endif //STARPU_USE_FXT
  522. {
  523. _STARPU_MPI_TRACE_START(argc_argv->rank, argc_argv->world_size);
  524. #ifdef STARPU_USE_FXT
  525. starpu_profiling_set_id(argc_argv->rank);
  526. #endif //STARPU_USE_FXT
  527. }
  528. _starpu_mpi_add_sync_point_in_fxt();
  529. while (1)
  530. {
  531. struct callback_lfstack_cell_s* c = callback_lfstack_pop(&callback_stack);
  532. int err=0;
  533. if(running || pending_request>0)
  534. {/* shall we block ? */
  535. err = starpu_sem_wait(&callback_sem);
  536. //running pending_request can change while waiting
  537. }
  538. if(c==NULL)
  539. {
  540. c = callback_lfstack_pop(&callback_stack);
  541. if (c == NULL)
  542. {
  543. if(running && pending_request>0)
  544. {
  545. STARPU_ASSERT_MSG(c!=NULL, "Callback thread awakened without callback ready with error %d.",err);
  546. }
  547. else
  548. {
  549. if (pending_request==0)
  550. break;
  551. }
  552. continue;
  553. }
  554. }
  555. c->req->callback(c->req->callback_arg);
  556. if (c->req->detached)
  557. {
  558. _starpu_mpi_request_destroy(c->req);
  559. }
  560. else
  561. {
  562. c->req->completed=1;
  563. piom_cond_signal(&(c->req->req_cond), REQ_FINALIZED);
  564. }
  565. STARPU_ATOMIC_ADD( &pending_request, -1);
  566. /* we signal that the request is completed.*/
  567. free(c);
  568. }
  569. STARPU_ASSERT_MSG(callback_lfstack_pop(&callback_stack)==NULL, "List of callback not empty.");
  570. STARPU_ASSERT_MSG(pending_request==0, "Request still pending.");
  571. if (argc_argv->initialize_mpi)
  572. {
  573. _STARPU_MPI_DEBUG(3, "Calling MPI_Finalize()\n");
  574. MPI_Finalize();
  575. }
  576. starpu_sem_destroy(&callback_sem);
  577. free(argc_argv);
  578. return NULL;
  579. }
  580. /********************************************************/
  581. /* */
  582. /* (De)Initialization methods */
  583. /* */
  584. /********************************************************/
  585. // #ifdef STARPU_MPI_ACTIVITY
  586. // static int hookid = - 1;
  587. // #endif /* STARPU_MPI_ACTIVITY */
  588. static void _starpu_mpi_add_sync_point_in_fxt(void)
  589. {
  590. #ifdef STARPU_USE_FXT
  591. int rank;
  592. int worldsize;
  593. int ret;
  594. starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
  595. starpu_mpi_comm_size(MPI_COMM_WORLD, &worldsize);
  596. ret = MPI_Barrier(MPI_COMM_WORLD);
  597. STARPU_MPI_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Barrier returning %s", _starpu_mpi_get_mpi_error_code(ret));
  598. /* We generate a "unique" key so that we can make sure that different
  599. * FxT traces come from the same MPI run. */
  600. int random_number;
  601. /* XXX perhaps we don't want to generate a new seed if the application
  602. * specified some reproductible behaviour ? */
  603. if (rank == 0)
  604. {
  605. srand(time(NULL));
  606. random_number = rand();
  607. }
  608. ret = MPI_Bcast(&random_number, 1, MPI_INT, 0, MPI_COMM_WORLD);
  609. STARPU_MPI_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Bcast returning %s", _starpu_mpi_get_mpi_error_code(ret));
  610. _STARPU_MPI_TRACE_BARRIER(rank, worldsize, random_number);
  611. _STARPU_MPI_DEBUG(3, "unique key %x\n", random_number);
  612. #endif
  613. }
  614. int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv)
  615. {
  616. STARPU_PTHREAD_MUTEX_INIT(&progress_mutex, NULL);
  617. STARPU_PTHREAD_COND_INIT(&progress_cond, NULL);
  618. starpu_sem_init(&callback_sem, 0, 0);
  619. running = 0;
  620. mpi_thread_cpuid = starpu_get_env_number_default("STARPU_MPI_THREAD_CPUID", -1);
  621. use_prio = starpu_get_env_number_default("STARPU_MPI_PRIORITIES", 1);
  622. STARPU_PTHREAD_CREATE(&progress_thread, NULL, _starpu_mpi_progress_thread_func, argc_argv);
  623. STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
  624. while (!running)
  625. STARPU_PTHREAD_COND_WAIT(&progress_cond, &progress_mutex);
  626. STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
  627. return 0;
  628. }
  629. void _starpu_mpi_progress_shutdown(uintptr_t value)
  630. {
  631. /* kill the progression thread */
  632. STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
  633. running = 0;
  634. STARPU_PTHREAD_COND_BROADCAST(&progress_cond);
  635. STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
  636. starpu_sem_post(&callback_sem);
  637. STARPU_PTHREAD_JOIN(progress_thread, value);
  638. STARPU_PTHREAD_MUTEX_DESTROY(&progress_mutex);
  639. STARPU_PTHREAD_COND_DESTROY(&progress_cond);
  640. }
  641. int starpu_mpi_comm_get_attr(MPI_Comm comm, int keyval, void *attribute_val, int *flag)
  642. {
  643. (void) comm;
  644. if (keyval == STARPU_MPI_TAG_UB)
  645. {
  646. const int64_t starpu_tag_max = INT64_MAX;
  647. const nm_tag_t nm_tag_max = NM_TAG_MAX;
  648. /* manage case where nmad max tag causes overflow if represented as starpu tag */
  649. *(int64_t *)attribute_val = (nm_tag_max > starpu_tag_max) ? starpu_tag_max : nm_tag_max;
  650. *flag = 1;
  651. }
  652. else
  653. {
  654. *flag = 0;
  655. }
  656. return 0;
  657. }
  658. #endif /* STARPU_USE_MPI_NMAD*/