starpu_mpi_nmad.c 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2017 Inria
  4. * Copyright (C) 2010-2015,2017,2018,2019 CNRS
  5. * Copyright (C) 2009-2014,2017,2018-2019 Université de Bordeaux
  6. * Copyright (C) 2017 Guillaume Beauchamp
  7. *
  8. * StarPU is free software; you can redistribute it and/or modify
  9. * it under the terms of the GNU Lesser General Public License as published by
  10. * the Free Software Foundation; either version 2.1 of the License, or (at
  11. * your option) any later version.
  12. *
  13. * StarPU is distributed in the hope that it will be useful, but
  14. * WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  16. *
  17. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  18. */
  19. #include <stdlib.h>
  20. #include <limits.h>
  21. #include <starpu_mpi.h>
  22. #include <starpu_mpi_datatype.h>
  23. #include <starpu_mpi_private.h>
  24. #include <starpu_mpi_cache.h>
  25. #include <starpu_profiling.h>
  26. #include <starpu_mpi_stats.h>
  27. #include <starpu_mpi_cache.h>
  28. #include <starpu_mpi_select_node.h>
  29. #include <starpu_mpi_init.h>
  30. #include <common/config.h>
  31. #include <common/thread.h>
  32. #include <datawizard/coherency.h>
  33. #include <core/task.h>
  34. #include <core/topology.h>
  35. #ifdef STARPU_USE_MPI_NMAD
  36. #include <nm_sendrecv_interface.h>
  37. #include <nm_mpi_nmad.h>
  38. #include "starpu_mpi_nmad_backend.h"
  39. static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req,nm_sr_event_t event);
  40. #ifdef STARPU_VERBOSE
  41. static char *_starpu_mpi_request_type(enum _starpu_mpi_request_type request_type);
  42. #endif
  43. static void _starpu_mpi_handle_pending_request(struct _starpu_mpi_req *req);
  44. static void _starpu_mpi_add_sync_point_in_fxt(void);
  45. /* Condition to wake up waiting for all current MPI requests to finish */
  46. static starpu_pthread_t progress_thread;
  47. static starpu_pthread_cond_t progress_cond;
  48. static starpu_pthread_mutex_t progress_mutex;
  49. static volatile int running = 0;
  50. extern struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, int data_tag, MPI_Comm comm, unsigned detached, unsigned sync, void (*callback)(void *), void *arg, int sequential_consistency, int is_internal_req, starpu_ssize_t count);
  51. /* Count requests posted by the application and not yet submitted to MPI, i.e pushed into the new_requests list */
  52. static volatile int pending_request = 0;
  53. #define REQ_FINALIZED 0x1
  54. PUK_LFSTACK_TYPE(callback, struct _starpu_mpi_req *req;);
  55. static callback_lfstack_t callback_stack = NULL;
  56. static starpu_sem_t callback_sem;
  57. /********************************************************/
  58. /* */
  59. /* Send/Receive functionalities */
  60. /* */
  61. /********************************************************/
  62. void _starpu_mpi_req_willpost(struct _starpu_mpi_req *req STARPU_ATTRIBUTE_UNUSED)
  63. {
  64. STARPU_ATOMIC_ADD( &pending_request, 1);
  65. }
  66. /********************************************************/
  67. /* */
  68. /* Send functionalities */
  69. /* */
  70. /********************************************************/
  71. static void _starpu_mpi_isend_data_func(struct _starpu_mpi_req *req)
  72. {
  73. _STARPU_MPI_LOG_IN();
  74. _STARPU_MPI_DEBUG(30, "post NM isend request %p type %s tag %ld src %d data %p datasize %ld ptr %p datatype '%s' count %d registered_datatype %d sync %d\n", req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.node.rank, req->data_handle, starpu_data_get_size(req->data_handle), req->ptr, req->datatype_name, (int)req->count, req->registered_datatype, req->sync);
  75. _starpu_mpi_comm_amounts_inc(req->node_tag.node.comm, req->node_tag.node.rank, req->datatype, req->count);
  76. _STARPU_MPI_TRACE_ISEND_SUBMIT_BEGIN(req->node_tag.node.rank, req->node_tag.data_tag, 0);
  77. struct nm_data_s data;
  78. nm_mpi_nmad_data_get(&data, (void*)req->ptr, req->datatype, req->count);
  79. nm_sr_send_init(req->backend->session, &(req->backend->data_request));
  80. nm_sr_send_pack_data(req->backend->session, &(req->backend->data_request), &data);
  81. nm_sr_send_set_priority(req->backend->session, &req->backend->data_request, req->prio);
  82. if (req->sync == 0)
  83. {
  84. req->ret = nm_sr_send_isend(req->backend->session, &(req->backend->data_request), req->backend->gate, req->node_tag.data_tag);
  85. STARPU_ASSERT_MSG(req->ret == NM_ESUCCESS, "MPI_Isend returning %d", req->ret);
  86. }
  87. else
  88. {
  89. req->ret = nm_sr_send_issend(req->backend->session, &(req->backend->data_request), req->backend->gate, req->node_tag.data_tag);
  90. STARPU_ASSERT_MSG(req->ret == NM_ESUCCESS, "MPI_Issend returning %d", req->ret);
  91. }
  92. _STARPU_MPI_TRACE_ISEND_SUBMIT_END(req->node_tag.node.rank, req->node_tag.data_tag, starpu_data_get_size(req->data_handle), req->pre_sync_jobid);
  93. _starpu_mpi_handle_pending_request(req);
  94. _STARPU_MPI_LOG_OUT();
  95. }
  96. void _starpu_mpi_isend_size_func(struct _starpu_mpi_req *req)
  97. {
  98. _starpu_mpi_datatype_allocate(req->data_handle, req);
  99. if (req->registered_datatype == 1)
  100. {
  101. req->backend->waited = 1;
  102. req->count = 1;
  103. req->ptr = starpu_data_handle_to_pointer(req->data_handle, STARPU_MAIN_RAM);
  104. }
  105. else
  106. {
  107. starpu_ssize_t psize = -1;
  108. int ret;
  109. req->backend->waited =2;
  110. // Do not pack the data, just try to find out the size
  111. starpu_data_pack(req->data_handle, NULL, &psize);
  112. if (psize != -1)
  113. {
  114. // We already know the size of the data, let's send it to overlap with the packing of the data
  115. _STARPU_MPI_DEBUG(20, "Sending size %ld (%ld %s) to node %d (first call to pack)\n", psize, sizeof(req->count), "MPI_BYTE", req->node_tag.node.rank);
  116. req->count = psize;
  117. //ret = nm_sr_isend(nm_mpi_communicator_get_session(p_req->p_comm),nm_mpi_communicator_get_gate(p_comm,req->srcdst), req->mpi_tag,&req->count, sizeof(req->count), &req->backend->size_req);
  118. ret = nm_sr_isend(req->backend->session,req->backend->gate, req->node_tag.data_tag,&req->count, sizeof(req->count), &req->backend->size_req);
  119. // ret = MPI_Isend(&req->count, sizeof(req->count), MPI_BYTE, req->srcdst, req->mpi_tag, req->comm, &req->backend->size_req);
  120. STARPU_ASSERT_MSG(ret == NM_ESUCCESS, "when sending size, nm_sr_isend returning %d", ret);
  121. }
  122. // Pack the data
  123. starpu_data_pack(req->data_handle, &req->ptr, &req->count);
  124. if (psize == -1)
  125. {
  126. // We know the size now, let's send it
  127. _STARPU_MPI_DEBUG(1, "Sending size %ld (%ld %s) with tag %ld to node %d (second call to pack)\n", req->count, sizeof(req->count), "MPI_BYTE", req->node_tag.data_tag, req->node_tag.node.rank);
  128. ret = nm_sr_isend(req->backend->session,req->backend->gate, req->node_tag.data_tag,&req->count, sizeof(req->count), &req->backend->size_req);
  129. STARPU_ASSERT_MSG(ret == NM_ESUCCESS, "when sending size, nm_sr_isend returning %d", ret);
  130. }
  131. else
  132. {
  133. // We check the size returned with the 2 calls to pack is the same
  134. STARPU_ASSERT_MSG(req->count == psize, "Calls to pack_data returned different sizes %ld != %ld", req->count, psize);
  135. }
  136. // We can send the data now
  137. }
  138. _starpu_mpi_isend_data_func(req);
  139. }
  140. /********************************************************/
  141. /* */
  142. /* Receive functionalities */
  143. /* */
  144. /********************************************************/
  145. static void _starpu_mpi_irecv_data_func(struct _starpu_mpi_req *req)
  146. {
  147. _STARPU_MPI_LOG_IN();
  148. _STARPU_MPI_DEBUG(20, "post NM irecv request %p type %s tag %ld src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n", req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.node.rank, req->data_handle, req->ptr, req->datatype_name, (int)req->count, req->registered_datatype);
  149. _STARPU_MPI_TRACE_IRECV_SUBMIT_BEGIN(req->node_tag.node.rank, req->node_tag.data_tag);
  150. //req->ret = MPI_Irecv(req->ptr, req->count, req->datatype, req->srcdst, req->mpi_tag, req->comm, &req->request);
  151. struct nm_data_s data;
  152. nm_mpi_nmad_data_get(&data, (void*)req->ptr, req->datatype, req->count);
  153. nm_sr_recv_init(req->backend->session, &(req->backend->data_request));
  154. nm_sr_recv_unpack_data(req->backend->session, &(req->backend->data_request), &data);
  155. nm_sr_recv_irecv(req->backend->session, &(req->backend->data_request), req->backend->gate, req->node_tag.data_tag, NM_TAG_MASK_FULL);
  156. _STARPU_MPI_TRACE_IRECV_SUBMIT_END(req->node_tag.node.rank, req->node_tag.data_tag);
  157. _starpu_mpi_handle_pending_request(req);
  158. _STARPU_MPI_LOG_OUT();
  159. }
  160. struct _starpu_mpi_irecv_size_callback
  161. {
  162. starpu_data_handle_t handle;
  163. struct _starpu_mpi_req *req;
  164. };
  165. static void _starpu_mpi_irecv_size_callback(void *arg)
  166. {
  167. struct _starpu_mpi_irecv_size_callback *callback = (struct _starpu_mpi_irecv_size_callback *)arg;
  168. starpu_data_unregister(callback->handle);
  169. callback->req->ptr = malloc(callback->req->count);
  170. STARPU_ASSERT_MSG(callback->req->ptr, "cannot allocate message of size %ld", callback->req->count);
  171. _starpu_mpi_irecv_data_func(callback->req);
  172. free(callback);
  173. }
  174. void _starpu_mpi_irecv_size_func(struct _starpu_mpi_req *req)
  175. {
  176. _STARPU_MPI_LOG_IN();
  177. _starpu_mpi_datatype_allocate(req->data_handle, req);
  178. if (req->registered_datatype == 1)
  179. {
  180. req->count = 1;
  181. req->ptr = starpu_data_handle_to_pointer(req->data_handle, STARPU_MAIN_RAM);
  182. _starpu_mpi_irecv_data_func(req);
  183. }
  184. else
  185. {
  186. struct _starpu_mpi_irecv_size_callback *callback = malloc(sizeof(struct _starpu_mpi_irecv_size_callback));
  187. callback->req = req;
  188. starpu_variable_data_register(&callback->handle, 0, (uintptr_t)&(callback->req->count), sizeof(callback->req->count));
  189. _STARPU_MPI_DEBUG(4, "Receiving size with tag %ld from node %d\n", req->node_tag.data_tag, req->node_tag.node.rank);
  190. _starpu_mpi_irecv_common(callback->handle, req->node_tag.node.rank, req->node_tag.data_tag, req->node_tag.node.comm, 1, 0, _starpu_mpi_irecv_size_callback, callback,1,0,0);
  191. }
  192. }
  193. /********************************************************/
  194. /* */
  195. /* Wait functionalities */
  196. /* */
  197. /********************************************************/
  198. #define _starpu_mpi_req_status(PUBLIC_REQ,STATUS) do { \
  199. STATUS->MPI_SOURCE=PUBLIC_REQ->node_tag.node.rank; /**< field name mandatory by spec */ \
  200. STATUS->MPI_TAG=PUBLIC_REQ->node_tag.data_tag; /**< field name mandatory by spec */ \
  201. STATUS->MPI_ERROR=PUBLIC_REQ->ret; /**< field name mandatory by spec */ \
  202. STATUS->size=PUBLIC_REQ->count; /**< size of data received */ \
  203. STATUS->cancelled=0; /**< whether request was cancelled */ \
  204. } while(0)
  205. int _starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
  206. {
  207. _STARPU_MPI_LOG_IN();
  208. STARPU_MPI_ASSERT_MSG(public_req, "starpu_mpi_wait needs a valid starpu_mpi_req");
  209. struct _starpu_mpi_req *req = *public_req;
  210. STARPU_MPI_ASSERT_MSG(!req->detached, "MPI_Wait cannot be called on a detached request");
  211. /* we must do a test_locked to avoid race condition :
  212. * without req_cond could still be used and couldn't be freed)*/
  213. while (!req->completed || ! piom_cond_test_locked(&(req->backend->req_cond),REQ_FINALIZED))
  214. {
  215. piom_cond_wait(&(req->backend->req_cond),REQ_FINALIZED);
  216. }
  217. if (status!=MPI_STATUS_IGNORE)
  218. _starpu_mpi_req_status(req,status);
  219. _starpu_mpi_request_destroy(req);
  220. *public_req = NULL;
  221. _STARPU_MPI_LOG_OUT();
  222. return MPI_SUCCESS;
  223. }
  224. /********************************************************/
  225. /* */
  226. /* Test functionalities */
  227. /* */
  228. /********************************************************/
  229. int _starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
  230. {
  231. _STARPU_MPI_LOG_IN();
  232. STARPU_MPI_ASSERT_MSG(public_req, "starpu_mpi_test needs a valid starpu_mpi_req");
  233. struct _starpu_mpi_req *req = *public_req;
  234. STARPU_MPI_ASSERT_MSG(!req->detached, "MPI_Test cannot be called on a detached request");
  235. _STARPU_MPI_DEBUG(2, "Test request %p type %s tag %ld src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
  236. req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.node.rank, req->data_handle, req->ptr, req->datatype_name, (int)req->count, req->registered_datatype);
  237. _STARPU_MPI_TRACE_UTESTING_BEGIN(req->node_tag.node.rank, req->node_tag.data_tag);
  238. /* we must do a test_locked to avoid race condition :
  239. * without req_cond could still be used and couldn't be freed)*/
  240. *flag = req->completed && piom_cond_test_locked(&(req->backend->req_cond),REQ_FINALIZED);
  241. if (*flag && status!=MPI_STATUS_IGNORE)
  242. _starpu_mpi_req_status(req,status);
  243. _STARPU_MPI_TRACE_UTESTING_END(req->node_tag.node.rank, req->node_tag.data_tag);
  244. if(*flag)
  245. {
  246. _starpu_mpi_request_destroy(req);
  247. *public_req = NULL;
  248. }
  249. _STARPU_MPI_LOG_OUT();
  250. return MPI_SUCCESS;
  251. }
  252. /********************************************************/
  253. /* */
  254. /* Barrier functionalities */
  255. /* */
  256. /********************************************************/
  257. int _starpu_mpi_barrier(MPI_Comm comm)
  258. {
  259. _STARPU_MPI_LOG_IN();
  260. int ret;
  261. // STARPU_ASSERT_MSG(!barrier_running, "Concurrent starpu_mpi_barrier is not implemented, even on different communicators");
  262. ret = MPI_Barrier(comm);
  263. STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Barrier returning %d", ret);
  264. _STARPU_MPI_LOG_OUT();
  265. return ret;
  266. }
  267. /********************************************************/
  268. /* */
  269. /* Progression */
  270. /* */
  271. /********************************************************/
  272. #ifdef STARPU_VERBOSE
  273. static char *_starpu_mpi_request_type(enum _starpu_mpi_request_type request_type)
  274. {
  275. switch (request_type)
  276. {
  277. case SEND_REQ: return "SEND_REQ";
  278. case RECV_REQ: return "RECV_REQ";
  279. case WAIT_REQ: return "WAIT_REQ";
  280. case TEST_REQ: return "TEST_REQ";
  281. case BARRIER_REQ: return "BARRIER_REQ";
  282. default: return "unknown request type";
  283. }
  284. }
  285. #endif
  286. static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req,nm_sr_event_t event)
  287. {
  288. _STARPU_MPI_LOG_IN();
  289. _STARPU_MPI_DEBUG(2, "complete MPI request %p type %s tag %ld src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
  290. req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.node.rank, req->data_handle, req->ptr, req->datatype_name, (int)req->count, req->registered_datatype);
  291. if (req->request_type == RECV_REQ || req->request_type == SEND_REQ)
  292. {
  293. if (req->registered_datatype == 0)
  294. {
  295. if(req->backend->waited == 1)
  296. nm_mpi_nmad_data_release(req->datatype);
  297. if (req->request_type == SEND_REQ)
  298. {
  299. req->backend->waited--;
  300. // We need to make sure the communication for sending the size
  301. // has completed, as MPI can re-order messages, let's count
  302. // recerived message.
  303. // FIXME concurent access.
  304. STARPU_ASSERT_MSG(event == NM_SR_EVENT_FINALIZED, "Callback with event %d", event);
  305. if(req->backend->waited>0)
  306. return;
  307. }
  308. if (req->request_type == RECV_REQ)
  309. // req->ptr is freed by starpu_data_unpack
  310. starpu_data_unpack(req->data_handle, req->ptr, req->count);
  311. else
  312. free(req->ptr);
  313. }
  314. else
  315. {
  316. nm_mpi_nmad_data_release(req->datatype);
  317. _starpu_mpi_datatype_free(req->data_handle, &req->datatype);
  318. }
  319. }
  320. _STARPU_MPI_TRACE_TERMINATED(req, req->node_tag.node.rank, req->node_tag.data_tag);
  321. _starpu_mpi_release_req_data(req);
  322. /* Execute the specified callback, if any */
  323. if (req->callback)
  324. {
  325. struct callback_lfstack_cell_s* c = padico_malloc(sizeof(struct callback_lfstack_cell_s));
  326. c->req = req;
  327. /* The main thread can exit without waiting
  328. * the end of the detached request. Callback thread
  329. * must then be kept alive if they have a callback.*/
  330. callback_lfstack_push(&callback_stack, c);
  331. starpu_sem_post(&callback_sem);
  332. }
  333. else
  334. {
  335. if(req->detached)
  336. {
  337. _starpu_mpi_request_destroy(req);
  338. // a detached request wont be wait/test (and freed inside).
  339. }
  340. else
  341. {
  342. /* tell anyone potentially waiting on the request that it is
  343. * terminated now (should be done after the callback)*/
  344. req->completed = 1;
  345. piom_cond_signal(&req->backend->req_cond, REQ_FINALIZED);
  346. }
  347. int pending_remaining = STARPU_ATOMIC_ADD(&pending_request, -1);
  348. if (!running && !pending_remaining)
  349. starpu_sem_post(&callback_sem);
  350. }
  351. _STARPU_MPI_LOG_OUT();
  352. }
  353. void _starpu_mpi_handle_request_termination_callback(nm_sr_event_t event, const nm_sr_event_info_t*event_info, void*ref)
  354. {
  355. _starpu_mpi_handle_request_termination(ref,event);
  356. }
  357. static void _starpu_mpi_handle_pending_request(struct _starpu_mpi_req *req)
  358. {
  359. if(req->request_type == SEND_REQ && req->backend->waited>1)
  360. {
  361. nm_sr_request_set_ref(&(req->backend->size_req), req);
  362. nm_sr_request_monitor(req->backend->session, &(req->backend->size_req), NM_SR_EVENT_FINALIZED,_starpu_mpi_handle_request_termination_callback);
  363. }
  364. /* the if must be before, because the first callback can directly free
  365. * a detached request (the second callback free if req->backend->waited>1). */
  366. nm_sr_request_set_ref(&(req->backend->data_request), req);
  367. nm_sr_request_monitor(req->backend->session, &(req->backend->data_request), NM_SR_EVENT_FINALIZED,_starpu_mpi_handle_request_termination_callback);
  368. }
  369. void _starpu_mpi_coop_sends_build_tree(struct _starpu_mpi_coop_sends *coop_sends)
  370. {
  371. /* TODO: turn them into redirects & forwards */
  372. }
  373. void _starpu_mpi_submit_coop_sends(struct _starpu_mpi_coop_sends *coop_sends, int submit_control, int submit_data)
  374. {
  375. unsigned i, n = coop_sends->n;
  376. /* Note: coop_sends might disappear very very soon after last request is submitted */
  377. for (i = 0; i < n; i++)
  378. {
  379. if (coop_sends->reqs_array[i]->request_type == SEND_REQ && submit_data)
  380. {
  381. _STARPU_MPI_DEBUG(0, "cooperative sends %p sending to %d\n", coop_sends, coop_sends->reqs_array[i]->node_tag.node.rank);
  382. _starpu_mpi_submit_ready_request(coop_sends->reqs_array[i]);
  383. }
  384. /* TODO: handle redirect requests */
  385. }
  386. }
  387. void _starpu_mpi_submit_ready_request(void *arg)
  388. {
  389. _STARPU_MPI_LOG_IN();
  390. struct _starpu_mpi_req *req = arg;
  391. STARPU_ASSERT_MSG(req, "Invalid request");
  392. /* submit the request to MPI directly from submitter */
  393. _STARPU_MPI_DEBUG(2, "Handling new request %p type %s tag %ld src %d data %p ptr %p datatype '%s' count %d registered_datatype %d \n",
  394. req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.node.rank, req->data_handle, req->ptr, req->datatype_name, (int)req->count, req->registered_datatype);
  395. req->func(req);
  396. _STARPU_MPI_LOG_OUT();
  397. }
  398. static void *_starpu_mpi_progress_thread_func(void *arg)
  399. {
  400. struct _starpu_mpi_argc_argv *argc_argv = (struct _starpu_mpi_argc_argv *) arg;
  401. #ifndef STARPU_SIMGRID
  402. if (_starpu_mpi_thread_cpuid < 0)
  403. {
  404. _starpu_mpi_thread_cpuid = starpu_get_next_bindid(0, NULL, 0);
  405. }
  406. if (starpu_bind_thread_on(_starpu_mpi_thread_cpuid, 0, "MPI") < 0)
  407. {
  408. _STARPU_DISP("No core was available for the MPI thread. You should use STARPU_RESERVE_NCPU to leave one core available for MPI, or specify one core less in STARPU_NCPU\n");
  409. }
  410. _starpu_mpi_do_initialize(argc_argv);
  411. if (_starpu_mpi_thread_cpuid >= 0)
  412. /* In case MPI changed the binding */
  413. starpu_bind_thread_on(_starpu_mpi_thread_cpuid, STARPU_THREAD_ACTIVE, "MPI");
  414. #endif
  415. _starpu_mpi_env_init();
  416. #ifdef STARPU_SIMGRID
  417. /* Now that MPI is set up, let the rest of simgrid get initialized */
  418. char **argv_cpy;
  419. _STARPU_MPI_MALLOC(argv_cpy, *(argc_argv->argc) * sizeof(char*));
  420. int i;
  421. for (i = 0; i < *(argc_argv->argc); i++)
  422. argv_cpy[i] = strdup((*(argc_argv->argv))[i]);
  423. MSG_process_create_with_arguments("main", smpi_simulated_main_, NULL, _starpu_simgrid_get_host_by_name("MAIN"), *(argc_argv->argc), argv_cpy);
  424. /* And set TSD for us */
  425. void **tsd;
  426. _STARPU_CALLOC(tsd, MAX_TSD + 1, sizeof(void*));
  427. if (!smpi_process_set_user_data)
  428. {
  429. _STARPU_ERROR("Your version of simgrid does not provide smpi_process_set_user_data, we can not continue without it\n");
  430. }
  431. smpi_process_set_user_data(tsd);
  432. #endif
  433. _starpu_mpi_comm_amounts_init(argc_argv->comm);
  434. _starpu_mpi_cache_init(argc_argv->comm);
  435. _starpu_mpi_select_node_init();
  436. _starpu_mpi_datatype_init();
  437. #ifdef STARPU_USE_FXT
  438. _starpu_fxt_wait_initialisation();
  439. /* We need to record our ID in the trace before the main thread makes any MPI call */
  440. _STARPU_MPI_TRACE_START(argc_argv->rank, argc_argv->world_size);
  441. starpu_profiling_set_id(argc_argv->rank);
  442. #endif //STARPU_USE_FXT
  443. /* notify the main thread that the progression thread is ready */
  444. STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
  445. running = 1;
  446. STARPU_PTHREAD_COND_SIGNAL(&progress_cond);
  447. STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
  448. _starpu_mpi_add_sync_point_in_fxt();
  449. while (1)
  450. {
  451. struct callback_lfstack_cell_s* c = callback_lfstack_pop(&callback_stack);
  452. int err=0;
  453. if(running || pending_request>0)
  454. {
  455. /* shall we block ? */
  456. err = starpu_sem_wait(&callback_sem);
  457. //running pending_request can change while waiting
  458. }
  459. if(c==NULL)
  460. {
  461. c = callback_lfstack_pop(&callback_stack);
  462. if (c == NULL)
  463. {
  464. if(running && pending_request>0)
  465. {
  466. STARPU_ASSERT_MSG(c!=NULL, "Callback thread awakened without callback ready with error %d.",err);
  467. }
  468. else
  469. {
  470. if (pending_request==0)
  471. break;
  472. }
  473. continue;
  474. }
  475. }
  476. c->req->callback(c->req->callback_arg);
  477. if (c->req->detached)
  478. {
  479. _starpu_mpi_request_destroy(c->req);
  480. }
  481. else
  482. {
  483. c->req->completed=1;
  484. piom_cond_signal(&(c->req->backend->req_cond), REQ_FINALIZED);
  485. }
  486. STARPU_ATOMIC_ADD( &pending_request, -1);
  487. /* we signal that the request is completed.*/
  488. free(c);
  489. }
  490. STARPU_ASSERT_MSG(callback_lfstack_pop(&callback_stack)==NULL, "List of callback not empty.");
  491. STARPU_ASSERT_MSG(pending_request==0, "Request still pending.");
  492. if (argc_argv->initialize_mpi)
  493. {
  494. _STARPU_MPI_DEBUG(3, "Calling MPI_Finalize()\n");
  495. MPI_Finalize();
  496. }
  497. starpu_sem_destroy(&callback_sem);
  498. free(argc_argv);
  499. return NULL;
  500. }
  501. /********************************************************/
  502. /* */
  503. /* (De)Initialization methods */
  504. /* */
  505. /********************************************************/
  506. // #ifdef STARPU_MPI_ACTIVITY
  507. // static int hookid = - 1;
  508. // #endif /* STARPU_MPI_ACTIVITY */
  509. static void _starpu_mpi_add_sync_point_in_fxt(void)
  510. {
  511. #ifdef STARPU_USE_FXT
  512. int rank;
  513. int worldsize;
  514. int ret;
  515. starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
  516. starpu_mpi_comm_size(MPI_COMM_WORLD, &worldsize);
  517. ret = MPI_Barrier(MPI_COMM_WORLD);
  518. STARPU_MPI_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Barrier returning %s", _starpu_mpi_get_mpi_error_code(ret));
  519. /* We generate a "unique" key so that we can make sure that different
  520. * FxT traces come from the same MPI run. */
  521. int random_number;
  522. /* XXX perhaps we don't want to generate a new seed if the application
  523. * specified some reproductible behaviour ? */
  524. if (rank == 0)
  525. {
  526. srand(time(NULL));
  527. random_number = rand();
  528. }
  529. ret = MPI_Bcast(&random_number, 1, MPI_INT, 0, MPI_COMM_WORLD);
  530. STARPU_MPI_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Bcast returning %s", _starpu_mpi_get_mpi_error_code(ret));
  531. _STARPU_MPI_TRACE_BARRIER(rank, worldsize, random_number);
  532. _STARPU_MPI_DEBUG(3, "unique key %x\n", random_number);
  533. #endif
  534. }
  535. int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv)
  536. {
  537. STARPU_PTHREAD_MUTEX_INIT(&progress_mutex, NULL);
  538. STARPU_PTHREAD_COND_INIT(&progress_cond, NULL);
  539. starpu_sem_init(&callback_sem, 0, 0);
  540. running = 0;
  541. /* Tell pioman to use a bound thread for communication progression */
  542. unsigned piom_bindid = starpu_get_next_bindid(STARPU_THREAD_ACTIVE, NULL, 0);
  543. int indexes[1] = {piom_bindid};
  544. piom_ltask_set_bound_thread_indexes(HWLOC_OBJ_PU,indexes,1);
  545. /* We force the "MPI" thread to share the same core as the pioman thread
  546. to avoid binding it on the same core as a worker */
  547. _starpu_mpi_thread_cpuid = piom_bindid;
  548. /* Register some hooks for communication progress if needed */
  549. int polling_point_prog, polling_point_idle;
  550. char *s_prog_hooks = starpu_getenv("STARPU_MPI_NMAD_PROG_HOOKS");
  551. char *s_idle_hooks = starpu_getenv("STARPU_MPI_NMAD_IDLE_HOOKS");
  552. if(!s_prog_hooks)
  553. {
  554. polling_point_prog = 0;
  555. }
  556. else
  557. {
  558. polling_point_prog =
  559. (strcmp(s_prog_hooks, "FORCED") == 0) ? PIOM_POLL_POINT_FORCED :
  560. (strcmp(s_prog_hooks, "SINGLE") == 0) ? PIOM_POLL_POINT_SINGLE :
  561. (strcmp(s_prog_hooks, "HOOK") == 0) ? PIOM_POLL_POINT_HOOK :
  562. 0;
  563. }
  564. if(!s_idle_hooks)
  565. {
  566. polling_point_idle = 0;
  567. }
  568. else
  569. {
  570. polling_point_idle =
  571. (strcmp(s_idle_hooks, "FORCED") == 0) ? PIOM_POLL_POINT_FORCED :
  572. (strcmp(s_idle_hooks, "SINGLE") == 0) ? PIOM_POLL_POINT_SINGLE :
  573. (strcmp(s_idle_hooks, "HOOK") == 0) ? PIOM_POLL_POINT_HOOK :
  574. 0;
  575. }
  576. if(polling_point_prog)
  577. {
  578. starpu_progression_hook_register((unsigned (*)(void *))&piom_ltask_schedule, (void *)&polling_point_prog);
  579. }
  580. if(polling_point_idle)
  581. {
  582. starpu_idle_hook_register((unsigned (*)(void *))&piom_ltask_schedule, (void *)&polling_point_idle);
  583. }
  584. /* Launch thread used for nmad callbacks */
  585. STARPU_PTHREAD_CREATE(&progress_thread, NULL, _starpu_mpi_progress_thread_func, argc_argv);
  586. STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
  587. while (!running)
  588. STARPU_PTHREAD_COND_WAIT(&progress_cond, &progress_mutex);
  589. STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
  590. return 0;
  591. }
  592. void _starpu_mpi_progress_shutdown(void **value)
  593. {
  594. /* kill the progression thread */
  595. STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
  596. running = 0;
  597. STARPU_PTHREAD_COND_BROADCAST(&progress_cond);
  598. STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
  599. starpu_sem_post(&callback_sem);
  600. STARPU_PTHREAD_JOIN(progress_thread, value);
  601. STARPU_PTHREAD_MUTEX_DESTROY(&progress_mutex);
  602. STARPU_PTHREAD_COND_DESTROY(&progress_cond);
  603. }
  604. static int64_t _starpu_mpi_tag_max = INT64_MAX;
  605. int starpu_mpi_comm_get_attr(MPI_Comm comm, int keyval, void *attribute_val, int *flag)
  606. {
  607. (void) comm;
  608. if (keyval == STARPU_MPI_TAG_UB)
  609. {
  610. if ((uint64_t) _starpu_mpi_tag_max > NM_TAG_MAX)
  611. _starpu_mpi_tag_max = NM_TAG_MAX;
  612. /* manage case where nmad max tag causes overflow if represented as starpu tag */
  613. *(int64_t **)attribute_val = &_starpu_mpi_tag_max;
  614. *flag = 1;
  615. }
  616. else
  617. {
  618. *flag = 0;
  619. }
  620. return 0;
  621. }
  622. #endif /* STARPU_USE_MPI_NMAD*/