starpu_fxt_mpi.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2010-2021 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  4. * Copyright (C) 2017,2019 Federal University of Rio Grande do Sul (UFRGS)
  5. *
  6. * StarPU is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU Lesser General Public License as published by
  8. * the Free Software Foundation; either version 2.1 of the License, or (at
  9. * your option) any later version.
  10. *
  11. * StarPU is distributed in the hope that it will be useful, but
  12. * WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  14. *
  15. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  16. */
  17. #include <starpu.h>
  18. #include <common/config.h>
  19. #ifdef STARPU_USE_FXT
  20. #include "starpu_fxt.h"
  21. #ifdef STARPU_HAVE_POTI
  22. #include <poti.h>
  23. #define STARPU_POTI_STR_LEN 200
  24. #endif
  25. LIST_TYPE(mpi_transfer,
  26. unsigned matched;
  27. int src;
  28. int dst;
  29. long mpi_tag;
  30. size_t size;
  31. float date;
  32. long jobid;
  33. double bandwidth;
  34. unsigned long handle;
  35. unsigned type;
  36. int prio;
  37. );
  38. /* Returns 0 if a barrier is found, -1 otherwise. In case of success, offset is
  39. * filled with the timestamp of the barrier */
  40. int _starpu_fxt_mpi_find_sync_point(char *filename_in, uint64_t *offset, int *key, int *rank)
  41. {
  42. STARPU_ASSERT(offset);
  43. /* Open the trace file */
  44. int fd_in;
  45. fd_in = open(filename_in, O_RDONLY);
  46. if (fd_in < 0)
  47. {
  48. perror("open failed :");
  49. exit(-1);
  50. }
  51. static fxt_t fut;
  52. fut = fxt_fdopen(fd_in);
  53. if (!fut)
  54. {
  55. perror("fxt_fdopen :");
  56. exit(-1);
  57. }
  58. fxt_blockev_t block;
  59. block = fxt_blockev_enter(fut);
  60. struct fxt_ev_64 ev;
  61. int func_ret = -1;
  62. unsigned found = 0;
  63. while(!found)
  64. {
  65. int ret = fxt_next_ev(block, FXT_EV_TYPE_64, (struct fxt_ev *)&ev);
  66. if (ret != FXT_EV_OK)
  67. {
  68. _STARPU_MSG("no more block ...\n");
  69. break;
  70. }
  71. if (ev.code == _STARPU_MPI_FUT_BARRIER)
  72. {
  73. /* We found the sync point */
  74. *offset = ev.time;
  75. *rank = ev.param[0];
  76. *key = ev.param[2];
  77. found = 1;
  78. func_ret = 0;
  79. }
  80. }
  81. /* Close the trace file */
  82. if (close(fd_in))
  83. {
  84. perror("close failed :");
  85. exit(-1);
  86. }
  87. return func_ret;
  88. }
  89. /*
  90. * Deal with the actual MPI transfers performed with the MPI lib
  91. */
  92. /* the list of MPI transfers found in the different traces */
  93. static struct mpi_transfer *mpi_sends[STARPU_FXT_MAX_FILES] = {NULL};
  94. static struct mpi_transfer *mpi_recvs[STARPU_FXT_MAX_FILES] = {NULL};
  95. /* number of available slots in the lists */
  96. unsigned mpi_sends_list_size[STARPU_FXT_MAX_FILES] = {0};
  97. unsigned mpi_recvs_list_size[STARPU_FXT_MAX_FILES] = {0};
  98. /* number of slots actually used in the list */
  99. unsigned mpi_sends_used[STARPU_FXT_MAX_FILES] = {0};
  100. unsigned mpi_recvs_used[STARPU_FXT_MAX_FILES] = {0};
  101. /* number of slots already matched at the beginning of the list. This permits
  102. * going through the lists from the beginning to match each and every
  103. * transfer, thus avoiding a quadratic complexity. */
  104. unsigned mpi_recvs_matched[STARPU_FXT_MAX_FILES][STARPU_FXT_MAX_FILES] = { {0} };
  105. unsigned mpi_sends_matched[STARPU_FXT_MAX_FILES][STARPU_FXT_MAX_FILES] = { {0} };
  106. void _starpu_fxt_mpi_add_send_transfer(int src, int dst STARPU_ATTRIBUTE_UNUSED, long mpi_tag, size_t size, float date, long jobid, unsigned long handle, unsigned type, int prio)
  107. {
  108. STARPU_ASSERT(src >= 0);
  109. if (src >= STARPU_FXT_MAX_FILES)
  110. return;
  111. unsigned slot = mpi_sends_used[src]++;
  112. if (mpi_sends_used[src] > mpi_sends_list_size[src])
  113. {
  114. if (mpi_sends_list_size[src] > 0)
  115. {
  116. mpi_sends_list_size[src] *= 2;
  117. }
  118. else
  119. {
  120. mpi_sends_list_size[src] = 1;
  121. }
  122. _STARPU_REALLOC(mpi_sends[src], mpi_sends_list_size[src]*sizeof(struct mpi_transfer));
  123. }
  124. mpi_sends[src][slot].matched = 0;
  125. mpi_sends[src][slot].src = src;
  126. mpi_sends[src][slot].dst = dst;
  127. mpi_sends[src][slot].mpi_tag = mpi_tag;
  128. mpi_sends[src][slot].size = size;
  129. mpi_sends[src][slot].date = date;
  130. mpi_sends[src][slot].jobid = jobid;
  131. mpi_sends[src][slot].handle = handle;
  132. mpi_sends[src][slot].type = type;
  133. mpi_sends[src][slot].prio = prio;
  134. }
  135. void _starpu_fxt_mpi_add_recv_transfer(int src STARPU_ATTRIBUTE_UNUSED, int dst, long mpi_tag, float date, long jobid, unsigned long handle)
  136. {
  137. if (dst >= STARPU_FXT_MAX_FILES)
  138. return;
  139. unsigned slot = mpi_recvs_used[dst]++;
  140. if (mpi_recvs_used[dst] > mpi_recvs_list_size[dst])
  141. {
  142. if (mpi_recvs_list_size[dst] > 0)
  143. {
  144. mpi_recvs_list_size[dst] *= 2;
  145. }
  146. else
  147. {
  148. mpi_recvs_list_size[dst] = 1;
  149. }
  150. _STARPU_REALLOC(mpi_recvs[dst], mpi_recvs_list_size[dst]*sizeof(struct mpi_transfer));
  151. }
  152. mpi_recvs[dst][slot].matched = 0;
  153. mpi_recvs[dst][slot].src = src;
  154. mpi_recvs[dst][slot].dst = dst;
  155. mpi_recvs[dst][slot].mpi_tag = mpi_tag;
  156. mpi_recvs[dst][slot].date = date;
  157. mpi_recvs[dst][slot].jobid = jobid;
  158. mpi_recvs[dst][slot].handle = handle;
  159. }
  160. static
  161. struct mpi_transfer *try_to_match_send_transfer(int src STARPU_ATTRIBUTE_UNUSED, int dst, long mpi_tag)
  162. {
  163. unsigned slot;
  164. unsigned firstslot = mpi_recvs_matched[src][dst];
  165. unsigned all_previous_were_matched = 1;
  166. for (slot = firstslot; slot < mpi_recvs_used[dst]; slot++)
  167. {
  168. if (!mpi_recvs[dst][slot].matched)
  169. {
  170. if (mpi_recvs[dst][slot].mpi_tag == mpi_tag)
  171. {
  172. /* we found a match ! */
  173. mpi_recvs[dst][slot].matched = 1;
  174. return &mpi_recvs[dst][slot];
  175. }
  176. all_previous_were_matched = 0;
  177. }
  178. else
  179. {
  180. if (all_previous_were_matched)
  181. {
  182. /* All previous transfers are already matched,
  183. * we need not consider them anymore */
  184. mpi_recvs_matched[src][dst] = slot;
  185. }
  186. }
  187. }
  188. /* If we reached that point, we could not find a match */
  189. return NULL;
  190. }
  191. static unsigned long mpi_com_id = 0;
  192. static const char* get_mpi_type_str(unsigned mpi_type)
  193. {
  194. switch (mpi_type)
  195. {
  196. case _STARPU_MPI_FUT_POINT_TO_POINT_SEND:
  197. return "PointToPoint";
  198. case _STARPU_MPI_FUT_COLLECTIVE_SEND:
  199. return "Collective";
  200. default:
  201. return "Unknown";
  202. }
  203. }
  204. static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comms_file, unsigned n)
  205. {
  206. unsigned slot[STARPU_FXT_MAX_FILES] = { 0 }, node;
  207. unsigned nb_wrong_comm_timing = 0;
  208. struct mpi_transfer_list pending_receives; /* Sorted list of matches which have not happened yet */
  209. double current_out_bandwidth[STARPU_FXT_MAX_FILES] = { 0. };
  210. double current_in_bandwidth[STARPU_FXT_MAX_FILES] = { 0. };
  211. #ifdef STARPU_HAVE_POTI
  212. char mpi_container[STARPU_POTI_STR_LEN];
  213. #endif
  214. //bwi_mpi and bwo_mpi are set to zero when MPI thread containers are created
  215. mpi_transfer_list_init(&pending_receives);
  216. while (1)
  217. {
  218. float start_date;
  219. struct mpi_transfer *cur, *match;
  220. int src;
  221. /* Find out which event comes first: a pending receive, or a new send */
  222. if (mpi_transfer_list_empty(&pending_receives))
  223. start_date = INFINITY;
  224. else
  225. start_date = mpi_transfer_list_front(&pending_receives)->date;
  226. src = STARPU_FXT_MAX_FILES;
  227. for (node = 0; node < n; node++)
  228. {
  229. if (slot[node] < mpi_sends_used[node] && mpi_sends[node][slot[node]].date < start_date)
  230. {
  231. /* next send for node is earlier than others */
  232. src = node;
  233. start_date = mpi_sends[src][slot[src]].date;
  234. }
  235. }
  236. if (start_date == INFINITY)
  237. /* No event any more, we're finished! */
  238. break;
  239. if (src == STARPU_FXT_MAX_FILES)
  240. {
  241. /* Pending match is earlier than all new sends, finish its communication */
  242. match = mpi_transfer_list_pop_front(&pending_receives);
  243. current_out_bandwidth[match->src] -= match->bandwidth;
  244. current_in_bandwidth[match->dst] -= match->bandwidth;
  245. #ifdef STARPU_HAVE_POTI
  246. snprintf(mpi_container, sizeof(mpi_container), "%d_mpict", match->src);
  247. poti_SetVariable(match->date, mpi_container, "bwo_mpi", current_out_bandwidth[match->src]);
  248. snprintf(mpi_container, sizeof(mpi_container), "%d_mpict", match->dst);
  249. poti_SetVariable(match->date, mpi_container, "bwi_mpi", current_in_bandwidth[match->dst]);
  250. #else
  251. fprintf(out_paje_file, "13 %.9f %d_mpict bwo_mpi %f\n", match->date, match->src, current_out_bandwidth[match->src]);
  252. fprintf(out_paje_file, "13 %.9f %d_mpict bwi_mpi %f\n", match->date, match->dst, current_in_bandwidth[match->dst]);
  253. #endif
  254. continue;
  255. }
  256. cur = &mpi_sends[src][slot[src]];
  257. int dst = cur->dst;
  258. long mpi_tag = cur->mpi_tag;
  259. size_t size = cur->size;
  260. unsigned long send_handle = cur->handle;
  261. if (dst < STARPU_FXT_MAX_FILES)
  262. match = try_to_match_send_transfer(src, dst, mpi_tag);
  263. else
  264. match = NULL;
  265. if (match)
  266. {
  267. float end_date = match->date;
  268. unsigned long recv_handle = match->handle;
  269. struct mpi_transfer *prev;
  270. if (end_date <= start_date)
  271. nb_wrong_comm_timing++;
  272. match->bandwidth = (0.001*size)/(end_date - start_date);
  273. current_out_bandwidth[src] += match->bandwidth;
  274. current_in_bandwidth[dst] += match->bandwidth;
  275. /* Insert in sorted list, most probably at the end so let's use a mere insertion sort */
  276. for (prev = mpi_transfer_list_last(&pending_receives);
  277. prev != mpi_transfer_list_alpha(&pending_receives);
  278. prev = mpi_transfer_list_prev(prev))
  279. if (prev->date <= end_date)
  280. {
  281. /* Found its place */
  282. mpi_transfer_list_insert_after(&pending_receives, match, prev);
  283. break;
  284. }
  285. if (prev == mpi_transfer_list_alpha(&pending_receives))
  286. {
  287. /* No element earlier than this one, put it at the head */
  288. mpi_transfer_list_push_front(&pending_receives, match);
  289. }
  290. unsigned long id = mpi_com_id++;
  291. if (cur->jobid != -1)
  292. _starpu_fxt_dag_add_send(src, cur->jobid, mpi_tag, id);
  293. if (match->jobid != -1)
  294. _starpu_fxt_dag_add_receive(dst, match->jobid, mpi_tag, id);
  295. #ifdef STARPU_HAVE_POTI
  296. char paje_value[STARPU_POTI_STR_LEN], paje_key[STARPU_POTI_STR_LEN];
  297. snprintf(paje_value, sizeof(paje_value), "%lu", (long unsigned) size);
  298. snprintf(paje_key, sizeof(paje_key), "mpicom_%lu", id);
  299. snprintf(mpi_container, sizeof(mpi_container), "%d_mpict", src);
  300. char str_mpi_tag[STARPU_POTI_STR_LEN];
  301. snprintf(str_mpi_tag, sizeof(str_mpi_tag), "%ld", mpi_tag);
  302. char str_priority[STARPU_POTI_STR_LEN];
  303. snprintf(str_priority, sizeof(str_priority), "%d", cur->prio);
  304. char str_handle[STARPU_POTI_STR_LEN];
  305. snprintf(str_handle, sizeof(str_handle), "%lx", send_handle);
  306. poti_user_StartLink(_starpu_poti_MpiLinkStart, start_date, "MPIroot", "MPIL", mpi_container, paje_value, paje_key, 4, str_mpi_tag, get_mpi_type_str(cur->type), str_priority, str_handle);
  307. poti_SetVariable(start_date, mpi_container, "bwo_mpi", current_out_bandwidth[src]);
  308. snprintf(mpi_container, sizeof(mpi_container), "%d_mpict", dst);
  309. poti_EndLink(end_date, "MPIroot", "MPIL", mpi_container, paje_value, paje_key);
  310. poti_SetVariable(start_date, mpi_container, "bwo_mpi", current_in_bandwidth[dst]);
  311. #else
  312. fprintf(out_paje_file, "13 %.9f %d_mpict bwo_mpi %f\n", start_date, src, current_out_bandwidth[src]);
  313. fprintf(out_paje_file, "13 %.9f %d_mpict bwi_mpi %f\n", start_date, dst, current_in_bandwidth[dst]);
  314. fprintf(out_paje_file, "23 %.9f MPIL MPIroot %lu %d_mpict mpicom_%lu %ld %s %d %lx\n", start_date, (unsigned long)size, src, id, mpi_tag, get_mpi_type_str(cur->type), cur->prio, send_handle);
  315. fprintf(out_paje_file, "19 %.9f MPIL MPIroot %lu %d_mpict mpicom_%lu\n", end_date, (unsigned long)size, dst, id);
  316. #endif
  317. if (out_comms_file != NULL)
  318. {
  319. fprintf(out_comms_file, "Src: %d\n", src);
  320. fprintf(out_comms_file, "Dst: %d\n", dst);
  321. fprintf(out_comms_file, "Tag: %ld\n", mpi_tag);
  322. fprintf(out_comms_file, "SendTime: %.9f\n", start_date);
  323. fprintf(out_comms_file, "RecvTime: %.9f\n", end_date);
  324. fprintf(out_comms_file, "SendHandle: %lx\n", send_handle);
  325. fprintf(out_comms_file, "RecvHandle: %lx\n", recv_handle);
  326. if (cur->jobid != -1)
  327. fprintf(out_comms_file, "SendJobId: %d_%ld\n", src, cur->jobid);
  328. if (match->jobid != -1)
  329. fprintf(out_comms_file, "RecvJobId: %d_%ld\n", dst, match->jobid);
  330. fprintf(out_comms_file, "Size: %lu\n", (unsigned long)size);
  331. fprintf(out_comms_file, "Priority: %d\n", cur->prio);
  332. fprintf(out_comms_file, "Type: %s\n", get_mpi_type_str(cur->type));
  333. fprintf(out_comms_file, "\n");
  334. }
  335. }
  336. else
  337. {
  338. _STARPU_DISP("Warning, could not match MPI transfer from %d to %d (tag %lx) starting at %f\n", src, dst, mpi_tag, start_date);
  339. }
  340. slot[src]++;
  341. }
  342. if (nb_wrong_comm_timing == 1)
  343. _STARPU_MSG("Warning: a communication finished before it started !\n");
  344. else if (nb_wrong_comm_timing > 1)
  345. _STARPU_MSG("Warning: %u communications finished before they started !\n", nb_wrong_comm_timing);
  346. }
  347. void _starpu_fxt_display_mpi_transfers(struct starpu_fxt_options *options, int *ranks STARPU_ATTRIBUTE_UNUSED, FILE *out_paje_file, FILE* out_comms_file)
  348. {
  349. if (options->ninputfiles > STARPU_FXT_MAX_FILES)
  350. {
  351. _STARPU_DISP("Warning: %u files given, maximum %u supported, truncating to %u\n", options->ninputfiles, STARPU_FXT_MAX_FILES, STARPU_FXT_MAX_FILES);
  352. options->ninputfiles = STARPU_FXT_MAX_FILES;
  353. }
  354. /* display the MPI transfers if possible */
  355. if (out_paje_file)
  356. display_all_transfers_from_trace(out_paje_file, out_comms_file, options->ninputfiles);
  357. }
  358. #endif // STARPU_USE_FXT