starpu_fxt_mpi.c 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2012-2013, 2016-2017 Université Bordeaux
  4. * Copyright (C) 2010, 2011, 2014, 2016, 2017 CNRS
  5. *
  6. * StarPU is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU Lesser General Public License as published by
  8. * the Free Software Foundation; either version 2.1 of the License, or (at
  9. * your option) any later version.
  10. *
  11. * StarPU is distributed in the hope that it will be useful, but
  12. * WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  14. *
  15. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  16. */
  17. #include <starpu.h>
  18. #include <common/config.h>
  19. #ifdef STARPU_USE_FXT
  20. #include "starpu_fxt.h"
  21. #ifdef STARPU_HAVE_POTI
  22. #include <poti.h>
  23. #define STARPU_POTI_STR_LEN 200
  24. #endif
  25. #define MAX_MPI_NODES 64
  26. LIST_TYPE(mpi_transfer,
  27. unsigned matched;
  28. int other_rank; /* src for a recv, dest for a send */
  29. int mpi_tag;
  30. size_t size;
  31. float date;
  32. double bandwidth;
  33. );
  34. /* Returns 0 if a barrier is found, -1 otherwise. In case of success, offset is
  35. * filled with the timestamp of the barrier */
  36. int _starpu_fxt_mpi_find_sync_point(char *filename_in, uint64_t *offset, int *key, int *rank)
  37. {
  38. STARPU_ASSERT(offset);
  39. /* Open the trace file */
  40. int fd_in;
  41. fd_in = open(filename_in, O_RDONLY);
  42. if (fd_in < 0)
  43. {
  44. perror("open failed :");
  45. exit(-1);
  46. }
  47. static fxt_t fut;
  48. fut = fxt_fdopen(fd_in);
  49. if (!fut)
  50. {
  51. perror("fxt_fdopen :");
  52. exit(-1);
  53. }
  54. fxt_blockev_t block;
  55. block = fxt_blockev_enter(fut);
  56. struct fxt_ev_64 ev;
  57. int func_ret = -1;
  58. unsigned found = 0;
  59. while(!found)
  60. {
  61. int ret = fxt_next_ev(block, FXT_EV_TYPE_64, (struct fxt_ev *)&ev);
  62. if (ret != FXT_EV_OK)
  63. {
  64. _STARPU_MSG("no more block ...\n");
  65. break;
  66. }
  67. if (ev.code == _STARPU_MPI_FUT_BARRIER)
  68. {
  69. /* We found the sync point */
  70. *offset = ev.time;
  71. *rank = ev.param[0];
  72. *key = ev.param[2];
  73. found = 1;
  74. func_ret = 0;
  75. }
  76. }
  77. /* Close the trace file */
  78. if (close(fd_in))
  79. {
  80. perror("close failed :");
  81. exit(-1);
  82. }
  83. return func_ret;
  84. }
  85. /*
  86. * Deal with the actual MPI transfers performed with the MPI lib
  87. */
  88. /* the list of MPI transfers found in the different traces */
  89. static struct mpi_transfer *mpi_sends[MAX_MPI_NODES] = {NULL};
  90. static struct mpi_transfer *mpi_recvs[MAX_MPI_NODES] = {NULL};
  91. /* number of available slots in the lists */
  92. unsigned mpi_sends_list_size[MAX_MPI_NODES] = {0};
  93. unsigned mpi_recvs_list_size[MAX_MPI_NODES] = {0};
  94. /* number of slots actually used in the list */
  95. unsigned mpi_sends_used[MAX_MPI_NODES] = {0};
  96. unsigned mpi_recvs_used[MAX_MPI_NODES] = {0};
  97. /* number of slots already matched at the beginning of the list. This permits
  98. * going through the lists from the beginning to match each and every
  99. * transfer, thus avoiding a quadratic complexity. */
  100. unsigned mpi_recvs_matched[MAX_MPI_NODES][MAX_MPI_NODES] = {0};
  101. unsigned mpi_sends_matched[MAX_MPI_NODES][MAX_MPI_NODES] = {0};
  102. void _starpu_fxt_mpi_add_send_transfer(int src, int dst STARPU_ATTRIBUTE_UNUSED, int mpi_tag, size_t size, float date)
  103. {
  104. STARPU_ASSERT(src >= 0);
  105. if (src >= MAX_MPI_NODES)
  106. return;
  107. unsigned slot = mpi_sends_used[src]++;
  108. if (mpi_sends_used[src] > mpi_sends_list_size[src])
  109. {
  110. if (mpi_sends_list_size[src] > 0)
  111. {
  112. mpi_sends_list_size[src] *= 2;
  113. }
  114. else
  115. {
  116. mpi_sends_list_size[src] = 1;
  117. }
  118. _STARPU_REALLOC(mpi_sends[src], mpi_sends_list_size[src]*sizeof(struct mpi_transfer));
  119. }
  120. mpi_sends[src][slot].matched = 0;
  121. mpi_sends[src][slot].other_rank = dst;
  122. mpi_sends[src][slot].mpi_tag = mpi_tag;
  123. mpi_sends[src][slot].size = size;
  124. mpi_sends[src][slot].date = date;
  125. }
  126. void _starpu_fxt_mpi_add_recv_transfer(int src STARPU_ATTRIBUTE_UNUSED, int dst, int mpi_tag, float date)
  127. {
  128. if (dst >= MAX_MPI_NODES)
  129. return;
  130. unsigned slot = mpi_recvs_used[dst]++;
  131. if (mpi_recvs_used[dst] > mpi_recvs_list_size[dst])
  132. {
  133. if (mpi_recvs_list_size[dst] > 0)
  134. {
  135. mpi_recvs_list_size[dst] *= 2;
  136. }
  137. else
  138. {
  139. mpi_recvs_list_size[dst] = 1;
  140. }
  141. _STARPU_REALLOC(mpi_recvs[dst], mpi_recvs_list_size[dst]*sizeof(struct mpi_transfer));
  142. }
  143. mpi_recvs[dst][slot].matched = 0;
  144. mpi_recvs[dst][slot].other_rank = dst;
  145. mpi_recvs[dst][slot].mpi_tag = mpi_tag;
  146. mpi_recvs[dst][slot].date = date;
  147. }
  148. static
  149. struct mpi_transfer *try_to_match_send_transfer(int src STARPU_ATTRIBUTE_UNUSED, int dst, int mpi_tag)
  150. {
  151. unsigned slot;
  152. unsigned firstslot = mpi_recvs_matched[src][dst];
  153. unsigned all_previous_were_matched = 1;
  154. for (slot = firstslot; slot < mpi_recvs_used[dst]; slot++)
  155. {
  156. if (!mpi_recvs[dst][slot].matched)
  157. {
  158. if (mpi_recvs[dst][slot].mpi_tag == mpi_tag)
  159. {
  160. /* we found a match ! */
  161. mpi_recvs[dst][slot].matched = 1;
  162. return &mpi_recvs[dst][slot];
  163. }
  164. all_previous_were_matched = 0;
  165. }
  166. else
  167. {
  168. if (all_previous_were_matched)
  169. {
  170. /* All previous transfers are already matched,
  171. * we need not consider them anymore */
  172. mpi_recvs_matched[src][dst] = slot;
  173. }
  174. }
  175. }
  176. /* If we reached that point, we could not find a match */
  177. return NULL;
  178. }
  179. static unsigned long mpi_com_id = 0;
  180. static void display_all_transfers_from_trace(FILE *out_paje_file, int src)
  181. {
  182. unsigned slot;
  183. struct mpi_transfer_list pending_matches; /* Sorted list of matches which have not happened yet */
  184. double current_bandwidth = 0.;
  185. #ifdef STARPU_HAVE_POTI
  186. char mpi_local_container[STARPU_POTI_STR_LEN];
  187. snprintf(mpi_local_container, sizeof(mpi_local_container), "%d_mpict", /* XXX */src);
  188. poti_SetVariable(0., mpi_local_container, "bwo", current_bandwidth);
  189. #else
  190. fprintf(out_paje_file, "13 %.9f %d_mpict bwo %f\n", 0., src, current_bandwidth);
  191. #endif
  192. mpi_transfer_list_init(&pending_matches);
  193. slot = 0;
  194. /* Parse sends to display communications and compute outbound bandwidth */
  195. while (slot < mpi_sends_used[src] || !mpi_transfer_list_empty(&pending_matches))
  196. {
  197. float start_date = INFINITY;
  198. struct mpi_transfer *match;
  199. if (slot < mpi_sends_used[src])
  200. start_date = mpi_sends[src][slot].date;
  201. if (!mpi_transfer_list_empty(&pending_matches) &&
  202. mpi_transfer_list_front(&pending_matches)->date < start_date)
  203. {
  204. match = mpi_transfer_list_pop_front(&pending_matches);
  205. current_bandwidth -= match->bandwidth;
  206. if (out_paje_file)
  207. {
  208. #ifdef STARPU_HAVE_POTI
  209. poti_SetVariable(match->date, mpi_local_container, "bwo", current_bandwidth);
  210. #else
  211. fprintf(out_paje_file, "13 %.9f %d_mpict bwo %f\n", match->date, src, current_bandwidth);
  212. #endif
  213. }
  214. continue;
  215. }
  216. int dst = mpi_sends[src][slot].other_rank;
  217. int mpi_tag = mpi_sends[src][slot].mpi_tag;
  218. size_t size = mpi_sends[src][slot].size;
  219. if (dst < MAX_MPI_NODES)
  220. match = try_to_match_send_transfer(src, dst, mpi_tag);
  221. else
  222. match = NULL;
  223. if (match)
  224. {
  225. float end_date = match->date;
  226. struct mpi_transfer *prev;
  227. match->bandwidth = (0.001*size)/(end_date - start_date);
  228. current_bandwidth += match->bandwidth;
  229. /* Insert in sorted list, most probably at the end so a mere insertion sort */
  230. for (prev = mpi_transfer_list_last(&pending_matches);
  231. prev != mpi_transfer_list_alpha(&pending_matches);
  232. prev = mpi_transfer_list_prev(prev))
  233. if (prev->date <= end_date)
  234. {
  235. /* Found its place */
  236. mpi_transfer_list_insert_after(&pending_matches, match, prev);
  237. break;
  238. }
  239. if (prev == mpi_transfer_list_alpha(&pending_matches))
  240. {
  241. /* No element earlier than this one, put it at the head */
  242. mpi_transfer_list_push_front(&pending_matches, match);
  243. }
  244. unsigned long id = mpi_com_id++;
  245. /* TODO replace 0 by a MPI program ? */
  246. if (out_paje_file)
  247. {
  248. #ifdef STARPU_HAVE_POTI
  249. char paje_value[STARPU_POTI_STR_LEN], paje_key[STARPU_POTI_STR_LEN];
  250. snprintf(paje_value, STARPU_POTI_STR_LEN, "%lu", (long unsigned) size);
  251. snprintf(paje_key, STARPU_POTI_STR_LEN, "mpicom_%lu", id);
  252. poti_StartLink(start_date, "MPICt", "MPIL", mpi_local_container, paje_value, paje_key);
  253. char mpi_container[STARPU_POTI_STR_LEN];
  254. snprintf(mpi_container, sizeof(mpi_container), "%d_mpict", /* XXX */dst);
  255. poti_EndLink(end_date, "MPICt", "MPIL", mpi_container, paje_value, paje_key);
  256. poti_SetVariable(start_date, mpi_local_container, "bwo", current_bandwidth);
  257. #else
  258. fprintf(out_paje_file, "18 %.9f MPIL MPIroot %lu %d_mpict mpicom_%lu\n", start_date, (unsigned long)size, /* XXX */src, id);
  259. fprintf(out_paje_file, "19 %.9f MPIL MPIroot %lu %d_mpict mpicom_%lu\n", end_date, (unsigned long)size, /* XXX */dst, id);
  260. fprintf(out_paje_file, "13 %.9f %d_mpict bwo %f\n", start_date, src, current_bandwidth);
  261. #endif
  262. }
  263. }
  264. else
  265. {
  266. _STARPU_DISP("Warning, could not match MPI transfer from %d to %d (tag %x) starting at %f\n", src, dst, mpi_tag, start_date);
  267. }
  268. slot++;
  269. }
  270. }
  271. void _starpu_fxt_display_mpi_transfers(struct starpu_fxt_options *options, int *ranks, FILE *out_paje_file)
  272. {
  273. unsigned inputfile;
  274. if (options->ninputfiles > MAX_MPI_NODES)
  275. {
  276. _STARPU_DISP("Warning: %u files given, maximum %u supported, truncating to %u\n", options->ninputfiles, MAX_MPI_NODES, MAX_MPI_NODES);
  277. options->ninputfiles = MAX_MPI_NODES;
  278. }
  279. /* display the MPI transfers if possible */
  280. for (inputfile = 0; inputfile < options->ninputfiles; inputfile++)
  281. {
  282. int filerank = ranks[inputfile];
  283. display_all_transfers_from_trace(out_paje_file, filerank);
  284. }
  285. }
  286. #endif // STARPU_USE_FXT