starpu_fxt_mpi.c 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2012-2013, 2016-2017 Université Bordeaux
  4. * Copyright (C) 2010, 2011, 2014, 2016, 2017 CNRS
  5. *
  6. * StarPU is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU Lesser General Public License as published by
  8. * the Free Software Foundation; either version 2.1 of the License, or (at
  9. * your option) any later version.
  10. *
  11. * StarPU is distributed in the hope that it will be useful, but
  12. * WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  14. *
  15. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  16. */
  17. #include <starpu.h>
  18. #include <common/config.h>
  19. #ifdef STARPU_USE_FXT
  20. #include "starpu_fxt.h"
  21. #ifdef STARPU_HAVE_POTI
  22. #include <poti.h>
  23. #define STARPU_POTI_STR_LEN 200
  24. #endif
  25. #define MAX_MPI_NODES 64
  26. LIST_TYPE(mpi_transfer,
  27. unsigned matched;
  28. int src;
  29. int dst;
  30. int mpi_tag;
  31. size_t size;
  32. float date;
  33. double bandwidth;
  34. );
  35. /* Returns 0 if a barrier is found, -1 otherwise. In case of success, offset is
  36. * filled with the timestamp of the barrier */
  37. int _starpu_fxt_mpi_find_sync_point(char *filename_in, uint64_t *offset, int *key, int *rank)
  38. {
  39. STARPU_ASSERT(offset);
  40. /* Open the trace file */
  41. int fd_in;
  42. fd_in = open(filename_in, O_RDONLY);
  43. if (fd_in < 0)
  44. {
  45. perror("open failed :");
  46. exit(-1);
  47. }
  48. static fxt_t fut;
  49. fut = fxt_fdopen(fd_in);
  50. if (!fut)
  51. {
  52. perror("fxt_fdopen :");
  53. exit(-1);
  54. }
  55. fxt_blockev_t block;
  56. block = fxt_blockev_enter(fut);
  57. struct fxt_ev_64 ev;
  58. int func_ret = -1;
  59. unsigned found = 0;
  60. while(!found)
  61. {
  62. int ret = fxt_next_ev(block, FXT_EV_TYPE_64, (struct fxt_ev *)&ev);
  63. if (ret != FXT_EV_OK)
  64. {
  65. _STARPU_MSG("no more block ...\n");
  66. break;
  67. }
  68. if (ev.code == _STARPU_MPI_FUT_BARRIER)
  69. {
  70. /* We found the sync point */
  71. *offset = ev.time;
  72. *rank = ev.param[0];
  73. *key = ev.param[2];
  74. found = 1;
  75. func_ret = 0;
  76. }
  77. }
  78. /* Close the trace file */
  79. if (close(fd_in))
  80. {
  81. perror("close failed :");
  82. exit(-1);
  83. }
  84. return func_ret;
  85. }
  86. /*
  87. * Deal with the actual MPI transfers performed with the MPI lib
  88. */
  89. /* the list of MPI transfers found in the different traces */
  90. static struct mpi_transfer *mpi_sends[MAX_MPI_NODES] = {NULL};
  91. static struct mpi_transfer *mpi_recvs[MAX_MPI_NODES] = {NULL};
  92. /* number of available slots in the lists */
  93. unsigned mpi_sends_list_size[MAX_MPI_NODES] = {0};
  94. unsigned mpi_recvs_list_size[MAX_MPI_NODES] = {0};
  95. /* number of slots actually used in the list */
  96. unsigned mpi_sends_used[MAX_MPI_NODES] = {0};
  97. unsigned mpi_recvs_used[MAX_MPI_NODES] = {0};
  98. /* number of slots already matched at the beginning of the list. This permits
  99. * going through the lists from the beginning to match each and every
  100. * transfer, thus avoiding a quadratic complexity. */
  101. unsigned mpi_recvs_matched[MAX_MPI_NODES][MAX_MPI_NODES] = { {0} };
  102. unsigned mpi_sends_matched[MAX_MPI_NODES][MAX_MPI_NODES] = { {0} };
  103. void _starpu_fxt_mpi_add_send_transfer(int src, int dst STARPU_ATTRIBUTE_UNUSED, int mpi_tag, size_t size, float date)
  104. {
  105. STARPU_ASSERT(src >= 0);
  106. if (src >= MAX_MPI_NODES)
  107. return;
  108. unsigned slot = mpi_sends_used[src]++;
  109. if (mpi_sends_used[src] > mpi_sends_list_size[src])
  110. {
  111. if (mpi_sends_list_size[src] > 0)
  112. {
  113. mpi_sends_list_size[src] *= 2;
  114. }
  115. else
  116. {
  117. mpi_sends_list_size[src] = 1;
  118. }
  119. _STARPU_REALLOC(mpi_sends[src], mpi_sends_list_size[src]*sizeof(struct mpi_transfer));
  120. }
  121. mpi_sends[src][slot].matched = 0;
  122. mpi_sends[src][slot].src = src;
  123. mpi_sends[src][slot].dst = dst;
  124. mpi_sends[src][slot].mpi_tag = mpi_tag;
  125. mpi_sends[src][slot].size = size;
  126. mpi_sends[src][slot].date = date;
  127. }
  128. void _starpu_fxt_mpi_add_recv_transfer(int src STARPU_ATTRIBUTE_UNUSED, int dst, int mpi_tag, float date)
  129. {
  130. if (dst >= MAX_MPI_NODES)
  131. return;
  132. unsigned slot = mpi_recvs_used[dst]++;
  133. if (mpi_recvs_used[dst] > mpi_recvs_list_size[dst])
  134. {
  135. if (mpi_recvs_list_size[dst] > 0)
  136. {
  137. mpi_recvs_list_size[dst] *= 2;
  138. }
  139. else
  140. {
  141. mpi_recvs_list_size[dst] = 1;
  142. }
  143. _STARPU_REALLOC(mpi_recvs[dst], mpi_recvs_list_size[dst]*sizeof(struct mpi_transfer));
  144. }
  145. mpi_recvs[dst][slot].matched = 0;
  146. mpi_recvs[dst][slot].src = src;
  147. mpi_recvs[dst][slot].dst = dst;
  148. mpi_recvs[dst][slot].mpi_tag = mpi_tag;
  149. mpi_recvs[dst][slot].date = date;
  150. }
  151. static
  152. struct mpi_transfer *try_to_match_send_transfer(int src STARPU_ATTRIBUTE_UNUSED, int dst, int mpi_tag)
  153. {
  154. unsigned slot;
  155. unsigned firstslot = mpi_recvs_matched[src][dst];
  156. unsigned all_previous_were_matched = 1;
  157. for (slot = firstslot; slot < mpi_recvs_used[dst]; slot++)
  158. {
  159. if (!mpi_recvs[dst][slot].matched)
  160. {
  161. if (mpi_recvs[dst][slot].mpi_tag == mpi_tag)
  162. {
  163. /* we found a match ! */
  164. mpi_recvs[dst][slot].matched = 1;
  165. return &mpi_recvs[dst][slot];
  166. }
  167. all_previous_were_matched = 0;
  168. }
  169. else
  170. {
  171. if (all_previous_were_matched)
  172. {
  173. /* All previous transfers are already matched,
  174. * we need not consider them anymore */
  175. mpi_recvs_matched[src][dst] = slot;
  176. }
  177. }
  178. }
  179. /* If we reached that point, we could not find a match */
  180. return NULL;
  181. }
  182. static unsigned long mpi_com_id = 0;
  183. static void display_all_transfers_from_trace(FILE *out_paje_file, unsigned n)
  184. {
  185. unsigned slot[MAX_MPI_NODES] = { 0 }, node, src;
  186. struct mpi_transfer_list pending_receives; /* Sorted list of matches which have not happened yet */
  187. double current_out_bandwidth[MAX_MPI_NODES] = { 0. };
  188. double current_in_bandwidth[MAX_MPI_NODES] = { 0. };
  189. #ifdef STARPU_HAVE_POTI
  190. char mpi_container[STARPU_POTI_STR_LEN];
  191. #endif
  192. for (node = 0; node < n ; node++)
  193. {
  194. #ifdef STARPU_HAVE_POTI
  195. snprintf(mpi_container, sizeof(mpi_container), "%u_mpict", node);
  196. poti_SetVariable(0., mpi_container, "bwi", 0.);
  197. poti_SetVariable(0., mpi_container, "bwo", 0.);
  198. #else
  199. fprintf(out_paje_file, "13 %.9f %u_mpict bwi %f\n", 0., node, 0.);
  200. fprintf(out_paje_file, "13 %.9f %u_mpict bwo %f\n", 0., node, 0.);
  201. #endif
  202. }
  203. mpi_transfer_list_init(&pending_receives);
  204. while (1)
  205. {
  206. float start_date;
  207. struct mpi_transfer *cur, *match;
  208. /* Find out which event comes first: a pending receive, or a new send */
  209. if (mpi_transfer_list_empty(&pending_receives))
  210. start_date = INFINITY;
  211. else
  212. start_date = mpi_transfer_list_front(&pending_receives)->date;
  213. src = MAX_MPI_NODES;
  214. for (node = 0; node < n; node++) {
  215. if (slot[node] < mpi_sends_used[node] && mpi_sends[node][slot[node]].date < start_date)
  216. {
  217. /* next send for node is earlier than others */
  218. src = node;
  219. start_date = mpi_sends[src][slot[src]].date;
  220. }
  221. }
  222. if (start_date == INFINITY)
  223. /* No event any more, we're finished! */
  224. break;
  225. if (src == MAX_MPI_NODES)
  226. {
  227. /* Pending match is earlier than all new sends, finish its communication */
  228. match = mpi_transfer_list_pop_front(&pending_receives);
  229. current_out_bandwidth[match->src] -= match->bandwidth;
  230. current_in_bandwidth[match->dst] -= match->bandwidth;
  231. #ifdef STARPU_HAVE_POTI
  232. snprintf(mpi_container, sizeof(mpi_container), "%d_mpict", match->src);
  233. poti_SetVariable(match->date, mpi_container, "bwo", current_out_bandwidth[match->src]);
  234. snprintf(mpi_container, sizeof(mpi_container), "%d_mpict", match->dst);
  235. poti_SetVariable(match->date, mpi_container, "bwi", current_in_bandwidth[match->dst]);
  236. #else
  237. fprintf(out_paje_file, "13 %.9f %d_mpict bwo %f\n", match->date, match->src, current_out_bandwidth[match->src]);
  238. fprintf(out_paje_file, "13 %.9f %d_mpict bwi %f\n", match->date, match->dst, current_in_bandwidth[match->dst]);
  239. #endif
  240. continue;
  241. }
  242. cur = &mpi_sends[src][slot[src]];
  243. int dst = cur->dst;
  244. int mpi_tag = cur->mpi_tag;
  245. size_t size = cur->size;
  246. if (dst < MAX_MPI_NODES)
  247. match = try_to_match_send_transfer(src, dst, mpi_tag);
  248. else
  249. match = NULL;
  250. if (match)
  251. {
  252. float end_date = match->date;
  253. struct mpi_transfer *prev;
  254. match->bandwidth = (0.001*size)/(end_date - start_date);
  255. current_out_bandwidth[src] += match->bandwidth;
  256. current_in_bandwidth[dst] += match->bandwidth;
  257. /* Insert in sorted list, most probably at the end so let's use a mere insertion sort */
  258. for (prev = mpi_transfer_list_last(&pending_receives);
  259. prev != mpi_transfer_list_alpha(&pending_receives);
  260. prev = mpi_transfer_list_prev(prev))
  261. if (prev->date <= end_date)
  262. {
  263. /* Found its place */
  264. mpi_transfer_list_insert_after(&pending_receives, match, prev);
  265. break;
  266. }
  267. if (prev == mpi_transfer_list_alpha(&pending_receives))
  268. {
  269. /* No element earlier than this one, put it at the head */
  270. mpi_transfer_list_push_front(&pending_receives, match);
  271. }
  272. unsigned long id = mpi_com_id++;
  273. #ifdef STARPU_HAVE_POTI
  274. char paje_value[STARPU_POTI_STR_LEN], paje_key[STARPU_POTI_STR_LEN];
  275. snprintf(paje_value, STARPU_POTI_STR_LEN, "%lu", (long unsigned) size);
  276. snprintf(paje_key, STARPU_POTI_STR_LEN, "mpicom_%lu", id);
  277. snprintf(mpi_container, sizeof(mpi_container), "%u_mpict", src);
  278. poti_StartLink(start_date, "MPICt", "MPIL", mpi_container, paje_value, paje_key);
  279. poti_SetVariable(start_date, mpi_container, "bwo", current_out_bandwidth[src]);
  280. snprintf(mpi_container, sizeof(mpi_container), "%d_mpict", dst);
  281. poti_EndLink(end_date, "MPICt", "MPIL", mpi_container, paje_value, paje_key);
  282. poti_SetVariable(start_date, mpi_container, "bwo", current_in_bandwidth[dst]);
  283. #else
  284. fprintf(out_paje_file, "18 %.9f MPIL MPIroot %lu %u_mpict mpicom_%lu\n", start_date, (unsigned long)size, src, id);
  285. fprintf(out_paje_file, "19 %.9f MPIL MPIroot %lu %u_mpict mpicom_%lu\n", end_date, (unsigned long)size, dst, id);
  286. fprintf(out_paje_file, "13 %.9f %u_mpict bwo %f\n", start_date, src, current_out_bandwidth[src]);
  287. fprintf(out_paje_file, "13 %.9f %u_mpict bwi %f\n", start_date, dst, current_in_bandwidth[dst]);
  288. #endif
  289. }
  290. else
  291. {
  292. _STARPU_DISP("Warning, could not match MPI transfer from %d to %d (tag %x) starting at %f\n", src, dst, mpi_tag, start_date);
  293. }
  294. slot[src]++;
  295. }
  296. }
  297. void _starpu_fxt_display_mpi_transfers(struct starpu_fxt_options *options, int *ranks STARPU_ATTRIBUTE_UNUSED, FILE *out_paje_file)
  298. {
  299. if (options->ninputfiles > MAX_MPI_NODES)
  300. {
  301. _STARPU_DISP("Warning: %u files given, maximum %u supported, truncating to %u\n", options->ninputfiles, MAX_MPI_NODES, MAX_MPI_NODES);
  302. options->ninputfiles = MAX_MPI_NODES;
  303. }
  304. /* display the MPI transfers if possible */
  305. if (out_paje_file)
  306. display_all_transfers_from_trace(out_paje_file, options->ninputfiles);
  307. }
  308. #endif // STARPU_USE_FXT