123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365 |
- /* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2011-2014,2016,2017 Université de Bordeaux
- * Copyright (C) 2011-2013,2017 Inria
- * Copyright (C) 2010-2012,2014-2018 CNRS
- * Copyright (C) 2017 Federal University of Rio Grande do Sul (UFRGS)
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
- #include <starpu.h>
- #include <common/config.h>
- #ifdef STARPU_USE_FXT
- #include "starpu_fxt.h"
- #ifdef STARPU_HAVE_POTI
- #include <poti.h>
- #define STARPU_POTI_STR_LEN 200
- #endif
- #define MAX_MPI_NODES 64
- LIST_TYPE(mpi_transfer,
- unsigned matched;
- int src;
- int dst;
- long mpi_tag;
- size_t size;
- float date;
- long jobid;
- double bandwidth;
- );
- /* Returns 0 if a barrier is found, -1 otherwise. In case of success, offset is
- * filled with the timestamp of the barrier */
- int _starpu_fxt_mpi_find_sync_point(char *filename_in, uint64_t *offset, int *key, int *rank)
- {
- STARPU_ASSERT(offset);
- /* Open the trace file */
- int fd_in;
- fd_in = open(filename_in, O_RDONLY);
- if (fd_in < 0)
- {
- perror("open failed :");
- exit(-1);
- }
- static fxt_t fut;
- fut = fxt_fdopen(fd_in);
- if (!fut)
- {
- perror("fxt_fdopen :");
- exit(-1);
- }
- fxt_blockev_t block;
- block = fxt_blockev_enter(fut);
- struct fxt_ev_64 ev;
- int func_ret = -1;
- unsigned found = 0;
- while(!found)
- {
- int ret = fxt_next_ev(block, FXT_EV_TYPE_64, (struct fxt_ev *)&ev);
- if (ret != FXT_EV_OK)
- {
- _STARPU_MSG("no more block ...\n");
- break;
- }
- if (ev.code == _STARPU_MPI_FUT_BARRIER)
- {
- /* We found the sync point */
- *offset = ev.time;
- *rank = ev.param[0];
- *key = ev.param[2];
- found = 1;
- func_ret = 0;
- }
- }
- /* Close the trace file */
- if (close(fd_in))
- {
- perror("close failed :");
- exit(-1);
- }
- return func_ret;
- }
- /*
- * Deal with the actual MPI transfers performed with the MPI lib
- */
- /* the list of MPI transfers found in the different traces */
- static struct mpi_transfer *mpi_sends[MAX_MPI_NODES] = {NULL};
- static struct mpi_transfer *mpi_recvs[MAX_MPI_NODES] = {NULL};
- /* number of available slots in the lists */
- unsigned mpi_sends_list_size[MAX_MPI_NODES] = {0};
- unsigned mpi_recvs_list_size[MAX_MPI_NODES] = {0};
- /* number of slots actually used in the list */
- unsigned mpi_sends_used[MAX_MPI_NODES] = {0};
- unsigned mpi_recvs_used[MAX_MPI_NODES] = {0};
- /* number of slots already matched at the beginning of the list. This permits
- * going through the lists from the beginning to match each and every
- * transfer, thus avoiding a quadratic complexity. */
- unsigned mpi_recvs_matched[MAX_MPI_NODES][MAX_MPI_NODES] = { {0} };
- unsigned mpi_sends_matched[MAX_MPI_NODES][MAX_MPI_NODES] = { {0} };
- void _starpu_fxt_mpi_add_send_transfer(int src, int dst STARPU_ATTRIBUTE_UNUSED, long mpi_tag, size_t size, float date, long jobid)
- {
- STARPU_ASSERT(src >= 0);
- if (src >= MAX_MPI_NODES)
- return;
- unsigned slot = mpi_sends_used[src]++;
- if (mpi_sends_used[src] > mpi_sends_list_size[src])
- {
- if (mpi_sends_list_size[src] > 0)
- {
- mpi_sends_list_size[src] *= 2;
- }
- else
- {
- mpi_sends_list_size[src] = 1;
- }
- _STARPU_REALLOC(mpi_sends[src], mpi_sends_list_size[src]*sizeof(struct mpi_transfer));
- }
- mpi_sends[src][slot].matched = 0;
- mpi_sends[src][slot].src = src;
- mpi_sends[src][slot].dst = dst;
- mpi_sends[src][slot].mpi_tag = mpi_tag;
- mpi_sends[src][slot].size = size;
- mpi_sends[src][slot].date = date;
- mpi_sends[src][slot].jobid = jobid;
- }
- void _starpu_fxt_mpi_add_recv_transfer(int src STARPU_ATTRIBUTE_UNUSED, int dst, long mpi_tag, float date, long jobid)
- {
- if (dst >= MAX_MPI_NODES)
- return;
- unsigned slot = mpi_recvs_used[dst]++;
- if (mpi_recvs_used[dst] > mpi_recvs_list_size[dst])
- {
- if (mpi_recvs_list_size[dst] > 0)
- {
- mpi_recvs_list_size[dst] *= 2;
- }
- else
- {
- mpi_recvs_list_size[dst] = 1;
- }
- _STARPU_REALLOC(mpi_recvs[dst], mpi_recvs_list_size[dst]*sizeof(struct mpi_transfer));
- }
- mpi_recvs[dst][slot].matched = 0;
- mpi_recvs[dst][slot].src = src;
- mpi_recvs[dst][slot].dst = dst;
- mpi_recvs[dst][slot].mpi_tag = mpi_tag;
- mpi_recvs[dst][slot].date = date;
- mpi_recvs[dst][slot].jobid = jobid;
- }
- static
- struct mpi_transfer *try_to_match_send_transfer(int src STARPU_ATTRIBUTE_UNUSED, int dst, long mpi_tag)
- {
- unsigned slot;
- unsigned firstslot = mpi_recvs_matched[src][dst];
- unsigned all_previous_were_matched = 1;
- for (slot = firstslot; slot < mpi_recvs_used[dst]; slot++)
- {
- if (!mpi_recvs[dst][slot].matched)
- {
- if (mpi_recvs[dst][slot].mpi_tag == mpi_tag)
- {
- /* we found a match ! */
- mpi_recvs[dst][slot].matched = 1;
- return &mpi_recvs[dst][slot];
- }
- all_previous_were_matched = 0;
- }
- else
- {
- if (all_previous_were_matched)
- {
- /* All previous transfers are already matched,
- * we need not consider them anymore */
- mpi_recvs_matched[src][dst] = slot;
- }
- }
- }
- /* If we reached that point, we could not find a match */
- return NULL;
- }
- static unsigned long mpi_com_id = 0;
- static void display_all_transfers_from_trace(FILE *out_paje_file, unsigned n)
- {
- unsigned slot[MAX_MPI_NODES] = { 0 }, node;
- struct mpi_transfer_list pending_receives; /* Sorted list of matches which have not happened yet */
- double current_out_bandwidth[MAX_MPI_NODES] = { 0. };
- double current_in_bandwidth[MAX_MPI_NODES] = { 0. };
- #ifdef STARPU_HAVE_POTI
- char mpi_container[STARPU_POTI_STR_LEN];
- #endif
- //bwi_mpi and bwo_mpi are set to zero when MPI thread containers are created
- mpi_transfer_list_init(&pending_receives);
- while (1)
- {
- float start_date;
- struct mpi_transfer *cur, *match;
- int src;
- /* Find out which event comes first: a pending receive, or a new send */
- if (mpi_transfer_list_empty(&pending_receives))
- start_date = INFINITY;
- else
- start_date = mpi_transfer_list_front(&pending_receives)->date;
- src = MAX_MPI_NODES;
- for (node = 0; node < n; node++)
- {
- if (slot[node] < mpi_sends_used[node] && mpi_sends[node][slot[node]].date < start_date)
- {
- /* next send for node is earlier than others */
- src = node;
- start_date = mpi_sends[src][slot[src]].date;
- }
- }
- if (start_date == INFINITY)
- /* No event any more, we're finished! */
- break;
- if (src == MAX_MPI_NODES)
- {
- /* Pending match is earlier than all new sends, finish its communication */
- match = mpi_transfer_list_pop_front(&pending_receives);
- current_out_bandwidth[match->src] -= match->bandwidth;
- current_in_bandwidth[match->dst] -= match->bandwidth;
- #ifdef STARPU_HAVE_POTI
- snprintf(mpi_container, sizeof(mpi_container), "%d_mpict", match->src);
- poti_SetVariable(match->date, mpi_container, "bwo_mpi", current_out_bandwidth[match->src]);
- snprintf(mpi_container, sizeof(mpi_container), "%d_mpict", match->dst);
- poti_SetVariable(match->date, mpi_container, "bwi_mpi", current_in_bandwidth[match->dst]);
- #else
- fprintf(out_paje_file, "13 %.9f %d_mpict bwo_mpi %f\n", match->date, match->src, current_out_bandwidth[match->src]);
- fprintf(out_paje_file, "13 %.9f %d_mpict bwi_mpi %f\n", match->date, match->dst, current_in_bandwidth[match->dst]);
- #endif
- continue;
- }
- cur = &mpi_sends[src][slot[src]];
- int dst = cur->dst;
- long mpi_tag = cur->mpi_tag;
- size_t size = cur->size;
- if (dst < MAX_MPI_NODES)
- match = try_to_match_send_transfer(src, dst, mpi_tag);
- else
- match = NULL;
- if (match)
- {
- float end_date = match->date;
- struct mpi_transfer *prev;
- match->bandwidth = (0.001*size)/(end_date - start_date);
- current_out_bandwidth[src] += match->bandwidth;
- current_in_bandwidth[dst] += match->bandwidth;
- /* Insert in sorted list, most probably at the end so let's use a mere insertion sort */
- for (prev = mpi_transfer_list_last(&pending_receives);
- prev != mpi_transfer_list_alpha(&pending_receives);
- prev = mpi_transfer_list_prev(prev))
- if (prev->date <= end_date)
- {
- /* Found its place */
- mpi_transfer_list_insert_after(&pending_receives, match, prev);
- break;
- }
- if (prev == mpi_transfer_list_alpha(&pending_receives))
- {
- /* No element earlier than this one, put it at the head */
- mpi_transfer_list_push_front(&pending_receives, match);
- }
- unsigned long id = mpi_com_id++;
- if (cur->jobid != -1)
- _starpu_fxt_dag_add_send(src, cur->jobid, mpi_tag, id);
- if (match->jobid != -1)
- _starpu_fxt_dag_add_receive(dst, match->jobid, mpi_tag, id);
- #ifdef STARPU_HAVE_POTI
- char paje_value[STARPU_POTI_STR_LEN], paje_key[STARPU_POTI_STR_LEN];
- snprintf(paje_value, sizeof(paje_value), "%lu", (long unsigned) size);
- snprintf(paje_key, sizeof(paje_key), "mpicom_%lu", id);
- snprintf(mpi_container, sizeof(mpi_container), "%d_mpict", src);
- char str_mpi_tag[STARPU_POTI_STR_LEN];
- snprintf(str_mpi_tag, sizeof(str_mpi_tag), "%ld", mpi_tag);
- poti_user_StartLink(_starpu_poti_MpiLinkStart, start_date, "MPIroot", "MPIL", mpi_container, paje_value, paje_key, 1, str_mpi_tag);
- poti_SetVariable(start_date, mpi_container, "bwo_mpi", current_out_bandwidth[src]);
- snprintf(mpi_container, sizeof(mpi_container), "%d_mpict", dst);
- poti_EndLink(end_date, "MPIroot", "MPIL", mpi_container, paje_value, paje_key);
- poti_SetVariable(start_date, mpi_container, "bwo_mpi", current_in_bandwidth[dst]);
- #else
- fprintf(out_paje_file, "13 %.9f %d_mpict bwo_mpi %f\n", start_date, src, current_out_bandwidth[src]);
- fprintf(out_paje_file, "13 %.9f %d_mpict bwi_mpi %f\n", start_date, dst, current_in_bandwidth[dst]);
- fprintf(out_paje_file, "23 %.9f MPIL MPIroot %lu %d_mpict mpicom_%lu %ld\n", start_date, (unsigned long)size, src, id, mpi_tag);
- fprintf(out_paje_file, "19 %.9f MPIL MPIroot %lu %d_mpict mpicom_%lu\n", end_date, (unsigned long)size, dst, id);
- #endif
- }
- else
- {
- _STARPU_DISP("Warning, could not match MPI transfer from %d to %d (tag %x) starting at %f\n", src, dst, mpi_tag, start_date);
- }
- slot[src]++;
- }
- }
- void _starpu_fxt_display_mpi_transfers(struct starpu_fxt_options *options, int *ranks STARPU_ATTRIBUTE_UNUSED, FILE *out_paje_file)
- {
- if (options->ninputfiles > MAX_MPI_NODES)
- {
- _STARPU_DISP("Warning: %u files given, maximum %u supported, truncating to %u\n", options->ninputfiles, MAX_MPI_NODES, MAX_MPI_NODES);
- options->ninputfiles = MAX_MPI_NODES;
- }
- /* display the MPI transfers if possible */
- if (out_paje_file)
- display_all_transfers_from_trace(out_paje_file, options->ninputfiles);
- }
- #endif // STARPU_USE_FXT
|