Browse Source

new tool starpu_mpi_comm_matrix.py to plot heatmaps for MPI communications

Nathalie Furmento 6 years ago
parent
commit
6fee0d788e

+ 2 - 1
ChangeLog

@@ -123,7 +123,8 @@ Small features:
   * Add starpu_task_declare_deps()
   * New function starpu_data_unpartition_submit_sequential_consistency_cb()
     to specify a callback for the task submitting the unpartitioning
-
+  * New tool starpu_mpi_comm_trace.py to draw heatmap of MPI
+    communications
 
 Changes:
   * Vastly improve simgrid simulation time.

+ 2 - 0
configure.ac

@@ -3594,6 +3594,7 @@ AC_CONFIG_COMMANDS([executable-scripts], [
   chmod +x gcc-plugin/tests/run-test
   chmod +x tools/starpu_codelet_profile
   chmod +x tools/starpu_codelet_histo_profile
+  chmod +x tools/starpu_mpi_comm_matrix.py
   chmod +x tools/starpu_workers_activity
   chmod +x tools/starpu_paje_draw_histogram
   chmod +x tools/starpu_paje_state_stats
@@ -3659,6 +3660,7 @@ AC_OUTPUT([
 	tools/Makefile
 	tools/starpu_codelet_profile
 	tools/starpu_codelet_histo_profile
+	tools/starpu_mpi_comm_matrix.py
 	tools/starpu_workers_activity
 	tools/starpu_paje_draw_histogram
 	tools/starpu_paje_state_stats

+ 7 - 4
doc/doxygen/chapters/370_online_performance_tools.doxy

@@ -176,12 +176,15 @@ obtained by setting the environment variable \ref STARPU_COMM_STATS to <c>1</c>;
 a summary will then be displayed at program termination:
 
 \verbatim
-[starpu_comm_stats][0] TOTAL:	4.000000 GB	4.000000 GB
-[starpu_comm_stats][0->1]	4.000000 GB	4.000000 GB
-[starpu_comm_stats][1] TOTAL:	8.000000 GB	8.000000 GB
-[starpu_comm_stats][1->0]	8.000000 GB	8.000000 GB
+starpu_comm_stats][1] TOTAL:	456.000000 B	0.000435 MB	 0.000188 B/s	 0.000000 MB/s
+[starpu_comm_stats][1:0]	456.000000 B	0.000435 MB	 0.000188 B/s	 0.000000 MB/s
+
+[starpu_comm_stats][0] TOTAL:	456.000000 B	0.000435 MB	 0.000188 B/s	 0.000000 MB/s
+[starpu_comm_stats][0:1]	456.000000 B	0.000435 MB	 0.000188 B/s	 0.000000 MB/s
 \endverbatim
 
+These statistics can be plotted as heatmaps using StarPU tool <c>starpu_mpi_comm_matrix.py</c>
+
 \subsection StarPU-TopInterface StarPU-Top Interface
 
 StarPU-Top is an interface which remotely displays the on-line state of a StarPU

+ 1 - 1
doc/doxygen/chapters/410_mpi_support.doxy

@@ -897,7 +897,7 @@ from the received communication cache.
 
 When the environment variable \ref STARPU_COMM_STATS is set to \c 1,
 StarPU will display at the end of the execution for each node the
-volume and the bandwidth of data sent to each other nodes.
+volume and the bandwidth of data sent to all the other nodes.
 
 \section MPIExamples More MPI examples
 

+ 4 - 3
mpi/src/starpu_mpi_stats.c

@@ -95,9 +95,10 @@ void _starpu_mpi_comm_amounts_display(FILE *stream, int node)
 
 	for (dst = 0; dst < world_size; dst++)
 	{
-		fprintf(stream, "[starpu_comm_stats][%d->%d]\t%f B\t%f MB\t %f B/s\t %f MB/s\n",
-			node, dst, (float)comm_amount[dst], ((float)comm_amount[dst])/(1024*1024),
-			(float)comm_amount[dst]/(float)time, ((float)comm_amount[dst])/(1024*1024)/(float)time);
+		if (comm_amount[dst])
+			fprintf(stream, "[starpu_comm_stats][%d:%d]\t%f B\t%f MB\t %f B/s\t %f MB/s\n",
+				node, dst, (float)comm_amount[dst], ((float)comm_amount[dst])/(1024*1024),
+				(float)comm_amount[dst]/(float)time, ((float)comm_amount[dst])/(1024*1024)/(float)time);
 	}
 }
 

+ 5 - 0
tools/Makefile.am

@@ -281,6 +281,7 @@ dist_bin_SCRIPTS +=			\
 	starpu_workers_activity		\
 	starpu_codelet_histo_profile	\
 	starpu_codelet_profile		\
+	starpu_mpi_comm_matrix.py	\
 	starpu_paje_draw_histogram	\
 	starpu_paje_draw_histogram.R	\
 	starpu_paje_summary		\
@@ -330,6 +331,9 @@ starpu_codelet_profile.1: starpu_codelet_profile
 starpu_codelet_histo_profile.1: starpu_codelet_histo_profile
 	chmod +x $<
 	help2man --no-discard-stderr -N --output=$@ ./$<
+starpu_mpi_comm_matrix.1: starpu_mpi_comm_matrix.py
+	chmod +x $<
+	help2man --no-discard-stderr -N --output=$@ ./$<
 starpu_paje_draw_histogram.1: starpu_paje_draw_histogram
 	chmod +x $<
 	help2man --no-discard-stderr -N --output=$@ ./$<
@@ -356,6 +360,7 @@ dist_man1_MANS =\
 	starpu_workers_activity.1 \
 	starpu_codelet_profile.1 \
 	starpu_codelet_histo_profile.1 \
+	starpu_mpi_comm_matrix.1 \
 	starpu_paje_draw_histogram.1 \
 	starpu_paje_state_stats.1
 

+ 106 - 0
tools/starpu_mpi_comm_matrix.py.in

@@ -0,0 +1,106 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2019                                      CNRS
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+import sys
+import re
+import os
+
+PROGNAME=sys.argv[0]
+
+def usage():
+    print("Offline tool to draw a communication matrix")
+    print("")
+    print("Usage: %s <output_execution>" % PROGNAME)
+    print("")
+    print("Options:")
+    print( "	-h, --help          display this help and exit")
+    print("	-v, --version       output version information and exit")
+    print("")
+    print("Report bugs to <@PACKAGE_BUGREPORT@>")
+    sys.exit(1)
+
+if len(sys.argv) >= 2:
+    if sys.argv[1] == '-v' or sys.argv[1] == '--version':
+        print("%s (@PACKAGE_NAME@) @PACKAGE_VERSION@" % PROGNAME)
+        sys.exit(0)
+    if sys.argv[1] == '-h' or sys.argv[1] == '--help':
+        usage()
+if (len(sys.argv) == 1):
+    usage()
+
+outputfile=sys.argv[1]
+
+# find the number of nodes
+nodes=0
+file = open(outputfile, "r")
+for line in file.readlines():
+    match = re.search('\TOTAL', line)
+    if match:
+        (node,stuff)=line.split(sep="[")[2].split("]")
+        if (int(node) > nodes):
+            nodes=int(node)
+file.close()
+nodes=nodes+1
+
+# extract volume of comm and bandwidth between all pair of nodes
+volumes = [[0 for x in range(nodes)] for y in range(nodes)]
+bandwidth = [[0 for x in range(nodes)] for y in range(nodes)]
+file = open(outputfile, "r")
+for line in file.readlines():
+    match = re.search('\[starpu_comm_stats]', line)
+    if match:
+        match = re.search('TOTAL', line)
+        if not match:
+            (head,volB,B,volMB,MB,bwB,B,bwMB,MB) = line.split()
+            (src,dst)=head.split(sep="[")[2].split(sep="]")[0].split(sep=":")
+            volumes[int(src)][int(dst)] = float(volB)
+            bandwidth[int(src)][int(dst)] = float(bwB)
+file.close()
+
+def writeData(filename, nodes, data):
+    ofile=open(filename, "w")
+    for src in range(nodes):
+        for dst in range(nodes):
+            ofile.write("%f "% data[src][dst])
+        ofile.write("\n")
+    ofile.close()
+
+def generateGnuplotScript(filename, datafilename, outputfile, nodes):
+    ofile=open(filename, "w")
+    srctics=""
+    dsttics=""
+    for node in range(nodes-1):
+        srctics += "\"src%d\" %d, " % (node, node)
+        dsttics += "\"dst%d\" %d, " % (node, node)
+    ofile.write("set term pdf color\n")
+    ofile.write("set output \"%s\"\n" % outputfile)
+    ofile.write("set view map scale 1\nset style data lines\nset palette rgbformulae 22,13,-31\n")
+    ofile.write("set xtics (%s\"src%d\" %d)\n" % (srctics, nodes-1, nodes-1))
+    ofile.write("set ytics (%s\"dst%d\" %d)\n" % (dsttics, nodes-1, nodes-1))
+    ofile.write("plot '%s' matrix with image\n" % datafilename)
+    ofile.close()
+
+# generate gnuplot volume data and script file
+writeData(outputfile+"_volume.data", nodes, volumes)
+generateGnuplotScript(outputfile+"_volume.gp", outputfile+"_volume.data", outputfile+"_volume_heatmap.pdf", nodes)
+os.system("gnuplot " + outputfile+"_volume.gp")
+
+# generate gnuplot bandwidth data and script file
+writeData(outputfile+"_bw.data", nodes, bandwidth)
+generateGnuplotScript(outputfile+"_bw.gp", outputfile+"_bw.data", outputfile+"_bw_heatmap.pdf", nodes)
+os.system("gnuplot " + outputfile+"_bw.gp")