Browse Source

new tool starpu_mpi_comm_matrix.py to plot heatmaps for MPI communications

Nathalie Furmento 6 years ago
parent
commit
6fee0d788e

+ 2 - 1
ChangeLog

@@ -123,7 +123,8 @@ Small features:
   * Add starpu_task_declare_deps()
   * Add starpu_task_declare_deps()
   * New function starpu_data_unpartition_submit_sequential_consistency_cb()
   * New function starpu_data_unpartition_submit_sequential_consistency_cb()
     to specify a callback for the task submitting the unpartitioning
     to specify a callback for the task submitting the unpartitioning
-
+  * New tool starpu_mpi_comm_trace.py to draw heatmap of MPI
+    communications
 
 
 Changes:
 Changes:
   * Vastly improve simgrid simulation time.
   * Vastly improve simgrid simulation time.

+ 2 - 0
configure.ac

@@ -3594,6 +3594,7 @@ AC_CONFIG_COMMANDS([executable-scripts], [
   chmod +x gcc-plugin/tests/run-test
   chmod +x gcc-plugin/tests/run-test
   chmod +x tools/starpu_codelet_profile
   chmod +x tools/starpu_codelet_profile
   chmod +x tools/starpu_codelet_histo_profile
   chmod +x tools/starpu_codelet_histo_profile
+  chmod +x tools/starpu_mpi_comm_matrix.py
   chmod +x tools/starpu_workers_activity
   chmod +x tools/starpu_workers_activity
   chmod +x tools/starpu_paje_draw_histogram
   chmod +x tools/starpu_paje_draw_histogram
   chmod +x tools/starpu_paje_state_stats
   chmod +x tools/starpu_paje_state_stats
@@ -3659,6 +3660,7 @@ AC_OUTPUT([
 	tools/Makefile
 	tools/Makefile
 	tools/starpu_codelet_profile
 	tools/starpu_codelet_profile
 	tools/starpu_codelet_histo_profile
 	tools/starpu_codelet_histo_profile
+	tools/starpu_mpi_comm_matrix.py
 	tools/starpu_workers_activity
 	tools/starpu_workers_activity
 	tools/starpu_paje_draw_histogram
 	tools/starpu_paje_draw_histogram
 	tools/starpu_paje_state_stats
 	tools/starpu_paje_state_stats

+ 7 - 4
doc/doxygen/chapters/370_online_performance_tools.doxy

@@ -176,12 +176,15 @@ obtained by setting the environment variable \ref STARPU_COMM_STATS to <c>1</c>;
 a summary will then be displayed at program termination:
 a summary will then be displayed at program termination:
 
 
 \verbatim
 \verbatim
-[starpu_comm_stats][0] TOTAL:	4.000000 GB	4.000000 GB
-[starpu_comm_stats][0->1]	4.000000 GB	4.000000 GB
-[starpu_comm_stats][1] TOTAL:	8.000000 GB	8.000000 GB
-[starpu_comm_stats][1->0]	8.000000 GB	8.000000 GB
+starpu_comm_stats][1] TOTAL:	456.000000 B	0.000435 MB	 0.000188 B/s	 0.000000 MB/s
+[starpu_comm_stats][1:0]	456.000000 B	0.000435 MB	 0.000188 B/s	 0.000000 MB/s
+
+[starpu_comm_stats][0] TOTAL:	456.000000 B	0.000435 MB	 0.000188 B/s	 0.000000 MB/s
+[starpu_comm_stats][0:1]	456.000000 B	0.000435 MB	 0.000188 B/s	 0.000000 MB/s
 \endverbatim
 \endverbatim
 
 
+These statistics can be plotted as heatmaps using StarPU tool <c>starpu_mpi_comm_matrix.py</c>
+
 \subsection StarPU-TopInterface StarPU-Top Interface
 \subsection StarPU-TopInterface StarPU-Top Interface
 
 
 StarPU-Top is an interface which remotely displays the on-line state of a StarPU
 StarPU-Top is an interface which remotely displays the on-line state of a StarPU

+ 1 - 1
doc/doxygen/chapters/410_mpi_support.doxy

@@ -897,7 +897,7 @@ from the received communication cache.
 
 
 When the environment variable \ref STARPU_COMM_STATS is set to \c 1,
 When the environment variable \ref STARPU_COMM_STATS is set to \c 1,
 StarPU will display at the end of the execution for each node the
 StarPU will display at the end of the execution for each node the
-volume and the bandwidth of data sent to each other nodes.
+volume and the bandwidth of data sent to all the other nodes.
 
 
 \section MPIExamples More MPI examples
 \section MPIExamples More MPI examples
 
 

+ 4 - 3
mpi/src/starpu_mpi_stats.c

@@ -95,9 +95,10 @@ void _starpu_mpi_comm_amounts_display(FILE *stream, int node)
 
 
 	for (dst = 0; dst < world_size; dst++)
 	for (dst = 0; dst < world_size; dst++)
 	{
 	{
-		fprintf(stream, "[starpu_comm_stats][%d->%d]\t%f B\t%f MB\t %f B/s\t %f MB/s\n",
-			node, dst, (float)comm_amount[dst], ((float)comm_amount[dst])/(1024*1024),
-			(float)comm_amount[dst]/(float)time, ((float)comm_amount[dst])/(1024*1024)/(float)time);
+		if (comm_amount[dst])
+			fprintf(stream, "[starpu_comm_stats][%d:%d]\t%f B\t%f MB\t %f B/s\t %f MB/s\n",
+				node, dst, (float)comm_amount[dst], ((float)comm_amount[dst])/(1024*1024),
+				(float)comm_amount[dst]/(float)time, ((float)comm_amount[dst])/(1024*1024)/(float)time);
 	}
 	}
 }
 }
 
 

+ 5 - 0
tools/Makefile.am

@@ -281,6 +281,7 @@ dist_bin_SCRIPTS +=			\
 	starpu_workers_activity		\
 	starpu_workers_activity		\
 	starpu_codelet_histo_profile	\
 	starpu_codelet_histo_profile	\
 	starpu_codelet_profile		\
 	starpu_codelet_profile		\
+	starpu_mpi_comm_matrix.py	\
 	starpu_paje_draw_histogram	\
 	starpu_paje_draw_histogram	\
 	starpu_paje_draw_histogram.R	\
 	starpu_paje_draw_histogram.R	\
 	starpu_paje_summary		\
 	starpu_paje_summary		\
@@ -330,6 +331,9 @@ starpu_codelet_profile.1: starpu_codelet_profile
 starpu_codelet_histo_profile.1: starpu_codelet_histo_profile
 starpu_codelet_histo_profile.1: starpu_codelet_histo_profile
 	chmod +x $<
 	chmod +x $<
 	help2man --no-discard-stderr -N --output=$@ ./$<
 	help2man --no-discard-stderr -N --output=$@ ./$<
+starpu_mpi_comm_matrix.1: starpu_mpi_comm_matrix.py
+	chmod +x $<
+	help2man --no-discard-stderr -N --output=$@ ./$<
 starpu_paje_draw_histogram.1: starpu_paje_draw_histogram
 starpu_paje_draw_histogram.1: starpu_paje_draw_histogram
 	chmod +x $<
 	chmod +x $<
 	help2man --no-discard-stderr -N --output=$@ ./$<
 	help2man --no-discard-stderr -N --output=$@ ./$<
@@ -356,6 +360,7 @@ dist_man1_MANS =\
 	starpu_workers_activity.1 \
 	starpu_workers_activity.1 \
 	starpu_codelet_profile.1 \
 	starpu_codelet_profile.1 \
 	starpu_codelet_histo_profile.1 \
 	starpu_codelet_histo_profile.1 \
+	starpu_mpi_comm_matrix.1 \
 	starpu_paje_draw_histogram.1 \
 	starpu_paje_draw_histogram.1 \
 	starpu_paje_state_stats.1
 	starpu_paje_state_stats.1
 
 

+ 106 - 0
tools/starpu_mpi_comm_matrix.py.in

@@ -0,0 +1,106 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2019                                      CNRS
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+import sys
+import re
+import os
+
+PROGNAME=sys.argv[0]
+
+def usage():
+    print("Offline tool to draw a communication matrix")
+    print("")
+    print("Usage: %s <output_execution>" % PROGNAME)
+    print("")
+    print("Options:")
+    print( "	-h, --help          display this help and exit")
+    print("	-v, --version       output version information and exit")
+    print("")
+    print("Report bugs to <@PACKAGE_BUGREPORT@>")
+    sys.exit(1)
+
+if len(sys.argv) >= 2:
+    if sys.argv[1] == '-v' or sys.argv[1] == '--version':
+        print("%s (@PACKAGE_NAME@) @PACKAGE_VERSION@" % PROGNAME)
+        sys.exit(0)
+    if sys.argv[1] == '-h' or sys.argv[1] == '--help':
+        usage()
+if (len(sys.argv) == 1):
+    usage()
+
+outputfile=sys.argv[1]
+
+# find the number of nodes
+nodes=0
+file = open(outputfile, "r")
+for line in file.readlines():
+    match = re.search('\TOTAL', line)
+    if match:
+        (node,stuff)=line.split(sep="[")[2].split("]")
+        if (int(node) > nodes):
+            nodes=int(node)
+file.close()
+nodes=nodes+1
+
+# extract volume of comm and bandwidth between all pair of nodes
+volumes = [[0 for x in range(nodes)] for y in range(nodes)]
+bandwidth = [[0 for x in range(nodes)] for y in range(nodes)]
+file = open(outputfile, "r")
+for line in file.readlines():
+    match = re.search('\[starpu_comm_stats]', line)
+    if match:
+        match = re.search('TOTAL', line)
+        if not match:
+            (head,volB,B,volMB,MB,bwB,B,bwMB,MB) = line.split()
+            (src,dst)=head.split(sep="[")[2].split(sep="]")[0].split(sep=":")
+            volumes[int(src)][int(dst)] = float(volB)
+            bandwidth[int(src)][int(dst)] = float(bwB)
+file.close()
+
+def writeData(filename, nodes, data):
+    ofile=open(filename, "w")
+    for src in range(nodes):
+        for dst in range(nodes):
+            ofile.write("%f "% data[src][dst])
+        ofile.write("\n")
+    ofile.close()
+
+def generateGnuplotScript(filename, datafilename, outputfile, nodes):
+    ofile=open(filename, "w")
+    srctics=""
+    dsttics=""
+    for node in range(nodes-1):
+        srctics += "\"src%d\" %d, " % (node, node)
+        dsttics += "\"dst%d\" %d, " % (node, node)
+    ofile.write("set term pdf color\n")
+    ofile.write("set output \"%s\"\n" % outputfile)
+    ofile.write("set view map scale 1\nset style data lines\nset palette rgbformulae 22,13,-31\n")
+    ofile.write("set xtics (%s\"src%d\" %d)\n" % (srctics, nodes-1, nodes-1))
+    ofile.write("set ytics (%s\"dst%d\" %d)\n" % (dsttics, nodes-1, nodes-1))
+    ofile.write("plot '%s' matrix with image\n" % datafilename)
+    ofile.close()
+
+# generate gnuplot volume data and script file
+writeData(outputfile+"_volume.data", nodes, volumes)
+generateGnuplotScript(outputfile+"_volume.gp", outputfile+"_volume.data", outputfile+"_volume_heatmap.pdf", nodes)
+os.system("gnuplot " + outputfile+"_volume.gp")
+
+# generate gnuplot bandwidth data and script file
+writeData(outputfile+"_bw.data", nodes, bandwidth)
+generateGnuplotScript(outputfile+"_bw.gp", outputfile+"_bw.data", outputfile+"_bw_heatmap.pdf", nodes)
+os.system("gnuplot " + outputfile+"_bw.gp")