6 years ago · f08505850b
--- a/ChangeLog
+++ b/ChangeLog
@@ -141,6 +141,7 @@ Small changes:
 
				     scheduler context
			
 
				   * Fonction starpu_is_initialized() is moved to the public API.
			
 
				   * Fix code to allow to submit tasks to empty contexts
			
 
				+  * STARPU_COMM_STATS also displays the bandwidth
			
 
				 
			
 
				 StarPU 1.2.7 (git revision xxx)
			
 
				 ==============================================
			
--- a/doc/doxygen/chapters/410_mpi_support.doxy
+++ b/doc/doxygen/chapters/410_mpi_support.doxy
@@ -31,7 +31,7 @@ An MPI Insert Task function provides an even more seamless transition to a
 
				 distributed application, by automatically issuing all required data transfers
			
 
				 according to the task graph and an application-provided distribution.
			
 
				 
			
 
				-\section ExampleDocumentation Example used in this documentation
			
 
				+\section ExampleDocumentation Example Used In This Documentation
			
 
				 
			
 
				 The example below will be used as the base for this documentation. It
			
 
				 initializes a token on node 0, and the token is passed from node to node,
			
@@ -65,7 +65,7 @@ for (loop = 0; loop < nloops; loop++)
 
				 }
			
 
				 \endcode
			
 
				 
			
 
				-\section NotUsingMPISupport About not using the MPI support
			
 
				+\section NotUsingMPISupport About Not Using The MPI Support
			
 
				 
			
 
				 Although StarPU provides MPI support, the application programmer may want to
			
 
				 keep his MPI communications as they are for a start, and only delegate task
			
@@ -518,7 +518,7 @@ starpu_mpi_task_post_build(MPI_COMM_WORLD, &cl,
 
				                            0);
			
 
				 \endcode
			
 
				 
			
 
				-\section MPIInsertPruning Pruning MPI task insertion
			
 
				+\section MPIInsertPruning Pruning MPI Task Insertion
			
 
				 
			
 
				 Making all MPI nodes process the whole graph can be a concern with a growing
			
 
				 number of nodes. To avoid this, the
			
@@ -684,7 +684,7 @@ To test how much MPI priorities have a good effect on performance, you can
 
				 set the environment variable \ref STARPU_MPI_PRIORITIES to \c 0 to disable the use of
			
 
				 priorities in StarPU-MPI.
			
 
				 
			
 
				-\section MPICache MPI cache support
			
 
				+\section MPICache MPI Cache Support
			
 
				 
			
 
				 StarPU-MPI automatically optimizes duplicate data transmissions: if an MPI
			
 
				 node \c B needs a piece of data \c D from MPI node \c A for several tasks, only one
			
@@ -721,7 +721,7 @@ environment variable. The variable \ref STARPU_MPI_CACHE_STATS can be set to <c>
 
				 to enable the runtime to display messages when data are added or removed
			
 
				 from the cache holding the received data.
			
 
				 
			
 
				-\section MPIMigration MPI Data migration
			
 
				+\section MPIMigration MPI Data Migration
			
 
				 
			
 
				 The application can dynamically change its mind about the data distribution, to
			
 
				 balance the load over MPI nodes for instance. This can be done very simply by
			
@@ -844,7 +844,7 @@ starpu_mpi_gather_detached(data_handles, nblocks, 0, MPI_COMM_WORLD, NULL, NULL,
 
				 Other collective operations would be easy to define, just ask starpu-devel for
			
 
				 them!
			
 
				 
			
 
				-\section MPIDriver Make StarPU-MPI progression thread execute tasks
			
 
				+\section MPIDriver Make StarPU-MPI Progression Thread Execute Tasks
			
 
				 
			
 
				 The default behaviour of StarPU-MPI is to spawn an MPI thread to take care only
			
 
				 of MPI communications in an active fashion (i.e the StarPU-MPI thread sleeps
			
@@ -895,6 +895,9 @@ environment variable \ref STARPU_MPI_CACHE_STATS is set to \c 1. It
 
				 prints messages on the standard output when data are added or removed
			
 
				 from the received communication cache.
			
 
				 
			
 
				+When the environment variable \ref STARPU_COMM_STATS is set to \c 1,
			
 
				+StarPU will display at the end of the execution for each node the
			
 
				+volume and the bandwidth of data sent to each other nodes.
			
 
				 
			
 
				 \section MPIExamples More MPI examples
			
 
				 
			
--- a/mpi/src/starpu_mpi_stats.c
+++ b/mpi/src/starpu_mpi_stats.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2012-2017                                CNRS
			
 
				+ * Copyright (C) 2012-2017, 2019                          CNRS
			
 
				  * Copyright (C) 2015                                     Université de Bordeaux
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -19,11 +19,13 @@
 
				 #include <common/config.h>
			
 
				 #include <stdio.h>
			
 
				 #include <starpu_mpi_private.h>
			
 
				+#include <starpu_util.h>
			
 
				 
			
 
				 /* measure the amount of data transfers between each pair of MPI nodes */
			
 
				 static size_t *comm_amount;
			
 
				 static int world_size;
			
 
				 static int stats_enabled=0;
			
 
				+static double time_init;
			
 
				 
			
 
				 void _starpu_mpi_comm_amounts_init(MPI_Comm comm)
			
 
				 {
			
@@ -42,6 +44,7 @@ void _starpu_mpi_comm_amounts_init(MPI_Comm comm)
 
				 	_STARPU_MPI_DEBUG(1, "allocating for %d nodes\n", world_size);
			
 
				 
			
 
				 	_STARPU_MPI_CALLOC(comm_amount, world_size, sizeof(size_t));
			
 
				+	time_init = starpu_timing_now();
			
 
				 }
			
 
				 
			
 
				 void _starpu_mpi_comm_amounts_shutdown()
			
@@ -81,20 +84,20 @@ void _starpu_mpi_comm_amounts_display(FILE *stream, int node)
 
				 	if (stats_enabled == 0)
			
 
				 		return;
			
 
				 
			
 
				+	double time = starpu_timing_now() - time_init;
			
 
				+
			
 
				 	for (dst = 0; dst < world_size; dst++)
			
 
				 	{
			
 
				 		sum += comm_amount[dst];
			
 
				 	}
			
 
				 
			
 
				-	fprintf(stream, "\n[starpu_comm_stats][%d] TOTAL:\t%f B\t%f MB\n", node, (float)sum, (float)sum/1024/1024);
			
 
				+	fprintf(stream, "\n[starpu_comm_stats][%d] TOTAL:\t%f B\t%f MB\t %f B/s\t %f MB/s\n", node, (float)sum, (float)sum/1024/1024, (float)sum/(float)time, (float)sum/1204/1024/(float)time);
			
 
				 
			
 
				 	for (dst = 0; dst < world_size; dst++)
			
 
				 	{
			
 
				-		if (comm_amount[dst])
			
 
				-		{
			
 
				-			fprintf(stream, "[starpu_comm_stats][%d->%d]\t%f B\t%f MB\n",
			
 
				-				node, dst, (float)comm_amount[dst], ((float)comm_amount[dst])/(1024*1024));
			
 
				-		}
			
 
				+		fprintf(stream, "[starpu_comm_stats][%d->%d]\t%f B\t%f MB\t %f B/s\t %f MB/s\n",
			
 
				+			node, dst, (float)comm_amount[dst], ((float)comm_amount[dst])/(1024*1024),
			
 
				+			(float)comm_amount[dst]/(float)time, ((float)comm_amount[dst])/(1024*1024)/(float)time);
			
 
				 	}
			
 
				 }