Browse Source

Add profiling capabilities on the bus too. This permits to monitor bus
consumption online and should replace the DATASTATS option at some point.

Cédric Augonnet 15 years ago
parent
commit
a3b97df7f8

+ 16 - 0
include/starpu_profiling.h

@@ -41,6 +41,13 @@ struct starpu_worker_profiling_info {
 	int executed_tasks;
 };
 
+struct starpu_bus_profiling_info {
+	struct timespec start_time;
+	struct timespec total_time;
+	int long long transferred_bytes;
+	int transfer_count;
+};
+
 /* This function sets the profiling status:
  * - enable with STARPU_PROFILING_ENABLE
  * - disable with STARPU_PROFILING_DISABLE 
@@ -56,6 +63,13 @@ int starpu_profiling_status_get(void);
  * measurements. If worker_info is NULL, we only reset the counters. */
 int starpu_worker_get_profiling_info(int workerid, struct starpu_worker_profiling_info *worker_info);
 
+int starpu_bus_get_count(void);
+int starpu_bus_get_id(int src, int dst);
+int starpu_bus_get_src(int busid);
+int starpu_bus_get_dst(int busid);
+
+int starpu_bus_get_profiling_info(int busid, struct starpu_bus_profiling_info *bus_info);
+
 /* Some helper functions to manipulate profiling API output */
 /* Reset timespec */
 static inline void starpu_timespec_clear(struct timespec *tsp)
@@ -115,4 +129,6 @@ static inline void starpu_timespec_sub(struct timespec *a,
 double starpu_timing_timespec_delay_us(struct timespec *start, struct timespec *end);
 double starpu_timing_timespec_to_us(struct timespec *ts);
 
+void starpu_bus_profiling_helper_display_summary(void);
+
 #endif // __STARPU_PROFILING_H__

+ 2 - 1
src/Makefile.am

@@ -176,7 +176,8 @@ libstarpu_la_SOURCES = 						\
 	util/starpu_cublas.c					\
 	util/file.c						\
 	debug/latency.c						\
-	profiling/profiling.c
+	profiling/profiling.c					\
+	profiling/bus_profiling_helpers.c
 
 if STARPU_USE_CPU
 libstarpu_la_SOURCES += drivers/cpu/driver_cpu.c

+ 2 - 2
src/common/timing.c

@@ -14,11 +14,11 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "timing.h"
-
+#include <sys/time.h>
 #include <starpu.h>
 #include <common/config.h>
 #include <profiling/profiling.h>
+#include <common/timing.h>
 
 #ifdef HAVE_CLOCK_GETTIME
 #include <time.h>

+ 0 - 1
src/common/timing.h

@@ -28,7 +28,6 @@
 #include <common/config.h>
 #include <starpu.h>
 
-
 void _starpu_timing_init(void);
 void starpu_clock_gettime(struct timespec *ts);
 double _starpu_timing_now(void);

+ 10 - 0
src/core/topology.c

@@ -22,6 +22,7 @@
 #include <core/topology.h>
 #include <drivers/cuda/driver_cuda.h>
 #include <common/hash.h>
+#include <profiling/profiling.h>
 
 #ifdef STARPU_HAVE_HWLOC
 #include <hwloc.h>
@@ -610,6 +611,10 @@ static void _starpu_init_workers_binding(struct starpu_machine_config_s *config)
 	/* TODO : support NUMA  ;) */
 	ram_memory_node = _starpu_register_memory_node(STARPU_CPU_RAM);
 
+	/* We will store all the busid of the different (src, dst) combinations
+	 * in a matrix which we initialize here. */
+	_starpu_initialize_busid_matrix();
+
 	unsigned worker;
 	for (worker = 0; worker < config->nworkers; worker++)
 	{
@@ -644,6 +649,9 @@ static void _starpu_init_workers_binding(struct starpu_machine_config_s *config)
 				}
 				is_a_set_of_accelerators = 0;
 				memory_node = _starpu_register_memory_node(STARPU_CUDA_RAM);
+
+				_starpu_register_bus(0, memory_node);
+				_starpu_register_bus(memory_node, 0);
 				break;
 #endif
 
@@ -657,6 +665,8 @@ static void _starpu_init_workers_binding(struct starpu_machine_config_s *config)
 				}
 				is_a_set_of_accelerators = 0;
 				memory_node = _starpu_register_memory_node(STARPU_OPENCL_RAM);
+				_starpu_register_bus(0, memory_node);
+				_starpu_register_bus(memory_node, 0);
 				break;
 #endif
 

+ 3 - 4
src/datawizard/copy_driver.c

@@ -14,7 +14,7 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include <pthread.h>
+#include <starpu.h>
 #include <common/config.h>
 #include <common/utils.h>
 #include <core/policies/sched_policy.h>
@@ -23,6 +23,7 @@
 #include "copy_driver.h"
 #include "memalloc.h"
 #include "starpu_opencl.h"
+#include <profiling/profiling.h>
 
 void _starpu_wake_all_blocked_workers_on_node(unsigned nodeid)
 {
@@ -216,10 +217,8 @@ int __attribute__((warn_unused_result)) _starpu_driver_copy_data_1_to_1(starpu_d
 		STARPU_ASSERT(handle->ops);
 		//STARPU_ASSERT(handle->ops->copy_data_1_to_1);
 
-#ifdef STARPU_DATA_STATS
 		size_t size = handle->ops->get_size(handle);
-		_starpu_update_comm_amount(src_node, dst_node, size);
-#endif
+		_starpu_bus_update_profiling_info((int)src_node, (int)dst_node, size);
 		
 #ifdef STARPU_USE_FXT
 		com_id = STARPU_ATOMIC_ADD(&communication_cnt, 1);

+ 2 - 10
src/datawizard/datastats.c

@@ -14,14 +14,12 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include <stdio.h>
+#include <starpu.h>
 #include <datawizard/datastats.h>
 #include <common/config.h>
-#include <starpu.h>
-
-/* measure the cache hit ratio for each node */
 
 #ifdef STARPU_DATA_STATS
+/* measure the cache hit ratio for each node */
 static unsigned hit_cnt[STARPU_MAXNODES];
 static unsigned miss_cnt[STARPU_MAXNODES];
 #endif
@@ -138,11 +136,6 @@ void _starpu_display_comm_amounts(void)
 	}
 }
 
-inline void _starpu_update_comm_amount(uint32_t src_node, uint32_t dst_node, size_t size)
-{
-	comm_ammount[src_node][dst_node] += size;
-}
-
 #else
 
 inline void _starpu_display_comm_amounts(void)
@@ -150,4 +143,3 @@ inline void _starpu_display_comm_amounts(void)
 }
 
 #endif
-

+ 3 - 6
src/datawizard/datastats.h

@@ -17,10 +17,11 @@
 #ifndef __DATASTATS_H__
 #define __DATASTATS_H__
 
+#include <starpu.h>
+#include <common/config.h>
 #include <stdint.h>
 #include <stdlib.h>
 
-
 inline void _starpu_msi_cache_hit(unsigned node);
 inline void _starpu_msi_cache_miss(unsigned node);
 
@@ -33,8 +34,4 @@ inline void _starpu_data_allocation_inc_stats(unsigned node __attribute__ ((unus
 void _starpu_display_comm_amounts(void);
 void _starpu_display_alloc_cache_stats(void);
 
-#ifdef STARPU_DATA_STATS
-inline void _starpu_update_comm_amount(uint32_t src_node, uint32_t dst_node, size_t size);
-#endif
-
-#endif
+#endif // __DATASTATS_H__

+ 49 - 0
src/profiling/bus_profiling_helpers.c

@@ -0,0 +1,49 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_profiling.h>
+#include <profiling/profiling.h>
+
+void starpu_bus_profiling_helper_display_summary(void)
+{
+	int long long sum_transferred = 0;
+
+	fprintf(stderr, "Data transfer statistics:\n");
+
+	int busid;
+	int bus_cnt = starpu_bus_get_count();
+	for (busid = 0; busid < bus_cnt; busid++)
+	{
+		int src, dst;
+	
+		src = starpu_bus_get_src(busid);
+		dst = starpu_bus_get_dst(busid);
+
+		struct starpu_bus_profiling_info bus_info;
+		starpu_bus_get_profiling_info(busid, &bus_info);
+
+		int long long transferred = bus_info.transferred_bytes;
+		int transfer_cnt =  bus_info.transfer_count;
+		double elapsed_time = starpu_timing_timespec_to_us(&bus_info.total_time);
+
+		fprintf(stderr, "\t%d -> %d\t%.2lf MB\t%.2lfMB/s\t(transfers : %d - avg %.2lf MB)\n", src, dst, (1.0*transferred)/(1024*1024), transferred/elapsed_time, transfer_cnt, (1.0*transferred)/(transfer_cnt*1024*1024));
+
+		sum_transferred += transferred;
+	}
+
+	fprintf(stderr, "Total transfers: %.2lf MB\n", (1.0*sum_transferred)/(1024*1024));
+}

+ 115 - 0
src/profiling/profiling.c

@@ -35,6 +35,22 @@ static struct timespec sleeping_start_date[STARPU_NMAXWORKERS];
 static unsigned worker_registered_executing_start[STARPU_NMAXWORKERS];
 static struct timespec executing_start_date[STARPU_NMAXWORKERS];
 
+
+/* Store the busid of the different (src, dst) pairs. busid_matrix[src][dst]
+ * contains the busid of (src, dst) or -1 if the bus was not registered. */
+struct node_pair {
+	int src;
+	int dst;
+	struct starpu_bus_profiling_info *bus_info;
+};
+
+static int busid_matrix[STARPU_MAXNODES][STARPU_MAXNODES];
+static struct starpu_bus_profiling_info bus_profiling_info[STARPU_MAXNODES][STARPU_MAXNODES];
+static struct node_pair busid_to_node_pair[STARPU_MAXNODES*STARPU_MAXNODES]; 
+static int busid_cnt = 0;
+
+static void _do_starpu_bus_reset_profiling_info(struct starpu_bus_profiling_info *bus_info);
+
 /*
  *	Global control of profiling
  */
@@ -55,6 +71,16 @@ int starpu_profiling_status_set(int status)
 		int worker;
 		for (worker = 0; worker < STARPU_NMAXWORKERS; worker++)
 			_starpu_worker_reset_profiling_info(worker);
+
+		int busid;
+		int bus_cnt = starpu_bus_get_count();
+		for (busid = 0; busid < bus_cnt; busid++)
+		{
+			struct starpu_bus_profiling_info *bus_info;
+			bus_info = busid_to_node_pair[busid].bus_info;
+
+			_do_starpu_bus_reset_profiling_info(bus_info);;
+		}
 	}
 
 	return prev_value;
@@ -255,3 +281,92 @@ int starpu_worker_get_profiling_info(int workerid, struct starpu_worker_profilin
 
 	return 0;
 }
+
+/*
+ *	Bus profiling
+ */
+
+void _starpu_initialize_busid_matrix(void)
+{
+	int i, j;
+	for (j = 0; j < STARPU_MAXNODES; j++)
+	for (i = 0; i < STARPU_MAXNODES; i++)
+		busid_matrix[i][j] = -1;	
+
+	busid_cnt = 0;
+}
+
+static void _do_starpu_bus_reset_profiling_info(struct starpu_bus_profiling_info *bus_info)
+{
+	starpu_clock_gettime(&bus_info->start_time);
+	bus_info->transferred_bytes = 0;
+	bus_info->transfer_count = 0;
+}
+
+int _starpu_register_bus(int src_node, int dst_node)
+{
+	if (busid_matrix[src_node][dst_node] != -1)
+		return -EBUSY;
+
+	int busid = STARPU_ATOMIC_ADD(&busid_cnt, 1) - 1;
+
+	busid_matrix[src_node][dst_node] = busid;
+
+	busid_to_node_pair[busid].src = src_node;
+	busid_to_node_pair[busid].dst = dst_node;
+	busid_to_node_pair[busid].bus_info = &bus_profiling_info[src_node][dst_node];
+
+	_do_starpu_bus_reset_profiling_info(&bus_profiling_info[src_node][dst_node]);
+
+	return busid;
+}
+
+int starpu_bus_get_count(void)
+{
+	return busid_cnt;
+}
+
+int starpu_bus_get_id(int src, int dst)
+{
+	return busid_matrix[src][dst];
+}
+
+int starpu_bus_get_src(int busid)
+{
+	return busid_to_node_pair[busid].src;
+}
+
+int starpu_bus_get_dst(int busid)
+{
+	return busid_to_node_pair[busid].dst;
+}
+
+int starpu_bus_get_profiling_info(int busid, struct starpu_bus_profiling_info *bus_info)
+{
+	int src_node = busid_to_node_pair[busid].src;
+	int dst_node = busid_to_node_pair[busid].dst;
+
+	/* XXX protect all this  method with a mutex */
+	if (bus_info)
+	{
+		struct timespec now;
+		starpu_clock_gettime(&now);
+
+		/* total_time = now - start_time */
+		starpu_timespec_sub(&now, &bus_profiling_info[src_node][dst_node].start_time,
+					  &bus_profiling_info[src_node][dst_node].total_time);
+
+		memcpy(bus_info, &bus_profiling_info[src_node][dst_node], sizeof(struct starpu_bus_profiling_info));
+	}
+
+	_do_starpu_bus_reset_profiling_info(&bus_profiling_info[src_node][dst_node]);
+
+	return 0;
+} 
+
+void _starpu_bus_update_profiling_info(int src_node, int dst_node, size_t size)
+{
+	bus_profiling_info[src_node][dst_node].transferred_bytes += size;
+	bus_profiling_info[src_node][dst_node].transfer_count++;
+//	fprintf(stderr, "PROFILE %d -> %d : %d (cnt %d)\n", src_node, dst_node, size, bus_profiling_info[src_node][dst_node].transfer_count);
+}

+ 4 - 0
src/profiling/profiling.h

@@ -29,4 +29,8 @@ void _starpu_worker_update_profiling_info_sleeping(int workerid, struct timespec
 void _starpu_worker_register_sleeping_start_date(int workerid, struct timespec *sleeping_start);
 void _starpu_worker_register_executing_start_date(int workerid, struct timespec *executing_start);
 
+void _starpu_initialize_busid_matrix(void);
+int _starpu_register_bus(int src_node, int dst_node);
+void _starpu_bus_update_profiling_info(int src_node, int dst_node, size_t size);
+
 #endif // __PROFILING_H__