Просмотр исходного кода

Add profiling capabilities on the bus too. This permits to monitor bus
consumption online and should replace the DATASTATS option at some point.

Cédric Augonnet лет назад: 15
Родитель
Сommit
a3b97df7f8

+ 16 - 0
include/starpu_profiling.h

@@ -41,6 +41,13 @@ struct starpu_worker_profiling_info {
 	int executed_tasks;
 };
 
+struct starpu_bus_profiling_info {
+	struct timespec start_time;
+	struct timespec total_time;
+	int long long transferred_bytes;
+	int transfer_count;
+};
+
 /* This function sets the profiling status:
  * - enable with STARPU_PROFILING_ENABLE
  * - disable with STARPU_PROFILING_DISABLE 
@@ -56,6 +63,13 @@ int starpu_profiling_status_get(void);
  * measurements. If worker_info is NULL, we only reset the counters. */
 int starpu_worker_get_profiling_info(int workerid, struct starpu_worker_profiling_info *worker_info);
 
+int starpu_bus_get_count(void);
+int starpu_bus_get_id(int src, int dst);
+int starpu_bus_get_src(int busid);
+int starpu_bus_get_dst(int busid);
+
+int starpu_bus_get_profiling_info(int busid, struct starpu_bus_profiling_info *bus_info);
+
 /* Some helper functions to manipulate profiling API output */
 /* Reset timespec */
 static inline void starpu_timespec_clear(struct timespec *tsp)
@@ -115,4 +129,6 @@ static inline void starpu_timespec_sub(struct timespec *a,
 double starpu_timing_timespec_delay_us(struct timespec *start, struct timespec *end);
 double starpu_timing_timespec_to_us(struct timespec *ts);
 
+void starpu_bus_profiling_helper_display_summary(void);
+
 #endif // __STARPU_PROFILING_H__

+ 2 - 1
src/Makefile.am

@@ -176,7 +176,8 @@ libstarpu_la_SOURCES = 						\
 	util/starpu_cublas.c					\
 	util/file.c						\
 	debug/latency.c						\
-	profiling/profiling.c
+	profiling/profiling.c					\
+	profiling/bus_profiling_helpers.c
 
 if STARPU_USE_CPU
 libstarpu_la_SOURCES += drivers/cpu/driver_cpu.c

+ 2 - 2
src/common/timing.c

@@ -14,11 +14,11 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include "timing.h"
-
+#include <sys/time.h>
 #include <starpu.h>
 #include <common/config.h>
 #include <profiling/profiling.h>
+#include <common/timing.h>
 
 #ifdef HAVE_CLOCK_GETTIME
 #include <time.h>

+ 0 - 1
src/common/timing.h

@@ -28,7 +28,6 @@
 #include <common/config.h>
 #include <starpu.h>
 
-
 void _starpu_timing_init(void);
 void starpu_clock_gettime(struct timespec *ts);
 double _starpu_timing_now(void);

+ 10 - 0
src/core/topology.c

@@ -22,6 +22,7 @@
 #include <core/topology.h>
 #include <drivers/cuda/driver_cuda.h>
 #include <common/hash.h>
+#include <profiling/profiling.h>
 
 #ifdef STARPU_HAVE_HWLOC
 #include <hwloc.h>
@@ -610,6 +611,10 @@ static void _starpu_init_workers_binding(struct starpu_machine_config_s *config)
 	/* TODO : support NUMA  ;) */
 	ram_memory_node = _starpu_register_memory_node(STARPU_CPU_RAM);
 
+	/* We will store all the busid of the different (src, dst) combinations
+	 * in a matrix which we initialize here. */
+	_starpu_initialize_busid_matrix();
+
 	unsigned worker;
 	for (worker = 0; worker < config->nworkers; worker++)
 	{
@@ -644,6 +649,9 @@ static void _starpu_init_workers_binding(struct starpu_machine_config_s *config)
 				}
 				is_a_set_of_accelerators = 0;
 				memory_node = _starpu_register_memory_node(STARPU_CUDA_RAM);
+
+				_starpu_register_bus(0, memory_node);
+				_starpu_register_bus(memory_node, 0);
 				break;
 #endif
 
@@ -657,6 +665,8 @@ static void _starpu_init_workers_binding(struct starpu_machine_config_s *config)
 				}
 				is_a_set_of_accelerators = 0;
 				memory_node = _starpu_register_memory_node(STARPU_OPENCL_RAM);
+				_starpu_register_bus(0, memory_node);
+				_starpu_register_bus(memory_node, 0);
 				break;
 #endif
 

+ 3 - 4
src/datawizard/copy_driver.c

@@ -14,7 +14,7 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include <pthread.h>
+#include <starpu.h>
 #include <common/config.h>
 #include <common/utils.h>
 #include <core/policies/sched_policy.h>
@@ -23,6 +23,7 @@
 #include "copy_driver.h"
 #include "memalloc.h"
 #include "starpu_opencl.h"
+#include <profiling/profiling.h>
 
 void _starpu_wake_all_blocked_workers_on_node(unsigned nodeid)
 {
@@ -216,10 +217,8 @@ int __attribute__((warn_unused_result)) _starpu_driver_copy_data_1_to_1(starpu_d
 		STARPU_ASSERT(handle->ops);
 		//STARPU_ASSERT(handle->ops->copy_data_1_to_1);
 
-#ifdef STARPU_DATA_STATS
 		size_t size = handle->ops->get_size(handle);
-		_starpu_update_comm_amount(src_node, dst_node, size);
-#endif
+		_starpu_bus_update_profiling_info((int)src_node, (int)dst_node, size);
 		
 #ifdef STARPU_USE_FXT
 		com_id = STARPU_ATOMIC_ADD(&communication_cnt, 1);

+ 2 - 10
src/datawizard/datastats.c

@@ -14,14 +14,12 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-#include <stdio.h>
+#include <starpu.h>
 #include <datawizard/datastats.h>
 #include <common/config.h>
-#include <starpu.h>
-
-/* measure the cache hit ratio for each node */
 
 #ifdef STARPU_DATA_STATS
+/* measure the cache hit ratio for each node */
 static unsigned hit_cnt[STARPU_MAXNODES];
 static unsigned miss_cnt[STARPU_MAXNODES];
 #endif
@@ -138,11 +136,6 @@ void _starpu_display_comm_amounts(void)
 	}
 }
 
-inline void _starpu_update_comm_amount(uint32_t src_node, uint32_t dst_node, size_t size)
-{
-	comm_ammount[src_node][dst_node] += size;
-}
-
 #else
 
 inline void _starpu_display_comm_amounts(void)
@@ -150,4 +143,3 @@ inline void _starpu_display_comm_amounts(void)
 }
 
 #endif
-

+ 3 - 6
src/datawizard/datastats.h

@@ -17,10 +17,11 @@
 #ifndef __DATASTATS_H__
 #define __DATASTATS_H__
 
+#include <starpu.h>
+#include <common/config.h>
 #include <stdint.h>
 #include <stdlib.h>
 
-
 inline void _starpu_msi_cache_hit(unsigned node);
 inline void _starpu_msi_cache_miss(unsigned node);
 
@@ -33,8 +34,4 @@ inline void _starpu_data_allocation_inc_stats(unsigned node __attribute__ ((unus
 void _starpu_display_comm_amounts(void);
 void _starpu_display_alloc_cache_stats(void);
 
-#ifdef STARPU_DATA_STATS
-inline void _starpu_update_comm_amount(uint32_t src_node, uint32_t dst_node, size_t size);
-#endif
-
-#endif
+#endif // __DATASTATS_H__

+ 49 - 0
src/profiling/bus_profiling_helpers.c

@@ -0,0 +1,49 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2010 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <starpu_profiling.h>
+#include <profiling/profiling.h>
+
+void starpu_bus_profiling_helper_display_summary(void)
+{
+	int long long sum_transferred = 0;
+
+	fprintf(stderr, "Data transfer statistics:\n");
+
+	int busid;
+	int bus_cnt = starpu_bus_get_count();
+	for (busid = 0; busid < bus_cnt; busid++)
+	{
+		int src, dst;
+	
+		src = starpu_bus_get_src(busid);
+		dst = starpu_bus_get_dst(busid);
+
+		struct starpu_bus_profiling_info bus_info;
+		starpu_bus_get_profiling_info(busid, &bus_info);
+
+		int long long transferred = bus_info.transferred_bytes;
+		int transfer_cnt =  bus_info.transfer_count;
+		double elapsed_time = starpu_timing_timespec_to_us(&bus_info.total_time);
+
+		fprintf(stderr, "\t%d -> %d\t%.2lf MB\t%.2lfMB/s\t(transfers : %d - avg %.2lf MB)\n", src, dst, (1.0*transferred)/(1024*1024), transferred/elapsed_time, transfer_cnt, (1.0*transferred)/(transfer_cnt*1024*1024));
+
+		sum_transferred += transferred;
+	}
+
+	fprintf(stderr, "Total transfers: %.2lf MB\n", (1.0*sum_transferred)/(1024*1024));
+}

+ 115 - 0
src/profiling/profiling.c

@@ -35,6 +35,22 @@ static struct timespec sleeping_start_date[STARPU_NMAXWORKERS];
 static unsigned worker_registered_executing_start[STARPU_NMAXWORKERS];
 static struct timespec executing_start_date[STARPU_NMAXWORKERS];
 
+
+/* Store the busid of the different (src, dst) pairs. busid_matrix[src][dst]
+ * contains the busid of (src, dst) or -1 if the bus was not registered. */
+struct node_pair {
+	int src;
+	int dst;
+	struct starpu_bus_profiling_info *bus_info;
+};
+
+static int busid_matrix[STARPU_MAXNODES][STARPU_MAXNODES];
+static struct starpu_bus_profiling_info bus_profiling_info[STARPU_MAXNODES][STARPU_MAXNODES];
+static struct node_pair busid_to_node_pair[STARPU_MAXNODES*STARPU_MAXNODES]; 
+static int busid_cnt = 0;
+
+static void _do_starpu_bus_reset_profiling_info(struct starpu_bus_profiling_info *bus_info);
+
 /*
  *	Global control of profiling
  */
@@ -55,6 +71,16 @@ int starpu_profiling_status_set(int status)
 		int worker;
 		for (worker = 0; worker < STARPU_NMAXWORKERS; worker++)
 			_starpu_worker_reset_profiling_info(worker);
+
+		int busid;
+		int bus_cnt = starpu_bus_get_count();
+		for (busid = 0; busid < bus_cnt; busid++)
+		{
+			struct starpu_bus_profiling_info *bus_info;
+			bus_info = busid_to_node_pair[busid].bus_info;
+
+			_do_starpu_bus_reset_profiling_info(bus_info);;
+		}
 	}
 
 	return prev_value;
@@ -255,3 +281,92 @@ int starpu_worker_get_profiling_info(int workerid, struct starpu_worker_profilin
 
 	return 0;
 }
+
+/*
+ *	Bus profiling
+ */
+
+void _starpu_initialize_busid_matrix(void)
+{
+	int i, j;
+	for (j = 0; j < STARPU_MAXNODES; j++)
+	for (i = 0; i < STARPU_MAXNODES; i++)
+		busid_matrix[i][j] = -1;	
+
+	busid_cnt = 0;
+}
+
+static void _do_starpu_bus_reset_profiling_info(struct starpu_bus_profiling_info *bus_info)
+{
+	starpu_clock_gettime(&bus_info->start_time);
+	bus_info->transferred_bytes = 0;
+	bus_info->transfer_count = 0;
+}
+
+int _starpu_register_bus(int src_node, int dst_node)
+{
+	if (busid_matrix[src_node][dst_node] != -1)
+		return -EBUSY;
+
+	int busid = STARPU_ATOMIC_ADD(&busid_cnt, 1) - 1;
+
+	busid_matrix[src_node][dst_node] = busid;
+
+	busid_to_node_pair[busid].src = src_node;
+	busid_to_node_pair[busid].dst = dst_node;
+	busid_to_node_pair[busid].bus_info = &bus_profiling_info[src_node][dst_node];
+
+	_do_starpu_bus_reset_profiling_info(&bus_profiling_info[src_node][dst_node]);
+
+	return busid;
+}
+
+int starpu_bus_get_count(void)
+{
+	return busid_cnt;
+}
+
+int starpu_bus_get_id(int src, int dst)
+{
+	return busid_matrix[src][dst];
+}
+
+int starpu_bus_get_src(int busid)
+{
+	return busid_to_node_pair[busid].src;
+}
+
+int starpu_bus_get_dst(int busid)
+{
+	return busid_to_node_pair[busid].dst;
+}
+
+int starpu_bus_get_profiling_info(int busid, struct starpu_bus_profiling_info *bus_info)
+{
+	int src_node = busid_to_node_pair[busid].src;
+	int dst_node = busid_to_node_pair[busid].dst;
+
+	/* XXX protect all this  method with a mutex */
+	if (bus_info)
+	{
+		struct timespec now;
+		starpu_clock_gettime(&now);
+
+		/* total_time = now - start_time */
+		starpu_timespec_sub(&now, &bus_profiling_info[src_node][dst_node].start_time,
+					  &bus_profiling_info[src_node][dst_node].total_time);
+
+		memcpy(bus_info, &bus_profiling_info[src_node][dst_node], sizeof(struct starpu_bus_profiling_info));
+	}
+
+	_do_starpu_bus_reset_profiling_info(&bus_profiling_info[src_node][dst_node]);
+
+	return 0;
+} 
+
+void _starpu_bus_update_profiling_info(int src_node, int dst_node, size_t size)
+{
+	bus_profiling_info[src_node][dst_node].transferred_bytes += size;
+	bus_profiling_info[src_node][dst_node].transfer_count++;
+//	fprintf(stderr, "PROFILE %d -> %d : %d (cnt %d)\n", src_node, dst_node, size, bus_profiling_info[src_node][dst_node].transfer_count);
+}

+ 4 - 0
src/profiling/profiling.h

@@ -29,4 +29,8 @@ void _starpu_worker_update_profiling_info_sleeping(int workerid, struct timespec
 void _starpu_worker_register_sleeping_start_date(int workerid, struct timespec *sleeping_start);
 void _starpu_worker_register_executing_start_date(int workerid, struct timespec *executing_start);
 
+void _starpu_initialize_busid_matrix(void);
+int _starpu_register_bus(int src_node, int dst_node);
+void _starpu_bus_update_profiling_info(int src_node, int dst_node, size_t size);
+
 #endif // __PROFILING_H__