浏览代码

Implement bus performance sampling: this is no longer hardcoded.

Cédric Augonnet 15 年之前
父节点
当前提交
c054754a75
共有 5 个文件被更改,包括 361 次插入24 次删除
  1. 1 0
      src/Makefile.am
  2. 1 24
      src/core/perfmodel/perfmodel.c
  3. 3 0
      src/core/perfmodel/perfmodel.h
  4. 354 0
      src/core/perfmodel/perfmodel_bus.c
  5. 2 0
      src/core/workers.c

+ 1 - 0
src/Makefile.am

@@ -106,6 +106,7 @@ libstarpu_la_SOURCES = 						\
 	core/mechanisms/priority_queues.c			\
 	core/mechanisms/fifo_queues.c				\
 	core/perfmodel/perfmodel_history.c			\
+	core/perfmodel/perfmodel_bus.c				\
 	core/perfmodel/perfmodel.c				\
 	core/perfmodel/regression.c				\
 	core/policies/no-prio-policy.c				\

+ 1 - 24
src/core/perfmodel/perfmodel.c

@@ -112,25 +112,6 @@ double job_expected_length(uint32_t who, struct job_s *j, enum starpu_perf_archt
 	return 0.0;
 }
 
-/* hand hard coded, does not really reflect anything real ! */
-static double bandwith_matrix[4][4] = 
-{
-	{0.0, 5700.0, 3000.0, 2000.0},
-	{5300.0, 0.0, 500.0, 500.0},
-	{3000.0, 500.0, 0.0, 500.0},
-	{2000.0, 500.0, 500.0, 0.0}
-};
-
-static double latency_matrix[4][4] = 
-{
-	{0.0, 500.0, 500.0, 500.0},
-	{500.0, 0.0, 2000.0, 2000.0},
-	{500.0, 2000.0, 0.0, 2000.0},
-	{500.0, 2000.0, 2000.0, 0.0}
-};
-
-
-
 /* Data transfer performance modeling */
 double data_expected_penalty(struct jobq_s *q, struct job_s *j)
 {
@@ -153,11 +134,7 @@ double data_expected_penalty(struct jobq_s *q, struct job_s *j)
 
 			uint32_t src_node = select_src_node(state);
 
-			/* TODO */
-			double bandwith = bandwith_matrix[src_node][memory_node];
-			double latency = latency_matrix[src_node][memory_node];
-
-			penalty += latency + size/bandwith;
+			penalty += predict_transfer_time(src_node, memory_node, size);
 		}
 	}
 

+ 3 - 0
src/core/perfmodel/perfmodel.h

@@ -97,5 +97,8 @@ void update_perfmodel_history(struct job_s *j, enum starpu_perf_archtype arch,
 double data_expected_penalty(struct jobq_s *q, struct job_s *j);
 
 void create_sampling_directory_if_needed(void);
+
+void load_bus_performance_files(void);
+double predict_transfer_time(unsigned src_node, unsigned dst_node, size_t size);
  
 #endif // __PERFMODEL_H__

+ 354 - 0
src/core/perfmodel/perfmodel_bus.c

@@ -0,0 +1,354 @@
+/*
+ * StarPU
+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <unistd.h>
+#include <sys/time.h>
+
+#include <starpu.h>
+#include <common/config.h>
+#include <core/perfmodel/perfmodel.h>
+#include <datawizard/data_parameters.h>
+
+
+#define SIZE	(32*1024*1024*sizeof(char))
+#define NITER	128
+
+static double bandwith_matrix[MAXNODES][MAXNODES] = {{-1.0}};
+static double latency_matrix[MAXNODES][MAXNODES] = {{ -1.0}};
+
+/* Benchmarking the performance of the bus */
+
+static double cudadev_timing_htod[MAXNODES] = {0.0};
+static double cudadev_timing_dtoh[MAXNODES] = {0.0};
+static int ncuda;
+static unsigned was_benchmarked = 0;
+
+static void measure_bandwith_between_host_and_dev(int dev)
+{
+	/* Initiliaze CUDA context on the device */
+	cudaSetDevice(dev);
+
+	/* hack to force the initialization */
+	cudaFree(0);
+
+	/* Allocate a buffer on the device */
+	unsigned char *d_buffer;
+	cudaMalloc((void **)&d_buffer, SIZE);
+	assert(d_buffer);
+
+	/* Allocate a buffer on the host */
+	unsigned char *h_buffer;
+	cudaHostAlloc((void **)&h_buffer, SIZE, 0); 
+	assert(h_buffer);
+
+	/* Fill them */
+	memset(h_buffer, 0, SIZE);
+	cudaMemset(d_buffer, 0, SIZE);
+
+	unsigned iter;
+	double timing;
+	struct timeval start;
+	struct timeval end;
+
+	/* Measure upload bandwith */
+	gettimeofday(&start, NULL);
+	for (iter = 0; iter < NITER; iter++)
+	{
+		cudaMemcpy(d_buffer, h_buffer, SIZE, cudaMemcpyHostToDevice);
+		cudaThreadSynchronize();
+	}
+	gettimeofday(&end, NULL);
+	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+
+	cudadev_timing_htod[dev+1] = timing/NITER;
+
+	/* Measure download bandwith */
+	gettimeofday(&start, NULL);
+	for (iter = 0; iter < NITER; iter++)
+	{
+		cudaMemcpy(h_buffer, d_buffer, SIZE, cudaMemcpyDeviceToHost);
+		cudaThreadSynchronize();
+	}
+	gettimeofday(&end, NULL);
+	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
+
+	cudadev_timing_dtoh[dev+1] = timing/NITER;
+
+	/* Free buffers */
+	cudaFreeHost(h_buffer);
+	cudaFree(d_buffer);
+
+	cudaThreadExit();
+}
+
+static void benchmark_all_cuda_devices(void)
+{
+#ifdef VERBOSE
+	fprintf(stderr, "Benchmarking the speed of the bus\n");
+#endif
+
+        cudaGetDeviceCount(&ncuda);
+
+	int i;
+	for (i = 0; i < ncuda; i++)
+	{
+		/* measure bandwith between Host and Device i */
+		measure_bandwith_between_host_and_dev(i);
+	}
+
+	was_benchmarked = 1;
+
+#ifdef VERBOSE
+	fprintf(stderr, "Benchmarking the speed of the bus is done.\n");
+#endif
+}
+
+static void get_bus_path(const char *type, char *path, size_t maxlen)
+{
+	strncpy(path, PERF_MODEL_DIR_BUS, maxlen);
+	strncat(path, type, maxlen);
+	
+	char hostname[32];
+	gethostname(hostname, 32);
+	strncat(path, ".", maxlen);
+	strncat(path, hostname, maxlen);
+}
+
+/*
+ *	Latency
+ */
+
+static void get_latency_path(char *path, size_t maxlen)
+{
+	get_bus_path("latency", path, maxlen);
+}
+
+static void load_bus_latency_file_content(void)
+{
+	int n;
+	unsigned src, dst;
+	FILE *f;
+
+	char path[256];
+	get_latency_path(path, 256);
+
+	f = fopen(path, "r");
+	STARPU_ASSERT(f);
+
+	for (src = 0; src < MAXNODES; src++)
+	{
+		for (dst = 0; dst < MAXNODES; dst++)
+		{
+			double latency;
+
+			n = fscanf(f, "%lf ", &latency);
+			STARPU_ASSERT(n == 1);
+
+			latency_matrix[src][dst] = latency;
+		}
+
+		n = fscanf(f, "\n");
+		STARPU_ASSERT(n == 0);
+	}
+
+	fclose(f);
+}
+
+static void write_bus_latency_file_content(void)
+{
+	int src, dst;
+	FILE *f;
+
+	STARPU_ASSERT(was_benchmarked);
+
+	char path[256];
+	get_latency_path(path, 256);
+
+	f = fopen(path, "w+");
+	STARPU_ASSERT(f);
+
+	for (src = 0; src < MAXNODES; src++)
+	{
+		for (dst = 0; dst < MAXNODES; dst++)
+		{
+			double latency;
+
+			if ((src > ncuda) || (dst > ncuda))
+			{
+				/* convention */
+				latency = -1.0;
+			}
+			else if (src == dst)
+			{
+				latency = 0.0;
+			}
+			else {
+                                latency = ((src && dst)?2000.0:500.0);
+			}
+
+			fprintf(f, "%lf ", latency);
+		}
+
+		fprintf(f, "\n");
+	}
+
+	fclose(f);
+}
+
+static void load_bus_latency_file(void)
+{
+	int res;
+
+	char path[256];
+	get_latency_path(path, 256);
+
+	res = access(path, F_OK);
+	if (res)
+	{
+		/* File does not exist yet */
+		if (!was_benchmarked)
+			benchmark_all_cuda_devices();
+
+		write_bus_latency_file_content();
+	}
+
+	load_bus_latency_file_content();
+}
+
+
+/* 
+ *	Bandwith
+ */
+static void get_bandwith_path(char *path, size_t maxlen)
+{
+	get_bus_path("bandwith", path, maxlen);
+}
+
+static void load_bus_bandwith_file_content(void)
+{
+	int n;
+	unsigned src, dst;
+	FILE *f;
+
+	char path[256];
+	get_bandwith_path(path, 256);
+
+	f = fopen(path, "r");
+	STARPU_ASSERT(f);
+
+	for (src = 0; src < MAXNODES; src++)
+	{
+		for (dst = 0; dst < MAXNODES; dst++)
+		{
+			double bandwith;
+
+			n = fscanf(f, "%lf ", &bandwith);
+			STARPU_ASSERT(n == 1);
+
+			bandwith_matrix[src][dst] = bandwith;
+		}
+
+		n = fscanf(f, "\n");
+		STARPU_ASSERT(n == 0);
+	}
+
+	fclose(f);
+}
+
+static void write_bus_bandwith_file_content(void)
+{
+	int src, dst;
+	FILE *f;
+
+	STARPU_ASSERT(was_benchmarked);
+
+	char path[256];
+	get_bandwith_path(path, 256);
+
+	f = fopen(path, "w+");
+	STARPU_ASSERT(f);
+
+	for (src = 0; src < MAXNODES; src++)
+	{
+		for (dst = 0; dst < MAXNODES; dst++)
+		{
+			double bandwith;
+			
+			if ((src > ncuda) || (dst > ncuda))
+			{
+				bandwith = -1.0;
+			}
+			else if (src != dst)
+			{
+				/* Bandwith = (SIZE)/(time i -> ram + time ram -> j)*/
+				double time_src_to_ram = (src==0)?0.0:cudadev_timing_dtoh[src];
+				double time_ram_to_dst = (dst==0)?0.0:cudadev_timing_htod[dst];
+				
+				double timing =time_src_to_ram + time_ram_to_dst;
+				
+				bandwith = 1.0*SIZE/timing;
+			}
+			else {
+			        /* convention */
+			        bandwith = 0.0;
+			}
+			
+			fprintf(f, "%lf ", bandwith);
+		}
+
+		fprintf(f, "\n");
+	}
+
+	fclose(f);
+}
+
+static void load_bus_bandwith_file(void)
+{
+	int res;
+
+	char path[256];
+	get_bandwith_path(path, 256);
+
+	res = access(path, F_OK);
+	if (res)
+	{
+		/* File does not exist yet */
+		if (!was_benchmarked)
+			benchmark_all_cuda_devices();
+
+		write_bus_bandwith_file_content();
+	}
+
+	load_bus_bandwith_file_content();
+}
+
+/*
+ *	Generic
+ */
+
+void load_bus_performance_files(void)
+{
+	load_bus_latency_file();
+	load_bus_bandwith_file();
+}
+
+double predict_transfer_time(unsigned src_node, unsigned dst_node, size_t size)
+{
+	double bandwith = bandwith_matrix[src_node][dst_node];
+	double latency = latency_matrix[src_node][dst_node];
+
+	return latency + size/bandwith;
+}

+ 2 - 0
src/core/workers.c

@@ -188,6 +188,8 @@ int starpu_init(struct starpu_conf *user_conf)
 
 	timing_init();
 
+	load_bus_performance_files();
+
 	/* store the pointer to the user explicit configuration during the
 	 * initialization */
 	config.user_conf = user_conf;