15 年之前 · c054754a75
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -106,6 +106,7 @@ libstarpu_la_SOURCES = 						\
 
				 	core/mechanisms/priority_queues.c			\
			
 
				 	core/mechanisms/fifo_queues.c				\
			
 
				 	core/perfmodel/perfmodel_history.c			\
			
 
				+	core/perfmodel/perfmodel_bus.c				\
			
 
				 	core/perfmodel/perfmodel.c				\
			
 
				 	core/perfmodel/regression.c				\
			
 
				 	core/policies/no-prio-policy.c				\
			
--- a/src/core/perfmodel/perfmodel.c
+++ b/src/core/perfmodel/perfmodel.c
@@ -112,25 +112,6 @@ double job_expected_length(uint32_t who, struct job_s *j, enum starpu_perf_archt
 
				 	return 0.0;
			
 
				 }
			
 
				 
			
 
				-/* hand hard coded, does not really reflect anything real ! */
			
 
				-static double bandwith_matrix[4][4] = 
			
 
				-{
			
 
				-	{0.0, 5700.0, 3000.0, 2000.0},
			
 
				-	{5300.0, 0.0, 500.0, 500.0},
			
 
				-	{3000.0, 500.0, 0.0, 500.0},
			
 
				-	{2000.0, 500.0, 500.0, 0.0}
			
 
				-};
			
 
				-
			
 
				-static double latency_matrix[4][4] = 
			
 
				-{
			
 
				-	{0.0, 500.0, 500.0, 500.0},
			
 
				-	{500.0, 0.0, 2000.0, 2000.0},
			
 
				-	{500.0, 2000.0, 0.0, 2000.0},
			
 
				-	{500.0, 2000.0, 2000.0, 0.0}
			
 
				-};
			
 
				-
			
 
				-
			
 
				-
			
 
				 /* Data transfer performance modeling */
			
 
				 double data_expected_penalty(struct jobq_s *q, struct job_s *j)
			
 
				 {
			
@@ -153,11 +134,7 @@ double data_expected_penalty(struct jobq_s *q, struct job_s *j)
 
				 
			
 
				 			uint32_t src_node = select_src_node(state);
			
 
				 
			
 
				-			/* TODO */
			
 
				-			double bandwith = bandwith_matrix[src_node][memory_node];
			
 
				-			double latency = latency_matrix[src_node][memory_node];
			
 
				-
			
 
				-			penalty += latency + size/bandwith;
			
 
				+			penalty += predict_transfer_time(src_node, memory_node, size);
			
 
				 		}
			
 
				 	}
			
 
				 
			
--- a/src/core/perfmodel/perfmodel.h
+++ b/src/core/perfmodel/perfmodel.h
@@ -97,5 +97,8 @@ void update_perfmodel_history(struct job_s *j, enum starpu_perf_archtype arch,
 
				 double data_expected_penalty(struct jobq_s *q, struct job_s *j);
			
 
				 
			
 
				 void create_sampling_directory_if_needed(void);
			
 
				+
			
 
				+void load_bus_performance_files(void);
			
 
				+double predict_transfer_time(unsigned src_node, unsigned dst_node, size_t size);
			
 
				  
			
 
				 #endif // __PERFMODEL_H__
			
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
@@ -0,0 +1,354 @@
 
				+/*
			
 
				+ * StarPU
			
 
				+ * Copyright (C) INRIA 2008-2009 (see AUTHORS file)
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * This program is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <unistd.h>
			
 
				+#include <sys/time.h>
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <common/config.h>
			
 
				+#include <core/perfmodel/perfmodel.h>
			
 
				+#include <datawizard/data_parameters.h>
			
 
				+
			
 
				+
			
 
				+#define SIZE	(32*1024*1024*sizeof(char))
			
 
				+#define NITER	128
			
 
				+
			
 
				+static double bandwith_matrix[MAXNODES][MAXNODES] = {{-1.0}};
			
 
				+static double latency_matrix[MAXNODES][MAXNODES] = {{ -1.0}};
			
 
				+
			
 
				+/* Benchmarking the performance of the bus */
			
 
				+
			
 
				+static double cudadev_timing_htod[MAXNODES] = {0.0};
			
 
				+static double cudadev_timing_dtoh[MAXNODES] = {0.0};
			
 
				+static int ncuda;
			
 
				+static unsigned was_benchmarked = 0;
			
 
				+
			
 
				+static void measure_bandwith_between_host_and_dev(int dev)
			
 
				+{
			
 
				+	/* Initiliaze CUDA context on the device */
			
 
				+	cudaSetDevice(dev);
			
 
				+
			
 
				+	/* hack to force the initialization */
			
 
				+	cudaFree(0);
			
 
				+
			
 
				+	/* Allocate a buffer on the device */
			
 
				+	unsigned char *d_buffer;
			
 
				+	cudaMalloc((void **)&d_buffer, SIZE);
			
 
				+	assert(d_buffer);
			
 
				+
			
 
				+	/* Allocate a buffer on the host */
			
 
				+	unsigned char *h_buffer;
			
 
				+	cudaHostAlloc((void **)&h_buffer, SIZE, 0); 
			
 
				+	assert(h_buffer);
			
 
				+
			
 
				+	/* Fill them */
			
 
				+	memset(h_buffer, 0, SIZE);
			
 
				+	cudaMemset(d_buffer, 0, SIZE);
			
 
				+
			
 
				+	unsigned iter;
			
 
				+	double timing;
			
 
				+	struct timeval start;
			
 
				+	struct timeval end;
			
 
				+
			
 
				+	/* Measure upload bandwith */
			
 
				+	gettimeofday(&start, NULL);
			
 
				+	for (iter = 0; iter < NITER; iter++)
			
 
				+	{
			
 
				+		cudaMemcpy(d_buffer, h_buffer, SIZE, cudaMemcpyHostToDevice);
			
 
				+		cudaThreadSynchronize();
			
 
				+	}
			
 
				+	gettimeofday(&end, NULL);
			
 
				+	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				+
			
 
				+	cudadev_timing_htod[dev+1] = timing/NITER;
			
 
				+
			
 
				+	/* Measure download bandwith */
			
 
				+	gettimeofday(&start, NULL);
			
 
				+	for (iter = 0; iter < NITER; iter++)
			
 
				+	{
			
 
				+		cudaMemcpy(h_buffer, d_buffer, SIZE, cudaMemcpyDeviceToHost);
			
 
				+		cudaThreadSynchronize();
			
 
				+	}
			
 
				+	gettimeofday(&end, NULL);
			
 
				+	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				+
			
 
				+	cudadev_timing_dtoh[dev+1] = timing/NITER;
			
 
				+
			
 
				+	/* Free buffers */
			
 
				+	cudaFreeHost(h_buffer);
			
 
				+	cudaFree(d_buffer);
			
 
				+
			
 
				+	cudaThreadExit();
			
 
				+}
			
 
				+
			
 
				+static void benchmark_all_cuda_devices(void)
			
 
				+{
			
 
				+#ifdef VERBOSE
			
 
				+	fprintf(stderr, "Benchmarking the speed of the bus\n");
			
 
				+#endif
			
 
				+
			
 
				+        cudaGetDeviceCount(&ncuda);
			
 
				+
			
 
				+	int i;
			
 
				+	for (i = 0; i < ncuda; i++)
			
 
				+	{
			
 
				+		/* measure bandwith between Host and Device i */
			
 
				+		measure_bandwith_between_host_and_dev(i);
			
 
				+	}
			
 
				+
			
 
				+	was_benchmarked = 1;
			
 
				+
			
 
				+#ifdef VERBOSE
			
 
				+	fprintf(stderr, "Benchmarking the speed of the bus is done.\n");
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+static void get_bus_path(const char *type, char *path, size_t maxlen)
			
 
				+{
			
 
				+	strncpy(path, PERF_MODEL_DIR_BUS, maxlen);
			
 
				+	strncat(path, type, maxlen);
			
 
				+	
			
 
				+	char hostname[32];
			
 
				+	gethostname(hostname, 32);
			
 
				+	strncat(path, ".", maxlen);
			
 
				+	strncat(path, hostname, maxlen);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ *	Latency
			
 
				+ */
			
 
				+
			
 
				+static void get_latency_path(char *path, size_t maxlen)
			
 
				+{
			
 
				+	get_bus_path("latency", path, maxlen);
			
 
				+}
			
 
				+
			
 
				+static void load_bus_latency_file_content(void)
			
 
				+{
			
 
				+	int n;
			
 
				+	unsigned src, dst;
			
 
				+	FILE *f;
			
 
				+
			
 
				+	char path[256];
			
 
				+	get_latency_path(path, 256);
			
 
				+
			
 
				+	f = fopen(path, "r");
			
 
				+	STARPU_ASSERT(f);
			
 
				+
			
 
				+	for (src = 0; src < MAXNODES; src++)
			
 
				+	{
			
 
				+		for (dst = 0; dst < MAXNODES; dst++)
			
 
				+		{
			
 
				+			double latency;
			
 
				+
			
 
				+			n = fscanf(f, "%lf ", &latency);
			
 
				+			STARPU_ASSERT(n == 1);
			
 
				+
			
 
				+			latency_matrix[src][dst] = latency;
			
 
				+		}
			
 
				+
			
 
				+		n = fscanf(f, "\n");
			
 
				+		STARPU_ASSERT(n == 0);
			
 
				+	}
			
 
				+
			
 
				+	fclose(f);
			
 
				+}
			
 
				+
			
 
				+static void write_bus_latency_file_content(void)
			
 
				+{
			
 
				+	int src, dst;
			
 
				+	FILE *f;
			
 
				+
			
 
				+	STARPU_ASSERT(was_benchmarked);
			
 
				+
			
 
				+	char path[256];
			
 
				+	get_latency_path(path, 256);
			
 
				+
			
 
				+	f = fopen(path, "w+");
			
 
				+	STARPU_ASSERT(f);
			
 
				+
			
 
				+	for (src = 0; src < MAXNODES; src++)
			
 
				+	{
			
 
				+		for (dst = 0; dst < MAXNODES; dst++)
			
 
				+		{
			
 
				+			double latency;
			
 
				+
			
 
				+			if ((src > ncuda) || (dst > ncuda))
			
 
				+			{
			
 
				+				/* convention */
			
 
				+				latency = -1.0;
			
 
				+			}
			
 
				+			else if (src == dst)
			
 
				+			{
			
 
				+				latency = 0.0;
			
 
				+			}
			
 
				+			else {
			
 
				+                                latency = ((src && dst)?2000.0:500.0);
			
 
				+			}
			
 
				+
			
 
				+			fprintf(f, "%lf ", latency);
			
 
				+		}
			
 
				+
			
 
				+		fprintf(f, "\n");
			
 
				+	}
			
 
				+
			
 
				+	fclose(f);
			
 
				+}
			
 
				+
			
 
				+static void load_bus_latency_file(void)
			
 
				+{
			
 
				+	int res;
			
 
				+
			
 
				+	char path[256];
			
 
				+	get_latency_path(path, 256);
			
 
				+
			
 
				+	res = access(path, F_OK);
			
 
				+	if (res)
			
 
				+	{
			
 
				+		/* File does not exist yet */
			
 
				+		if (!was_benchmarked)
			
 
				+			benchmark_all_cuda_devices();
			
 
				+
			
 
				+		write_bus_latency_file_content();
			
 
				+	}
			
 
				+
			
 
				+	load_bus_latency_file_content();
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* 
			
 
				+ *	Bandwith
			
 
				+ */
			
 
				+static void get_bandwith_path(char *path, size_t maxlen)
			
 
				+{
			
 
				+	get_bus_path("bandwith", path, maxlen);
			
 
				+}
			
 
				+
			
 
				+static void load_bus_bandwith_file_content(void)
			
 
				+{
			
 
				+	int n;
			
 
				+	unsigned src, dst;
			
 
				+	FILE *f;
			
 
				+
			
 
				+	char path[256];
			
 
				+	get_bandwith_path(path, 256);
			
 
				+
			
 
				+	f = fopen(path, "r");
			
 
				+	STARPU_ASSERT(f);
			
 
				+
			
 
				+	for (src = 0; src < MAXNODES; src++)
			
 
				+	{
			
 
				+		for (dst = 0; dst < MAXNODES; dst++)
			
 
				+		{
			
 
				+			double bandwith;
			
 
				+
			
 
				+			n = fscanf(f, "%lf ", &bandwith);
			
 
				+			STARPU_ASSERT(n == 1);
			
 
				+
			
 
				+			bandwith_matrix[src][dst] = bandwith;
			
 
				+		}
			
 
				+
			
 
				+		n = fscanf(f, "\n");
			
 
				+		STARPU_ASSERT(n == 0);
			
 
				+	}
			
 
				+
			
 
				+	fclose(f);
			
 
				+}
			
 
				+
			
 
				+static void write_bus_bandwith_file_content(void)
			
 
				+{
			
 
				+	int src, dst;
			
 
				+	FILE *f;
			
 
				+
			
 
				+	STARPU_ASSERT(was_benchmarked);
			
 
				+
			
 
				+	char path[256];
			
 
				+	get_bandwith_path(path, 256);
			
 
				+
			
 
				+	f = fopen(path, "w+");
			
 
				+	STARPU_ASSERT(f);
			
 
				+
			
 
				+	for (src = 0; src < MAXNODES; src++)
			
 
				+	{
			
 
				+		for (dst = 0; dst < MAXNODES; dst++)
			
 
				+		{
			
 
				+			double bandwith;
			
 
				+			
			
 
				+			if ((src > ncuda) || (dst > ncuda))
			
 
				+			{
			
 
				+				bandwith = -1.0;
			
 
				+			}
			
 
				+			else if (src != dst)
			
 
				+			{
			
 
				+				/* Bandwith = (SIZE)/(time i -> ram + time ram -> j)*/
			
 
				+				double time_src_to_ram = (src==0)?0.0:cudadev_timing_dtoh[src];
			
 
				+				double time_ram_to_dst = (dst==0)?0.0:cudadev_timing_htod[dst];
			
 
				+				
			
 
				+				double timing =time_src_to_ram + time_ram_to_dst;
			
 
				+				
			
 
				+				bandwith = 1.0*SIZE/timing;
			
 
				+			}
			
 
				+			else {
			
 
				+			        /* convention */
			
 
				+			        bandwith = 0.0;
			
 
				+			}
			
 
				+			
			
 
				+			fprintf(f, "%lf ", bandwith);
			
 
				+		}
			
 
				+
			
 
				+		fprintf(f, "\n");
			
 
				+	}
			
 
				+
			
 
				+	fclose(f);
			
 
				+}
			
 
				+
			
 
				+static void load_bus_bandwith_file(void)
			
 
				+{
			
 
				+	int res;
			
 
				+
			
 
				+	char path[256];
			
 
				+	get_bandwith_path(path, 256);
			
 
				+
			
 
				+	res = access(path, F_OK);
			
 
				+	if (res)
			
 
				+	{
			
 
				+		/* File does not exist yet */
			
 
				+		if (!was_benchmarked)
			
 
				+			benchmark_all_cuda_devices();
			
 
				+
			
 
				+		write_bus_bandwith_file_content();
			
 
				+	}
			
 
				+
			
 
				+	load_bus_bandwith_file_content();
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ *	Generic
			
 
				+ */
			
 
				+
			
 
				+void load_bus_performance_files(void)
			
 
				+{
			
 
				+	load_bus_latency_file();
			
 
				+	load_bus_bandwith_file();
			
 
				+}
			
 
				+
			
 
				+double predict_transfer_time(unsigned src_node, unsigned dst_node, size_t size)
			
 
				+{
			
 
				+	double bandwith = bandwith_matrix[src_node][dst_node];
			
 
				+	double latency = latency_matrix[src_node][dst_node];
			
 
				+
			
 
				+	return latency + size/bandwith;
			
 
				+}
			
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -188,6 +188,8 @@ int starpu_init(struct starpu_conf *user_conf)
 
				 
			
 
				 	timing_init();
			
 
				 
			
 
				+	load_bus_performance_files();
			
 
				+
			
 
				 	/* store the pointer to the user explicit configuration during the
			
 
				 	 * initialization */
			
 
				 	config.user_conf = user_conf;