15 years ago · 7a6af6bb8e
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
@@ -40,7 +40,7 @@ struct cudadev_timing {
 
				 	double timing_dtoh;
			
 
				 };
			
 
				 
			
 
				-static double bandwith_matrix[STARPU_MAXNODES][STARPU_MAXNODES] = {{-1.0}};
			
 
				+static double bandwidth_matrix[STARPU_MAXNODES][STARPU_MAXNODES] = {{-1.0}};
			
 
				 static double latency_matrix[STARPU_MAXNODES][STARPU_MAXNODES] = {{ -1.0}};
			
 
				 static unsigned was_benchmarked = 0;
			
 
				 static int ncuda = 0;
			
@@ -55,7 +55,7 @@ static double cudadev_timing_dtoh[STARPU_MAXNODES] = {0.0};
 
				 
			
 
				 static struct cudadev_timing cudadev_timing_per_cpu[STARPU_MAXNODES][MAXCPUS];
			
 
				 
			
 
				-static void measure_bandwith_between_host_and_dev_on_cpu(int dev, int cpu)
			
 
				+static void measure_bandwidth_between_host_and_dev_on_cpu(int dev, int cpu)
			
 
				 {
			
 
				 	struct starpu_machine_config_s *config = _starpu_get_machine_config();
			
 
				 	_starpu_bind_thread_on_cpu(config, cpu);
			
@@ -106,7 +106,7 @@ static void measure_bandwith_between_host_and_dev_on_cpu(int dev, int cpu)
 
				 
			
 
				 	cudadev_timing_per_cpu[dev+1][cpu].cpu_id = cpu;
			
 
				 
			
 
				-	/* Measure upload bandwith */
			
 
				+	/* Measure upload bandwidth */
			
 
				 	gettimeofday(&start, NULL);
			
 
				 	for (iter = 0; iter < NITER; iter++)
			
 
				 	{
			
@@ -118,7 +118,7 @@ static void measure_bandwith_between_host_and_dev_on_cpu(int dev, int cpu)
 
				 
			
 
				 	cudadev_timing_per_cpu[dev+1][cpu].timing_htod = timing/NITER;
			
 
				 
			
 
				-	/* Measure download bandwith */
			
 
				+	/* Measure download bandwidth */
			
 
				 	gettimeofday(&start, NULL);
			
 
				 	for (iter = 0; iter < NITER; iter++)
			
 
				 	{
			
@@ -138,7 +138,7 @@ static void measure_bandwith_between_host_and_dev_on_cpu(int dev, int cpu)
 
				 
			
 
				 }
			
 
				 
			
 
				-/* NB: we want to sort the bandwith by DECREASING order */
			
 
				+/* NB: we want to sort the bandwidth by DECREASING order */
			
 
				 static int compar_cudadev_timing(const void *left_cudadev_timing, const void *right_cudadev_timing)
			
 
				 {
			
 
				 	const struct cudadev_timing *left = left_cudadev_timing;
			
@@ -149,19 +149,19 @@ static int compar_cudadev_timing(const void *left_cudadev_timing, const void *ri
 
				 	double right_dtoh = right->timing_dtoh;
			
 
				 	double right_htod = right->timing_htod;
			
 
				 	
			
 
				-	double bandwith_sum2_left = left_dtoh*left_dtoh + left_htod*left_htod;
			
 
				-	double bandwith_sum2_right = right_dtoh*right_dtoh + right_htod*right_htod;
			
 
				+	double bandwidth_sum2_left = left_dtoh*left_dtoh + left_htod*left_htod;
			
 
				+	double bandwidth_sum2_right = right_dtoh*right_dtoh + right_htod*right_htod;
			
 
				 
			
 
				 	/* it's for a decreasing sorting */
			
 
				-	return (bandwith_sum2_left < bandwith_sum2_right);
			
 
				+	return (bandwidth_sum2_left < bandwidth_sum2_right);
			
 
				 }
			
 
				 
			
 
				-static void measure_bandwith_between_host_and_dev(int dev, unsigned ncpus)
			
 
				+static void measure_bandwidth_between_host_and_dev(int dev, unsigned ncpus)
			
 
				 {
			
 
				 	unsigned cpu;
			
 
				 	for (cpu = 0; cpu < ncpus; cpu++)
			
 
				 	{
			
 
				-		measure_bandwith_between_host_and_dev_on_cpu(dev, cpu);
			
 
				+		measure_bandwidth_between_host_and_dev_on_cpu(dev, cpu);
			
 
				 	}
			
 
				 
			
 
				 	/* sort the results */
			
@@ -173,17 +173,17 @@ static void measure_bandwith_between_host_and_dev(int dev, unsigned ncpus)
 
				 	for (cpu = 0; cpu < ncpus; cpu++)
			
 
				 	{
			
 
				 		unsigned current_cpu = cudadev_timing_per_cpu[dev+1][cpu].cpu_id;
			
 
				-		double bandwith_dtoh = cudadev_timing_per_cpu[dev+1][cpu].timing_dtoh;
			
 
				-		double bandwith_htod = cudadev_timing_per_cpu[dev+1][cpu].timing_htod;
			
 
				+		double bandwidth_dtoh = cudadev_timing_per_cpu[dev+1][cpu].timing_dtoh;
			
 
				+		double bandwidth_htod = cudadev_timing_per_cpu[dev+1][cpu].timing_htod;
			
 
				 
			
 
				-		double bandwith_sum2 = bandwith_dtoh*bandwith_dtoh + bandwith_htod*bandwith_htod;
			
 
				+		double bandwidth_sum2 = bandwidth_dtoh*bandwidth_dtoh + bandwidth_htod*bandwidth_htod;
			
 
				 
			
 
				-		fprintf(stderr, "BANDWITH GPU %d CPU %d - htod %lf - dtoh %lf - %lf\n", dev, current_cpu, bandwith_htod, bandwith_dtoh, sqrt(bandwith_sum2));
			
 
				+		fprintf(stderr, "BANDWIDTH GPU %d CPU %d - htod %lf - dtoh %lf - %lf\n", dev, current_cpu, bandwidth_htod, bandwidth_dtoh, sqrt(bandwidth_sum2));
			
 
				 	}
			
 
				 
			
 
				 	unsigned best_cpu = cudadev_timing_per_cpu[dev+1][0].cpu_id;
			
 
				 
			
 
				-	fprintf(stderr, "BANDWITH GPU %d BEST CPU %d\n", dev, best_cpu);
			
 
				+	fprintf(stderr, "BANDWIDTH GPU %d BEST CPU %d\n", dev, best_cpu);
			
 
				 #endif
			
 
				 
			
 
				 	/* The results are sorted in a decreasing order, so that the best
			
@@ -219,8 +219,8 @@ static void benchmark_all_cuda_devices(void)
 
				 	int i;
			
 
				 	for (i = 0; i < ncuda; i++)
			
 
				 	{
			
 
				-		/* measure bandwith between Host and Device i */
			
 
				-		measure_bandwith_between_host_and_dev(i, ncpus);
			
 
				+		/* measure bandwidth between Host and Device i */
			
 
				+		measure_bandwidth_between_host_and_dev(i, ncpus);
			
 
				 	}
			
 
				 
			
 
				 	/* FIXME: use hwloc */
			
@@ -495,21 +495,21 @@ static void load_bus_latency_file(void)
 
				 
			
 
				 
			
 
				 /* 
			
 
				- *	Bandwith
			
 
				+ *	Bandwidth
			
 
				  */
			
 
				-static void get_bandwith_path(char *path, size_t maxlen)
			
 
				+static void get_bandwidth_path(char *path, size_t maxlen)
			
 
				 {
			
 
				-	get_bus_path("bandwith", path, maxlen);
			
 
				+	get_bus_path("bandwidth", path, maxlen);
			
 
				 }
			
 
				 
			
 
				-static void load_bus_bandwith_file_content(void)
			
 
				+static void load_bus_bandwidth_file_content(void)
			
 
				 {
			
 
				 	int n;
			
 
				 	unsigned src, dst;
			
 
				 	FILE *f;
			
 
				 
			
 
				 	char path[256];
			
 
				-	get_bandwith_path(path, 256);
			
 
				+	get_bandwidth_path(path, 256);
			
 
				 
			
 
				 	f = fopen(path, "r");
			
 
				 	if (!f)
			
@@ -523,12 +523,12 @@ static void load_bus_bandwith_file_content(void)
 
				 		starpu_drop_comments(f);
			
 
				 		for (dst = 0; dst < STARPU_MAXNODES; dst++)
			
 
				 		{
			
 
				-			double bandwith;
			
 
				+			double bandwidth;
			
 
				 
			
 
				-			n = fscanf(f, "%lf\t", &bandwith);
			
 
				+			n = fscanf(f, "%lf\t", &bandwidth);
			
 
				 			STARPU_ASSERT(n == 1);
			
 
				 
			
 
				-			bandwith_matrix[src][dst] = bandwith;
			
 
				+			bandwidth_matrix[src][dst] = bandwidth;
			
 
				 		}
			
 
				 
			
 
				 		n = fscanf(f, "\n");
			
@@ -538,7 +538,7 @@ static void load_bus_bandwith_file_content(void)
 
				 	fclose(f);
			
 
				 }
			
 
				 
			
 
				-static void write_bus_bandwith_file_content(void)
			
 
				+static void write_bus_bandwidth_file_content(void)
			
 
				 {
			
 
				 	int src, dst;
			
 
				 	FILE *f;
			
@@ -546,7 +546,7 @@ static void write_bus_bandwith_file_content(void)
 
				 	STARPU_ASSERT(was_benchmarked);
			
 
				 
			
 
				 	char path[256];
			
 
				-	get_bandwith_path(path, 256);
			
 
				+	get_bandwidth_path(path, 256);
			
 
				 
			
 
				 	f = fopen(path, "w+");
			
 
				 	STARPU_ASSERT(f);
			
@@ -560,30 +560,30 @@ static void write_bus_bandwith_file_content(void)
 
				 	{
			
 
				 		for (dst = 0; dst < STARPU_MAXNODES; dst++)
			
 
				 		{
			
 
				-			double bandwith;
			
 
				+			double bandwidth;
			
 
				 			
			
 
				 			if ((src > ncuda) || (dst > ncuda))
			
 
				 			{
			
 
				-				bandwith = -1.0;
			
 
				+				bandwidth = -1.0;
			
 
				 			}
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 			else if (src != dst)
			
 
				 			{
			
 
				-			/* Bandwith = (SIZE)/(time i -> ram + time ram -> j)*/
			
 
				+			/* Bandwidth = (SIZE)/(time i -> ram + time ram -> j)*/
			
 
				 				double time_src_to_ram = (src==0)?0.0:cudadev_timing_dtoh[src];
			
 
				 				double time_ram_to_dst = (dst==0)?0.0:cudadev_timing_htod[dst];
			
 
				 				
			
 
				 				double timing =time_src_to_ram + time_ram_to_dst;
			
 
				 				
			
 
				-				bandwith = 1.0*SIZE/timing;
			
 
				+				bandwidth = 1.0*SIZE/timing;
			
 
				 			}
			
 
				 #endif
			
 
				 			else {
			
 
				 			        /* convention */
			
 
				-			        bandwith = 0.0;
			
 
				+			        bandwidth = 0.0;
			
 
				 			}
			
 
				 			
			
 
				-			fprintf(f, "%lf\t", bandwith);
			
 
				+			fprintf(f, "%lf\t", bandwidth);
			
 
				 		}
			
 
				 
			
 
				 		fprintf(f, "\n");
			
@@ -592,29 +592,29 @@ static void write_bus_bandwith_file_content(void)
 
				 	fclose(f);
			
 
				 }
			
 
				 
			
 
				-static void generate_bus_bandwith_file(void)
			
 
				+static void generate_bus_bandwidth_file(void)
			
 
				 {
			
 
				 	if (!was_benchmarked)
			
 
				 		benchmark_all_cuda_devices();
			
 
				 
			
 
				-	write_bus_bandwith_file_content();
			
 
				+	write_bus_bandwidth_file_content();
			
 
				 }
			
 
				 
			
 
				-static void load_bus_bandwith_file(void)
			
 
				+static void load_bus_bandwidth_file(void)
			
 
				 {
			
 
				 	int res;
			
 
				 
			
 
				 	char path[256];
			
 
				-	get_bandwith_path(path, 256);
			
 
				+	get_bandwidth_path(path, 256);
			
 
				 
			
 
				 	res = access(path, F_OK);
			
 
				 	if (res)
			
 
				 	{
			
 
				 		/* File does not exist yet */
			
 
				-		generate_bus_bandwith_file();
			
 
				+		generate_bus_bandwidth_file();
			
 
				 	}
			
 
				 
			
 
				-	load_bus_bandwith_file_content();
			
 
				+	load_bus_bandwidth_file_content();
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -627,7 +627,7 @@ void starpu_force_bus_sampling(void)
 
				 
			
 
				 	generate_bus_affinity_file();
			
 
				 	generate_bus_latency_file();
			
 
				-	generate_bus_bandwith_file();
			
 
				+	generate_bus_bandwidth_file();
			
 
				 }
			
 
				 
			
 
				 void _starpu_load_bus_performance_files(void)
			
@@ -636,13 +636,13 @@ void _starpu_load_bus_performance_files(void)
 
				 
			
 
				 	load_bus_affinity_file();
			
 
				 	load_bus_latency_file();
			
 
				-	load_bus_bandwith_file();
			
 
				+	load_bus_bandwidth_file();
			
 
				 }
			
 
				 
			
 
				 double _starpu_predict_transfer_time(unsigned src_node, unsigned dst_node, size_t size)
			
 
				 {
			
 
				-	double bandwith = bandwith_matrix[src_node][dst_node];
			
 
				+	double bandwidth = bandwidth_matrix[src_node][dst_node];
			
 
				 	double latency = latency_matrix[src_node][dst_node];
			
 
				 
			
 
				-	return latency + size/bandwith;
			
 
				+	return latency + size/bandwidth;
			
 
				 }
			
--- a/tests/experiments/bandwidth-cuda/Makefile
+++ b/tests/experiments/bandwidth-cuda/Makefile
@@ -28,11 +28,11 @@ LDFLAGS+= $(ATLASDIR)/lib/$(BLASARCH)/libatlas.a
 
				 CFLAGS+= -g #-pg
			
 
				 #LDFLAGS+= -pg
			
 
				 
			
 
				-all:  cuda-bandwith
			
 
				+all:  cuda-bandwidth
			
 
				 
			
 
				-cuda-bandwith:  cuda-bandwith.o
			
 
				-	$(CC)  cuda-bandwith.o -o  cuda-bandwith $(LDFLAGS)
			
 
				+cuda-bandwidth:  cuda-bandwidth.o
			
 
				+	$(CC)  cuda-bandwidth.o -o  cuda-bandwidth $(LDFLAGS)
			
 
				 
			
 
				 clean:
			
 
				-	rm -f cuda-bandwith
			
 
				+	rm -f cuda-bandwidth
			
 
				 	rm -f *.o
			
--- a/tests/experiments/bandwidth-cuda/bench-bandwidth-strided.gp
+++ b/tests/experiments/bandwidth-cuda/bench-bandwidth-strided.gp
@@ -18,11 +18,11 @@
 
				 
			
 
				 
			
 
				 set term postscript eps enhanced color
			
 
				-set output "bench-bandwith-strided.eps"
			
 
				-set title "CUDA Bandwith"
			
 
				+set output "bench-bandwidth-strided.eps"
			
 
				+set title "CUDA Bandwidth"
			
 
				 set logscale x
			
 
				 set xlabel "Size (Bytes/4)"
			
 
				-set ylabel "Bandwith (MB/s)"
			
 
				+set ylabel "Bandwidth (MB/s)"
			
 
				 
			
 
				 # plot ".results/htod-pin.data" with linespoint	title "Host to Device (pinned)" ,\
			
 
				 #      ".results/htod-pin.32.data"  with linespoint   title "stride 32" ,\
			
--- a/tests/experiments/bandwidth-cuda/bench-bandwidth.gp
+++ b/tests/experiments/bandwidth-cuda/bench-bandwidth.gp
@@ -18,11 +18,11 @@
 
				 
			
 
				 
			
 
				 set term postscript eps enhanced color
			
 
				-set output "bench-bandwith.eps"
			
 
				-set title "CUDA Bandwith"
			
 
				+set output "bench-bandwidth.eps"
			
 
				+set title "CUDA Bandwidth"
			
 
				 set logscale x
			
 
				 set xlabel "Size (Bytes)"
			
 
				-set ylabel "Bandwith (MB/s)"
			
 
				+set ylabel "Bandwidth (MB/s)"
			
 
				 
			
 
				 plot ".results/htod-pin.data" with linespoint	title "Host to Device (pinned)",\
			
 
				      ".results/dtoh-pin.data" with linespoint   title "Device to Host (pinned)"
			
--- a/tests/experiments/bandwidth-cuda/bench-bandwidth.sh
+++ b/tests/experiments/bandwidth-cuda/bench-bandwidth.sh
@@ -27,7 +27,7 @@ echo "H -> D"
 
				 for log in `seq 1 13`
			
 
				 do
			
 
				 	size=$((2**$log))
			
 
				-	echo "$size	`./cuda-bandwith -pin -HtoD -size $size -cpu-ld $size -gpu-ld $size -iter 50`" >> .results/htod-pin.data 
			
 
				+	echo "$size	`./cuda-bandwidth -pin -HtoD -size $size -cpu-ld $size -gpu-ld $size -iter 50`" >> .results/htod-pin.data 
			
 
				 done
			
 
				 
			
 
				 echo "D -> H"
			
@@ -35,10 +35,10 @@ echo "D -> H"
 
				 for log in `seq 1 13`
			
 
				 do
			
 
				 	size=$((2**$log))
			
 
				-	echo "$size	`./cuda-bandwith -pin -size $size -cpu-ld $size -gpu-ld $size -iter 50`" >> .results/dtoh-pin.data 
			
 
				+	echo "$size	`./cuda-bandwidth -pin -size $size -cpu-ld $size -gpu-ld $size -iter 50`" >> .results/dtoh-pin.data 
			
 
				 done
			
 
				 
			
 
				-./bench-bandwith.gp
			
 
				+./bench-bandwidth.gp
			
 
				 
			
 
				 echo "STRIDED H -> D"
			
 
				 
			
@@ -50,7 +50,7 @@ do
 
				 	for log in `seq 1 $stridelog`
			
 
				 	do
			
 
				 		size=$((2**$log))
			
 
				-		echo "$size	`./cuda-bandwith -pin -HtoD -size $size -cpu-ld $stridesize -gpu-ld $stridesize -iter 50`" >> .results/htod-pin.$stridesize.data 
			
 
				+		echo "$size	`./cuda-bandwidth -pin -HtoD -size $size -cpu-ld $stridesize -gpu-ld $stridesize -iter 50`" >> .results/htod-pin.$stridesize.data 
			
 
				 	done
			
 
				 done
			
 
				 
			
@@ -64,6 +64,6 @@ do
 
				 	for log in `seq 1 $stridelog`
			
 
				 	do
			
 
				 		size=$((2**$log))
			
 
				-		echo "$size	`./cuda-bandwith -pin -size $size -cpu-ld $stridesize -gpu-ld $stridesize -iter 50`" >> .results/dtoh-pin.$stridesize.data 
			
 
				+		echo "$size	`./cuda-bandwidth -pin -size $size -cpu-ld $stridesize -gpu-ld $stridesize -iter 50`" >> .results/dtoh-pin.$stridesize.data 
			
 
				 	done
			
 
				 done
			
--- a/tools/fxt-tool.c
+++ b/tools/fxt-tool.c
@@ -52,7 +52,7 @@ static symbol_name_list_t symbol_list;
 
				 LIST_TYPE(communication,
			
 
				 	unsigned comid;
			
 
				 	float comm_start;	
			
 
				-	float bandwith;
			
 
				+	float bandwidth;
			
 
				 	unsigned node;
			
 
				 );
			
 
				 
			
@@ -85,7 +85,7 @@ static void paje_output_file_init(void)
 
				 	3       S       T       \"Thread State\"                        \n \
			
 
				 	3       MS       Mn       \"Memory Node State\"                        \n \
			
 
				 	4       ntask    Sc       \"Number of tasks\"                        \n \
			
 
				-	4       bw      Mn       \"Bandwith\"                        \n \
			
 
				+	4       bw      Mn       \"Bandwidth\"                        \n \
			
 
				 	6       I       S      Initializing       \"0.0 .7 1.0\"            \n \
			
 
				 	6       D       S      Deinitializing       \"0.0 .1 .7\"            \n \
			
 
				 	6       Fi       S      FetchingInput       \"1.0 .1 1.0\"            \n \
			
@@ -403,14 +403,14 @@ static void handle_end_driver_copy(void)
 
				 			if (itor->comid == comid)
			
 
				 			{
			
 
				 				float comm_end = get_event_time_stamp();
			
 
				-				float bandwith = (float)((0.001*size)/(comm_end - itor->comm_start));
			
 
				+				float bandwidth = (float)((0.001*size)/(comm_end - itor->comm_start));
			
 
				 
			
 
				-				itor->bandwith = bandwith;
			
 
				+				itor->bandwidth = bandwidth;
			
 
				 
			
 
				 				communication_t com = communication_new();
			
 
				 				com->comid = comid;
			
 
				 				com->comm_start = get_event_time_stamp();
			
 
				-				com->bandwith = -bandwith;
			
 
				+				com->bandwidth = -bandwidth;
			
 
				 
			
 
				 				com->node = itor->node;
			
 
				 
			
@@ -422,23 +422,23 @@ static void handle_end_driver_copy(void)
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static void display_bandwith_evolution(void)
			
 
				+static void display_bandwidth_evolution(void)
			
 
				 {
			
 
				-	float current_bandwith = 0.0;
			
 
				-	float current_bandwith_per_node[32] = {0.0};
			
 
				+	float current_bandwidth = 0.0;
			
 
				+	float current_bandwidth_per_node[32] = {0.0};
			
 
				 
			
 
				 	communication_itor_t itor;
			
 
				 	for (itor = communication_list_begin(communication_list);
			
 
				 		itor != communication_list_end(communication_list);
			
 
				 		itor = communication_list_next(itor))
			
 
				 	{
			
 
				-		current_bandwith += itor->bandwith;
			
 
				+		current_bandwidth += itor->bandwidth;
			
 
				 		fprintf(out_paje_file, "13  %f bw %sMEMNODE0 %f\n",
			
 
				-				itor->comm_start, prefix, current_bandwith);
			
 
				+				itor->comm_start, prefix, current_bandwidth);
			
 
				 
			
 
				-		current_bandwith_per_node[itor->node] +=  itor->bandwith;
			
 
				+		current_bandwidth_per_node[itor->node] +=  itor->bandwidth;
			
 
				 		fprintf(out_paje_file, "13  %f bw %sMEMNODE%d %f\n",
			
 
				-				itor->comm_start, prefix, itor->node, current_bandwith_per_node[itor->node]);
			
 
				+				itor->comm_start, prefix, itor->node, current_bandwidth_per_node[itor->node]);
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -925,7 +925,7 @@ int main(int argc, char **argv)
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	display_bandwith_evolution();
			
 
				+	display_bandwidth_evolution();
			
 
				 
			
 
				 	/* close the different files */
			
 
				 	fclose(out_paje_file);