Procházet zdrojové kódy

forwardport r6996 from 1.0: fix bus calibration for more than 32 cpus

Samuel Thibault před 13 roky
rodič
revize
8f8194dfa6
1 změnil soubory, kde provedl 28 přidání a 28 odebrání
  1. 28 28
      src/core/perfmodel/perfmodel_bus.c

+ 28 - 28
src/core/perfmodel/perfmodel_bus.c

@@ -64,19 +64,19 @@ static int nopencl = 0;
 /* Benchmarking the performance of the bus */
 
 #ifdef STARPU_USE_CUDA
-static int cuda_affinity_matrix[STARPU_MAXCUDADEVS][MAXCPUS];
+static int cuda_affinity_matrix[STARPU_MAXCUDADEVS][STARPU_MAXCPUS];
 static double cudadev_timing_htod[STARPU_MAXNODES] = {0.0};
 static double cudadev_timing_dtoh[STARPU_MAXNODES] = {0.0};
 #ifdef HAVE_CUDA_MEMCPY_PEER
 static double cudadev_timing_dtod[STARPU_MAXNODES][STARPU_MAXNODES] = {{0.0}};
 #endif
-static struct dev_timing cudadev_timing_per_cpu[STARPU_MAXNODES*MAXCPUS];
+static struct dev_timing cudadev_timing_per_cpu[STARPU_MAXNODES*STARPU_MAXCPUS];
 #endif
 #ifdef STARPU_USE_OPENCL
-static int opencl_affinity_matrix[STARPU_MAXOPENCLDEVS][MAXCPUS];
+static int opencl_affinity_matrix[STARPU_MAXOPENCLDEVS][STARPU_MAXCPUS];
 static double opencldev_timing_htod[STARPU_MAXNODES] = {0.0};
 static double opencldev_timing_dtoh[STARPU_MAXNODES] = {0.0};
-static struct dev_timing opencldev_timing_per_cpu[STARPU_MAXNODES*MAXCPUS];
+static struct dev_timing opencldev_timing_per_cpu[STARPU_MAXNODES*STARPU_MAXCPUS];
 #endif
 
 #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
@@ -150,7 +150,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 	gettimeofday(&end, NULL);
 	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 
-	dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod = timing/NITER/size;
+	dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_htod = timing/NITER/size;
 
 	/* Measure download bandwidth */
 	gettimeofday(&start, NULL);
@@ -162,7 +162,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 	gettimeofday(&end, NULL);
 	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 
-	dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh = timing/NITER/size;
+	dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_dtoh = timing/NITER/size;
 
 	/* Free buffers */
 	cudaFreeHost(h_buffer);
@@ -323,7 +323,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(int dev, i
 	gettimeofday(&end, NULL);
 	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 
-	dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod = timing/NITER/size;
+	dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_htod = timing/NITER/size;
 
 	/* Measure download bandwidth */
 	gettimeofday(&start, NULL);
@@ -335,7 +335,7 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(int dev, i
 	gettimeofday(&end, NULL);
 	timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
 
-	dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh = timing/NITER/size;
+	dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_dtoh = timing/NITER/size;
 
 	/* Free buffers */
 	clReleaseMemObject(d_buffer);
@@ -423,7 +423,7 @@ static void measure_bandwidth_between_cpus_and_dev(int dev, struct dev_timing *d
 	unsigned cpu;
 	for (cpu = 0; cpu < ncpus; cpu++)
 	{
-		dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].cpu_id = cpu;
+		dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].cpu_id = cpu;
 
 #ifdef STARPU_HAVE_HWLOC
 		int numa_id = 0;
@@ -437,9 +437,9 @@ static void measure_bandwidth_between_cpus_and_dev(int dev, struct dev_timing *d
 			if (is_available_per_numa_node[numa_id])
 			{
 				/* We reuse the previous numbers for that NUMA node */
-				dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod =
+				dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_htod =
 					dev_timing_htod_per_numa_node[numa_id];
-				dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh =
+				dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_dtoh =
 					dev_timing_dtoh_per_numa_node[numa_id];
 				continue;
 			}
@@ -460,9 +460,9 @@ static void measure_bandwidth_between_cpus_and_dev(int dev, struct dev_timing *d
 		{
 			/* Save the results for that NUMA node */
 			dev_timing_htod_per_numa_node[numa_id] =
-				dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod;
+				dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_htod;
 			dev_timing_dtoh_per_numa_node[numa_id] =
-				dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh;
+				dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_dtoh;
 
 			is_available_per_numa_node[numa_id] = 1;
 		}
@@ -485,7 +485,7 @@ static void measure_bandwidth_between_host_and_dev(int dev, double *dev_timing_h
 	measure_bandwidth_between_cpus_and_dev(dev, dev_timing_per_cpu, type);
 
 	/* sort the results */
-	qsort(&(dev_timing_per_cpu[(dev+1)*MAXCPUS]), ncpus,
+	qsort(&(dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS]), ncpus,
               sizeof(struct dev_timing),
 			compar_dev_timing);
 
@@ -493,24 +493,24 @@ static void measure_bandwidth_between_host_and_dev(int dev, double *dev_timing_h
         unsigned cpu;
 	for (cpu = 0; cpu < ncpus; cpu++)
 	{
-		unsigned current_cpu = dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].cpu_id;
-		double bandwidth_dtoh = dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh;
-		double bandwidth_htod = dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod;
+		unsigned current_cpu = dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].cpu_id;
+		double bandwidth_dtoh = dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_dtoh;
+		double bandwidth_htod = dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_htod;
 
 		double bandwidth_sum2 = bandwidth_dtoh*bandwidth_dtoh + bandwidth_htod*bandwidth_htod;
 
 		_STARPU_DISP("(%10s) BANDWIDTH GPU %d CPU %u - htod %f - dtoh %f - %f\n", type, dev, current_cpu, bandwidth_htod, bandwidth_dtoh, sqrt(bandwidth_sum2));
 	}
 
-	unsigned best_cpu = dev_timing_per_cpu[(dev+1)*MAXCPUS+0].cpu_id;
+	unsigned best_cpu = dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+0].cpu_id;
 
 	_STARPU_DISP("(%10s) BANDWIDTH GPU %d BEST CPU %u\n", type, dev, best_cpu);
 #endif
 
 	/* The results are sorted in a decreasing order, so that the best
 	 * measurement is currently the first entry. */
-	dev_timing_dtoh[dev+1] = dev_timing_per_cpu[(dev+1)*MAXCPUS+0].timing_dtoh;
-	dev_timing_htod[dev+1] = dev_timing_per_cpu[(dev+1)*MAXCPUS+0].timing_htod;
+	dev_timing_dtoh[dev+1] = dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+0].timing_dtoh;
+	dev_timing_htod[dev+1] = dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+0].timing_htod;
 }
 #endif /* defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) */
 
@@ -727,7 +727,7 @@ static void write_bus_affinity_file_content(void)
 
 		for (cpu = 0; cpu < ncpus; cpu++)
 		{
-			fprintf(f, "%d\t", cudadev_timing_per_cpu[(gpu+1)*MAXCPUS+cpu].cpu_id);
+			fprintf(f, "%d\t", cudadev_timing_per_cpu[(gpu+1)*STARPU_MAXCPUS+cpu].cpu_id);
 		}
 
 		fprintf(f, "\n");
@@ -740,7 +740,7 @@ static void write_bus_affinity_file_content(void)
 
 		for (cpu = 0; cpu < ncpus; cpu++)
 		{
-                        fprintf(f, "%d\t", opencldev_timing_per_cpu[(gpu+1)*MAXCPUS+cpu].cpu_id);
+                        fprintf(f, "%d\t", opencldev_timing_per_cpu[(gpu+1)*STARPU_MAXCPUS+cpu].cpu_id);
 		}
 
 		fprintf(f, "\n");
@@ -1108,12 +1108,12 @@ void starpu_bus_print_bandwidth(FILE *f)
         maxnode += nopencl;
 #endif
 
-	fprintf(f, "from\t");
-	fprintf(f, "to RAM\t");
+	fprintf(f, "from to\t");
+	fprintf(f, "RAM\t");
 	for (dst = 0; dst < ncuda; dst++)
-		fprintf(f, "to CUDA %d\t", dst);
+		fprintf(f, "CUDA %d\t", dst);
 	for (dst = 0; dst < nopencl; dst++)
-		fprintf(f, "to OpenCL %d\t", dst);
+		fprintf(f, "OpenCL %d\t", dst);
 	fprintf(f, "\n");
 
 	for (src = 0; src <= maxnode; src++)
@@ -1143,7 +1143,7 @@ void starpu_bus_print_bandwidth(FILE *f)
 			fprintf(f, "CUDA %d\t", src-1);
 			for (cpu = 0; cpu < ncpus; cpu++)
 			{
-				timing = &cudadev_timing_per_cpu[src*MAXCPUS+cpu];
+				timing = &cudadev_timing_per_cpu[src*STARPU_MAXCPUS+cpu];
 				if (timing->timing_htod)
 					fprintf(f, "%d %.0f %.0f\t", timing->cpu_id, 1/timing->timing_htod, 1/timing->timing_dtoh);
 				else
@@ -1155,7 +1155,7 @@ void starpu_bus_print_bandwidth(FILE *f)
 			fprintf(f, "OpenCL%d\t", src-ncuda-1);
 			for (cpu = 0; cpu < ncpus; cpu++)
 			{
-				timing = &opencldev_timing_per_cpu[(src-ncuda)*MAXCPUS+cpu];
+				timing = &opencldev_timing_per_cpu[(src-ncuda)*STARPU_MAXCPUS+cpu];
 				if (timing->timing_htod)
 					fprintf(f, "%d %.0f %.0f\t", timing->cpu_id, 1/timing->timing_htod, 1/timing->timing_dtoh);
 				else