Browse Source

If hwloc is available, we accelerate the bus sampling by only benchmarking the
bus between the different numa nodes and the different GPUs rather than to
testing all CPU/GPU pairs.

Cédric Augonnet 15 years ago
parent
commit
8e890458ad
1 changed files with 75 additions and 7 deletions
  1. 75 7
      src/core/perfmodel/perfmodel_bus.c

+ 75 - 7
src/core/perfmodel/perfmodel_bus.c

@@ -64,6 +64,10 @@ static double opencldev_timing_dtoh[STARPU_MAXNODES] = {0.0};
 static struct dev_timing opencldev_timing_per_cpu[STARPU_MAXNODES*MAXCPUS];
 static struct dev_timing opencldev_timing_per_cpu[STARPU_MAXNODES*MAXCPUS];
 #endif
 #endif
 
 
+#ifdef STARPU_HAVE_HWLOC
+static hwloc_topology_t hwtopology;
+#endif
+
 #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
 #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
 
 
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
@@ -116,8 +120,6 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 	struct timeval start;
 	struct timeval start;
 	struct timeval end;
 	struct timeval end;
 
 
-	dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].cpu_id = cpu;
-
 	/* Measure upload bandwidth */
 	/* Measure upload bandwidth */
 	gettimeofday(&start, NULL);
 	gettimeofday(&start, NULL);
 	for (iter = 0; iter < NITER; iter++)
 	for (iter = 0; iter < NITER; iter++)
@@ -147,7 +149,6 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int
 	cudaFree(d_buffer);
 	cudaFree(d_buffer);
 
 
 	cudaThreadExit();
 	cudaThreadExit();
-
 }
 }
 #endif
 #endif
 
 
@@ -198,8 +199,6 @@ static void measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(int dev, i
 	struct timeval start;
 	struct timeval start;
 	struct timeval end;
 	struct timeval end;
 
 
-	dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].cpu_id = cpu;
-
 	/* Measure upload bandwidth */
 	/* Measure upload bandwidth */
 	gettimeofday(&start, NULL);
 	gettimeofday(&start, NULL);
 	for (iter = 0; iter < NITER; iter++)
 	for (iter = 0; iter < NITER; iter++)
@@ -251,12 +250,57 @@ static int compar_dev_timing(const void *left_dev_timing, const void *right_dev_
 	return (bandwidth_sum2_left < bandwidth_sum2_right);
 	return (bandwidth_sum2_left < bandwidth_sum2_right);
 }
 }
 
 
-static void measure_bandwidth_between_host_and_dev(int dev, double *dev_timing_htod, double *dev_timing_dtoh,
-                                                   struct dev_timing *dev_timing_per_cpu, char type)
+static int find_numa_node(hwloc_obj_t obj)
 {
 {
+	STARPU_ASSERT(obj);
+	hwloc_obj_t current = obj;
+
+	while (current->depth != HWLOC_OBJ_NODE)
+		current = current->parent;
+
+	STARPU_ASSERT(current->depth == HWLOC_OBJ_NODE);
+
+	return current->logical_index; 
+}
+
+static void measure_bandwidth_between_cpus_and_dev(int dev, struct dev_timing *dev_timing_per_cpu, char type)
+{
+	/* Either we have hwloc and we measure the bandwith between each GPU
+	 * and each NUMA node, or we don't have such NUMA information and we
+	 * measure the bandwith for each pair of (CPU, GPU), which is slower.
+	 * */
+#ifdef STARPU_HAVE_HWLOC
+	int cpu_depth = hwloc_get_type_depth(hwtopology, HWLOC_OBJ_CORE);
+	int nnuma_nodes = hwloc_get_nbobjs_by_depth(hwtopology, HWLOC_OBJ_NODE);
+	
+	unsigned is_available_per_numa_node[nnuma_nodes];
+	double dev_timing_htod_per_numa_node[nnuma_nodes];
+	double dev_timing_dtoh_per_numa_node[nnuma_nodes];
+
+	memset(is_available_per_numa_node, 0, nnuma_nodes*sizeof(unsigned));
+#endif
+
 	unsigned cpu;
 	unsigned cpu;
 	for (cpu = 0; cpu < ncpus; cpu++)
 	for (cpu = 0; cpu < ncpus; cpu++)
 	{
 	{
+		dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].cpu_id = cpu;
+
+#ifdef STARPU_HAVE_HWLOC
+		hwloc_obj_t obj = hwloc_get_obj_by_depth(hwtopology, cpu_depth, cpu);
+
+		int numa_id = find_numa_node(obj);
+
+		if (is_available_per_numa_node[numa_id])
+		{
+			/* We reuse the previous numbers for that NUMA node */
+			dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod =
+				dev_timing_htod_per_numa_node[numa_id];
+			dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh =
+				dev_timing_dtoh_per_numa_node[numa_id];
+			continue;
+		}
+#endif
+
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
                 if (type == 'C')
                 if (type == 'C')
                         measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(dev, cpu, dev_timing_per_cpu);
                         measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(dev, cpu, dev_timing_per_cpu);
@@ -265,7 +309,26 @@ static void measure_bandwidth_between_host_and_dev(int dev, double *dev_timing_h
                 if (type == 'O')
                 if (type == 'O')
                         measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(dev, cpu, dev_timing_per_cpu);
                         measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(dev, cpu, dev_timing_per_cpu);
 #endif
 #endif
+
+#ifdef STARPU_HAVE_HWLOC
+		if (!is_available_per_numa_node[numa_id])
+		{
+			/* Save the results for that NUMA node */
+			dev_timing_htod_per_numa_node[numa_id] =
+				dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod;
+			dev_timing_dtoh_per_numa_node[numa_id] =
+				dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh;
+
+			is_available_per_numa_node[numa_id] = 1;
+		}
+#endif
         }
         }
+}
+
+static void measure_bandwidth_between_host_and_dev(int dev, double *dev_timing_htod, double *dev_timing_dtoh,
+                                                   struct dev_timing *dev_timing_per_cpu, char type)
+{
+	measure_bandwidth_between_cpus_and_dev(dev, dev_timing_per_cpu, type);
 
 
 	/* sort the results */
 	/* sort the results */
 	qsort(&(dev_timing_per_cpu[(dev+1)*MAXCPUS]), ncpus,
 	qsort(&(dev_timing_per_cpu[(dev+1)*MAXCPUS]), ncpus,
@@ -305,6 +368,11 @@ static void benchmark_all_gpu_devices(void)
 	fprintf(stderr, "Benchmarking the speed of the bus\n");
 	fprintf(stderr, "Benchmarking the speed of the bus\n");
 #endif
 #endif
 
 
+#ifdef STARPU_HAVE_HWLOC
+	hwloc_topology_init(&hwtopology);
+	hwloc_topology_load(hwtopology);
+#endif
+
 	/* TODO: use hwloc */
 	/* TODO: use hwloc */
 	/* Save the current cpu binding */
 	/* Save the current cpu binding */
 	cpu_set_t former_process_affinity;
 	cpu_set_t former_process_affinity;