Browse Source

In case hwloc does not know whether there are NUMA nodes or not, we do not make
a per-numa node benchmarking and use the default per CPU approach.

Cédric Augonnet 15 years ago
parent
commit
f86959ff77
1 changed files with 26 additions and 12 deletions
  1. 26 12
      src/core/perfmodel/perfmodel_bus.c

+ 26 - 12
src/core/perfmodel/perfmodel_bus.c

@@ -256,8 +256,15 @@ static int find_numa_node(hwloc_obj_t obj)
 	hwloc_obj_t current = obj;
 	hwloc_obj_t current = obj;
 
 
 	while (current->depth != HWLOC_OBJ_NODE)
 	while (current->depth != HWLOC_OBJ_NODE)
+	{
 		current = current->parent;
 		current = current->parent;
 
 
+		/* If we don't find a "node" obj before the root, this means
+		 * hwloc does not know whether there are numa nodes or not, so
+		 * we should not use a per-node sampling in that case. */
+		STARPU_ASSERT(current);
+	}
+
 	STARPU_ASSERT(current->depth == HWLOC_OBJ_NODE);
 	STARPU_ASSERT(current->depth == HWLOC_OBJ_NODE);
 
 
 	return current->logical_index; 
 	return current->logical_index; 
@@ -272,6 +279,10 @@ static void measure_bandwidth_between_cpus_and_dev(int dev, struct dev_timing *d
 #ifdef STARPU_HAVE_HWLOC
 #ifdef STARPU_HAVE_HWLOC
 	int cpu_depth = hwloc_get_type_depth(hwtopology, HWLOC_OBJ_CORE);
 	int cpu_depth = hwloc_get_type_depth(hwtopology, HWLOC_OBJ_CORE);
 	int nnuma_nodes = hwloc_get_nbobjs_by_depth(hwtopology, HWLOC_OBJ_NODE);
 	int nnuma_nodes = hwloc_get_nbobjs_by_depth(hwtopology, HWLOC_OBJ_NODE);
+
+	/* If no NUMA node was found, we assume that we have a single memory
+	 * bank. */
+	const unsigned no_node_obj_was_found = (nnuma_nodes == 0);
 	
 	
 	unsigned is_available_per_numa_node[nnuma_nodes];
 	unsigned is_available_per_numa_node[nnuma_nodes];
 	double dev_timing_htod_per_numa_node[nnuma_nodes];
 	double dev_timing_htod_per_numa_node[nnuma_nodes];
@@ -286,18 +297,21 @@ static void measure_bandwidth_between_cpus_and_dev(int dev, struct dev_timing *d
 		dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].cpu_id = cpu;
 		dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].cpu_id = cpu;
 
 
 #ifdef STARPU_HAVE_HWLOC
 #ifdef STARPU_HAVE_HWLOC
-		hwloc_obj_t obj = hwloc_get_obj_by_depth(hwtopology, cpu_depth, cpu);
-
-		int numa_id = find_numa_node(obj);
-
-		if (is_available_per_numa_node[numa_id])
+		if (!no_node_obj_was_found)
 		{
 		{
-			/* We reuse the previous numbers for that NUMA node */
-			dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod =
-				dev_timing_htod_per_numa_node[numa_id];
-			dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh =
-				dev_timing_dtoh_per_numa_node[numa_id];
-			continue;
+			hwloc_obj_t obj = hwloc_get_obj_by_depth(hwtopology, cpu_depth, cpu);
+	
+			int numa_id = find_numa_node(obj);
+	
+			if (is_available_per_numa_node[numa_id])
+			{
+				/* We reuse the previous numbers for that NUMA node */
+				dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod =
+					dev_timing_htod_per_numa_node[numa_id];
+				dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh =
+					dev_timing_dtoh_per_numa_node[numa_id];
+				continue;
+			}
 		}
 		}
 #endif
 #endif
 
 
@@ -311,7 +325,7 @@ static void measure_bandwidth_between_cpus_and_dev(int dev, struct dev_timing *d
 #endif
 #endif
 
 
 #ifdef STARPU_HAVE_HWLOC
 #ifdef STARPU_HAVE_HWLOC
-		if (!is_available_per_numa_node[numa_id])
+		if (!no_node_obj_was_found && !is_available_per_numa_node[numa_id])
 		{
 		{
 			/* Save the results for that NUMA node */
 			/* Save the results for that NUMA node */
 			dev_timing_htod_per_numa_node[numa_id] =
 			dev_timing_htod_per_numa_node[numa_id] =