Selaa lähdekoodia

Use libnvidia-ml to detect NVLinks for simgrid platforms

Samuel Thibault 4 vuotta sitten
vanhempi
commit
e955e3fc3f
2 muutettua tiedostoa jossa 110 lisäystä ja 12 poistoa
  1. 108 10
      src/core/perfmodel/perfmodel_bus.c
  2. 2 2
      src/core/topology.c

+ 108 - 10
src/core/perfmodel/perfmodel_bus.c

@@ -2859,19 +2859,97 @@ static void write_bus_platform_file_content(int version)
 #warning TODO: use libnvml to get NVLink links, otherwise numbers will be bogusly propagated through PCI topology
 #endif
 	/* If we have enough hwloc information, write PCI bandwidths and routes */
-	if (!starpu_get_env_number_default("STARPU_PCI_FLAT", 0))
+	if (!starpu_get_env_number_default("STARPU_PCI_FLAT", 0) && ncuda > 0)
 	{
 		hwloc_topology_t topology;
 		hwloc_topology_init(&topology);
 		_starpu_topology_filter(topology);
 		hwloc_topology_load(topology);
 
-		/* First find paths and record measured bandwidth along the path */
+		char nvlink[ncuda][ncuda];
+		char nvlinkhost[ncuda];
+		memset(nvlink, 0, sizeof(nvlink));
+		memset(nvlinkhost, 0, sizeof(nvlinkhost));
+
+#ifdef STARPU_HAVE_LIBNVIDIA_ML
+		/* First find NVLinks */
+		struct cudaDeviceProp props[ncuda];
+
+		for (i = 0; i < ncuda; i++)
+		{
+			cudaError_t cures = cudaGetDeviceProperties(&props[i], i);
+			if (cures != cudaSuccess)
+				props[i].name[0] = 0;
+		}
+
+		for (i = 0; i < ncuda; i++)
+		{
+			unsigned j;
+
+			if (!props[i].name[0])
+				continue;
+
+			nvmlDevice_t nvmldev;
+			nvmldev = _starpu_cuda_get_nvmldev(&props[i]);
+			if (!nvmldev)
+				continue;
+
+			for (j = 0; j < NVML_NVLINK_MAX_LINKS; j++)
+			{
+				nvmlEnableState_t active;
+				nvmlReturn_t ret;
+				nvmlPciInfo_t pci;
+				unsigned k;
+
+				ret = nvmlDeviceGetNvLinkState(nvmldev, j, &active);
+				if (ret != NVML_SUCCESS)
+					continue;
+				if (active != NVML_FEATURE_ENABLED)
+					continue;
+				ret = nvmlDeviceGetNvLinkRemotePciInfo(nvmldev, j, &pci);
+				if (ret != NVML_SUCCESS)
+					continue;
+
+				hwloc_obj_t obj = hwloc_get_pcidev_by_busid(topology,
+						pci.domain, pci.bus, pci.device, 0);
+				if (obj && obj->type == HWLOC_OBJ_PCI_DEVICE && (obj->attr->pcidev.class_id >> 8 == 0x06))
+				{
+					switch (obj->attr->pcidev.vendor_id)
+					{
+					case 0x1014:
+						/* IBM OpenCAPI port, direct CPU-GPU NVLink */
+						/* TODO: NUMA affinity */
+						nvlinkhost[i] = 1;
+						continue;
+					case 0x10de:
+						/* TODO: NVIDIA NVSwitch */
+						continue;
+					}
+				}
+
+				/* Otherwise, link to another GPU? */
+				for (k = i+1; k < ncuda; k++)
+				{
+					if (pci.domain == props[k].pciDomainID
+					 && pci.bus == props[k].pciBusID
+					 && pci.device == props[k].pciDeviceID)
+					{
+						nvlink[i][k] = 1;
+						nvlink[k][i] = 1;
+						break;
+					}
+				}
+			}
+		}
+#endif
+
+		/* Find paths and record measured bandwidth along the path */
 		for (i = 0; i < ncuda; i++)
 		{
 			unsigned j;
+
 			for (j = 0; j < ncuda; j++)
-				if (i != j)
+				if (i != j && !nvlink[i][j] && !nvlinkhost[i] && !nvlinkhost[j])
 					if (!find_platform_cuda_path(topology, i, j, 1000000. / cudadev_timing_dtod[i][j]))
 					{
 						_STARPU_DISP("Warning: could not get CUDA location from hwloc\n");
@@ -2879,15 +2957,20 @@ static void write_bus_platform_file_content(int version)
 						hwloc_topology_destroy(topology);
 						goto flat_cuda;
 					}
+
 			/* Record RAM/CUDA bandwidths */
-			find_platform_forward_path(hwloc_cuda_get_device_osdev_by_index(topology, i), 1000000. / search_bus_best_timing(i, "CUDA", 0));
-			find_platform_backward_path(hwloc_cuda_get_device_osdev_by_index(topology, i), 1000000. / search_bus_best_timing(i, "CUDA", 1));
+			if (!nvlinkhost[i])
+			{
+				find_platform_forward_path(hwloc_cuda_get_device_osdev_by_index(topology, i), 1000000. / search_bus_best_timing(i, "CUDA", 0));
+				find_platform_backward_path(hwloc_cuda_get_device_osdev_by_index(topology, i), 1000000. / search_bus_best_timing(i, "CUDA", 1));
+			}
 		}
 
 		/* Ok, found path in all cases, can emit advanced platform routes */
 		fprintf(f, "\n");
 		emit_topology_bandwidths(f, hwloc_get_root_obj(topology), Bps, s);
 		fprintf(f, "\n");
+
 		for (i = 0; i < ncuda; i++)
 		{
 			unsigned j;
@@ -2896,20 +2979,35 @@ static void write_bus_platform_file_content(int version)
 				{
 					fprintf(f, "   <route src=\"CUDA%u\" dst=\"CUDA%u\" symmetrical=\"NO\">\n", i, j);
 					fprintf(f, "    <link_ctn id=\"CUDA%u-CUDA%u\"/>\n", i, j);
-					emit_platform_path_up(f,
-							hwloc_cuda_get_device_osdev_by_index(topology, i),
-							hwloc_cuda_get_device_osdev_by_index(topology, j));
+					if (!nvlink[i][j])
+					{
+						if (nvlinkhost[i] && nvlinkhost[j])
+							/* TODO: NUMA affinity */
+							fprintf(f, "    <link_ctn id=\"Host\"/>\n");
+						else
+							emit_platform_path_up(f,
+									hwloc_cuda_get_device_osdev_by_index(topology, i),
+									hwloc_cuda_get_device_osdev_by_index(topology, j));
+					}
 					fprintf(f, "   </route>\n");
 				}
 
 			fprintf(f, "   <route src=\"CUDA%u\" dst=\"RAM\" symmetrical=\"NO\">\n", i);
 			fprintf(f, "    <link_ctn id=\"CUDA%u-RAM\"/>\n", i);
-			emit_platform_forward_path(f, hwloc_cuda_get_device_osdev_by_index(topology, i));
+			if (nvlinkhost[i])
+				/* TODO: NUMA affinity */
+				fprintf(f, "    <link_ctn id=\"Host\"/>\n");
+			else
+				emit_platform_forward_path(f, hwloc_cuda_get_device_osdev_by_index(topology, i));
 			fprintf(f, "   </route>\n");
 
 			fprintf(f, "   <route src=\"RAM\" dst=\"CUDA%u\" symmetrical=\"NO\">\n", i);
 			fprintf(f, "    <link_ctn id=\"RAM-CUDA%u\"/>\n", i);
-			emit_platform_backward_path(f, hwloc_cuda_get_device_osdev_by_index(topology, i));
+			if (nvlinkhost[i])
+				/* TODO: NUMA affinity */
+				fprintf(f, "    <link_ctn id=\"Host\"/>\n");
+			else
+				emit_platform_backward_path(f, hwloc_cuda_get_device_osdev_by_index(topology, i));
 			fprintf(f, "   </route>\n");
 		}
 

+ 2 - 2
src/core/topology.c

@@ -1247,10 +1247,10 @@ unsigned _starpu_topology_get_nnumanodes(struct _starpu_machine_config *config S
 void _starpu_topology_filter(hwloc_topology_t topology)
 {
 #if HWLOC_API_VERSION >= 0x20000
-	hwloc_topology_set_io_types_filter(topology, HWLOC_TYPE_FILTER_KEEP_IMPORTANT);
+	hwloc_topology_set_io_types_filter(topology, HWLOC_TYPE_FILTER_KEEP_ALL);
 	hwloc_topology_set_flags(topology, HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM);
 #else
-	hwloc_topology_set_flags(topology, HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM | HWLOC_TOPOLOGY_FLAG_IO_DEVICES | HWLOC_TOPOLOGY_FLAG_IO_BRIDGES);
+	hwloc_topology_set_flags(topology, HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM | HWLOC_TOPOLOGY_FLAG_WHOLE_IO);
 #endif
 #ifdef HAVE_HWLOC_TOPOLOGY_SET_COMPONENTS
 #  ifndef STARPU_USE_CUDA