浏览代码

Improve the simgrid simulation on big machines by modeling bandwidths through PCI links and hubs

Samuel Thibault 10 年之前
父节点
当前提交
af5c5b002e
共有 2 个文件被更改,包括 500 次插入28 次删除
  1. 1 0
      configure.ac
  2. 499 28
      src/core/perfmodel/perfmodel_bus.c

+ 1 - 0
configure.ac

@@ -2397,6 +2397,7 @@ AS_IF([test "$have_valid_hwloc" = "yes"],
       [AC_DEFINE([STARPU_HAVE_HWLOC], [1], [Define to 1 if you have the hwloc library.])
        HWLOC_REQUIRES=hwloc
        AC_SUBST([STARPU_HAVE_HWLOC], [1])
+       AC_CHECK_DECLS([hwloc_cuda_get_device_osdev_by_index], [], [], [[#include <hwloc/cuda.h>]])
        AS_IF([test "$have_pkgconfig_hwloc" = "no"],
              [HWLOC_CFLAGS="-I$hwloc_dir/include"
 	      HWLOC_LIBS="-L$hwloc_dir/lib -lhwloc"]

+ 499 - 28
src/core/perfmodel/perfmodel_bus.c

@@ -45,6 +45,10 @@
 #include <windows.h>
 #endif
 
+#if HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX
+#include <hwloc/cuda.h>
+#endif
+
 #define SIZE	(32*1024*1024*sizeof(char))
 #define NITER	128
 
@@ -1071,7 +1075,7 @@ static int load_bus_latency_file_content(void)
 			n = _starpu_read_double(f, "%lf", &latency);
 			if (n && !isnan(latency))
 			{
-				_STARPU_DISP("Too many nodes in latency file %s for this configuration (%d), use --enable-maxnodes to increase it\n", path, STARPU_MAXNODES);
+				_STARPU_DISP("Too many nodes in latency file %s for this configuration (%d)\n", path, STARPU_MAXNODES);
 				fclose(f);
 				return 0;
 			}
@@ -1293,7 +1297,7 @@ static int load_bus_bandwidth_file_content(void)
 			n = _starpu_read_double(f, "%lf", &bandwidth);
 			if (n && !isnan(bandwidth))
 			{
-				_STARPU_DISP("Too many nodes in bandwidth file %s for this configuration (%d), use --enable-maxnodes to increase it\n", path, STARPU_MAXNODES);
+				_STARPU_DISP("Too many nodes in bandwidth file %s for this configuration (%d)\n", path, STARPU_MAXNODES);
 				fclose(f);
 				return 0;
 			}
@@ -1676,6 +1680,395 @@ void _starpu_simgrid_get_platform_path(char *path, size_t maxlen)
 }
 
 #ifndef STARPU_SIMGRID
+/*
+ * Compute the precise PCI tree bandwidth and link shares
+ *
+ * We only have measurements from one leaf to another. We assume that the
+ * available bandwidth is greater at lower levels, and thus measurements from
+ * increasingly far GPUs provide the PCI bridges bandwidths at each level.
+ *
+ * The bandwidth of a PCI bridge is thus computed as the maximum of the speed
+ * of the various transfers that we have achieved through it.  We thus browse
+ * the PCI tree three times:
+ *
+ * - first through all CUDA-CUDA possible transfers to compute the maximum
+ *   measured bandwidth on each PCI link and hub used for that.
+ * - then through the whole tree to emit links for each PCI link and hub.
+ * - then through all CUDA-CUDA possible transfers again to emit routes.
+ */
+
+#if defined(STARPU_USE_CUDA) && HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX && defined(HAVE_CUDA_MEMCPY_PEER)
+
+/* Records, for each PCI link and hub, the maximum bandwidth seen through it */
+struct pci_userdata {
+	/* Uplink max measurement */
+	double bw_up;
+	double bw_down;
+
+	/* Hub max measurement */
+	double bw;
+};
+
+/* Allocate a pci_userdata structure for the given object */
+static void allocate_userdata(hwloc_obj_t obj)
+{
+	struct pci_userdata *data;
+
+	if (obj->userdata)
+		return;
+
+	data = obj->userdata = malloc(sizeof(*data));
+	data->bw_up = 0.0;
+	data->bw_down = 0.0;
+	data->bw = 0.0;
+}
+
+/* Update the maximum bandwidth seen going to upstream */
+static void update_bandwidth_up(hwloc_obj_t obj, double bandwidth)
+{
+	struct pci_userdata *data;
+	if (obj->type != HWLOC_OBJ_BRIDGE && obj->type != HWLOC_OBJ_PCI_DEVICE)
+		return;
+	allocate_userdata(obj);
+
+	data = obj->userdata;
+	if (data->bw_up < bandwidth)
+		data->bw_up = bandwidth;
+}
+
+/* Update the maximum bandwidth seen going from upstream */
+static void update_bandwidth_down(hwloc_obj_t obj, double bandwidth)
+{
+	struct pci_userdata *data;
+	if (obj->type != HWLOC_OBJ_BRIDGE && obj->type != HWLOC_OBJ_PCI_DEVICE)
+		return;
+	allocate_userdata(obj);
+
+	data = obj->userdata;
+	if (data->bw_down < bandwidth)
+		data->bw_down = bandwidth;
+}
+
+/* Update the maximum bandwidth seen going through this Hub */
+static void update_bandwidth_through(hwloc_obj_t obj, double bandwidth)
+{
+	struct pci_userdata *data;
+	allocate_userdata(obj);
+
+	data = obj->userdata;
+	if (data->bw < bandwidth)
+		data->bw = bandwidth;
+}
+
+/* find_* functions perform the first step: computing maximum bandwidths */
+
+/* Our trafic had to go through the host, go back from target up to the host,
+ * updating uplink downstream bandwidth along the way */
+static void find_platform_backward_path(hwloc_obj_t obj, double bandwidth)
+{
+	/* Update uplink bandwidth of PCI Hub */
+	update_bandwidth_down(obj, bandwidth);
+	/* Update internal bandwidth of PCI Hub */
+	update_bandwidth_through(obj, bandwidth);
+
+	if (obj->type == HWLOC_OBJ_BRIDGE && obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_HOST)
+		/* Finished */
+		return;
+
+	/* Continue up */
+	find_platform_backward_path(obj->parent, bandwidth);
+}
+/* Same, but update uplink upstream bandwidth */
+static void find_platform_forward_path(hwloc_obj_t obj, double bandwidth)
+{
+	/* Update uplink bandwidth of PCI Hub */
+	update_bandwidth_up(obj, bandwidth);
+	/* Update internal bandwidth of PCI Hub */
+	update_bandwidth_through(obj, bandwidth);
+
+	if (obj->type == HWLOC_OBJ_BRIDGE && obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_HOST)
+		/* Finished */
+		return;
+
+	/* Continue up */
+	find_platform_forward_path(obj->parent, bandwidth);
+}
+
+/* Find the path from obj1 through parent down to obj2 (without ever going up),
+ * and update the maximum bandwidth along the path */
+static int find_platform_path_down(hwloc_obj_t parent, hwloc_obj_t obj1, hwloc_obj_t obj2, double bandwidth)
+{
+	unsigned i;
+
+	/* Base case, path is empty */
+	if (parent == obj2)
+		return 1;
+
+	/* Try to go down from parent */
+	for (i = 0; i < parent->arity; i++)
+		if (parent->children[i] != obj1 && find_platform_path_down(parent->children[i], NULL, obj2, bandwidth))
+		{
+			/* Found it down there, update bandwidth of parent */
+			update_bandwidth_down(parent->children[i], bandwidth);
+			update_bandwidth_through(parent, bandwidth);
+			return 1;
+		}
+	return 0;
+}
+
+/* Find the path from obj1 to obj2, and update the maximum bandwidth along the
+ * path */
+static int find_platform_path_up(hwloc_obj_t obj1, hwloc_obj_t obj2, double bandwidth)
+{
+	int ret;
+	hwloc_obj_t parent = obj1->parent;
+
+	if (find_platform_path_down(parent, obj1, obj2, bandwidth))
+		/* obj2 was a mere (sub)child of our parent */
+		return 1;
+
+	/* obj2 is not a (sub)child of our parent, we have to go up through the parent */
+	if (parent->type == HWLOC_OBJ_BRIDGE && parent->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_HOST)
+	{
+		/* We have to go up to the Host, so obj2 is not in the same PCI
+		 * tree, so we're for for obj1 to Host, and just find the path
+		 * from obj2 to Host too.
+		 */
+		find_platform_backward_path(obj2, bandwidth);
+
+		update_bandwidth_up(parent, bandwidth);
+		update_bandwidth_through(parent, bandwidth);
+
+		return 1;
+	}
+
+	/* Not at host yet, just go up */
+	ret = find_platform_path_up(parent, obj2, bandwidth);
+	update_bandwidth_up(parent, bandwidth);
+	update_bandwidth_through(parent, bandwidth);
+	return ret;
+}
+
+/* find the path between cuda i and cuda j, and update the maximum bandwidth along the path */
+static int find_platform_cuda_path(hwloc_topology_t topology, unsigned i, unsigned j, double bandwidth)
+{
+	hwloc_obj_t cudai, cudaj;
+	cudai = hwloc_cuda_get_device_osdev_by_index(topology, i);
+	cudaj = hwloc_cuda_get_device_osdev_by_index(topology, j);
+
+	if (!cudai || !cudaj)
+		return 0;
+
+	return find_platform_path_up(cudai, cudaj, bandwidth);
+}
+
+/* emit_topology_bandwidths performs the second step: emitting link names */
+
+/* Emit the link name of the object */
+static void emit_pci_hub(FILE *f, hwloc_obj_t obj)
+{
+	STARPU_ASSERT(obj->type == HWLOC_OBJ_BRIDGE);
+	fprintf(f, "PCI:%04x:[%02x-%02x]", obj->attr->bridge.downstream.pci.domain, obj->attr->bridge.downstream.pci.secondary_bus, obj->attr->bridge.downstream.pci.subordinate_bus);
+}
+
+static void emit_pci_dev(FILE *f, struct hwloc_pcidev_attr_s *pcidev)
+{
+	fprintf(f, "PCI:%04x:%02x:%02x.%1x", pcidev->domain, pcidev->bus, pcidev->dev, pcidev->func);
+}
+
+/* Emit the links of the object */
+static void emit_topology_bandwidths(FILE *f, hwloc_obj_t obj)
+{
+	unsigned i;
+	if (obj->userdata) {
+		struct pci_userdata *data = obj->userdata;
+
+		if (obj->type == HWLOC_OBJ_BRIDGE)
+		{
+			/* Uplink */
+			fprintf(f, "   <link id='");
+			emit_pci_hub(f, obj);
+			fprintf(f, " up' bandwidth='%f' latency='0.000000'/>\n", data->bw_up);
+			fprintf(f, "   <link id='");
+			emit_pci_hub(f, obj);
+			fprintf(f, " down' bandwidth='%f' latency='0.000000'/>\n", data->bw_down);
+
+			/* PCI Switches are assumed to have infinite internal bandwidth */
+			if (!obj->name || !strstr(obj->name, "Switch"))
+			{
+				/* We assume that PCI Hubs have double bandwidth in
+				 * order to support full duplex but not more */
+				fprintf(f, "   <link id='");
+				emit_pci_hub(f, obj);
+				fprintf(f, " through' bandwidth='%f' latency='0.000000'/>\n", data->bw * 2);
+			}
+		}
+		else if (obj->type == HWLOC_OBJ_PCI_DEVICE)
+		{
+			fprintf(f, "   <link id='");
+			emit_pci_dev(f, &obj->attr->pcidev);
+			fprintf(f, " up' bandwidth='%f' latency='0.000000'/>\n", data->bw_up);
+			fprintf(f, "   <link id='");
+			emit_pci_dev(f, &obj->attr->pcidev);
+			fprintf(f, " down' bandwidth='%f' latency='0.000000'/>\n", data->bw_down);
+		}
+	}
+
+	for (i = 0; i < obj->arity; i++)
+		emit_topology_bandwidths(f, obj->children[i]);
+}
+
+/* emit_pci_link_* functions perform the third step: emitting the routes */
+
+static void emit_pci_link(FILE *f, hwloc_obj_t obj, const char *suffix)
+{
+	if (obj->type == HWLOC_OBJ_BRIDGE)
+	{
+		fprintf(f, "    <link_ctn id='");
+		emit_pci_hub(f, obj);
+		fprintf(f, " %s'/>\n", suffix);
+	}
+	else if (obj->type == HWLOC_OBJ_PCI_DEVICE)
+	{
+		fprintf(f, "    <link_ctn id='");
+		emit_pci_dev(f, &obj->attr->pcidev);
+		fprintf(f, " %s'/>\n", suffix);
+	}
+}
+
+/* Go to upstream */
+static void emit_pci_link_up(FILE *f, hwloc_obj_t obj)
+{
+	emit_pci_link(f, obj, "up");
+}
+
+/* Go from upstream */
+static void emit_pci_link_down(FILE *f, hwloc_obj_t obj)
+{
+	emit_pci_link(f, obj, "down");
+}
+
+/* Go through PCI hub */
+static void emit_pci_link_through(FILE *f, hwloc_obj_t obj)
+{
+	/* We don't care about trafic going through PCI switches */
+	if (obj->type == HWLOC_OBJ_BRIDGE)
+	{
+		if (!obj->name || !strstr(obj->name, "Switch"))
+			emit_pci_link(f, obj, "through");
+		else
+		{
+			fprintf(f, "    <!--   Switch ");
+			emit_pci_hub(f, obj);
+			fprintf(f, " through -->\n");
+		}
+	}
+}
+
+/* Our trafic has to go through the host, go back from target up to the host,
+ * using uplink downstream along the way */
+static void emit_platform_backward_path(FILE *f, hwloc_obj_t obj)
+{
+	/* Go through PCI Hub */
+	emit_pci_link_through(f, obj);
+	/* Go through uplink */
+	emit_pci_link_down(f, obj);
+
+	if (obj->type == HWLOC_OBJ_BRIDGE && obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_HOST)
+	{
+		/* Finished, go through host */
+		fprintf(f, "    <link_ctn id='Host'/>\n");
+		return;
+	}
+
+	/* Continue up */
+	emit_platform_backward_path(f, obj->parent);
+}
+/* Same, but use upstream link */
+static void emit_platform_forward_path(FILE *f, hwloc_obj_t obj)
+{
+	/* Go through PCI Hub */
+	emit_pci_link_through(f, obj);
+	/* Go through uplink */
+	emit_pci_link_up(f, obj);
+
+	if (obj->type == HWLOC_OBJ_BRIDGE && obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_HOST)
+	{
+		/* Finished, go through host */
+		fprintf(f, "    <link_ctn id='Host'/>\n");
+		return;
+	}
+
+	/* Continue up */
+	emit_platform_forward_path(f, obj->parent);
+}
+
+/* Find the path from obj1 through parent down to obj2 (without ever going up),
+ * and use the links along the path */
+static int emit_platform_path_down(FILE *f, hwloc_obj_t parent, hwloc_obj_t obj1, hwloc_obj_t obj2)
+{
+	unsigned i;
+
+	/* Base case, path is empty */
+	if (parent == obj2)
+		return 1;
+
+	/* Try to go down from parent */
+	for (i = 0; i < parent->arity; i++)
+		if (parent->children[i] != obj1 && emit_platform_path_down(f, parent->children[i], NULL, obj2))
+		{
+			/* Found it down there, path goes through this hub */
+			emit_pci_link_down(f, parent->children[i]);
+			emit_pci_link_through(f, parent);
+			return 1;
+		}
+	return 0;
+}
+
+/* Find the path from obj1 to obj2, and use the links along the path */
+static int emit_platform_path_up(FILE *f, hwloc_obj_t obj1, hwloc_obj_t obj2)
+{
+	int ret;
+	hwloc_obj_t parent = obj1->parent;
+
+	if (emit_platform_path_down(f, parent, obj1, obj2))
+		/* obj2 was a mere (sub)child of our parent */
+		return 1;
+
+	/* obj2 is not a (sub)child of our parent, we have to go up through the parent */
+	if (parent->type == HWLOC_OBJ_BRIDGE && parent->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_HOST)
+	{
+		/* We have to go up to the Host, so obj2 is not in the same PCI
+		 * tree, so we're for for obj1 to Host, and just find the path
+		 * from obj2 to Host too.
+		 */
+		emit_platform_backward_path(f, obj2);
+		fprintf(f, "    <link_ctn id='Host'/>\n");
+
+		emit_pci_link_up(f, parent);
+		emit_pci_link_through(f, parent);
+
+		return 1;
+	}
+
+	/* Not at host yet, just go up */
+	ret = emit_platform_path_up(f, parent, obj2);
+	emit_pci_link_up(f, parent);
+	emit_pci_link_through(f, parent);
+	return ret;
+}
+
+/* Clean our mess in the topology before destroying it */
+static void clean_topology(hwloc_obj_t obj)
+{
+	unsigned i;
+	if (obj->userdata)
+		free(obj->userdata);
+	for (i = 0; i < obj->arity; i++)
+		clean_topology(obj->children[i]);
+}
+#endif
+
 static void write_bus_platform_file_content(void)
 {
 	FILE *f;
@@ -1735,7 +2128,9 @@ static void write_bus_platform_file_content(void)
 
 	fprintf(f, "\n   <host id='RAM' power='1'/>\n");
 
-	/* Compute maximum bandwidth, taken as machine bandwidth */
+	/*
+	 * Compute maximum bandwidth, taken as host bandwidth
+	 */
 	double max_bandwidth = 0;
 #ifdef STARPU_USE_CUDA
 	for (i = 0; i < ncuda; i++)
@@ -1759,10 +2154,35 @@ static void write_bus_platform_file_content(void)
 			max_bandwidth = up_bw;
 	}
 #endif
-	fprintf(f, "\n   <link id='Share' bandwidth='%f' latency='0.000000'/>\n\n", max_bandwidth*1000000);
+	fprintf(f, "\n   <link id='Host' bandwidth='%f' latency='0.000000'/>\n\n", max_bandwidth*1000000);
+
+	/*
+	 * OpenCL links
+	 */
+
+#ifdef STARPU_USE_OPENCL
+	for (i = 0; i < nopencl; i++)
+	{
+		char i_name[16];
+		snprintf(i_name, sizeof(i_name), "OpenCL%d", i);
+		fprintf(f, "   <link id='RAM-%s' bandwidth='%f' latency='%f'/>\n",
+			i_name,
+			1000000 / opencldev_timing_htod[1+i],
+			opencldev_latency_htod[1+i]/1000000.);
+		fprintf(f, "   <link id='%s-RAM' bandwidth='%f' latency='%f'/>\n",
+			i_name,
+			1000000 / opencldev_timing_dtoh[1+i],
+			opencldev_latency_dtoh[1+i]/1000000.);
+	}
+	fprintf(f, "\n");
+#endif
+
+	/*
+	 * CUDA links and routes
+	 */
 
-	/* Write bandwidths & latencies */
 #ifdef STARPU_USE_CUDA
+	/* Write RAM/CUDA bandwidths and latencies */
 	for (i = 0; i < ncuda; i++)
 	{
 		char i_name[16];
@@ -1776,7 +2196,9 @@ static void write_bus_platform_file_content(void)
 			1000000. / cudadev_timing_dtoh[1+i],
 			cudadev_latency_dtoh[1+i]/1000000.);
 	}
+	fprintf(f, "\n");
 #ifdef HAVE_CUDA_MEMCPY_PEER
+	/* Write CUDA/CUDA bandwidths and latencies */
 	for (i = 0; i < ncuda; i++)
 	{
 		unsigned j;
@@ -1795,32 +2217,75 @@ static void write_bus_platform_file_content(void)
 		}
 	}
 #endif
-#endif
 
-#ifdef STARPU_USE_OPENCL
-	for (i = 0; i < nopencl; i++)
+#if HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX && defined(HAVE_CUDA_MEMCPY_PEER)
+	/* If we have enough hwloc information, write PCI bandwidths and routes */
+	if (!starpu_get_env_number_default("STARPU_PCI_FLAT", 0)) {
+		hwloc_topology_t topology;
+		hwloc_topology_init(&topology);
+		hwloc_topology_set_flags(topology, HWLOC_TOPOLOGY_FLAG_IO_DEVICES | HWLOC_TOPOLOGY_FLAG_IO_BRIDGES);
+		hwloc_topology_load(topology);
+
+		/* First find paths and record measured bandwidth along the path */
+		for (i = 0; i < ncuda; i++)
+		{
+			unsigned j;
+			for (j = 0; j < ncuda; j++)
+				if (i != j)
+					if (!find_platform_cuda_path(topology, i, j, 1000000. / cudadev_timing_dtod[1+i][1+j]))
+					{
+						clean_topology(hwloc_get_root_obj(topology));
+						hwloc_topology_destroy(topology);
+						goto flat_cuda;
+					}
+			/* Record RAM/CUDA bandwidths */
+			find_platform_forward_path(hwloc_cuda_get_device_osdev_by_index(topology, i), 1000000. / cudadev_timing_dtoh[1+i]);
+			find_platform_backward_path(hwloc_cuda_get_device_osdev_by_index(topology, i), 1000000. / cudadev_timing_htod[1+i]);
+		}
+
+		/* Ok, found path in all cases, can emit advanced platform routes */
+		fprintf(f, "\n");
+		emit_topology_bandwidths(f, hwloc_get_root_obj(topology));
+		fprintf(f, "\n");
+		for (i = 0; i < ncuda; i++)
+		{
+			unsigned j;
+			for (j = 0; j < ncuda; j++)
+				if (i != j)
+				{
+					fprintf(f, "   <route src='CUDA%u' dst='CUDA%u' symmetrical='NO'>\n", i, j);
+					fprintf(f, "    <link_ctn id='CUDA%d-CUDA%d'/>\n", i, j);
+					emit_platform_path_up(f,
+						hwloc_cuda_get_device_osdev_by_index(topology, i),
+						hwloc_cuda_get_device_osdev_by_index(topology, j));
+					fprintf(f, "   </route>\n");
+				}
+
+			fprintf(f, "   <route src='CUDA%d' dst='RAM' symmetrical='NO'>\n", i);
+			fprintf(f, "    <link_ctn id='CUDA%d-RAM'/>\n", i);
+			emit_platform_forward_path(f, hwloc_cuda_get_device_osdev_by_index(topology, i));
+			fprintf(f, "   </route>\n");
+
+			fprintf(f, "   <route src='RAM' dst='CUDA%d' symmetrical='NO'>\n", i);
+			fprintf(f, "    <link_ctn id='RAM-CUDA%d'/>\n", i);
+			emit_platform_backward_path(f, hwloc_cuda_get_device_osdev_by_index(topology, i));
+			fprintf(f, "   </route>\n");
+		}
+
+		clean_topology(hwloc_get_root_obj(topology));
+		hwloc_topology_destroy(topology);
+	} else {
+flat_cuda:
+#else
 	{
-		char i_name[16];
-		snprintf(i_name, sizeof(i_name), "OpenCL%d", i);
-		fprintf(f, "   <link id='RAM-%s' bandwidth='%f' latency='%f'/>\n",
-			i_name,
-			1000000 / opencldev_timing_htod[1+i],
-			opencldev_latency_htod[1+i]/1000000.);
-		fprintf(f, "   <link id='%s-RAM' bandwidth='%f' latency='%f'/>\n",
-			i_name,
-			1000000 / opencldev_timing_dtoh[1+i],
-			opencldev_latency_dtoh[1+i]/1000000.);
-	}
 #endif
-
-	/* Write routes */
-#ifdef STARPU_USE_CUDA
+	/* If we don't have enough hwloc information, write trivial routes always through host */
 	for (i = 0; i < ncuda; i++)
 	{
 		char i_name[16];
 		snprintf(i_name, sizeof(i_name), "CUDA%d", i);
-		fprintf(f, "   <route src='RAM' dst='%s' symmetrical='NO'><link_ctn id='RAM-%s'/><link_ctn id='Share'/></route>\n", i_name, i_name);
-		fprintf(f, "   <route src='%s' dst='RAM' symmetrical='NO'><link_ctn id='%s-RAM'/><link_ctn id='Share'/></route>\n", i_name, i_name);
+		fprintf(f, "   <route src='RAM' dst='%s' symmetrical='NO'><link_ctn id='RAM-%s'/><link_ctn id='Host'/></route>\n", i_name, i_name);
+		fprintf(f, "   <route src='%s' dst='RAM' symmetrical='NO'><link_ctn id='%s-RAM'/><link_ctn id='Host'/></route>\n", i_name, i_name);
 	}
 #ifdef HAVE_CUDA_MEMCPY_PEER
 	for (i = 0; i < ncuda; i++)
@@ -1834,19 +2299,25 @@ static void write_bus_platform_file_content(void)
 			if (j == i)
 				continue;
 			snprintf(j_name, sizeof(j_name), "CUDA%d", j);
-			fprintf(f, "   <route src='%s' dst='%s' symmetrical='NO'><link_ctn id='%s-%s'/><link_ctn id='Share'/></route>\n", i_name, j_name, i_name, j_name);
+			fprintf(f, "   <route src='%s' dst='%s' symmetrical='NO'><link_ctn id='%s-%s'/><link_ctn id='Host'/></route>\n", i_name, j_name, i_name, j_name);
 		}
 	}
 #endif
-#endif
+	} /* defined(STARPU_HAVE_HWLOC) && defined(HAVE_CUDA_MEMCPY_PEER) */
+	fprintf(f, "\n");
+#endif /* STARPU_USE_CUDA */
+
+	/*
+	 * OpenCL routes
+	 */
 
 #ifdef STARPU_USE_OPENCL
 	for (i = 0; i < nopencl; i++)
 	{
 		char i_name[16];
 		snprintf(i_name, sizeof(i_name), "OpenCL%d", i);
-		fprintf(f, "   <route src='RAM' dst='%s' symmetrical='NO'><link_ctn id='RAM-%s'/><link_ctn id='Share'/></route>\n", i_name, i_name);
-		fprintf(f, "   <route src='%s' dst='RAM' symmetrical='NO'><link_ctn id='%s-RAM'/><link_ctn id='Share'/></route>\n", i_name, i_name);
+		fprintf(f, "   <route src='RAM' dst='%s' symmetrical='NO'><link_ctn id='RAM-%s'/><link_ctn id='Host'/></route>\n", i_name, i_name);
+		fprintf(f, "   <route src='%s' dst='RAM' symmetrical='NO'><link_ctn id='%s-RAM'/><link_ctn id='Host'/></route>\n", i_name, i_name);
 	}
 #endif