8 gadi atpakaļ · 2510657bc9
--- a/include/starpu_profiling.h
+++ b/include/starpu_profiling.h
@@ -108,6 +108,10 @@ int starpu_bus_get_count(void);
 
				 int starpu_bus_get_id(int src, int dst);
			
 
				 int starpu_bus_get_src(int busid);
			
 
				 int starpu_bus_get_dst(int busid);
			
 
				+void starpu_bus_set_direct(int busid, int direct);
			
 
				+int starpu_bus_get_direct(int busid);
			
 
				+void starpu_bus_set_ngpus(int busid, int ngpus);
			
 
				+int starpu_bus_get_ngpus(int busid);
			
 
				 
			
 
				 int starpu_bus_get_profiling_info(int busid, struct starpu_profiling_bus_info *bus_info);
			
 
				 
			
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
@@ -97,6 +97,7 @@ static double cudadev_latency_dtod[STARPU_MAXNODES][STARPU_MAXNODES] = {{0.0}};
 
				 #endif
			
 
				 #endif
			
 
				 static struct dev_timing cudadev_timing_per_cpu[STARPU_MAXNODES*STARPU_MAXCPUS];
			
 
				+static char cudadev_direct[STARPU_MAXNODES][STARPU_MAXNODES];
			
 
				 #endif
			
 
				 
			
 
				 #ifndef STARPU_SIMGRID
			
@@ -267,7 +268,10 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
				 		{
			
 
				 			cures = cudaDeviceEnablePeerAccess(dst, 0);
			
 
				 			if (!cures)
			
 
				+			{
			
 
				 				_STARPU_DISP("GPU-Direct %d -> %d\n", dst, src);
			
 
				+				cudadev_direct[src][dst] = 1;
			
 
				+			}
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -289,7 +293,10 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
				 		{
			
 
				 			cures = cudaDeviceEnablePeerAccess(src, 0);
			
 
				 			if (!cures)
			
 
				+			{
			
 
				 				_STARPU_DISP("GPU-Direct %d -> %d\n", src, dst);
			
 
				+				cudadev_direct[dst][src] = 1;
			
 
				+			}
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -2195,6 +2202,7 @@ static void write_bus_platform_file_content(int version)
 
				 #ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				 		fprintf(f, "     <prop id=\"memcpy_peer\" value=\"1\"/>\n");
			
 
				 #endif
			
 
				+		/* TODO: record cudadev_direct instead of assuming it's NUMA nodes */
			
 
				 		fprintf(f, "   </host>\n");
			
 
				 	}
			
 
				 
			
@@ -2504,8 +2512,24 @@ double starpu_transfer_predict(unsigned src_node, unsigned dst_node, size_t size
 
				 	double bandwidth = bandwidth_matrix[src_node][dst_node];
			
 
				 	double latency = latency_matrix[src_node][dst_node];
			
 
				 	struct _starpu_machine_topology *topology = &_starpu_get_machine_config()->topology;
			
 
				+	int busid = starpu_bus_get_id(src_node, dst_node);
			
 
				+	int direct = starpu_bus_get_direct(busid);
			
 
				+	float ngpus = topology->ncudagpus+topology->nopenclgpus;
			
 
				+
			
 
				+#if 0
			
 
				+	/* Ideally we should take into account that some GPUs are directly
			
 
				+	 * connected through a PCI switch, which has less contention that the
			
 
				+	 * Host bridge, but doing that seems to *decrease* performance... */
			
 
				+	if (direct)
			
 
				+	{
			
 
				+		float neighbours = starpu_bus_get_ngpus(busid);
			
 
				+		/* Count transfers of these GPUs, and count transfers between
			
 
				+		 * other GPUs and these GPUs */
			
 
				+		ngpus = neighbours + (ngpus - neighbours) * neighbours / ngpus;
			
 
				+	}
			
 
				+#endif
			
 
				 
			
 
				-	return latency + (size/bandwidth)*2*(topology->ncudagpus+topology->nopenclgpus);
			
 
				+	return latency + (size/bandwidth)*2*ngpus;
			
 
				 }
			
 
				 
			
 
				 
			
--- a/src/core/simgrid.c
+++ b/src/core/simgrid.c
@@ -677,4 +677,112 @@ _starpu_simgrid_thread_start(int argc STARPU_ATTRIBUTE_UNUSED, char *argv[])
 
				 	f(arg);
			
 
				 	return 0;
			
 
				 }
			
 
				+
			
 
				+msg_host_t
			
 
				+_starpu_simgrid_get_memnode_host(unsigned node)
			
 
				+{
			
 
				+	const char *fmt;
			
 
				+	char name[16];
			
 
				+
			
 
				+	switch (starpu_node_get_kind(node))
			
 
				+	{
			
 
				+		case STARPU_CPU_RAM:
			
 
				+			fmt = "RAM";
			
 
				+			break;
			
 
				+		case STARPU_CUDA_RAM:
			
 
				+			fmt = "CUDA%u";
			
 
				+			break;
			
 
				+		case STARPU_OPENCL_RAM:
			
 
				+			fmt = "OpenCL%u";
			
 
				+			break;
			
 
				+		default:
			
 
				+			STARPU_ABORT();
			
 
				+			break;
			
 
				+	}
			
 
				+	snprintf(name, sizeof(name), fmt, _starpu_memory_node_get_devid(node));
			
 
				+
			
 
				+	return _starpu_simgrid_get_host_by_name(name);
			
 
				+}
			
 
				+
			
 
				+void _starpu_simgrid_count_ngpus(void)
			
 
				+{
			
 
				+	unsigned src, dst;
			
 
				+	msg_host_t ramhost = _starpu_simgrid_get_host_by_name("RAM");
			
 
				+
			
 
				+	/* For each pair of memory nodes, get the route */
			
 
				+	for (src = 1; src < STARPU_MAXNODES; src++)
			
 
				+		for (dst = 1; dst < STARPU_MAXNODES; dst++)
			
 
				+		{
			
 
				+			int busid;
			
 
				+			msg_host_t srchost, dsthost;
			
 
				+			const SD_link_t *route;
			
 
				+			int i, routesize;
			
 
				+			int through;
			
 
				+			unsigned src2;
			
 
				+			unsigned ngpus;
			
 
				+			const char *name;
			
 
				+
			
 
				+			if (dst == src)
			
 
				+				continue;
			
 
				+			busid = starpu_bus_get_id(src, dst);
			
 
				+			if (busid == -1)
			
 
				+				continue;
			
 
				+
			
 
				+			srchost = _starpu_simgrid_get_memnode_host(src);
			
 
				+			dsthost = _starpu_simgrid_get_memnode_host(dst);
			
 
				+			routesize = SD_route_get_size(srchost, dsthost);
			
 
				+			route = SD_route_get_list(srchost, dsthost);
			
 
				+
			
 
				+			/* If it goes through "Host", do not care, there is no
			
 
				+			 * direct transfer support */
			
 
				+			for (i = 0; i < routesize; i++)
			
 
				+				if (!strcmp(sg_link_name(route[i]), "Host"))
			
 
				+					break;
			
 
				+			if (i < routesize)
			
 
				+				continue;
			
 
				+
			
 
				+			/* Get the PCI bridge between down and up links */
			
 
				+			through = -1;
			
 
				+			for (i = 0; i < routesize; i++)
			
 
				+			{
			
 
				+				name = sg_link_name(route[i]);
			
 
				+				size_t len = strlen(name);
			
 
				+				if (!strcmp(" through", name+len-8))
			
 
				+					through = i;
			
 
				+				else if (!strcmp(" up", name+len-3))
			
 
				+					break;
			
 
				+			}
			
 
				+			/* Didn't find it ?! */
			
 
				+			if (through == -1)
			
 
				+			{
			
 
				+				_STARPU_DEBUG("Didn't find through-link for %d->%d\n", src, dst);
			
 
				+				continue;
			
 
				+			}
			
 
				+			name = sg_link_name(route[through]);
			
 
				+
			
 
				+			/*
			
 
				+			 * count how many direct routes go through it between
			
 
				+			 * GPUs and RAM
			
 
				+			 */
			
 
				+			ngpus = 0;
			
 
				+			for (src2 = 1; src2 < STARPU_MAXNODES; src2++)
			
 
				+			{
			
 
				+				if (starpu_bus_get_id(src2, STARPU_MAIN_RAM) == -1)
			
 
				+					continue;
			
 
				+				msg_host_t srchost2 = _starpu_simgrid_get_memnode_host(src2);
			
 
				+				int routesize2 = SD_route_get_size(srchost2, ramhost);
			
 
				+				const SD_link_t *route2 = SD_route_get_list(srchost2, ramhost);
			
 
				+
			
 
				+				for (i = 0; i < routesize2; i++)
			
 
				+					if (!strcmp(name, sg_link_name(route2[i])))
			
 
				+					{
			
 
				+						/* This GPU goes through this PCI bridge to access RAM */
			
 
				+						ngpus++;
			
 
				+						break;
			
 
				+					}
			
 
				+			}
			
 
				+			_STARPU_DEBUG("%d->%d through %s, %u GPUs\n", src, dst, name, ngpus);
			
 
				+			starpu_bus_set_ngpus(busid, ngpus);
			
 
				+		}
			
 
				+}
			
 
				 #endif
			
--- a/src/core/simgrid.h
+++ b/src/core/simgrid.h
@@ -47,6 +47,7 @@ int _starpu_simgrid_transfer(size_t size, unsigned src_node, unsigned dst_node,
 
				 int _starpu_simgrid_get_nbhosts(const char *prefix);
			
 
				 unsigned long long _starpu_simgrid_get_memsize(const char *prefix, unsigned devid);
			
 
				 msg_host_t _starpu_simgrid_get_host_by_name(const char *name);
			
 
				+msg_host_t _starpu_simgrid_get_memnode_host(unsigned node);
			
 
				 struct _starpu_worker;
			
 
				 msg_host_t _starpu_simgrid_get_host_by_worker(struct _starpu_worker *worker);
			
 
				 void _starpu_simgrid_get_platform_path(int version, char *path, size_t maxlen);
			
@@ -63,6 +64,10 @@ starpu_pthread_queue_t _starpu_simgrid_task_queue[STARPU_NMAXWORKERS];
 
				 #define _starpu_simgrid_queue_malloc_cost() starpu_get_env_number_default("STARPU_SIMGRID_QUEUE_MALLOC_COST", 1)
			
 
				 #define _starpu_simgrid_task_submit_cost() starpu_get_env_number_default("STARPU_SIMGRID_TASK_SUBMIT_COST", 1)
			
 
				 
			
 
				+/* Called at initialization to count how many GPUs are interfering with each
			
 
				+ * bus */
			
 
				+void _starpu_simgrid_count_ngpus(void);
			
 
				+
			
 
				 #endif
			
 
				 
			
 
				 #endif // __SIMGRID_H__
			
--- a/src/core/topology.c
+++ b/src/core/topology.c
@@ -47,6 +47,10 @@
 
				 #include <core/simgrid.h>
			
 
				 #endif
			
 
				 
			
 
				+#if defined(HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX) && HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX
			
 
				+#include <hwloc/cuda.h>
			
 
				+#endif
			
 
				+
			
 
				 static unsigned topology_is_initialized = 0;
			
 
				 static int nobind;
			
 
				 
			
@@ -516,6 +520,7 @@ _starpu_init_topology (struct _starpu_machine_config *config)
 
				 #ifndef STARPU_SIMGRID
			
 
				 #ifdef STARPU_HAVE_HWLOC
			
 
				 	hwloc_topology_init(&topology->hwtopology);
			
 
				+	hwloc_topology_set_flags(topology->hwtopology, HWLOC_TOPOLOGY_FLAG_IO_DEVICES | HWLOC_TOPOLOGY_FLAG_IO_BRIDGES);
			
 
				 	hwloc_topology_load(topology->hwtopology);
			
 
				 	_starpu_allocate_topology_userdata(hwloc_get_root_obj(topology->hwtopology));
			
 
				 #endif
			
@@ -930,6 +935,29 @@ _starpu_deinit_mp_config (struct _starpu_machine_config *config)
 
				 }
			
 
				 #endif
			
 
				 
			
 
				+#ifdef STARPU_HAVE_HWLOC
			
 
				+static unsigned
			
 
				+_starpu_topology_count_ngpus(hwloc_obj_t obj)
			
 
				+{
			
 
				+	struct _starpu_hwloc_userdata *data = obj->userdata;
			
 
				+	unsigned n = data->ngpus;
			
 
				+	unsigned i;
			
 
				+
			
 
				+	for (i = 0; i < obj->arity; i++)
			
 
				+		n += _starpu_topology_count_ngpus(obj->children[i]);
			
 
				+
			
 
				+	data->ngpus = n;
			
 
				+#ifdef STARPU_VERBOSE
			
 
				+	{
			
 
				+		char name[64];
			
 
				+		hwloc_obj_type_snprintf(name, sizeof(name), obj, 0);
			
 
				+		_STARPU_DEBUG("hwloc obj %s has %u GPUs below\n", name, n);
			
 
				+	}
			
 
				+#endif
			
 
				+	return n;
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				 static int
			
 
				 _starpu_init_machine_config(struct _starpu_machine_config *config, int no_mp_config STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
@@ -1039,6 +1067,23 @@ _starpu_init_machine_config(struct _starpu_machine_config *config, int no_mp_con
 
				 			entry->gpuid = devid;
			
 
				 			HASH_ADD_INT(devices_using_cuda, gpuid, entry);
			
 
				 		}
			
 
				+
			
 
				+#ifndef STARPU_SIMGRID
			
 
				+#if defined(HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX) && HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX
			
 
				+		{
			
 
				+			hwloc_obj_t obj = hwloc_cuda_get_device_osdev_by_index(topology->hwtopology, devid);
			
 
				+			if (obj)
			
 
				+			{
			
 
				+				struct _starpu_hwloc_userdata *data = obj->userdata;
			
 
				+				data->ngpus++;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				_STARPU_DISP("Warning: could not find location of CUDA%u, do you have the hwloc CUDA plugin installed?\n", devid);
			
 
				+			}
			
 
				+		}
			
 
				+#endif
			
 
				+#endif
			
 
				         }
			
 
				 
			
 
				 	topology->nworkers += topology->ncudagpus * nworker_per_cuda;
			
@@ -1565,8 +1610,8 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
				 					workerarg->bindid = cuda_bindid[devid] = _starpu_get_next_bindid(config, preferred_binding, npreferred);
			
 
				 					memory_node = cuda_memory_nodes[devid] = _starpu_memory_node_register(STARPU_CUDA_RAM, devid);
			
 
				 
			
 
				-					_starpu_register_bus(STARPU_MAIN_RAM, memory_node);
			
 
				-					_starpu_register_bus(memory_node, STARPU_MAIN_RAM);
			
 
				+					_starpu_cuda_bus_ids[0][devid+1] = _starpu_register_bus(STARPU_MAIN_RAM, memory_node);
			
 
				+					_starpu_cuda_bus_ids[devid+1][0] = _starpu_register_bus(memory_node, STARPU_MAIN_RAM);
			
 
				 #ifdef STARPU_SIMGRID
			
 
				 					const char* cuda_memcpy_peer;
			
 
				 					snprintf(name, sizeof(name), "CUDA%d", devid);
			
@@ -1589,11 +1634,35 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
				 						for (worker2 = 0; worker2 < worker; worker2++)
			
 
				 						{
			
 
				 							struct _starpu_worker *workerarg2 = &config->workers[worker2];
			
 
				+							int devid2 = workerarg2->devid;
			
 
				 							if (workerarg2->arch == STARPU_CUDA_WORKER)
			
 
				 							{
			
 
				 								unsigned memory_node2 = starpu_worker_get_memory_node(worker2);
			
 
				-								_starpu_register_bus(memory_node2, memory_node);
			
 
				-								_starpu_register_bus(memory_node, memory_node2);
			
 
				+								_starpu_cuda_bus_ids[devid2][devid] = _starpu_register_bus(memory_node2, memory_node);
			
 
				+								_starpu_cuda_bus_ids[devid][devid2] = _starpu_register_bus(memory_node, memory_node2);
			
 
				+#ifndef STARPU_SIMGRID
			
 
				+#if defined(HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX) && HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX
			
 
				+								{
			
 
				+									hwloc_obj_t obj, obj2, ancestor;
			
 
				+									obj = hwloc_cuda_get_device_osdev_by_index(config->topology.hwtopology, devid);
			
 
				+									obj2 = hwloc_cuda_get_device_osdev_by_index(config->topology.hwtopology, devid2);
			
 
				+									ancestor = hwloc_get_common_ancestor_obj(config->topology.hwtopology, obj, obj2);
			
 
				+									if (ancestor)
			
 
				+									{
			
 
				+										struct _starpu_hwloc_userdata *data = ancestor->userdata;
			
 
				+#ifdef STARPU_VERBOSE
			
 
				+										{
			
 
				+											char name[64];
			
 
				+											hwloc_obj_type_snprintf(name, sizeof(name), ancestor, 0);
			
 
				+											_STARPU_DEBUG("CUDA%u and CUDA%u are linked through %s, along %u GPUs\n", devid, devid2, name, data->ngpus);
			
 
				+										}
			
 
				+#endif
			
 
				+										starpu_bus_set_ngpus(_starpu_cuda_bus_ids[devid2][devid], data->ngpus);
			
 
				+										starpu_bus_set_ngpus(_starpu_cuda_bus_ids[devid][devid2], data->ngpus);
			
 
				+									}
			
 
				+								}
			
 
				+#endif
			
 
				+#endif
			
 
				 							}
			
 
				 						}
			
 
				 					}
			
@@ -1752,6 +1821,14 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
				 			config->bindid_workers[bindid].workerids[config->bindid_workers[bindid].nworkers-1] = worker;
			
 
				 		}
			
 
				 	}
			
 
				+
			
 
				+#ifdef STARPU_SIMGRID
			
 
				+	_starpu_simgrid_count_ngpus();
			
 
				+#else
			
 
				+#ifdef STARPU_HAVE_HWLOC
			
 
				+	_starpu_topology_count_ngpus(hwloc_get_root_obj(config->topology.hwtopology));
			
 
				+#endif
			
 
				+#endif
			
 
				 }
			
 
				 
			
 
				 int
			
--- a/src/core/topology.h
+++ b/src/core/topology.h
@@ -31,6 +31,7 @@ struct _starpu_machine_config;
 
				 /* This is allocated for each hwloc object */
			
 
				 struct _starpu_hwloc_userdata {
			
 
				 	struct _starpu_worker_list *worker_list; /* List of workers running on this obj */
			
 
				+	unsigned ngpus; /* Number of GPUs sharing this PCI link */
			
 
				 };
			
 
				 #endif
			
 
				 #endif
			
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c
@@ -239,12 +239,8 @@ static int worker_supports_direct_access(unsigned node, unsigned handling_node)
 
				 #ifdef STARPU_SIMGRID
			
 
				 			if (starpu_node_get_kind(handling_node) == STARPU_CUDA_RAM)
			
 
				 			{
			
 
				-				char name[16];
			
 
				-				msg_host_t host;
			
 
				-				const char* cuda_memcpy_peer;
			
 
				-				snprintf(name, sizeof(name), "CUDA%d", _starpu_memory_node_get_devid(handling_node));
			
 
				-				host = _starpu_simgrid_get_host_by_name(name);
			
 
				-				cuda_memcpy_peer = MSG_host_get_property_value(host, "memcpy_peer");
			
 
				+				msg_host_t host = _starpu_simgrid_get_memnode_host(handling_node);
			
 
				+				const char* cuda_memcpy_peer = MSG_host_get_property_value(host, "memcpy_peer");
			
 
				 				return cuda_memcpy_peer && atoll(cuda_memcpy_peer);
			
 
				 			}
			
 
				 			else
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -50,6 +50,7 @@
 
				 static unsigned ncudagpus;
			
 
				 
			
 
				 static size_t global_mem[STARPU_MAXCUDADEVS];
			
 
				+int _starpu_cuda_bus_ids[STARPU_MAXCUDADEVS+1][STARPU_MAXCUDADEVS+1];
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 static cudaStream_t streams[STARPU_NMAXWORKERS];
			
 
				 static cudaStream_t out_transfer_streams[STARPU_MAXCUDADEVS];
			
@@ -270,7 +271,11 @@ static void init_device_context(unsigned devid)
 
				 				{
			
 
				 					cures = cudaDeviceEnablePeerAccess(worker->devid, 0);
			
 
				 					if (!cures)
			
 
				+					{
			
 
				 						_STARPU_DEBUG("Enabled GPU-Direct %d -> %d\n", worker->devid, devid);
			
 
				+						/* direct copies are made from the destination, see link_supports_direct_transfers */
			
 
				+						starpu_bus_set_direct(_starpu_cuda_bus_ids[worker->devid][devid], 1);
			
 
				+					}
			
 
				 				}
			
 
				 			}
			
 
				 		}
			
--- a/src/drivers/cuda/driver_cuda.h
+++ b/src/drivers/cuda/driver_cuda.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010, 2012-2014  Université de Bordeaux
			
 
				+ * Copyright (C) 2009, 2010, 2012-2014, 2016  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2012  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -35,6 +35,7 @@
 
				 #include <common/fxt.h>
			
 
				 
			
 
				 unsigned _starpu_get_cuda_device_count(void);
			
 
				+extern int _starpu_cuda_bus_ids[STARPU_MAXCUDADEVS+1][STARPU_MAXCUDADEVS+1];
			
 
				 
			
 
				 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
			
 
				 void _starpu_cuda_discover_devices (struct _starpu_machine_config *);
			
--- a/src/profiling/profiling.c
+++ b/src/profiling/profiling.c
@@ -49,6 +49,8 @@ struct node_pair
 
				 static int busid_matrix[STARPU_MAXNODES][STARPU_MAXNODES];
			
 
				 static struct starpu_profiling_bus_info bus_profiling_info[STARPU_MAXNODES][STARPU_MAXNODES];
			
 
				 static struct node_pair busid_to_node_pair[STARPU_MAXNODES*STARPU_MAXNODES];
			
 
				+static char bus_direct[STARPU_MAXNODES*STARPU_MAXNODES];
			
 
				+static int bus_ngpus[STARPU_MAXNODES*STARPU_MAXNODES];
			
 
				 static unsigned busid_cnt = 0;
			
 
				 
			
 
				 static void _starpu_bus_reset_profiling_info(struct starpu_profiling_bus_info *bus_info);
			
@@ -429,6 +431,31 @@ int starpu_bus_get_dst(int busid)
 
				 	return busid_to_node_pair[busid].dst;
			
 
				 }
			
 
				 
			
 
				+void starpu_bus_set_direct(int busid, int direct)
			
 
				+{
			
 
				+	bus_direct[busid] = direct;
			
 
				+}
			
 
				+
			
 
				+int starpu_bus_get_direct(int busid)
			
 
				+{
			
 
				+	return bus_direct[busid];
			
 
				+}
			
 
				+
			
 
				+void starpu_bus_set_ngpus(int busid, int ngpus)
			
 
				+{
			
 
				+	bus_ngpus[busid] = ngpus;
			
 
				+}
			
 
				+
			
 
				+int starpu_bus_get_ngpus(int busid)
			
 
				+{
			
 
				+	struct _starpu_machine_topology *topology = &_starpu_get_machine_config()->topology;
			
 
				+	int ngpus = bus_ngpus[busid];
			
 
				+	if (!ngpus)
			
 
				+		/* Unknown number of GPUs, assume it's shared by all GPUs */
			
 
				+		ngpus = topology->ncudagpus+topology->nopenclgpus;
			
 
				+	return ngpus;
			
 
				+}
			
 
				+
			
 
				 int starpu_bus_get_profiling_info(int busid, struct starpu_profiling_bus_info *bus_info)
			
 
				 {
			
 
				 	int src_node = starpu_bus_get_src(busid);