Pārlūkot izejas kodu

Add code that takes into account PCI topology to determine contention. But disable it for now since it seems to actually decrease performance...

Samuel Thibault 8 gadi atpakaļ
vecāks
revīzija
2510657bc9

+ 4 - 0
include/starpu_profiling.h

@@ -108,6 +108,10 @@ int starpu_bus_get_count(void);
 int starpu_bus_get_id(int src, int dst);
 int starpu_bus_get_src(int busid);
 int starpu_bus_get_dst(int busid);
+void starpu_bus_set_direct(int busid, int direct);
+int starpu_bus_get_direct(int busid);
+void starpu_bus_set_ngpus(int busid, int ngpus);
+int starpu_bus_get_ngpus(int busid);
 
 int starpu_bus_get_profiling_info(int busid, struct starpu_profiling_bus_info *bus_info);
 

+ 25 - 1
src/core/perfmodel/perfmodel_bus.c

@@ -97,6 +97,7 @@ static double cudadev_latency_dtod[STARPU_MAXNODES][STARPU_MAXNODES] = {{0.0}};
 #endif
 #endif
 static struct dev_timing cudadev_timing_per_cpu[STARPU_MAXNODES*STARPU_MAXCPUS];
+static char cudadev_direct[STARPU_MAXNODES][STARPU_MAXNODES];
 #endif
 
 #ifndef STARPU_SIMGRID
@@ -267,7 +268,10 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 		{
 			cures = cudaDeviceEnablePeerAccess(dst, 0);
 			if (!cures)
+			{
 				_STARPU_DISP("GPU-Direct %d -> %d\n", dst, src);
+				cudadev_direct[src][dst] = 1;
+			}
 		}
 	}
 
@@ -289,7 +293,10 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 		{
 			cures = cudaDeviceEnablePeerAccess(src, 0);
 			if (!cures)
+			{
 				_STARPU_DISP("GPU-Direct %d -> %d\n", src, dst);
+				cudadev_direct[dst][src] = 1;
+			}
 		}
 	}
 
@@ -2195,6 +2202,7 @@ static void write_bus_platform_file_content(int version)
 #ifdef HAVE_CUDA_MEMCPY_PEER
 		fprintf(f, "     <prop id=\"memcpy_peer\" value=\"1\"/>\n");
 #endif
+		/* TODO: record cudadev_direct instead of assuming it's NUMA nodes */
 		fprintf(f, "   </host>\n");
 	}
 
@@ -2504,8 +2512,24 @@ double starpu_transfer_predict(unsigned src_node, unsigned dst_node, size_t size
 	double bandwidth = bandwidth_matrix[src_node][dst_node];
 	double latency = latency_matrix[src_node][dst_node];
 	struct _starpu_machine_topology *topology = &_starpu_get_machine_config()->topology;
+	int busid = starpu_bus_get_id(src_node, dst_node);
+	int direct = starpu_bus_get_direct(busid);
+	float ngpus = topology->ncudagpus+topology->nopenclgpus;
+
+#if 0
+	/* Ideally we should take into account that some GPUs are directly
+	 * connected through a PCI switch, which has less contention that the
+	 * Host bridge, but doing that seems to *decrease* performance... */
+	if (direct)
+	{
+		float neighbours = starpu_bus_get_ngpus(busid);
+		/* Count transfers of these GPUs, and count transfers between
+		 * other GPUs and these GPUs */
+		ngpus = neighbours + (ngpus - neighbours) * neighbours / ngpus;
+	}
+#endif
 
-	return latency + (size/bandwidth)*2*(topology->ncudagpus+topology->nopenclgpus);
+	return latency + (size/bandwidth)*2*ngpus;
 }
 
 

+ 108 - 0
src/core/simgrid.c

@@ -677,4 +677,112 @@ _starpu_simgrid_thread_start(int argc STARPU_ATTRIBUTE_UNUSED, char *argv[])
 	f(arg);
 	return 0;
 }
+
+msg_host_t
+_starpu_simgrid_get_memnode_host(unsigned node)
+{
+	const char *fmt;
+	char name[16];
+
+	switch (starpu_node_get_kind(node))
+	{
+		case STARPU_CPU_RAM:
+			fmt = "RAM";
+			break;
+		case STARPU_CUDA_RAM:
+			fmt = "CUDA%u";
+			break;
+		case STARPU_OPENCL_RAM:
+			fmt = "OpenCL%u";
+			break;
+		default:
+			STARPU_ABORT();
+			break;
+	}
+	snprintf(name, sizeof(name), fmt, _starpu_memory_node_get_devid(node));
+
+	return _starpu_simgrid_get_host_by_name(name);
+}
+
+void _starpu_simgrid_count_ngpus(void)
+{
+	unsigned src, dst;
+	msg_host_t ramhost = _starpu_simgrid_get_host_by_name("RAM");
+
+	/* For each pair of memory nodes, get the route */
+	for (src = 1; src < STARPU_MAXNODES; src++)
+		for (dst = 1; dst < STARPU_MAXNODES; dst++)
+		{
+			int busid;
+			msg_host_t srchost, dsthost;
+			const SD_link_t *route;
+			int i, routesize;
+			int through;
+			unsigned src2;
+			unsigned ngpus;
+			const char *name;
+
+			if (dst == src)
+				continue;
+			busid = starpu_bus_get_id(src, dst);
+			if (busid == -1)
+				continue;
+
+			srchost = _starpu_simgrid_get_memnode_host(src);
+			dsthost = _starpu_simgrid_get_memnode_host(dst);
+			routesize = SD_route_get_size(srchost, dsthost);
+			route = SD_route_get_list(srchost, dsthost);
+
+			/* If it goes through "Host", do not care, there is no
+			 * direct transfer support */
+			for (i = 0; i < routesize; i++)
+				if (!strcmp(sg_link_name(route[i]), "Host"))
+					break;
+			if (i < routesize)
+				continue;
+
+			/* Get the PCI bridge between down and up links */
+			through = -1;
+			for (i = 0; i < routesize; i++)
+			{
+				name = sg_link_name(route[i]);
+				size_t len = strlen(name);
+				if (!strcmp(" through", name+len-8))
+					through = i;
+				else if (!strcmp(" up", name+len-3))
+					break;
+			}
+			/* Didn't find it ?! */
+			if (through == -1)
+			{
+				_STARPU_DEBUG("Didn't find through-link for %d->%d\n", src, dst);
+				continue;
+			}
+			name = sg_link_name(route[through]);
+
+			/*
+			 * count how many direct routes go through it between
+			 * GPUs and RAM
+			 */
+			ngpus = 0;
+			for (src2 = 1; src2 < STARPU_MAXNODES; src2++)
+			{
+				if (starpu_bus_get_id(src2, STARPU_MAIN_RAM) == -1)
+					continue;
+				msg_host_t srchost2 = _starpu_simgrid_get_memnode_host(src2);
+				int routesize2 = SD_route_get_size(srchost2, ramhost);
+				const SD_link_t *route2 = SD_route_get_list(srchost2, ramhost);
+
+				for (i = 0; i < routesize2; i++)
+					if (!strcmp(name, sg_link_name(route2[i])))
+					{
+						/* This GPU goes through this PCI bridge to access RAM */
+						ngpus++;
+						break;
+					}
+			}
+			_STARPU_DEBUG("%d->%d through %s, %u GPUs\n", src, dst, name, ngpus);
+			starpu_bus_set_ngpus(busid, ngpus);
+		}
+}
 #endif

+ 5 - 0
src/core/simgrid.h

@@ -47,6 +47,7 @@ int _starpu_simgrid_transfer(size_t size, unsigned src_node, unsigned dst_node,
 int _starpu_simgrid_get_nbhosts(const char *prefix);
 unsigned long long _starpu_simgrid_get_memsize(const char *prefix, unsigned devid);
 msg_host_t _starpu_simgrid_get_host_by_name(const char *name);
+msg_host_t _starpu_simgrid_get_memnode_host(unsigned node);
 struct _starpu_worker;
 msg_host_t _starpu_simgrid_get_host_by_worker(struct _starpu_worker *worker);
 void _starpu_simgrid_get_platform_path(int version, char *path, size_t maxlen);
@@ -63,6 +64,10 @@ starpu_pthread_queue_t _starpu_simgrid_task_queue[STARPU_NMAXWORKERS];
 #define _starpu_simgrid_queue_malloc_cost() starpu_get_env_number_default("STARPU_SIMGRID_QUEUE_MALLOC_COST", 1)
 #define _starpu_simgrid_task_submit_cost() starpu_get_env_number_default("STARPU_SIMGRID_TASK_SUBMIT_COST", 1)
 
+/* Called at initialization to count how many GPUs are interfering with each
+ * bus */
+void _starpu_simgrid_count_ngpus(void);
+
 #endif
 
 #endif // __SIMGRID_H__

+ 81 - 4
src/core/topology.c

@@ -47,6 +47,10 @@
 #include <core/simgrid.h>
 #endif
 
+#if defined(HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX) && HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX
+#include <hwloc/cuda.h>
+#endif
+
 static unsigned topology_is_initialized = 0;
 static int nobind;
 
@@ -516,6 +520,7 @@ _starpu_init_topology (struct _starpu_machine_config *config)
 #ifndef STARPU_SIMGRID
 #ifdef STARPU_HAVE_HWLOC
 	hwloc_topology_init(&topology->hwtopology);
+	hwloc_topology_set_flags(topology->hwtopology, HWLOC_TOPOLOGY_FLAG_IO_DEVICES | HWLOC_TOPOLOGY_FLAG_IO_BRIDGES);
 	hwloc_topology_load(topology->hwtopology);
 	_starpu_allocate_topology_userdata(hwloc_get_root_obj(topology->hwtopology));
 #endif
@@ -930,6 +935,29 @@ _starpu_deinit_mp_config (struct _starpu_machine_config *config)
 }
 #endif
 
+#ifdef STARPU_HAVE_HWLOC
+static unsigned
+_starpu_topology_count_ngpus(hwloc_obj_t obj)
+{
+	struct _starpu_hwloc_userdata *data = obj->userdata;
+	unsigned n = data->ngpus;
+	unsigned i;
+
+	for (i = 0; i < obj->arity; i++)
+		n += _starpu_topology_count_ngpus(obj->children[i]);
+
+	data->ngpus = n;
+#ifdef STARPU_VERBOSE
+	{
+		char name[64];
+		hwloc_obj_type_snprintf(name, sizeof(name), obj, 0);
+		_STARPU_DEBUG("hwloc obj %s has %u GPUs below\n", name, n);
+	}
+#endif
+	return n;
+}
+#endif
+
 static int
 _starpu_init_machine_config(struct _starpu_machine_config *config, int no_mp_config STARPU_ATTRIBUTE_UNUSED)
 {
@@ -1039,6 +1067,23 @@ _starpu_init_machine_config(struct _starpu_machine_config *config, int no_mp_con
 			entry->gpuid = devid;
 			HASH_ADD_INT(devices_using_cuda, gpuid, entry);
 		}
+
+#ifndef STARPU_SIMGRID
+#if defined(HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX) && HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX
+		{
+			hwloc_obj_t obj = hwloc_cuda_get_device_osdev_by_index(topology->hwtopology, devid);
+			if (obj)
+			{
+				struct _starpu_hwloc_userdata *data = obj->userdata;
+				data->ngpus++;
+			}
+			else
+			{
+				_STARPU_DISP("Warning: could not find location of CUDA%u, do you have the hwloc CUDA plugin installed?\n", devid);
+			}
+		}
+#endif
+#endif
         }
 
 	topology->nworkers += topology->ncudagpus * nworker_per_cuda;
@@ -1565,8 +1610,8 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 					workerarg->bindid = cuda_bindid[devid] = _starpu_get_next_bindid(config, preferred_binding, npreferred);
 					memory_node = cuda_memory_nodes[devid] = _starpu_memory_node_register(STARPU_CUDA_RAM, devid);
 
-					_starpu_register_bus(STARPU_MAIN_RAM, memory_node);
-					_starpu_register_bus(memory_node, STARPU_MAIN_RAM);
+					_starpu_cuda_bus_ids[0][devid+1] = _starpu_register_bus(STARPU_MAIN_RAM, memory_node);
+					_starpu_cuda_bus_ids[devid+1][0] = _starpu_register_bus(memory_node, STARPU_MAIN_RAM);
 #ifdef STARPU_SIMGRID
 					const char* cuda_memcpy_peer;
 					snprintf(name, sizeof(name), "CUDA%d", devid);
@@ -1589,11 +1634,35 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 						for (worker2 = 0; worker2 < worker; worker2++)
 						{
 							struct _starpu_worker *workerarg2 = &config->workers[worker2];
+							int devid2 = workerarg2->devid;
 							if (workerarg2->arch == STARPU_CUDA_WORKER)
 							{
 								unsigned memory_node2 = starpu_worker_get_memory_node(worker2);
-								_starpu_register_bus(memory_node2, memory_node);
-								_starpu_register_bus(memory_node, memory_node2);
+								_starpu_cuda_bus_ids[devid2][devid] = _starpu_register_bus(memory_node2, memory_node);
+								_starpu_cuda_bus_ids[devid][devid2] = _starpu_register_bus(memory_node, memory_node2);
+#ifndef STARPU_SIMGRID
+#if defined(HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX) && HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX
+								{
+									hwloc_obj_t obj, obj2, ancestor;
+									obj = hwloc_cuda_get_device_osdev_by_index(config->topology.hwtopology, devid);
+									obj2 = hwloc_cuda_get_device_osdev_by_index(config->topology.hwtopology, devid2);
+									ancestor = hwloc_get_common_ancestor_obj(config->topology.hwtopology, obj, obj2);
+									if (ancestor)
+									{
+										struct _starpu_hwloc_userdata *data = ancestor->userdata;
+#ifdef STARPU_VERBOSE
+										{
+											char name[64];
+											hwloc_obj_type_snprintf(name, sizeof(name), ancestor, 0);
+											_STARPU_DEBUG("CUDA%u and CUDA%u are linked through %s, along %u GPUs\n", devid, devid2, name, data->ngpus);
+										}
+#endif
+										starpu_bus_set_ngpus(_starpu_cuda_bus_ids[devid2][devid], data->ngpus);
+										starpu_bus_set_ngpus(_starpu_cuda_bus_ids[devid][devid2], data->ngpus);
+									}
+								}
+#endif
+#endif
 							}
 						}
 					}
@@ -1752,6 +1821,14 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 			config->bindid_workers[bindid].workerids[config->bindid_workers[bindid].nworkers-1] = worker;
 		}
 	}
+
+#ifdef STARPU_SIMGRID
+	_starpu_simgrid_count_ngpus();
+#else
+#ifdef STARPU_HAVE_HWLOC
+	_starpu_topology_count_ngpus(hwloc_get_root_obj(config->topology.hwtopology));
+#endif
+#endif
 }
 
 int

+ 1 - 0
src/core/topology.h

@@ -31,6 +31,7 @@ struct _starpu_machine_config;
 /* This is allocated for each hwloc object */
 struct _starpu_hwloc_userdata {
 	struct _starpu_worker_list *worker_list; /* List of workers running on this obj */
+	unsigned ngpus; /* Number of GPUs sharing this PCI link */
 };
 #endif
 #endif

+ 2 - 6
src/datawizard/coherency.c

@@ -239,12 +239,8 @@ static int worker_supports_direct_access(unsigned node, unsigned handling_node)
 #ifdef STARPU_SIMGRID
 			if (starpu_node_get_kind(handling_node) == STARPU_CUDA_RAM)
 			{
-				char name[16];
-				msg_host_t host;
-				const char* cuda_memcpy_peer;
-				snprintf(name, sizeof(name), "CUDA%d", _starpu_memory_node_get_devid(handling_node));
-				host = _starpu_simgrid_get_host_by_name(name);
-				cuda_memcpy_peer = MSG_host_get_property_value(host, "memcpy_peer");
+				msg_host_t host = _starpu_simgrid_get_memnode_host(handling_node);
+				const char* cuda_memcpy_peer = MSG_host_get_property_value(host, "memcpy_peer");
 				return cuda_memcpy_peer && atoll(cuda_memcpy_peer);
 			}
 			else

+ 5 - 0
src/drivers/cuda/driver_cuda.c

@@ -50,6 +50,7 @@
 static unsigned ncudagpus;
 
 static size_t global_mem[STARPU_MAXCUDADEVS];
+int _starpu_cuda_bus_ids[STARPU_MAXCUDADEVS+1][STARPU_MAXCUDADEVS+1];
 #ifdef STARPU_USE_CUDA
 static cudaStream_t streams[STARPU_NMAXWORKERS];
 static cudaStream_t out_transfer_streams[STARPU_MAXCUDADEVS];
@@ -270,7 +271,11 @@ static void init_device_context(unsigned devid)
 				{
 					cures = cudaDeviceEnablePeerAccess(worker->devid, 0);
 					if (!cures)
+					{
 						_STARPU_DEBUG("Enabled GPU-Direct %d -> %d\n", worker->devid, devid);
+						/* direct copies are made from the destination, see link_supports_direct_transfers */
+						starpu_bus_set_direct(_starpu_cuda_bus_ids[worker->devid][devid], 1);
+					}
 				}
 			}
 		}

+ 2 - 1
src/drivers/cuda/driver_cuda.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010, 2012-2014  Université de Bordeaux
+ * Copyright (C) 2009, 2010, 2012-2014, 2016  Université de Bordeaux
  * Copyright (C) 2010, 2012  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -35,6 +35,7 @@
 #include <common/fxt.h>
 
 unsigned _starpu_get_cuda_device_count(void);
+extern int _starpu_cuda_bus_ids[STARPU_MAXCUDADEVS+1][STARPU_MAXCUDADEVS+1];
 
 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
 void _starpu_cuda_discover_devices (struct _starpu_machine_config *);

+ 27 - 0
src/profiling/profiling.c

@@ -49,6 +49,8 @@ struct node_pair
 static int busid_matrix[STARPU_MAXNODES][STARPU_MAXNODES];
 static struct starpu_profiling_bus_info bus_profiling_info[STARPU_MAXNODES][STARPU_MAXNODES];
 static struct node_pair busid_to_node_pair[STARPU_MAXNODES*STARPU_MAXNODES];
+static char bus_direct[STARPU_MAXNODES*STARPU_MAXNODES];
+static int bus_ngpus[STARPU_MAXNODES*STARPU_MAXNODES];
 static unsigned busid_cnt = 0;
 
 static void _starpu_bus_reset_profiling_info(struct starpu_profiling_bus_info *bus_info);
@@ -429,6 +431,31 @@ int starpu_bus_get_dst(int busid)
 	return busid_to_node_pair[busid].dst;
 }
 
+void starpu_bus_set_direct(int busid, int direct)
+{
+	bus_direct[busid] = direct;
+}
+
+int starpu_bus_get_direct(int busid)
+{
+	return bus_direct[busid];
+}
+
+void starpu_bus_set_ngpus(int busid, int ngpus)
+{
+	bus_ngpus[busid] = ngpus;
+}
+
+int starpu_bus_get_ngpus(int busid)
+{
+	struct _starpu_machine_topology *topology = &_starpu_get_machine_config()->topology;
+	int ngpus = bus_ngpus[busid];
+	if (!ngpus)
+		/* Unknown number of GPUs, assume it's shared by all GPUs */
+		ngpus = topology->ncudagpus+topology->nopenclgpus;
+	return ngpus;
+}
+
 int starpu_bus_get_profiling_info(int busid, struct starpu_profiling_bus_info *bus_info)
 {
 	int src_node = starpu_bus_get_src(busid);