Forráskód Böngészése

Take transfer time into account when choosing the source. Also check that direct transfer is possible. Add GPU-GPU bus to account for statistics

Samuel Thibault 13 éve
szülő
commit
09d5790083

+ 1 - 1
src/core/perfmodel/perfmodel.c

@@ -215,7 +215,7 @@ double starpu_data_expected_transfer_time(starpu_data_handle handle, unsigned me
 	if (size == 0)
 		return 0.0;
 
-	uint32_t src_node = _starpu_select_src_node(handle);
+	uint32_t src_node = _starpu_select_src_node(handle, memory_node);
 	return _starpu_predict_transfer_time(src_node, memory_node, size);
 }
 

+ 12 - 0
src/core/topology.c

@@ -693,6 +693,18 @@ static void _starpu_init_workers_binding(struct starpu_machine_config_s *config)
 
 				_starpu_register_bus(0, memory_node);
 				_starpu_register_bus(memory_node, 0);
+#ifdef HAVE_CUDA_MEMCPY_PEER
+				unsigned worker2;
+				for (worker2 = 0; worker2 < worker; worker2++)
+				{
+					struct starpu_worker_s *workerarg = &config->workers[worker];
+					if (workerarg->arch == STARPU_CUDA_WORKER) {
+						unsigned memory_node2 = starpu_worker_get_memory_node(worker2);
+						_starpu_register_bus(memory_node2, memory_node);
+						_starpu_register_bus(memory_node, memory_node2);
+					}
+				}
+#endif
 				break;
 #endif
 

+ 48 - 9
src/datawizard/coherency.c

@@ -22,7 +22,8 @@
 #include <core/dependencies/data_concurrency.h>
 #include <profiling/profiling.h>
 
-uint32_t _starpu_select_src_node(starpu_data_handle handle)
+static int link_supports_direct_transfers(starpu_data_handle handle, unsigned src_node, unsigned dst_node, unsigned *handling_node);
+uint32_t _starpu_select_src_node(starpu_data_handle handle, unsigned destination)
 {
 	unsigned src_node = 0;
 	unsigned i;
@@ -33,6 +34,9 @@ uint32_t _starpu_select_src_node(starpu_data_handle handle)
 	uint32_t node;
 
 	uint32_t src_node_mask = 0;
+	size_t size = _starpu_data_get_size(handle);
+	double cost = INFINITY;
+
 	for (node = 0; node < nnodes; node++)
 	{
 		if (handle->per_node[node].state != STARPU_INVALID) {
@@ -44,7 +48,39 @@ uint32_t _starpu_select_src_node(starpu_data_handle handle)
 	/* we should have found at least one copy ! */
 	STARPU_ASSERT(src_node_mask != 0);
 
-	/* find the node that will be the actual source */
+	/* Without knowing the size, we won't know the cost */
+	if (!size)
+		cost = 0;
+
+	/* Check whether we have transfer cost for all nodes, if so, take the minimum */
+	if (cost)
+		for (i = 0; i < nnodes; i++)
+		{
+			if (src_node_mask & (1<<i))
+			{
+				double time = _starpu_predict_transfer_time(i, destination, size);
+				unsigned handling_node;
+
+				/* Avoid indirect transfers */
+				if (!link_supports_direct_transfers(handle, i, destination, &handling_node))
+					continue;
+
+				if (time == 0.0) {
+					/* No estimation, will have to revert to dumb strategy */
+					cost = 0.0;
+					break;
+				} else if (time < cost) {
+					cost = time;
+					src_node = i;
+				}
+			}
+		}
+
+	if (cost)
+		/* Could estimate through cost, return that */
+		return src_node;
+
+	/* Revert to dumb strategy: take RAM unless only a GPU has it */
 	for (i = 0; i < nnodes; i++)
 	{
 		if (src_node_mask & (1<<i))
@@ -53,13 +89,15 @@ uint32_t _starpu_select_src_node(starpu_data_handle handle)
 			src_node = i;
 
 			/* however GPU are expensive sources, really !
-			 * 	other should be ok */
-		 
-			if (_starpu_get_node_kind(i) != STARPU_CUDA_RAM && _starpu_get_node_kind(i) != STARPU_OPENCL_RAM)	
+			 * 	Unless peer transfer is supported.
+			 * 	Other should be ok */
+
+			if (
+#ifndef HAVE_CUDA_MEMCPY_PEER
+					_starpu_get_node_kind(i) != STARPU_CUDA_RAM &&
+#endif
+					_starpu_get_node_kind(i) != STARPU_OPENCL_RAM)	
 				break ;
-		 
-			/* XXX do a better algorithm to distribute the memory copies */
-			/* TODO : use the "requesting_node" as an argument to do so */
 		}
 	}
 
@@ -125,6 +163,7 @@ static int worker_supports_direct_access(unsigned node, unsigned handling_node)
 			return (_starpu_get_node_kind(handling_node) != STARPU_OPENCL_RAM);
 #else
 			/* Direct GPU-GPU transfers are not allowed in general */
+#error erf
 			return 0;
 #endif
 		case STARPU_OPENCL_RAM:
@@ -325,7 +364,7 @@ starpu_data_request_t create_request_to_fetch_data(starpu_data_handle handle,
 	/* if the data is in write only mode, there is no need for a source */
 	if (mode & STARPU_R)
 	{
-		src_node = _starpu_select_src_node(handle);
+		src_node = _starpu_select_src_node(handle, requesting_node);
 		STARPU_ASSERT(src_node != requesting_node);
 	}
 

+ 2 - 2
src/datawizard/coherency.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2009, 2010  Université de Bordeaux 1
+ * Copyright (C) 2009-2011  Université de Bordeaux 1
  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -231,7 +231,7 @@ unsigned _starpu_is_data_present_or_requested(struct starpu_data_state_t *state,
 unsigned starpu_data_test_if_allocated_on_node(starpu_data_handle handle, uint32_t memory_node);
 
 
-uint32_t _starpu_select_src_node(struct starpu_data_state_t *state);
+uint32_t _starpu_select_src_node(struct starpu_data_state_t *state, unsigned destination);
 
 starpu_data_request_t create_request_to_fetch_data(starpu_data_handle handle,
 				struct starpu_data_replicate_s *dst_replicate,