13 years ago · 09d5790083
--- a/src/core/perfmodel/perfmodel.c
+++ b/src/core/perfmodel/perfmodel.c
@@ -215,7 +215,7 @@ double starpu_data_expected_transfer_time(starpu_data_handle handle, unsigned me
 
				 	if (size == 0)
			
 
				 		return 0.0;
			
 
				 
			
 
				-	uint32_t src_node = _starpu_select_src_node(handle);
			
 
				+	uint32_t src_node = _starpu_select_src_node(handle, memory_node);
			
 
				 	return _starpu_predict_transfer_time(src_node, memory_node, size);
			
 
				 }
			
 
				 
			
--- a/src/core/topology.c
+++ b/src/core/topology.c
@@ -693,6 +693,18 @@ static void _starpu_init_workers_binding(struct starpu_machine_config_s *config)
 
				 
			
 
				 				_starpu_register_bus(0, memory_node);
			
 
				 				_starpu_register_bus(memory_node, 0);
			
 
				+#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				+				unsigned worker2;
			
 
				+				for (worker2 = 0; worker2 < worker; worker2++)
			
 
				+				{
			
 
				+					struct starpu_worker_s *workerarg = &config->workers[worker];
			
 
				+					if (workerarg->arch == STARPU_CUDA_WORKER) {
			
 
				+						unsigned memory_node2 = starpu_worker_get_memory_node(worker2);
			
 
				+						_starpu_register_bus(memory_node2, memory_node);
			
 
				+						_starpu_register_bus(memory_node, memory_node2);
			
 
				+					}
			
 
				+				}
			
 
				+#endif
			
 
				 				break;
			
 
				 #endif
			
 
				 
			
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c
@@ -22,7 +22,8 @@
 
				 #include <core/dependencies/data_concurrency.h>
			
 
				 #include <profiling/profiling.h>
			
 
				 
			
 
				-uint32_t _starpu_select_src_node(starpu_data_handle handle)
			
 
				+static int link_supports_direct_transfers(starpu_data_handle handle, unsigned src_node, unsigned dst_node, unsigned *handling_node);
			
 
				+uint32_t _starpu_select_src_node(starpu_data_handle handle, unsigned destination)
			
 
				 {
			
 
				 	unsigned src_node = 0;
			
 
				 	unsigned i;
			
@@ -33,6 +34,9 @@ uint32_t _starpu_select_src_node(starpu_data_handle handle)
 
				 	uint32_t node;
			
 
				 
			
 
				 	uint32_t src_node_mask = 0;
			
 
				+	size_t size = _starpu_data_get_size(handle);
			
 
				+	double cost = INFINITY;
			
 
				+
			
 
				 	for (node = 0; node < nnodes; node++)
			
 
				 	{
			
 
				 		if (handle->per_node[node].state != STARPU_INVALID) {
			
@@ -44,7 +48,39 @@ uint32_t _starpu_select_src_node(starpu_data_handle handle)
 
				 	/* we should have found at least one copy ! */
			
 
				 	STARPU_ASSERT(src_node_mask != 0);
			
 
				 
			
 
				-	/* find the node that will be the actual source */
			
 
				+	/* Without knowing the size, we won't know the cost */
			
 
				+	if (!size)
			
 
				+		cost = 0;
			
 
				+
			
 
				+	/* Check whether we have transfer cost for all nodes, if so, take the minimum */
			
 
				+	if (cost)
			
 
				+		for (i = 0; i < nnodes; i++)
			
 
				+		{
			
 
				+			if (src_node_mask & (1<<i))
			
 
				+			{
			
 
				+				double time = _starpu_predict_transfer_time(i, destination, size);
			
 
				+				unsigned handling_node;
			
 
				+
			
 
				+				/* Avoid indirect transfers */
			
 
				+				if (!link_supports_direct_transfers(handle, i, destination, &handling_node))
			
 
				+					continue;
			
 
				+
			
 
				+				if (time == 0.0) {
			
 
				+					/* No estimation, will have to revert to dumb strategy */
			
 
				+					cost = 0.0;
			
 
				+					break;
			
 
				+				} else if (time < cost) {
			
 
				+					cost = time;
			
 
				+					src_node = i;
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+	if (cost)
			
 
				+		/* Could estimate through cost, return that */
			
 
				+		return src_node;
			
 
				+
			
 
				+	/* Revert to dumb strategy: take RAM unless only a GPU has it */
			
 
				 	for (i = 0; i < nnodes; i++)
			
 
				 	{
			
 
				 		if (src_node_mask & (1<<i))
			
@@ -53,13 +89,15 @@ uint32_t _starpu_select_src_node(starpu_data_handle handle)
 
				 			src_node = i;
			
 
				 
			
 
				 			/* however GPU are expensive sources, really !
			
 
				-			 * 	other should be ok */
			
 
				-		 
			
 
				-			if (_starpu_get_node_kind(i) != STARPU_CUDA_RAM && _starpu_get_node_kind(i) != STARPU_OPENCL_RAM)	
			
 
				+			 * 	Unless peer transfer is supported.
			
 
				+			 * 	Other should be ok */
			
 
				+
			
 
				+			if (
			
 
				+#ifndef HAVE_CUDA_MEMCPY_PEER
			
 
				+					_starpu_get_node_kind(i) != STARPU_CUDA_RAM &&
			
 
				+#endif
			
 
				+					_starpu_get_node_kind(i) != STARPU_OPENCL_RAM)	
			
 
				 				break ;
			
 
				-		 
			
 
				-			/* XXX do a better algorithm to distribute the memory copies */
			
 
				-			/* TODO : use the "requesting_node" as an argument to do so */
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -125,6 +163,7 @@ static int worker_supports_direct_access(unsigned node, unsigned handling_node)
 
				 			return (_starpu_get_node_kind(handling_node) != STARPU_OPENCL_RAM);
			
 
				 #else
			
 
				 			/* Direct GPU-GPU transfers are not allowed in general */
			
 
				+#error erf
			
 
				 			return 0;
			
 
				 #endif
			
 
				 		case STARPU_OPENCL_RAM:
			
@@ -325,7 +364,7 @@ starpu_data_request_t create_request_to_fetch_data(starpu_data_handle handle,
 
				 	/* if the data is in write only mode, there is no need for a source */
			
 
				 	if (mode & STARPU_R)
			
 
				 	{
			
 
				-		src_node = _starpu_select_src_node(handle);
			
 
				+		src_node = _starpu_select_src_node(handle, requesting_node);
			
 
				 		STARPU_ASSERT(src_node != requesting_node);
			
 
				 	}
			
 
				 
			
--- a/src/datawizard/coherency.h
+++ b/src/datawizard/coherency.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010  Université de Bordeaux 1
			
 
				+ * Copyright (C) 2009-2011  Université de Bordeaux 1
			
 
				  * Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -231,7 +231,7 @@ unsigned _starpu_is_data_present_or_requested(struct starpu_data_state_t *state,
 
				 unsigned starpu_data_test_if_allocated_on_node(starpu_data_handle handle, uint32_t memory_node);
			
 
				 
			
 
				 
			
 
				-uint32_t _starpu_select_src_node(struct starpu_data_state_t *state);
			
 
				+uint32_t _starpu_select_src_node(struct starpu_data_state_t *state, unsigned destination);
			
 
				 
			
 
				 starpu_data_request_t create_request_to_fetch_data(starpu_data_handle handle,
			
 
				 				struct starpu_data_replicate_s *dst_replicate,