Browse Source

make starpu_data_expected_transfer_time use _starpu_determine_request_path to completely count multi-hop transfers instead of using transitive closure of bandwidth_matrix

Samuel Thibault 6 years ago
parent
commit
aeadbc9262

+ 14 - 1
src/core/perfmodel/perfmodel.c

@@ -328,7 +328,20 @@ double starpu_data_expected_transfer_time(starpu_data_handle_t handle, unsigned
 		/* Will just create it in place. Ideally we should take the
 		 * time to create it into account */
 		return 0.0;
-	return starpu_transfer_predict(src_node, memory_node, size);
+
+#define MAX_REQUESTS 4
+	unsigned src_nodes[MAX_REQUESTS];
+	unsigned dst_nodes[MAX_REQUESTS];
+	unsigned handling_nodes[MAX_REQUESTS];
+	int nhops = _starpu_determine_request_path(handle, src_node, memory_node, mode,
+			MAX_REQUESTS,
+			src_nodes, dst_nodes, handling_nodes, 0);
+	int i;
+	double duration = 0.;
+
+	for (i = 0; i < nhops; i++)
+		duration += starpu_transfer_predict(src_nodes[i], dst_nodes[i], size);
+	return duration;
 }
 
 /* Data transfer performance modeling */

+ 4 - 0
src/core/perfmodel/perfmodel_bus.c

@@ -2997,6 +2997,9 @@ double starpu_transfer_predict(unsigned src_node, unsigned dst_node, size_t size
 	int direct = starpu_bus_get_direct(busid);
 #endif
 	float ngpus = topology->ncudagpus+topology->nopenclgpus;
+#ifdef STARPU_DEVEL
+#warning FIXME: ngpus shouldn't be used e.g. for slow disk transfers...
+#endif
 
 #if 0
 	/* Ideally we should take into account that some GPUs are directly
@@ -3011,6 +3014,7 @@ double starpu_transfer_predict(unsigned src_node, unsigned dst_node, size_t size
 	}
 #endif
 
+
 	return latency + (size/bandwidth)*2*ngpus;
 }
 

+ 2 - 2
src/datawizard/coherency.c

@@ -346,7 +346,7 @@ static unsigned chose_best_numa_between_src_and_dest(int src, int dst)
  * node that handles the hop. The returned value indicates the number of hops,
  * and the max_len is the maximum number of hops (ie. the size of the
  * src_nodes, dst_nodes and handling_nodes arrays. */
-static int determine_request_path(starpu_data_handle_t handle,
+int _starpu_determine_request_path(starpu_data_handle_t handle,
 				  int src_node, int dst_node,
 				  enum starpu_data_access_mode mode, int max_len,
 				  unsigned *src_nodes, unsigned *dst_nodes,
@@ -630,7 +630,7 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 	unsigned src_nodes[MAX_REQUESTS], dst_nodes[MAX_REQUESTS], handling_nodes[MAX_REQUESTS];
 	/* keep one slot for the last W request, if any */
 	int write_invalidation = (mode & STARPU_W) && nwait && !is_prefetch;
-	int nhops = determine_request_path(handle, src_node, requesting_node, mode, MAX_REQUESTS,
+	int nhops = _starpu_determine_request_path(handle, src_node, requesting_node, mode, MAX_REQUESTS,
 					   src_nodes, dst_nodes, handling_nodes, write_invalidation);
 
 	STARPU_ASSERT(nhops >= 0 && nhops <= MAX_REQUESTS-1);

+ 5 - 0
src/datawizard/coherency.h

@@ -323,6 +323,11 @@ void _starpu_fetch_task_input_tail(struct starpu_task *task, struct _starpu_job
 void _starpu_fetch_nowhere_task_input(struct _starpu_job *j);
 
 int _starpu_select_src_node(struct _starpu_data_state *state, unsigned destination);
+int _starpu_determine_request_path(starpu_data_handle_t handle,
+				  int src_node, int dst_node,
+				  enum starpu_data_access_mode mode, int max_len,
+				  unsigned *src_nodes, unsigned *dst_nodes,
+				  unsigned *handling_nodes, unsigned write_invalidation);
 
 /* is_prefetch is whether the DSM may drop the request (when there is not enough memory for instance
  * async is whether the caller wants a reference on the last request, to be