|
@@ -320,6 +320,29 @@ static int link_supports_direct_transfers(starpu_data_handle_t handle, unsigned
|
|
return 0;
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+/* Now, we use slowness/bandwidth to compare numa nodes, is it better to use latency ? */
|
|
|
|
+static unsigned chose_best_numa_between_src_and_dest(int src, int dst)
|
|
|
|
+{
|
|
|
|
+ double timing_best;
|
|
|
|
+ int best_numa = -1;
|
|
|
|
+ unsigned numa;
|
|
|
|
+ const unsigned nb_numa_nodes = _starpu_get_nb_numa_nodes();
|
|
|
|
+ for(numa = 0; numa < nb_numa_nodes; numa++)
|
|
|
|
+ {
|
|
|
|
+ double actual = 1.0/starpu_transfer_bandwidth(src, numa) + 1.0/starpu_transfer_bandwidth(numa, dst);
|
|
|
|
+
|
|
|
|
+ /* Compare slowness : take the lowest */
|
|
|
|
+ if (best_numa < 0 || actual < timing_best)
|
|
|
|
+ {
|
|
|
|
+ best_numa = numa;
|
|
|
|
+ timing_best = actual;
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ STARPU_ASSERT(best_numa >= 0);
|
|
|
|
+
|
|
|
|
+ return best_numa;
|
|
|
|
+}
|
|
|
|
+
|
|
/* Determines the path of a request : each hop is defined by (src,dst) and the
|
|
/* Determines the path of a request : each hop is defined by (src,dst) and the
|
|
* node that handles the hop. The returned value indicates the number of hops,
|
|
* node that handles the hop. The returned value indicates the number of hops,
|
|
* and the max_len is the maximum number of hops (ie. the size of the
|
|
* and the max_len is the maximum number of hops (ie. the size of the
|
|
@@ -351,7 +374,6 @@ static int determine_request_path(starpu_data_handle_t handle,
|
|
unsigned handling_node;
|
|
unsigned handling_node;
|
|
int link_is_valid = link_supports_direct_transfers(handle, src_node, dst_node, &handling_node);
|
|
int link_is_valid = link_supports_direct_transfers(handle, src_node, dst_node, &handling_node);
|
|
|
|
|
|
- /* TODO: NUMA nodes */
|
|
|
|
if (!link_is_valid)
|
|
if (!link_is_valid)
|
|
{
|
|
{
|
|
int (*can_copy)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, unsigned handling_node) = handle->ops->copy_methods->can_copy;
|
|
int (*can_copy)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, unsigned handling_node) = handle->ops->copy_methods->can_copy;
|
|
@@ -363,9 +385,11 @@ static int determine_request_path(starpu_data_handle_t handle,
|
|
STARPU_ASSERT(max_len >= 2);
|
|
STARPU_ASSERT(max_len >= 2);
|
|
STARPU_ASSERT(src_node >= 0);
|
|
STARPU_ASSERT(src_node >= 0);
|
|
|
|
|
|
|
|
+ unsigned numa = chose_best_numa_between_src_and_dest(src_node, dst_node);
|
|
|
|
+
|
|
/* GPU -> RAM */
|
|
/* GPU -> RAM */
|
|
src_nodes[0] = src_node;
|
|
src_nodes[0] = src_node;
|
|
- dst_nodes[0] = STARPU_MAIN_RAM;
|
|
|
|
|
|
+ dst_nodes[0] = numa;
|
|
|
|
|
|
if (starpu_node_get_kind(src_node) == STARPU_DISK_RAM)
|
|
if (starpu_node_get_kind(src_node) == STARPU_DISK_RAM)
|
|
/* Disks don't have their own driver thread */
|
|
/* Disks don't have their own driver thread */
|
|
@@ -381,7 +405,7 @@ static int determine_request_path(starpu_data_handle_t handle,
|
|
}
|
|
}
|
|
|
|
|
|
/* RAM -> GPU */
|
|
/* RAM -> GPU */
|
|
- src_nodes[1] = STARPU_MAIN_RAM;
|
|
|
|
|
|
+ src_nodes[1] = numa;
|
|
dst_nodes[1] = dst_node;
|
|
dst_nodes[1] = dst_node;
|
|
|
|
|
|
if (starpu_node_get_kind(dst_node) == STARPU_DISK_RAM)
|
|
if (starpu_node_get_kind(dst_node) == STARPU_DISK_RAM)
|