|
@@ -30,11 +30,15 @@
|
|
|
#include <starpu_mpi_cache.h>
|
|
|
#include <starpu_mpi_select_node.h>
|
|
|
|
|
|
-#define _SEND_DATA(data, mode, dest, data_tag, comm, callback, arg) \
|
|
|
+#include "starpu_mpi_task_insert.h"
|
|
|
+
|
|
|
+#define _SEND_DATA(data, mode, dest, data_tag, prio, comm, callback, arg) \
|
|
|
+ do { \
|
|
|
if (mode & STARPU_SSEND) \
|
|
|
- starpu_mpi_issend_detached(data, dest, data_tag, comm, callback, arg); \
|
|
|
+ starpu_mpi_issend_detached_prio(data, dest, data_tag, prio, comm, callback, arg); \
|
|
|
else \
|
|
|
- starpu_mpi_isend_detached(data, dest, data_tag, comm, callback, arg);
|
|
|
+ starpu_mpi_isend_detached_prio(data, dest, data_tag, prio, comm, callback, arg); \
|
|
|
+ } while (0)
|
|
|
|
|
|
static void (*pre_submit_hook)(struct starpu_task *task) = NULL;
|
|
|
|
|
@@ -54,7 +58,7 @@ int starpu_mpi_pre_submit_hook_unregister()
|
|
|
|
|
|
int _starpu_mpi_find_executee_node(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int *do_execute, int *inconsistent_execute, int *xrank)
|
|
|
{
|
|
|
- if (mode & STARPU_W)
|
|
|
+ if (mode & STARPU_W || mode & STARPU_REDUX)
|
|
|
{
|
|
|
if (!data)
|
|
|
{
|
|
@@ -80,11 +84,11 @@ int _starpu_mpi_find_executee_node(starpu_data_handle_t data, enum starpu_data_a
|
|
|
// No node has been selected yet
|
|
|
*xrank = mpi_rank;
|
|
|
_STARPU_MPI_DEBUG(100, "Codelet is going to be executed by node %d\n", *xrank);
|
|
|
- *do_execute = (mpi_rank == me);
|
|
|
+ *do_execute = mpi_rank == STARPU_MPI_PER_NODE || (mpi_rank == me);
|
|
|
}
|
|
|
else if (mpi_rank != *xrank)
|
|
|
{
|
|
|
- _STARPU_MPI_DEBUG(100, "Another node %d had already been selected to execute the codelet\n", *xrank);
|
|
|
+ _STARPU_MPI_DEBUG(100, "Another node %d had already been selected to execute the codelet, can't now set %d\n", *xrank, mpi_rank);
|
|
|
*inconsistent_execute = 1;
|
|
|
}
|
|
|
}
|
|
@@ -92,8 +96,12 @@ int _starpu_mpi_find_executee_node(starpu_data_handle_t data, enum starpu_data_a
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
-void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int xrank, int do_execute, MPI_Comm comm)
|
|
|
+void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int xrank, int do_execute, int prio, MPI_Comm comm)
|
|
|
{
|
|
|
+ if (data && xrank == STARPU_MPI_PER_NODE)
|
|
|
+ {
|
|
|
+ STARPU_ASSERT_MSG(starpu_mpi_data_get_rank(data) == STARPU_MPI_PER_NODE, "If task is replicated, it has to access only per-node data");
|
|
|
+ }
|
|
|
if (data && mode & STARPU_R)
|
|
|
{
|
|
|
int mpi_rank = starpu_mpi_data_get_rank(data);
|
|
@@ -103,7 +111,7 @@ void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum
|
|
|
_STARPU_ERROR("StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register\n");
|
|
|
}
|
|
|
|
|
|
- if (do_execute && mpi_rank != me)
|
|
|
+ if (do_execute && mpi_rank != STARPU_MPI_PER_NODE && mpi_rank != me)
|
|
|
{
|
|
|
/* The node is going to execute the codelet, but it does not own the data, it needs to receive the data from the owner node */
|
|
|
int already_received = _starpu_mpi_cache_received_data_set(data);
|
|
@@ -126,7 +134,7 @@ void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum
|
|
|
if (data_tag == -1)
|
|
|
_STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
|
|
|
_STARPU_MPI_DEBUG(1, "Sending data %p to %d\n", data, xrank);
|
|
|
- _SEND_DATA(data, mode, xrank, data_tag, comm, NULL, NULL);
|
|
|
+ _SEND_DATA(data, mode, xrank, data_tag, prio, comm, NULL, NULL);
|
|
|
}
|
|
|
// Else the data has already been sent
|
|
|
}
|
|
@@ -134,7 +142,7 @@ void _starpu_mpi_exchange_data_before_execution(starpu_data_handle_t data, enum
|
|
|
}
|
|
|
|
|
|
static
|
|
|
-void _starpu_mpi_exchange_data_after_execution(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int xrank, int do_execute, MPI_Comm comm)
|
|
|
+void _starpu_mpi_exchange_data_after_execution(starpu_data_handle_t data, enum starpu_data_access_mode mode, int me, int xrank, int do_execute, int prio, MPI_Comm comm)
|
|
|
{
|
|
|
if (mode & STARPU_W)
|
|
|
{
|
|
@@ -144,9 +152,13 @@ void _starpu_mpi_exchange_data_after_execution(starpu_data_handle_t data, enum s
|
|
|
{
|
|
|
_STARPU_ERROR("StarPU needs to be told the MPI rank of this data, using starpu_mpi_data_register\n");
|
|
|
}
|
|
|
+ if (mpi_rank == STARPU_MPI_PER_NODE)
|
|
|
+ {
|
|
|
+ mpi_rank = me;
|
|
|
+ }
|
|
|
if (mpi_rank == me)
|
|
|
{
|
|
|
- if (xrank != -1 && me != xrank)
|
|
|
+ if (xrank != -1 && (xrank != STARPU_MPI_PER_NODE && me != xrank))
|
|
|
{
|
|
|
_STARPU_MPI_DEBUG(1, "Receive data %p back from the task %d which executed the codelet ...\n", data, xrank);
|
|
|
if(data_tag == -1)
|
|
@@ -159,7 +171,7 @@ void _starpu_mpi_exchange_data_after_execution(starpu_data_handle_t data, enum s
|
|
|
if(data_tag == -1)
|
|
|
_STARPU_ERROR("StarPU needs to be told the MPI tag of this data, using starpu_mpi_data_register\n");
|
|
|
_STARPU_MPI_DEBUG(1, "Send data %p back to its owner %d...\n", data, mpi_rank);
|
|
|
- _SEND_DATA(data, mode, mpi_rank, data_tag, comm, NULL, NULL);
|
|
|
+ _SEND_DATA(data, mode, mpi_rank, data_tag, prio, comm, NULL, NULL);
|
|
|
}
|
|
|
}
|
|
|
}
|
|
@@ -182,6 +194,10 @@ void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum star
|
|
|
if ((mode & STARPU_R) && do_execute)
|
|
|
{
|
|
|
int mpi_rank = starpu_mpi_data_get_rank(data);
|
|
|
+ if (mpi_rank == STARPU_MPI_PER_NODE)
|
|
|
+ {
|
|
|
+ mpi_rank = me;
|
|
|
+ }
|
|
|
if (mpi_rank != me && mpi_rank != -1)
|
|
|
{
|
|
|
starpu_data_invalidate_submit(data);
|
|
@@ -191,8 +207,9 @@ void _starpu_mpi_clear_data_after_execution(starpu_data_handle_t data, enum star
|
|
|
}
|
|
|
|
|
|
static
|
|
|
-int _starpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nodes, int *xrank, int *do_execute, struct starpu_data_descr **descrs_p, int *nb_data_p, va_list varg_list)
|
|
|
+int _starpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nodes, int *xrank, int *do_execute, struct starpu_data_descr **descrs_p, int *nb_data_p, int *prio_p, va_list varg_list)
|
|
|
{
|
|
|
+ /* XXX: _fstarpu_mpi_task_decode_v needs to be updated at the same time */
|
|
|
va_list varg_list_copy;
|
|
|
int inconsistent_execute = 0;
|
|
|
int arg_type;
|
|
@@ -200,6 +217,7 @@ int _starpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nod
|
|
|
int nb_allocated_data = 16;
|
|
|
struct starpu_data_descr *descrs;
|
|
|
int nb_data;
|
|
|
+ int prio = 0;
|
|
|
int select_node_policy = STARPU_MPI_NODE_SELECTION_CURRENT_POLICY;
|
|
|
|
|
|
_STARPU_TRACE_TASK_MPI_DECODE_START();
|
|
@@ -348,7 +366,7 @@ int _starpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nod
|
|
|
}
|
|
|
else if (arg_type==STARPU_PRIORITY)
|
|
|
{
|
|
|
- (void)va_arg(varg_list_copy, int);
|
|
|
+ prio = va_arg(varg_list_copy, int);
|
|
|
}
|
|
|
/* STARPU_EXECUTE_ON_NODE handled above */
|
|
|
/* STARPU_EXECUTE_ON_DATA handled above */
|
|
@@ -386,6 +404,12 @@ int _starpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nod
|
|
|
{
|
|
|
(void)va_arg(varg_list_copy, void *);
|
|
|
}
|
|
|
+ else if (arg_type==STARPU_EXECUTE_WHERE)
|
|
|
+ {
|
|
|
+ // the flag is decoded and set later when
|
|
|
+ // calling function _starpu_task_insert_create()
|
|
|
+ (void)va_arg(varg_list_copy, unsigned long long);
|
|
|
+ }
|
|
|
else if (arg_type==STARPU_EXECUTE_ON_WORKER)
|
|
|
{
|
|
|
// the flag is decoded and set later when
|
|
@@ -425,32 +449,34 @@ int _starpu_mpi_task_decode_v(struct starpu_codelet *codelet, int me, int nb_nod
|
|
|
if (inconsistent_execute == 1 || *xrank == -1)
|
|
|
{
|
|
|
// We need to find out which node is going to execute the codelet.
|
|
|
- _STARPU_MPI_DISP("Different nodes are owning W data. The node to execute the codelet is going to be selected with the current selection node policy. See starpu_mpi_node_selection_set_current_policy() to change the policy, or use STARPU_EXECUTE_ON_NODE or STARPU_EXECUTE_ON_DATA to specify the node\n");
|
|
|
+ _STARPU_MPI_DEBUG(100, "Different nodes are owning W data. The node to execute the codelet is going to be selected with the current selection node policy. See starpu_mpi_node_selection_set_current_policy() to change the policy, or use STARPU_EXECUTE_ON_NODE or STARPU_EXECUTE_ON_DATA to specify the node\n");
|
|
|
*xrank = _starpu_mpi_select_node(me, nb_nodes, descrs, nb_data, select_node_policy);
|
|
|
- *do_execute = (me == *xrank);
|
|
|
+ *do_execute = *xrank == STARPU_MPI_PER_NODE || (me == *xrank);
|
|
|
}
|
|
|
else
|
|
|
{
|
|
|
_STARPU_MPI_DEBUG(100, "Inconsistent=%d - xrank=%d\n", inconsistent_execute, *xrank);
|
|
|
- *do_execute = (me == *xrank);
|
|
|
+ *do_execute = *xrank == STARPU_MPI_PER_NODE || (me == *xrank);
|
|
|
}
|
|
|
_STARPU_MPI_DEBUG(100, "do_execute=%d\n", *do_execute);
|
|
|
|
|
|
*descrs_p = descrs;
|
|
|
*nb_data_p = nb_data;
|
|
|
+ *prio_p = prio;
|
|
|
|
|
|
_STARPU_TRACE_TASK_MPI_DECODE_END();
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
static
|
|
|
-int _starpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, struct starpu_task **task, int *xrank_p, struct starpu_data_descr **descrs_p, int *nb_data_p, va_list varg_list)
|
|
|
+int _starpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, struct starpu_task **task, int *xrank_p, struct starpu_data_descr **descrs_p, int *nb_data_p, int *prio_p, va_list varg_list)
|
|
|
{
|
|
|
int me, do_execute, xrank, nb_nodes;
|
|
|
int ret;
|
|
|
int i;
|
|
|
struct starpu_data_descr *descrs;
|
|
|
int nb_data;
|
|
|
+ int prio;
|
|
|
|
|
|
_STARPU_MPI_LOG_IN();
|
|
|
|
|
@@ -458,25 +484,36 @@ int _starpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, stru
|
|
|
starpu_mpi_comm_size(comm, &nb_nodes);
|
|
|
|
|
|
/* Find out whether we are to execute the data because we own the data to be written to. */
|
|
|
- ret = _starpu_mpi_task_decode_v(codelet, me, nb_nodes, &xrank, &do_execute, &descrs, &nb_data, varg_list);
|
|
|
- if (ret < 0) return ret;
|
|
|
+ ret = _starpu_mpi_task_decode_v(codelet, me, nb_nodes, &xrank, &do_execute, &descrs, &nb_data, &prio, varg_list);
|
|
|
+ if (ret < 0)
|
|
|
+ return ret;
|
|
|
|
|
|
_STARPU_TRACE_TASK_MPI_PRE_START();
|
|
|
/* Send and receive data as requested */
|
|
|
for(i=0 ; i<nb_data ; i++)
|
|
|
{
|
|
|
- _starpu_mpi_exchange_data_before_execution(descrs[i].handle, descrs[i].mode, me, xrank, do_execute, comm);
|
|
|
+ _starpu_mpi_exchange_data_before_execution(descrs[i].handle, descrs[i].mode, me, xrank, do_execute, prio, comm);
|
|
|
}
|
|
|
|
|
|
- if (xrank_p) *xrank_p = xrank;
|
|
|
- if (nb_data_p) *nb_data_p = nb_data;
|
|
|
+ if (xrank_p)
|
|
|
+ *xrank_p = xrank;
|
|
|
+ if (nb_data_p)
|
|
|
+ *nb_data_p = nb_data;
|
|
|
+ if (prio_p)
|
|
|
+ *prio_p = prio;
|
|
|
+
|
|
|
if (descrs_p)
|
|
|
*descrs_p = descrs;
|
|
|
else
|
|
|
free(descrs);
|
|
|
+
|
|
|
+
|
|
|
_STARPU_TRACE_TASK_MPI_PRE_END();
|
|
|
|
|
|
- if (do_execute == 0) return 1;
|
|
|
+ if (do_execute == 0)
|
|
|
+ {
|
|
|
+ return 1;
|
|
|
+ }
|
|
|
else
|
|
|
{
|
|
|
va_list varg_list_copy;
|
|
@@ -493,7 +530,7 @@ int _starpu_mpi_task_build_v(MPI_Comm comm, struct starpu_codelet *codelet, stru
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-int _starpu_mpi_task_postbuild_v(MPI_Comm comm, int xrank, int do_execute, struct starpu_data_descr *descrs, int nb_data)
|
|
|
+int _starpu_mpi_task_postbuild_v(MPI_Comm comm, int xrank, int do_execute, struct starpu_data_descr *descrs, int nb_data, int prio)
|
|
|
{
|
|
|
int me, i;
|
|
|
|
|
@@ -502,7 +539,7 @@ int _starpu_mpi_task_postbuild_v(MPI_Comm comm, int xrank, int do_execute, struc
|
|
|
|
|
|
for(i=0 ; i<nb_data ; i++)
|
|
|
{
|
|
|
- _starpu_mpi_exchange_data_after_execution(descrs[i].handle, descrs[i].mode, me, xrank, do_execute, comm);
|
|
|
+ _starpu_mpi_exchange_data_after_execution(descrs[i].handle, descrs[i].mode, me, xrank, do_execute, prio, comm);
|
|
|
_starpu_mpi_clear_data_after_execution(descrs[i].handle, descrs[i].mode, me, do_execute);
|
|
|
}
|
|
|
|
|
@@ -522,9 +559,11 @@ int _starpu_mpi_task_insert_v(MPI_Comm comm, struct starpu_codelet *codelet, va_
|
|
|
int do_execute = 0;
|
|
|
struct starpu_data_descr *descrs;
|
|
|
int nb_data;
|
|
|
+ int prio;
|
|
|
|
|
|
- ret = _starpu_mpi_task_build_v(comm, codelet, &task, &xrank, &descrs, &nb_data, varg_list);
|
|
|
- if (ret < 0) return ret;
|
|
|
+ ret = _starpu_mpi_task_build_v(comm, codelet, &task, &xrank, &descrs, &nb_data, &prio, varg_list);
|
|
|
+ if (ret < 0)
|
|
|
+ return ret;
|
|
|
|
|
|
if (ret == 0)
|
|
|
{
|
|
@@ -544,7 +583,7 @@ int _starpu_mpi_task_insert_v(MPI_Comm comm, struct starpu_codelet *codelet, va_
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- int val = _starpu_mpi_task_postbuild_v(comm, xrank, do_execute, descrs, nb_data);
|
|
|
+ int val = _starpu_mpi_task_postbuild_v(comm, xrank, do_execute, descrs, nb_data, prio);
|
|
|
|
|
|
if (ret == 0 && pre_submit_hook)
|
|
|
pre_submit_hook(task);
|
|
@@ -581,10 +620,10 @@ struct starpu_task *starpu_mpi_task_build(MPI_Comm comm, struct starpu_codelet *
|
|
|
int ret;
|
|
|
|
|
|
va_start(varg_list, codelet);
|
|
|
- ret = _starpu_mpi_task_build_v(comm, codelet, &task, NULL, NULL, NULL, varg_list);
|
|
|
+ ret = _starpu_mpi_task_build_v(comm, codelet, &task, NULL, NULL, NULL, NULL, varg_list);
|
|
|
va_end(varg_list);
|
|
|
STARPU_ASSERT(ret >= 0);
|
|
|
- if (ret > 0) return NULL; else return task;
|
|
|
+ return (ret > 0) ? NULL : task;
|
|
|
}
|
|
|
|
|
|
int starpu_mpi_task_post_build(MPI_Comm comm, struct starpu_codelet *codelet, ...)
|
|
@@ -594,17 +633,19 @@ int starpu_mpi_task_post_build(MPI_Comm comm, struct starpu_codelet *codelet, ..
|
|
|
va_list varg_list;
|
|
|
struct starpu_data_descr *descrs;
|
|
|
int nb_data;
|
|
|
+ int prio;
|
|
|
|
|
|
starpu_mpi_comm_rank(comm, &me);
|
|
|
starpu_mpi_comm_size(comm, &nb_nodes);
|
|
|
|
|
|
va_start(varg_list, codelet);
|
|
|
/* Find out whether we are to execute the data because we own the data to be written to. */
|
|
|
- ret = _starpu_mpi_task_decode_v(codelet, me, nb_nodes, &xrank, &do_execute, &descrs, &nb_data, varg_list);
|
|
|
+ ret = _starpu_mpi_task_decode_v(codelet, me, nb_nodes, &xrank, &do_execute, &descrs, &nb_data, &prio, varg_list);
|
|
|
va_end(varg_list);
|
|
|
- if (ret < 0) return ret;
|
|
|
+ if (ret < 0)
|
|
|
+ return ret;
|
|
|
|
|
|
- return _starpu_mpi_task_postbuild_v(comm, xrank, do_execute, descrs, nb_data);
|
|
|
+ return _starpu_mpi_task_postbuild_v(comm, xrank, do_execute, descrs, nb_data, prio);
|
|
|
}
|
|
|
|
|
|
struct _starpu_mpi_redux_data_args
|
|
@@ -679,7 +720,7 @@ void _starpu_mpi_redux_data_recv_callback(void *callback_arg)
|
|
|
|
|
|
/* TODO: this should rather be implicitly called by starpu_mpi_task_insert when
|
|
|
* a data previously accessed in REDUX mode gets accessed in R mode. */
|
|
|
-void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle)
|
|
|
+void starpu_mpi_redux_data_prio(MPI_Comm comm, starpu_data_handle_t data_handle, int prio)
|
|
|
{
|
|
|
int me, rank, tag, nb_nodes;
|
|
|
|
|
@@ -741,7 +782,8 @@ void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle)
|
|
|
args->taskB->cl = args->data_handle->redux_cl;
|
|
|
args->taskB->sequential_consistency = 0;
|
|
|
STARPU_TASK_SET_HANDLE(args->taskB, args->data_handle, 0);
|
|
|
- taskBs[j] = args->taskB; j++;
|
|
|
+ taskBs[j] = args->taskB;
|
|
|
+ j++;
|
|
|
|
|
|
// Submit taskA
|
|
|
starpu_task_insert(&_starpu_mpi_redux_data_read_cl,
|
|
@@ -762,7 +804,7 @@ void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle)
|
|
|
else
|
|
|
{
|
|
|
_STARPU_MPI_DEBUG(1, "Sending redux handle to %d ...\n", rank);
|
|
|
- starpu_mpi_isend_detached(data_handle, rank, tag, comm, NULL, NULL);
|
|
|
+ starpu_mpi_isend_detached_prio(data_handle, rank, tag, prio, comm, NULL, NULL);
|
|
|
starpu_task_insert(data_handle->init_cl, STARPU_W, data_handle, 0);
|
|
|
}
|
|
|
/* FIXME: In order to prevent simultaneous receive submissions
|
|
@@ -773,3 +815,7 @@ void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle)
|
|
|
starpu_task_wait_for_all();
|
|
|
|
|
|
}
|
|
|
+void starpu_mpi_redux_data(MPI_Comm comm, starpu_data_handle_t data_handle)
|
|
|
+{
|
|
|
+ return starpu_mpi_redux_data_prio(comm, data_handle, 0);
|
|
|
+}
|