|
@@ -19,6 +19,7 @@
|
|
|
|
|
|
#include <mpi_failure_tolerance/starpu_mpi_checkpoint.h>
|
|
|
#include <mpi_failure_tolerance/starpu_mpi_checkpoint_template.h>
|
|
|
+#include <mpi_failure_tolerance/starpu_mpi_checkpoint_package.h>
|
|
|
#include <mpi_failure_tolerance/starpu_mpi_ft_service_comms.h>
|
|
|
#include <starpu_mpi_private.h>
|
|
|
#include <mpi/starpu_mpi_mpi_backend.h> // Should be deduced at preprocessing (Nmad vs MPI)
|
|
@@ -59,6 +60,13 @@ void _starpu_mpi_push_cp_ack_send_cb(void* _args)
|
|
|
|
|
|
}
|
|
|
|
|
|
+void _starpu_mpi_store_data_and_push_cp_ack_send_cb(void* _args)
|
|
|
+{
|
|
|
+ struct _starpu_mpi_cp_ack_arg_cb* arg = (struct _starpu_mpi_cp_ack_arg_cb*) _args;
|
|
|
+ checkpoint_package_data_add(arg->msg.checkpoint_id, arg->msg.checkpoint_instance, arg->rank, arg->tag, arg->type, arg->copy_handle, arg->count);
|
|
|
+ _starpu_mpi_push_cp_ack_send_cb(_args);
|
|
|
+}
|
|
|
+
|
|
|
void _starpu_mpi_push_cp_ack_recv_cb(void* _args)
|
|
|
{
|
|
|
struct _starpu_mpi_cp_ack_arg_cb* arg = (struct _starpu_mpi_cp_ack_arg_cb*) _args;
|
|
@@ -71,7 +79,7 @@ void _starpu_checkpoint_cached_data_recv_copy_and_ack(void* _arg)
|
|
|
{
|
|
|
struct _starpu_mpi_cp_ack_arg_cb* arg = (struct _starpu_mpi_cp_ack_arg_cb*) _arg;
|
|
|
starpu_data_register_same(&arg->copy_handle, arg->handle);
|
|
|
- starpu_data_cpy(arg->copy_handle, arg->handle, 1, _starpu_mpi_push_cp_ack_send_cb, _arg);
|
|
|
+ starpu_data_cpy(arg->copy_handle, arg->handle, 1, _starpu_mpi_store_data_and_push_cp_ack_send_cb, _arg);
|
|
|
starpu_data_release(arg->handle);
|
|
|
}
|
|
|
|
|
@@ -79,7 +87,7 @@ void _starpu_checkpoint_data_recv_copy_and_ack(void* _arg)
|
|
|
{
|
|
|
struct _starpu_mpi_cp_ack_arg_cb* arg = (struct _starpu_mpi_cp_ack_arg_cb*) _arg;
|
|
|
starpu_data_register_same(&arg->copy_handle, arg->handle);
|
|
|
- starpu_data_cpy(arg->copy_handle, arg->handle, 1, _starpu_mpi_push_cp_ack_send_cb, _arg);
|
|
|
+ starpu_data_cpy(arg->copy_handle, arg->handle, 1, _starpu_mpi_store_data_and_push_cp_ack_send_cb, _arg);
|
|
|
}
|
|
|
|
|
|
int starpu_mpi_submit_checkpoint_template(starpu_mpi_checkpoint_template_t cp_template)
|
|
@@ -116,6 +124,9 @@ int starpu_mpi_submit_checkpoint_template(starpu_mpi_checkpoint_template_t cp_te
|
|
|
struct _starpu_mpi_cp_ack_arg_cb* arg = calloc(1, sizeof(struct _starpu_mpi_cp_ack_arg_cb));
|
|
|
arg->rank = item->backup_rank;
|
|
|
arg->handle = handle;
|
|
|
+ arg->tag = starpu_mpi_data_get_tag(handle);
|
|
|
+ arg->type = STARPU_R;
|
|
|
+ arg->count = item->count;
|
|
|
arg->msg.checkpoint_id = cp_template->cp_template_id;
|
|
|
arg->msg.checkpoint_instance = cp_template->cp_template_current_instance;
|
|
|
_starpu_mpi_isend_cache_aware(handle, item->backup_rank, starpu_mpi_data_get_tag(handle), MPI_COMM_WORLD, 1, 0, 0,
|
|
@@ -127,6 +138,9 @@ int starpu_mpi_submit_checkpoint_template(starpu_mpi_checkpoint_template_t cp_te
|
|
|
struct _starpu_mpi_cp_ack_arg_cb* arg = calloc(1, sizeof(struct _starpu_mpi_cp_ack_arg_cb));
|
|
|
arg->rank = starpu_mpi_data_get_rank(handle);
|
|
|
arg->handle = handle;
|
|
|
+ arg->tag = starpu_mpi_data_get_tag(handle);
|
|
|
+ arg->type = STARPU_R;
|
|
|
+ arg->count = item->count;
|
|
|
arg->msg.checkpoint_id = cp_template->cp_template_id;
|
|
|
arg->msg.checkpoint_instance = cp_template->cp_template_current_instance;
|
|
|
_starpu_mpi_irecv_cache_aware(handle, starpu_mpi_data_get_rank(handle), starpu_mpi_data_get_tag(handle), MPI_COMM_WORLD, 1, 0,
|