|
|
@@ -21,6 +21,7 @@
|
|
|
#include <mpi_failure_tolerance/starpu_mpi_checkpoint_template.h>
|
|
|
#include <mpi_failure_tolerance/starpu_mpi_checkpoint_package.h>
|
|
|
#include <mpi_failure_tolerance/starpu_mpi_ft_service_comms.h>
|
|
|
+#include <mpi_failure_tolerance/starpu_mpi_ft_stats.h>
|
|
|
#include <starpu_mpi_private.h>
|
|
|
#include <mpi/starpu_mpi_mpi_backend.h> // Should be deduced at preprocessing (Nmad vs MPI)
|
|
|
#include "starpu_mpi_cache.h"
|
|
|
@@ -36,6 +37,13 @@ extern struct _starpu_mpi_req* _starpu_mpi_irecv_cache_aware(starpu_data_handle_
|
|
|
|
|
|
|
|
|
|
|
|
+void _arg_free(void* _args)
|
|
|
+{
|
|
|
+ struct _starpu_mpi_cp_ack_arg_cb* arg = (struct _starpu_mpi_cp_ack_arg_cb*) _args;
|
|
|
+ _STARPU_MPI_DEBUG(3, "Ack send succeeded cpid:%d, cpinst:%d, dest:%d\n", arg->msg.checkpoint_id, arg->msg.checkpoint_instance, arg->rank);
|
|
|
+ free(arg);
|
|
|
+}
|
|
|
+
|
|
|
void _starpu_mpi_treat_ack_receipt_cb(void* _args)
|
|
|
{
|
|
|
struct _starpu_mpi_cp_ack_arg_cb* arg = (struct _starpu_mpi_cp_ack_arg_cb*) _args;
|
|
|
@@ -51,60 +59,76 @@ void _starpu_mpi_treat_ack_receipt_cb(void* _args)
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-void _arg_free(void* _args)
|
|
|
+void _starpu_mpi_store_data_and_send_ack_cb(struct _starpu_mpi_cp_ack_arg_cb* arg)
|
|
|
+{
|
|
|
+ checkpoint_package_data_add(arg->msg.checkpoint_id, arg->msg.checkpoint_instance, arg->rank, arg->tag, arg->type, arg->copy_handle, arg->count);
|
|
|
+ _STARPU_MPI_DEBUG(3,"Send ack msg to %d: id=%d inst=%d\n", arg->rank, arg->msg.checkpoint_id, arg->msg.checkpoint_instance);
|
|
|
+ _ft_service_msg_isend_cb((void*)&arg->msg, sizeof(struct _starpu_mpi_cp_ack_msg), arg->rank, _STARPU_MPI_TAG_CP_ACK, MPI_COMM_WORLD, _arg_free, arg);
|
|
|
+ _STARPU_MPI_FT_STATS_SEND_FT_SERVICE_MSG(sizeof(struct _starpu_mpi_cp_ack_msg));
|
|
|
+}
|
|
|
+
|
|
|
+void _starpu_mpi_push_cp_ack_recv_cb(struct _starpu_mpi_cp_ack_arg_cb* arg)
|
|
|
+{
|
|
|
+ _STARPU_MPI_DEBUG(3, "Posting ack recv cb from %d\n", arg->rank);
|
|
|
+ _ft_service_msg_irecv_cb((void*)&arg->msg, sizeof(struct _starpu_mpi_cp_ack_msg), arg->rank, _STARPU_MPI_TAG_CP_ACK, MPI_COMM_WORLD, _starpu_mpi_treat_ack_receipt_cb, arg);
|
|
|
+ _STARPU_MPI_FT_STATS_RECV_FT_SERVICE_MSG(sizeof(struct _starpu_mpi_cp_ack_msg));
|
|
|
+}
|
|
|
+
|
|
|
+void _recv_internal_dup_ro_cb(void* _args)
|
|
|
{
|
|
|
struct _starpu_mpi_cp_ack_arg_cb* arg = (struct _starpu_mpi_cp_ack_arg_cb*) _args;
|
|
|
- _STARPU_MPI_DEBUG(3, "Ack send succeeded cpid:%d, cpinst:%d, dest:%d\n", arg->msg.checkpoint_id, arg->msg.checkpoint_instance, arg->rank);
|
|
|
- free(arg);
|
|
|
+ starpu_data_release(arg->copy_handle);
|
|
|
+ _starpu_mpi_store_data_and_send_ack_cb(arg);
|
|
|
}
|
|
|
|
|
|
-void _starpu_mpi_store_data_and_send_ack_cb(void* _args)
|
|
|
+void _recv_cp_external_data_cb(void* _args)
|
|
|
{
|
|
|
struct _starpu_mpi_cp_ack_arg_cb* arg = (struct _starpu_mpi_cp_ack_arg_cb*) _args;
|
|
|
- if (STARPU_VALUE == arg->type) {
|
|
|
- // an handle has specifically been created, Let's get the value back, and unregister the handle
|
|
|
- arg->copy_handle = starpu_data_handle_to_pointer(arg->handle, STARPU_MAIN_RAM);
|
|
|
- starpu_data_unregister_submit(arg->handle);
|
|
|
- }
|
|
|
- checkpoint_package_data_add(arg->msg.checkpoint_id, arg->msg.checkpoint_instance, arg->rank, arg->tag, arg->type, arg->copy_handle, arg->count);
|
|
|
- _STARPU_MPI_DEBUG(3,"Send ack msg to %d: id=%d inst=%d\n", arg->rank, arg->msg.checkpoint_id, arg->msg.checkpoint_instance);
|
|
|
- _ft_service_msg_isend_cb((void*)&arg->msg, sizeof(struct _starpu_mpi_cp_ack_msg), arg->rank, _STARPU_MPI_TAG_CP_ACK, MPI_COMM_WORLD, _arg_free, _args);
|
|
|
+ _STARPU_MPI_FT_STATS_RECV_CP_DATA(arg->type==STARPU_VALUE?arg->count:arg->type==STARPU_R?starpu_data_get_size(arg->handle):-1);
|
|
|
+ // an handle has specifically been created, Let's get the value back, and unregister the handle
|
|
|
+ arg->copy_handle = starpu_data_handle_to_pointer(arg->handle, STARPU_MAIN_RAM);
|
|
|
+ starpu_data_unregister_submit(arg->handle);
|
|
|
+ _starpu_mpi_store_data_and_send_ack_cb(arg);
|
|
|
+}
|
|
|
|
|
|
-}void _starpu_mpi_release_and_store_data_and_send_ack_cb(void* _args)
|
|
|
+void _recv_cp_internal_data_cb(void* _args)
|
|
|
{
|
|
|
struct _starpu_mpi_cp_ack_arg_cb* arg = (struct _starpu_mpi_cp_ack_arg_cb*) _args;
|
|
|
- starpu_data_release(arg->copy_handle);
|
|
|
- checkpoint_package_data_add(arg->msg.checkpoint_id, arg->msg.checkpoint_instance, arg->rank, arg->tag, arg->type, arg->copy_handle, arg->count);
|
|
|
- _STARPU_MPI_DEBUG(3,"Send ack msg to %d: id=%d inst=%d\n", arg->rank, arg->msg.checkpoint_id, arg->msg.checkpoint_instance);
|
|
|
- _ft_service_msg_isend_cb((void*)&arg->msg, sizeof(struct _starpu_mpi_cp_ack_msg), arg->rank, _STARPU_MPI_TAG_CP_ACK, MPI_COMM_WORLD, _arg_free, _args);
|
|
|
+ _STARPU_MPI_FT_STATS_RECV_CP_DATA(
|
|
|
+ arg->type == STARPU_VALUE ? arg->count : arg->type == STARPU_R ? starpu_data_get_size(arg->handle) : -1);
|
|
|
}
|
|
|
|
|
|
-void _starpu_mpi_push_cp_ack_recv_cb(void* _args)
|
|
|
+void _recv_cached_cp_internal_data_cb(void* _args)
|
|
|
{
|
|
|
struct _starpu_mpi_cp_ack_arg_cb* arg = (struct _starpu_mpi_cp_ack_arg_cb*) _args;
|
|
|
- if (STARPU_VALUE == arg->type)
|
|
|
- {
|
|
|
- free(starpu_data_handle_to_pointer(arg->handle, STARPU_MAIN_RAM));
|
|
|
- starpu_data_unregister_submit(arg->handle);
|
|
|
- }
|
|
|
- _STARPU_MPI_DEBUG(3, "Posting ack recv cb from %d\n", arg->rank);
|
|
|
- _ft_service_msg_irecv_cb((void*)&arg->msg, sizeof(struct _starpu_mpi_cp_ack_msg), arg->rank, _STARPU_MPI_TAG_CP_ACK, MPI_COMM_WORLD, _starpu_mpi_treat_ack_receipt_cb, _args);
|
|
|
+ _STARPU_MPI_FT_STATS_RECV_CACHED_CP_DATA(arg->type==STARPU_VALUE?arg->count:arg->type==STARPU_R?starpu_data_get_size(arg->handle):-1);
|
|
|
+ starpu_data_release(arg->handle);
|
|
|
}
|
|
|
|
|
|
-void _starpu_mpi_cached_push_cp_ack_recv_cb(void* _args)
|
|
|
+void _send_cp_external_data_cb(void* _args)
|
|
|
{
|
|
|
struct _starpu_mpi_cp_ack_arg_cb* arg = (struct _starpu_mpi_cp_ack_arg_cb*) _args;
|
|
|
- if (STARPU_R == arg->type)
|
|
|
- {
|
|
|
- starpu_data_release(arg->handle);
|
|
|
- }
|
|
|
- _STARPU_MPI_DEBUG(3, "Posting ack recv cb from %d\n", arg->rank);
|
|
|
- _ft_service_msg_irecv_cb((void*)&arg->msg, sizeof(struct _starpu_mpi_cp_ack_msg), arg->rank, _STARPU_MPI_TAG_CP_ACK, MPI_COMM_WORLD, _starpu_mpi_treat_ack_receipt_cb, _args);
|
|
|
+ _STARPU_MPI_FT_STATS_SEND_CP_DATA(arg->type==STARPU_VALUE?arg->count:arg->type==STARPU_R?starpu_data_get_size(arg->handle):-1);
|
|
|
+ free(starpu_data_handle_to_pointer(arg->handle, STARPU_MAIN_RAM));
|
|
|
+ starpu_data_unregister_submit(arg->handle);
|
|
|
+ _starpu_mpi_push_cp_ack_recv_cb(arg);
|
|
|
}
|
|
|
|
|
|
-void _starpu_data_release_cb(void* _arg)
|
|
|
+void _send_cp_internal_data_cb(void* _args)
|
|
|
{
|
|
|
- starpu_data_release(_arg);
|
|
|
+ struct _starpu_mpi_cp_ack_arg_cb* arg = (struct _starpu_mpi_cp_ack_arg_cb*) _args;
|
|
|
+ _STARPU_MPI_FT_STATS_SEND_CP_DATA(
|
|
|
+ arg->type == STARPU_VALUE ? arg->count : arg->type == STARPU_R ? starpu_data_get_size(arg->handle) : -1);
|
|
|
+ _starpu_mpi_push_cp_ack_recv_cb(arg);
|
|
|
+}
|
|
|
+
|
|
|
+void _send_cached_cp_internal_data_cb(void* _args)
|
|
|
+{
|
|
|
+ struct _starpu_mpi_cp_ack_arg_cb* arg = (struct _starpu_mpi_cp_ack_arg_cb*) _args;
|
|
|
+ _STARPU_MPI_FT_STATS_SEND_CACHED_CP_DATA(
|
|
|
+ arg->type == STARPU_VALUE ? arg->count : arg->type == STARPU_R ? starpu_data_get_size(arg->handle) : -1);
|
|
|
+ starpu_data_release(arg->handle);
|
|
|
+ _starpu_mpi_push_cp_ack_recv_cb(arg);
|
|
|
}
|
|
|
|
|
|
|
|
|
@@ -142,7 +166,7 @@ int starpu_mpi_submit_checkpoint_template(starpu_mpi_checkpoint_template_t cp_te
|
|
|
arg->rank = item->backupped_by;
|
|
|
_STARPU_MPI_DEBUG(0, "Submit CP: sending external data:%d, tag:%ld, to :%d\n", (int)(*(int*)cpy_ptr), arg->tag, arg->rank);
|
|
|
starpu_mpi_isend_detached_prio(arg->handle, arg->rank, arg->tag, 0, MPI_COMM_WORLD,
|
|
|
- &_starpu_mpi_push_cp_ack_recv_cb, (void*)arg);
|
|
|
+ &_send_cp_external_data_cb, (void*)arg);
|
|
|
// The callback needs to free the handle specially created for the send, and post ack recv
|
|
|
}
|
|
|
else if (item->backup_of != -1)
|
|
|
@@ -152,7 +176,7 @@ int starpu_mpi_submit_checkpoint_template(starpu_mpi_checkpoint_template_t cp_te
|
|
|
arg->rank = item->backup_of;
|
|
|
_STARPU_MPI_DEBUG(0, "Submit CP: receiving external data tag:%ld, from :%d\n", arg->tag, arg->rank);
|
|
|
starpu_mpi_irecv_detached(arg->handle, arg->rank, arg->tag, MPI_COMM_WORLD,
|
|
|
- &_starpu_mpi_store_data_and_send_ack_cb, (void*)arg);
|
|
|
+ &_recv_cp_external_data_cb, (void*)arg);
|
|
|
// The callback needs to store the received data and post ack send
|
|
|
}
|
|
|
break;
|
|
|
@@ -170,7 +194,8 @@ int starpu_mpi_submit_checkpoint_template(starpu_mpi_checkpoint_template_t cp_te
|
|
|
arg->msg.checkpoint_id = cp_template->cp_id;
|
|
|
arg->msg.checkpoint_instance = current_instance;
|
|
|
_starpu_mpi_isend_cache_aware(*handle, item->backupped_by, starpu_mpi_data_get_tag(*handle), MPI_COMM_WORLD, 1, 0, 0,
|
|
|
- &_starpu_mpi_push_cp_ack_recv_cb, (void*)arg, &_starpu_mpi_cached_push_cp_ack_recv_cb, (void*)arg, 1);
|
|
|
+ &_send_cp_internal_data_cb, (void*)arg,
|
|
|
+ &_send_cached_cp_internal_data_cb, (void*)arg, 1);
|
|
|
// the callbacks need to post ack recv. The cache one needs to release the handle.
|
|
|
|
|
|
}
|
|
|
@@ -186,10 +211,11 @@ int starpu_mpi_submit_checkpoint_template(starpu_mpi_checkpoint_template_t cp_te
|
|
|
arg->msg.checkpoint_id = cp_template->cp_id;
|
|
|
arg->msg.checkpoint_instance = current_instance;
|
|
|
_starpu_mpi_irecv_cache_aware(*handle, starpu_mpi_data_get_rank(*handle), starpu_mpi_data_get_tag(*handle), MPI_COMM_WORLD, 1, 0,
|
|
|
- NULL, NULL, &_starpu_data_release_cb, (void*)arg->handle, 1, 0, 1);
|
|
|
+ &_recv_cp_internal_data_cb, (void*)arg,
|
|
|
+ &_recv_cached_cp_internal_data_cb, (void*)arg, 1, 0, 1);
|
|
|
// The callback needs to do nothing. The cached one must release the handle.
|
|
|
starpu_data_dup_ro(&arg->copy_handle, arg->handle, 1);
|
|
|
- starpu_data_acquire_cb(arg->copy_handle, STARPU_R, _starpu_mpi_release_and_store_data_and_send_ack_cb, arg);
|
|
|
+ starpu_data_acquire_cb(arg->copy_handle, STARPU_R, _recv_internal_dup_ro_cb, arg);
|
|
|
// The callback need to store the data and post ack send.
|
|
|
}
|
|
|
break;
|