|
@@ -22,6 +22,8 @@
|
|
|
#include <starpu_mpi_cache.h>
|
|
|
#include <mpi_failure_tolerance/starpu_mpi_checkpoint_template.h>
|
|
|
#include <mpi_failure_tolerance/starpu_mpi_checkpoint.h>
|
|
|
+#include <mpi_failure_tolerance/starpu_mpi_ft_service_comms.h>
|
|
|
+#include <mpi/starpu_mpi_mpi_backend.h>
|
|
|
|
|
|
|
|
|
#define MAX_CP_TEMPLATE_NUMBER 32 // Arbitrary limit
|
|
@@ -32,15 +34,20 @@ int my_rank;
|
|
|
int size;
|
|
|
int cp_template_number = 0;
|
|
|
struct _starpu_mpi_cp_ack_msg last_valid_checkpoint;
|
|
|
+starpu_mpi_checkpoint_template_t pending_checkpoint;
|
|
|
+starpu_pthread_mutex_t checkpoint_pending_mutex;
|
|
|
|
|
|
typedef int (*backup_of_fn)(int);
|
|
|
|
|
|
+
|
|
|
void checkpoint_template_lib_init(void) {
|
|
|
starpu_pthread_mutex_init(&cp_template_mutex, NULL);
|
|
|
+ starpu_pthread_mutex_init(&checkpoint_pending_mutex, NULL);
|
|
|
starpu_mpi_comm_rank(MPI_COMM_WORLD, &my_rank);
|
|
|
starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
|
|
|
last_valid_checkpoint.checkpoint_id = -1;
|
|
|
last_valid_checkpoint.checkpoint_instance = -1;
|
|
|
+ pending_checkpoint = NULL;
|
|
|
}
|
|
|
|
|
|
void checkpoint_template_lib_quit(void) {
|
|
@@ -55,6 +62,29 @@ void checkpoint_template_lib_quit(void) {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+int set_pending_checkpoint_template(starpu_mpi_checkpoint_template_t _pending_checkpoint)
|
|
|
+{
|
|
|
+ int ret=starpu_pthread_mutex_trylock(&checkpoint_pending_mutex);
|
|
|
+ if (EBUSY==ret)
|
|
|
+ {
|
|
|
+ fprintf(stderr, "The process is blocked, a checkpoint has been submitted while the previous one's "
|
|
|
+ "submission has not ended. The submission has to wait, try to submit checkpoint "
|
|
|
+ "less frequently.\n");
|
|
|
+ starpu_pthread_mutex_lock(&checkpoint_pending_mutex);
|
|
|
+ }
|
|
|
+ STARPU_ASSERT_MSG(pending_checkpoint==NULL, "There is already a checkpoint submission pending. This should not happen.\n");
|
|
|
+ pending_checkpoint = _pending_checkpoint;
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
+int valid_pending_checkpoint_template(starpu_mpi_checkpoint_template_t _pending_checkpoint)
|
|
|
+{
|
|
|
+ STARPU_ASSERT_MSG(pending_checkpoint==_pending_checkpoint, "This checkpoint is not the one marked as pending. This should not happen.\n");
|
|
|
+ pending_checkpoint = NULL;
|
|
|
+ starpu_pthread_mutex_unlock(&checkpoint_pending_mutex);
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
int starpu_mpi_checkpoint_template_create(starpu_mpi_checkpoint_template_t* cp_template, int cp_id)
|
|
|
{
|
|
|
*cp_template = _starpu_mpi_checkpoint_template_new(cp_id);
|
|
@@ -64,26 +94,30 @@ int starpu_mpi_checkpoint_template_create(starpu_mpi_checkpoint_template_t* cp_t
|
|
|
int _starpu_mpi_checkpoint_template_add_entry(starpu_mpi_checkpoint_template_t cp_template, int arg_type, va_list varg_list)
|
|
|
{
|
|
|
void* ptr;
|
|
|
- int count;
|
|
|
- int my_backup;
|
|
|
- int backup_of;
|
|
|
- int data_rank;
|
|
|
+ int count;
|
|
|
+ int backupped_by;
|
|
|
+ int backup_of;
|
|
|
+ int data_rank;
|
|
|
starpu_mpi_tag_t tag;
|
|
|
- backup_of_fn _backup_of;
|
|
|
+ backup_of_fn _backup_of;
|
|
|
|
|
|
STARPU_ASSERT_MSG(!(arg_type & STARPU_COMMUTE), "Unable to checkpoint non sequential task flow.\n");
|
|
|
|
|
|
switch(arg_type)
|
|
|
{
|
|
|
case STARPU_R:
|
|
|
- ptr = va_arg(varg_list, void*);
|
|
|
- count = 1;
|
|
|
- my_backup = va_arg(varg_list, int);
|
|
|
- backup_of = -1;
|
|
|
- data_rank = starpu_mpi_data_get_rank(*(starpu_data_handle_t*)ptr);
|
|
|
- if (my_rank==data_rank || my_rank==my_backup)
|
|
|
+ ptr = va_arg(varg_list, void*);
|
|
|
+ count = 1;
|
|
|
+ backupped_by = va_arg(varg_list, int);
|
|
|
+ backup_of = -1;
|
|
|
+ data_rank = starpu_mpi_data_get_rank(*(starpu_data_handle_t*)ptr);
|
|
|
+ if (my_rank==data_rank)
|
|
|
+ {
|
|
|
+ return _starpu_mpi_checkpoint_template_add_data(cp_template, arg_type, ptr, count, backupped_by, -1, -1);
|
|
|
+ }
|
|
|
+ else if(my_rank == backupped_by)
|
|
|
{
|
|
|
- return _starpu_mpi_checkpoint_template_add_data(cp_template, arg_type, ptr, count, my_backup, backup_of, -1);
|
|
|
+ return _starpu_mpi_checkpoint_template_add_data(cp_template, arg_type, ptr, count, -1, data_rank, -1);
|
|
|
}
|
|
|
else
|
|
|
{
|
|
@@ -118,7 +152,7 @@ int _starpu_mpi_checkpoint_template_add_entry(starpu_mpi_checkpoint_template_t c
|
|
|
// case STARPU_DATA_ARRAY:
|
|
|
// ptr = va_arg(varg_list, void*);
|
|
|
// count = va_arg(varg_list, int);
|
|
|
-// my_backup = va_arg(varg_list, int);
|
|
|
+// backupped_by = va_arg(varg_list, int);
|
|
|
// backup_of = -1;
|
|
|
// break;
|
|
|
default:
|
|
@@ -127,6 +161,136 @@ int _starpu_mpi_checkpoint_template_add_entry(starpu_mpi_checkpoint_template_t c
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+static starpu_mpi_checkpoint_template_t _starpu_mpi_get_checkpoint_template_by_id(int checkpoint_id)
|
|
|
+{
|
|
|
+ starpu_pthread_mutex_lock(&cp_template_mutex);
|
|
|
+ for (int i=0 ; i<cp_template_number ; i++)
|
|
|
+ {
|
|
|
+ starpu_pthread_mutex_lock(&cp_template_array[i]->mutex);
|
|
|
+ if (cp_template_array[i]->cp_template_id == checkpoint_id)
|
|
|
+ {
|
|
|
+ starpu_pthread_mutex_unlock(&cp_template_array[i]->mutex);
|
|
|
+ starpu_pthread_mutex_unlock(&cp_template_mutex);
|
|
|
+ return cp_template_array[i];
|
|
|
+ }
|
|
|
+ starpu_pthread_mutex_unlock(&cp_template_array[i]->mutex);
|
|
|
+ }
|
|
|
+ starpu_pthread_mutex_unlock(&cp_template_mutex);
|
|
|
+ return NULL;
|
|
|
+}
|
|
|
+
|
|
|
+void checkpoint_discard(void* _args)
|
|
|
+{
|
|
|
+ // TODO: flag data as "CP ready", since the CP has succeeded
|
|
|
+ struct _starpu_mpi_cp_ack_msg* message = (struct _starpu_mpi_cp_ack_msg*) _args;
|
|
|
+ fprintf(stderr, "DISCARDING OLD CHECKPOINT DATA - new one is CPID:%d - CPINST:%d\n", message->checkpoint_id, message->checkpoint_instance);
|
|
|
+}
|
|
|
+
|
|
|
+int _starpu_mpi_checkpoint_post_cp_discard_recv(starpu_mpi_checkpoint_template_t cp_template)
|
|
|
+{
|
|
|
+ /* A new CP is submitted. We must post matching recv for the message warning the future checkpoint integrity (so
|
|
|
+ * I can tag the data as CP validated, and discard old data from deprecated checkpoint).
|
|
|
+ * I will receive a msg if I have old CP data, or if I am the back up for a node into the upcoming Checkpoint.
|
|
|
+ * * Here the union of the different list is processed to post message reception only once.
|
|
|
+ * TODO: For the message logging discard, I will receive message from the people I exchanged with since the last checkpoint.
|
|
|
+ * */
|
|
|
+ struct _starpu_mpi_cp_discard_arg_cb* arg;
|
|
|
+ int i, j, flag;
|
|
|
+ starpu_mpi_checkpoint_template_t old_template;
|
|
|
+ for (i=0 ; i<cp_template->backup_of_array_used_size ; i++)
|
|
|
+ {
|
|
|
+ starpu_malloc((void**)&arg, sizeof(struct _starpu_mpi_cp_discard_arg_cb));
|
|
|
+ arg->rank = cp_template->backup_of_array[i];
|
|
|
+ fprintf(stderr, "Posting DISCARD msg reception from %d\n", arg->rank);
|
|
|
+ _ft_service_msg_irecv_cb(&arg->msg, sizeof(struct _starpu_mpi_cp_ack_msg), arg->rank, _STARPU_MPI_TAG_CP_DISCARD, MPI_COMM_WORLD, checkpoint_discard, (void*)arg);
|
|
|
+ }
|
|
|
+ if (last_valid_checkpoint.checkpoint_id == -1)
|
|
|
+ {
|
|
|
+ return -1;
|
|
|
+ }
|
|
|
+ else if (last_valid_checkpoint.checkpoint_id!=cp_template->cp_template_id)
|
|
|
+ {
|
|
|
+ old_template = _starpu_mpi_get_checkpoint_template_by_id(last_valid_checkpoint.checkpoint_id);
|
|
|
+ for (i=0 ; i<old_template->backup_of_array_used_size ; i++)
|
|
|
+ {
|
|
|
+ flag=0;
|
|
|
+ for(j=0 ; j<cp_template->backup_of_array_used_size ; j++)
|
|
|
+ {
|
|
|
+ if (cp_template->backup_of_array[j] == old_template->backup_of_array[i])
|
|
|
+ {
|
|
|
+ flag = 1;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if (flag==0)
|
|
|
+ {
|
|
|
+ starpu_malloc((void**)&arg, sizeof(struct _starpu_mpi_cp_discard_arg_cb));
|
|
|
+ arg->rank = old_template->backup_of_array[i];
|
|
|
+ fprintf(stderr, "Posting DISCARD msg reception from %d - LAST VALIDATED CP\n", arg->rank);
|
|
|
+ _ft_service_msg_irecv_cb(&arg->msg, sizeof(struct _starpu_mpi_cp_ack_msg), arg->rank, _STARPU_MPI_TAG_CP_DISCARD, MPI_COMM_WORLD, checkpoint_discard, (void*)arg);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
+void free_arg(void* _args)
|
|
|
+{
|
|
|
+ starpu_free(_args);
|
|
|
+}
|
|
|
+
|
|
|
+int _starpu_mpi_checkpoint_post_cp_discard_send(starpu_mpi_checkpoint_template_t cp_template, int cp_id, int cp_instance)
|
|
|
+{
|
|
|
+ /* The CP data replication has succeeded. I must send the message warning the future checkpoint integrity (so
|
|
|
+ * they can tag the data as CP validated, and discard old data from deprecated checkpoint).
|
|
|
+ * I will send to one if it has old CP data from me, or if it is my backup for a data into the just succeeded Checkpoint.
|
|
|
+ * * Here the union of the different list is processed to send message only once.
|
|
|
+ * TODO: For the message logging discard, I will send message to the people I exchanged with since the last checkpoint.
|
|
|
+ * */
|
|
|
+ struct _starpu_mpi_cp_discard_arg_cb* arg;
|
|
|
+ int i, j, flag;
|
|
|
+ starpu_mpi_checkpoint_template_t old_template;
|
|
|
+ fprintf(stderr, "backupped_by_array_used_size: %d\n", cp_template->backupped_by_array_used_size);
|
|
|
+ for (i=0 ; i<cp_template->backupped_by_array_used_size ; i++)
|
|
|
+ {
|
|
|
+ starpu_malloc((void**)&arg, sizeof(struct _starpu_mpi_cp_discard_arg_cb));
|
|
|
+ arg->rank = cp_template->backupped_by_array[i];
|
|
|
+ fprintf(stderr, "Sending DISCARD msg reception to %d\n", arg->rank);
|
|
|
+ arg->msg.checkpoint_id = cp_id;
|
|
|
+ arg->msg.checkpoint_instance = cp_instance;
|
|
|
+ _ft_service_msg_isend_cb(&arg->msg, sizeof(struct _starpu_mpi_cp_ack_msg), arg->rank, _STARPU_MPI_TAG_CP_DISCARD, MPI_COMM_WORLD, free_arg, (void*)arg);
|
|
|
+ }
|
|
|
+ if (last_valid_checkpoint.checkpoint_id == -1)
|
|
|
+ {
|
|
|
+ return -1;
|
|
|
+ }
|
|
|
+ else if (last_valid_checkpoint.checkpoint_id!=cp_template->cp_template_id)
|
|
|
+ {
|
|
|
+ old_template = _starpu_mpi_get_checkpoint_template_by_id(last_valid_checkpoint.checkpoint_id);
|
|
|
+ for (i=0 ; i<old_template->backupped_by_array_used_size ; i++)
|
|
|
+ {
|
|
|
+ flag=0;
|
|
|
+ for(j=0 ; j<cp_template->backupped_by_array_used_size ; j++)
|
|
|
+ {
|
|
|
+ if (cp_template->backupped_by_array[j] == old_template->backupped_by_array[i])
|
|
|
+ {
|
|
|
+ flag = 1;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if (flag==0)
|
|
|
+ {
|
|
|
+ starpu_malloc((void**)&arg, sizeof(struct _starpu_mpi_cp_discard_arg_cb));
|
|
|
+ arg->rank = old_template->backupped_by_array[i];
|
|
|
+ fprintf(stderr, "Sending DISCARD msg to %d - OLD CP\n", arg->rank);
|
|
|
+ arg->msg.checkpoint_id = cp_id;
|
|
|
+ arg->msg.checkpoint_instance = cp_instance;
|
|
|
+ _ft_service_msg_isend_cb(&arg->msg, sizeof(struct _starpu_mpi_cp_ack_msg), arg->rank, _STARPU_MPI_TAG_CP_DISCARD, MPI_COMM_WORLD, free_arg, (void*)arg);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return 0;
|
|
|
+}
|
|
|
|
|
|
int _starpu_mpi_checkpoint_template_freeze(starpu_mpi_checkpoint_template_t cp_template)
|
|
|
{
|
|
@@ -221,28 +385,35 @@ int starpu_mpi_checkpoint_template_add_entry(starpu_mpi_checkpoint_template_t* c
|
|
|
return ret;
|
|
|
}
|
|
|
|
|
|
+
|
|
|
+
|
|
|
int _checkpoint_template_digest_ack_reception(int checkpoint_id, int checkpoint_instance) {
|
|
|
+ int old_cp_id;
|
|
|
+ starpu_mpi_checkpoint_template_t old_cp_template;
|
|
|
+ starpu_mpi_checkpoint_template_t cp_template = _starpu_mpi_get_checkpoint_template_by_id(checkpoint_id);
|
|
|
starpu_pthread_mutex_lock(&cp_template_mutex);
|
|
|
fprintf(stderr, "Digesting ack recv: id=%d, inst=%d\n", checkpoint_id, checkpoint_instance);
|
|
|
- for (int i=0 ; i<cp_template_number ; i++)
|
|
|
+
|
|
|
+ starpu_pthread_mutex_lock(&cp_template->mutex);
|
|
|
+ if (cp_template->cp_template_current_instance == checkpoint_instance)
|
|
|
{
|
|
|
- starpu_pthread_mutex_lock(&cp_template_array[i]->mutex);
|
|
|
- if (cp_template_array[i]->cp_template_id == checkpoint_id && cp_template_array[i]->cp_template_current_instance == checkpoint_instance)
|
|
|
+ fprintf(stderr, "Inst found, remaining ack msg awaited:%d\n", cp_template->remaining_ack_awaited);
|
|
|
+ cp_template->remaining_ack_awaited--;
|
|
|
+ if (cp_template->remaining_ack_awaited == 0)
|
|
|
{
|
|
|
- fprintf(stderr, "Inst found, remaining ack msg awaited:%d\n", cp_template_array[i]->remaining_ack_awaited);
|
|
|
- cp_template_array[i]->remaining_ack_awaited--;
|
|
|
- if (cp_template_array[i]->remaining_ack_awaited == 0)
|
|
|
- {
|
|
|
- // TODO: share info about cp integrity
|
|
|
- fprintf(stderr, "All cp material for cpid:%d, cpinst:%d - have been sent and acknowledged.\n", checkpoint_id, checkpoint_instance);
|
|
|
- cp_template_array[i]->pending=0;
|
|
|
- }
|
|
|
- starpu_pthread_mutex_unlock(&cp_template_array[i]->mutex);
|
|
|
- starpu_pthread_mutex_unlock(&cp_template_mutex);
|
|
|
- return 0;
|
|
|
+ fprintf(stderr, "All cp material for cpid:%d, cpinst:%d - have been sent and acknowledged.\n", checkpoint_id, checkpoint_instance);
|
|
|
+ _starpu_mpi_checkpoint_post_cp_discard_send(cp_template, checkpoint_id, checkpoint_instance);
|
|
|
+ valid_pending_checkpoint_template(cp_template);
|
|
|
+ cp_template->pending=0;
|
|
|
+ last_valid_checkpoint.checkpoint_id = checkpoint_id;
|
|
|
+ last_valid_checkpoint.checkpoint_instance = checkpoint_instance;
|
|
|
+ fprintf(stderr, "Digested\n");
|
|
|
}
|
|
|
- starpu_pthread_mutex_unlock(&cp_template_array[i]->mutex);
|
|
|
+ starpu_pthread_mutex_unlock(&cp_template->mutex);
|
|
|
+ starpu_pthread_mutex_unlock(&cp_template_mutex);
|
|
|
+ return 0;
|
|
|
}
|
|
|
+ starpu_pthread_mutex_unlock(&cp_template->mutex);
|
|
|
starpu_pthread_mutex_unlock(&cp_template_mutex);
|
|
|
return -1;
|
|
|
}
|
|
@@ -259,12 +430,12 @@ int _starpu_mpi_checkpoint_template_print(starpu_mpi_checkpoint_template_t cp_te
|
|
|
fprintf(stderr,"Item %2d: ", i);
|
|
|
if (item->type == STARPU_VALUE)
|
|
|
{
|
|
|
- fprintf(stderr, "STARPU_VALUE - Value=%d - backupof:%d - backupedby:%d\n", (*(int *)(item->ptr)), item->backup_of, item->backup_rank);
|
|
|
+ fprintf(stderr, "STARPU_VALUE - Value=%d - backupof:%d - backupedby:%d\n", (*(int *)(item->ptr)), item->backup_of, item->backupped_by);
|
|
|
}
|
|
|
else if (item->type == STARPU_R)
|
|
|
{
|
|
|
val = *(int*)starpu_data_handle_to_pointer(*(starpu_data_handle_t*)(item->ptr), 0);
|
|
|
- fprintf(stderr, "STARPU_R - Value=%d - backupof:%d - backupedby:%d\n", val, item->backup_of, item->backup_rank);
|
|
|
+ fprintf(stderr, "STARPU_R - Value=%d - backupof:%d - backupedby:%d\n", val, item->backup_of, item->backupped_by);
|
|
|
}
|
|
|
else if (item->type == STARPU_DATA_ARRAY)
|
|
|
{
|