|
@@ -21,6 +21,7 @@
|
|
|
#include <common/list.h>
|
|
#include <common/list.h>
|
|
|
#include <starpu_mpi_private.h>
|
|
#include <starpu_mpi_private.h>
|
|
|
#include <mpi_failure_tolerance/starpu_mpi_checkpoint.h>
|
|
#include <mpi_failure_tolerance/starpu_mpi_checkpoint.h>
|
|
|
|
|
+#include <mpi_failure_tolerance/starpu_mpi_checkpoint_tracker.h>
|
|
|
|
|
|
|
|
#ifdef __cplusplus
|
|
#ifdef __cplusplus
|
|
|
extern "C"
|
|
extern "C"
|
|
@@ -34,7 +35,7 @@ extern "C"
|
|
|
extern starpu_pthread_mutex_t cp_template_mutex;
|
|
extern starpu_pthread_mutex_t cp_template_mutex;
|
|
|
extern int cp_template_array_size;
|
|
extern int cp_template_array_size;
|
|
|
extern starpu_mpi_checkpoint_template_t cp_template_array[MAX_CP_TEMPLATE_NUMBER];
|
|
extern starpu_mpi_checkpoint_template_t cp_template_array[MAX_CP_TEMPLATE_NUMBER];
|
|
|
-struct _starpu_mpi_checkpoint_template_tracking_inst* last_valid_tracking_inst;
|
|
|
|
|
|
|
+
|
|
|
extern struct _starpu_mpi_checkpoint_template_tracking_inst_list future_tracking_list;
|
|
extern struct _starpu_mpi_checkpoint_template_tracking_inst_list future_tracking_list;
|
|
|
extern struct _starpu_mpi_checkpoint_template_tracking_inst_list pending_tracking_list;
|
|
extern struct _starpu_mpi_checkpoint_template_tracking_inst_list pending_tracking_list;
|
|
|
|
|
|
|
@@ -49,7 +50,7 @@ int _checkpoint_template_digest_ack_reception(int checkpoint_id, int checkpoint_
|
|
|
|
|
|
|
|
int _starpu_mpi_checkpoint_post_cp_discard_recv(starpu_mpi_checkpoint_template_t cp_template);
|
|
int _starpu_mpi_checkpoint_post_cp_discard_recv(starpu_mpi_checkpoint_template_t cp_template);
|
|
|
|
|
|
|
|
-int _starpu_mpi_checkpoint_template_register(starpu_mpi_checkpoint_template_t *cp_template, int cp_id, va_list varg_list);
|
|
|
|
|
|
|
+int _starpu_mpi_checkpoint_template_register(starpu_mpi_checkpoint_template_t *cp_template, int cp_id, int cp_domain, va_list varg_list);
|
|
|
|
|
|
|
|
|
|
|
|
|
LIST_TYPE(_starpu_mpi_checkpoint_template_tracking_inst,
|
|
LIST_TYPE(_starpu_mpi_checkpoint_template_tracking_inst,
|
|
@@ -88,128 +89,6 @@ struct _starpu_mpi_checkpoint_template
|
|
|
|
|
|
|
|
};
|
|
};
|
|
|
|
|
|
|
|
-static inline void _starpu_mpi_checkpoint_template_tracking_inst_init(struct _starpu_mpi_checkpoint_template_tracking_inst *tracking_inst)
|
|
|
|
|
-{
|
|
|
|
|
- tracking_inst->cp_template = NULL;
|
|
|
|
|
- tracking_inst->cp_id = -1;
|
|
|
|
|
- tracking_inst->cp_inst = -1;
|
|
|
|
|
- tracking_inst->ack_msg_count = 0;
|
|
|
|
|
- tracking_inst->valid = 0;
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-static struct _starpu_mpi_checkpoint_template_tracking_inst* _starpu_mpi_checkpoint_template_get_tracking_inst_by_id_inst(struct _starpu_mpi_checkpoint_template_tracking_inst_list tracking_list, int cp_id, int cp_inst)
|
|
|
|
|
-{
|
|
|
|
|
- struct _starpu_mpi_checkpoint_template_tracking_inst* item;
|
|
|
|
|
-
|
|
|
|
|
- for (item =_starpu_mpi_checkpoint_template_tracking_inst_list_begin(&tracking_list) ;
|
|
|
|
|
- item!=_starpu_mpi_checkpoint_template_tracking_inst_list_end(&tracking_list) ;
|
|
|
|
|
- item =_starpu_mpi_checkpoint_template_tracking_inst_list_next(item))
|
|
|
|
|
- {
|
|
|
|
|
- if (item->cp_id==cp_id && item->cp_inst==cp_inst)
|
|
|
|
|
- {
|
|
|
|
|
- return item;
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
- return NULL;
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-
|
|
|
|
|
-static struct _starpu_mpi_checkpoint_template_tracking_inst* _starpu_mpi_checkpoint_template_check_validation_coherency(int cp_id, int cp_inst)
|
|
|
|
|
-{
|
|
|
|
|
- struct _starpu_mpi_checkpoint_template_tracking_inst* item;
|
|
|
|
|
-
|
|
|
|
|
- for (item =_starpu_mpi_checkpoint_template_tracking_inst_list_begin(&pending_tracking_list) ;
|
|
|
|
|
- item!=_starpu_mpi_checkpoint_template_tracking_inst_list_end(&pending_tracking_list) ;
|
|
|
|
|
- item =_starpu_mpi_checkpoint_template_tracking_inst_list_next(item))
|
|
|
|
|
- {
|
|
|
|
|
- if (last_valid_tracking_inst->cp_inst > cp_inst)
|
|
|
|
|
- {
|
|
|
|
|
- return last_valid_tracking_inst;
|
|
|
|
|
- }
|
|
|
|
|
- else if (item->cp_id==cp_id && item->cp_inst==cp_inst)
|
|
|
|
|
- {
|
|
|
|
|
- if (item->valid)
|
|
|
|
|
- {
|
|
|
|
|
- STARPU_ABORT_MSG("The checkpoint (id:%d - inst:%d) is already validated. This should not happen.\n",
|
|
|
|
|
- cp_id, cp_inst);
|
|
|
|
|
- }
|
|
|
|
|
- return item;
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
- return NULL;
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-static int _starpu_mpi_checkpoint_template_create_instance_tracker(starpu_mpi_checkpoint_template_t cp_template, int cp_id, int cp_inst)
|
|
|
|
|
-{
|
|
|
|
|
- struct _starpu_mpi_checkpoint_template_tracking_inst* item;
|
|
|
|
|
- int ret=0;
|
|
|
|
|
- item = _starpu_mpi_checkpoint_template_get_tracking_inst_by_id_inst(future_tracking_list, cp_id, cp_inst);
|
|
|
|
|
- if (NULL != item)
|
|
|
|
|
- {
|
|
|
|
|
- _starpu_mpi_checkpoint_template_tracking_inst_list_erase(&future_tracking_list, item);
|
|
|
|
|
- _starpu_mpi_checkpoint_template_tracking_inst_list_push_front(&pending_tracking_list, item);
|
|
|
|
|
- ret = item->ack_msg_count;
|
|
|
|
|
- item->ack_msg_count = item->cp_template->message_to_send_number-item->ack_msg_count;
|
|
|
|
|
- if (item->ack_msg_count==0)
|
|
|
|
|
- {
|
|
|
|
|
- //TODO:Process discard send
|
|
|
|
|
- STARPU_ABORT_MSG("Not yet implemented.\n");
|
|
|
|
|
- }
|
|
|
|
|
- else if (item->ack_msg_count<0)
|
|
|
|
|
- {
|
|
|
|
|
- STARPU_ABORT_MSG("Already received to many ack msgs(n:%d) for cp(id:%d - inst:%d). This should never happen.\n", ret, cp_id, cp_inst);
|
|
|
|
|
- }
|
|
|
|
|
- }
|
|
|
|
|
- else
|
|
|
|
|
- {
|
|
|
|
|
- item = _starpu_mpi_checkpoint_template_tracking_inst_new();
|
|
|
|
|
- _starpu_mpi_checkpoint_template_tracking_inst_init(item);
|
|
|
|
|
- item->cp_id = cp_id;
|
|
|
|
|
- item->cp_inst = cp_inst;
|
|
|
|
|
- item->cp_template = cp_template;
|
|
|
|
|
- item->ack_msg_count = cp_template->message_to_send_number;
|
|
|
|
|
- _starpu_mpi_checkpoint_template_tracking_inst_list_push_front(&pending_tracking_list, item);
|
|
|
|
|
- }
|
|
|
|
|
- return ret;
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-static int _starpu_mpi_checkpoint_template_add_future_inst(starpu_mpi_checkpoint_template_t cp_template, int cp_id, int cp_inst)
|
|
|
|
|
-{
|
|
|
|
|
- struct _starpu_mpi_checkpoint_template_tracking_inst* item;
|
|
|
|
|
- int current_instance = get_current_instance();
|
|
|
|
|
- item = _starpu_mpi_checkpoint_template_get_tracking_inst_by_id_inst(future_tracking_list, cp_id, cp_inst);
|
|
|
|
|
- _STARPU_MPI_DEBUG(10, "I received an ack msg for a checkpoint(id:%d) instance I did not initiated yet(received:%d - last:%d). Let's remember it's already acknowledged.\n", cp_id, cp_inst, current_instance);
|
|
|
|
|
- if (item != NULL)
|
|
|
|
|
- {
|
|
|
|
|
- item->ack_msg_count++;
|
|
|
|
|
- return item->ack_msg_count;
|
|
|
|
|
- }
|
|
|
|
|
- _STARPU_MPI_DEBUG(10, "This instance is not yet registered, let's create it.\n");
|
|
|
|
|
- item = _starpu_mpi_checkpoint_template_tracking_inst_new();
|
|
|
|
|
- _starpu_mpi_checkpoint_template_tracking_inst_init(item);
|
|
|
|
|
- item->cp_id = cp_id;
|
|
|
|
|
- item->cp_inst = cp_inst;
|
|
|
|
|
- item->cp_template = cp_template;
|
|
|
|
|
- item->ack_msg_count = 1;
|
|
|
|
|
- _starpu_mpi_checkpoint_template_tracking_inst_list_push_front(&future_tracking_list, item);
|
|
|
|
|
- return item->ack_msg_count;
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-static int _starpu_mpi_checkpoint_template_track_inst_treat_ack(starpu_mpi_checkpoint_template_t cp_template, int cp_id, int cp_inst)
|
|
|
|
|
-{
|
|
|
|
|
- int ret;
|
|
|
|
|
- struct _starpu_mpi_checkpoint_template_tracking_inst* item;
|
|
|
|
|
- item = _starpu_mpi_checkpoint_template_get_tracking_inst_by_id_inst(pending_tracking_list, cp_id, cp_inst);
|
|
|
|
|
- if (item != NULL)
|
|
|
|
|
- {
|
|
|
|
|
- item->ack_msg_count--;
|
|
|
|
|
- return item->ack_msg_count;
|
|
|
|
|
- }
|
|
|
|
|
- _STARPU_MPI_DEBUG(10, "The instance (id:%d - inst:%d) is not pending, let's ask the future instance instead.\n", cp_id, cp_inst);
|
|
|
|
|
- ret = _starpu_mpi_checkpoint_template_add_future_inst(cp_template, cp_id, cp_inst);
|
|
|
|
|
- return -ret;
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
static starpu_mpi_checkpoint_template_t _starpu_mpi_get_checkpoint_template_by_id(int checkpoint_id)
|
|
static starpu_mpi_checkpoint_template_t _starpu_mpi_get_checkpoint_template_by_id(int checkpoint_id)
|
|
|
{
|
|
{
|
|
|
starpu_pthread_mutex_lock(&cp_template_mutex);
|
|
starpu_pthread_mutex_lock(&cp_template_mutex);
|
|
@@ -228,25 +107,8 @@ static starpu_mpi_checkpoint_template_t _starpu_mpi_get_checkpoint_template_by_i
|
|
|
return NULL;
|
|
return NULL;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-//static int checkpoint_template_count_ack_inst(int cp_id, int cp_inst)
|
|
|
|
|
-//{
|
|
|
|
|
-// int ret;
|
|
|
|
|
-// struct _starpu_mpi_checkpoint_template_instance* item;
|
|
|
|
|
-// starpu_mpi_checkpoint_template_t cp_template = _starpu_mpi_get_checkpoint_template_by_id(cp_id);
|
|
|
|
|
-// for (item=_starpu_mpi_checkpoint_template_instance_list_begin(&cp_template->pending_inst_list) ;
|
|
|
|
|
-// item!=_starpu_mpi_checkpoint_template_instance_list_end(&cp_template->pending_inst_list) ;
|
|
|
|
|
-// item=_starpu_mpi_checkpoint_template_instance_list_next(item))
|
|
|
|
|
-// {
|
|
|
|
|
-// if (item->instance == cp_inst)
|
|
|
|
|
-// {
|
|
|
|
|
-// item->count--;
|
|
|
|
|
-// return item->count;
|
|
|
|
|
-// }
|
|
|
|
|
-// }
|
|
|
|
|
-// _STARPU_MPI_DEBUG(10, "This instance is not pending, let's see with the future instance instead.\n");
|
|
|
|
|
-// ret = checkpoint_template_add_future_inst(cp_template, cp_inst);
|
|
|
|
|
-// return -ret;
|
|
|
|
|
-//}
|
|
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
|
|
|
|
|
|
|
|
static inline int checkpoint_template_array_realloc(int** array, int* max_size, int growth_factor)
|
|
static inline int checkpoint_template_array_realloc(int** array, int* max_size, int growth_factor)
|
|
@@ -284,11 +146,12 @@ static inline struct _starpu_mpi_checkpoint_template_item* _starpu_mpi_checkpoin
|
|
|
return item;
|
|
return item;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-static inline starpu_mpi_checkpoint_template_t _starpu_mpi_checkpoint_template_new(int cp_id)
|
|
|
|
|
|
|
+static inline starpu_mpi_checkpoint_template_t _starpu_mpi_checkpoint_template_new(int cp_id, int cp_domain)
|
|
|
{
|
|
{
|
|
|
starpu_mpi_checkpoint_template_t _cp_template;
|
|
starpu_mpi_checkpoint_template_t _cp_template;
|
|
|
_STARPU_MPI_CALLOC(_cp_template, 1, sizeof(struct _starpu_mpi_checkpoint_template));
|
|
_STARPU_MPI_CALLOC(_cp_template, 1, sizeof(struct _starpu_mpi_checkpoint_template));
|
|
|
_cp_template->cp_id = cp_id;
|
|
_cp_template->cp_id = cp_id;
|
|
|
|
|
+ _cp_template->checkpoint_domain = cp_domain;
|
|
|
_cp_template->backup_of_array_max_size = _CHECKPOINT_TEMPLATE_BACKUPED_RANK_ARRAY_DEFAULT_SIZE;
|
|
_cp_template->backup_of_array_max_size = _CHECKPOINT_TEMPLATE_BACKUPED_RANK_ARRAY_DEFAULT_SIZE;
|
|
|
starpu_malloc((void**)&_cp_template->backup_of_array, _CHECKPOINT_TEMPLATE_BACKUPED_RANK_ARRAY_DEFAULT_SIZE);
|
|
starpu_malloc((void**)&_cp_template->backup_of_array, _CHECKPOINT_TEMPLATE_BACKUPED_RANK_ARRAY_DEFAULT_SIZE);
|
|
|
_cp_template->backup_of_array[0] = -1;
|
|
_cp_template->backup_of_array[0] = -1;
|