|
@@ -20,10 +20,11 @@
|
|
|
#include <sys/param.h>
|
|
|
#include <starpu_mpi_private.h>
|
|
|
#include <starpu_mpi_cache.h>
|
|
|
+#include <mpi/starpu_mpi_mpi_backend.h>
|
|
|
#include <mpi_failure_tolerance/starpu_mpi_checkpoint_template.h>
|
|
|
#include <mpi_failure_tolerance/starpu_mpi_checkpoint.h>
|
|
|
#include <mpi_failure_tolerance/starpu_mpi_ft_service_comms.h>
|
|
|
-#include <mpi/starpu_mpi_mpi_backend.h>
|
|
|
+#include <mpi_failure_tolerance/starpu_mpi_checkpoint_package.h>
|
|
|
|
|
|
|
|
|
#define MAX_CP_TEMPLATE_NUMBER 32 // Arbitrary limit
|
|
@@ -96,7 +97,6 @@ int _starpu_mpi_checkpoint_template_add_entry(starpu_mpi_checkpoint_template_t c
|
|
|
void* ptr;
|
|
|
int count;
|
|
|
int backupped_by;
|
|
|
- int backup_of;
|
|
|
int data_rank;
|
|
|
starpu_mpi_tag_t tag;
|
|
|
backup_of_fn _backup_of;
|
|
@@ -109,7 +109,6 @@ int _starpu_mpi_checkpoint_template_add_entry(starpu_mpi_checkpoint_template_t c
|
|
|
ptr = va_arg(varg_list, void*);
|
|
|
count = 1;
|
|
|
backupped_by = va_arg(varg_list, int);
|
|
|
- backup_of = -1;
|
|
|
data_rank = starpu_mpi_data_get_rank(*(starpu_data_handle_t*)ptr);
|
|
|
if (my_rank==data_rank)
|
|
|
{
|
|
@@ -184,6 +183,7 @@ void checkpoint_discard(void* _args)
|
|
|
// TODO: flag data as "CP ready", since the CP has succeeded
|
|
|
struct _starpu_mpi_cp_discard_arg_cb* arg = (struct _starpu_mpi_cp_discard_arg_cb*) _args;
|
|
|
fprintf(stderr, "DISCARDING OLD CHECKPOINT DATA - new one is CPID:%d - CPINST:%d\n", arg->msg.checkpoint_id, arg->msg.checkpoint_instance);
|
|
|
+ checkpoint_package_data_del(arg->msg.checkpoint_id, arg->msg.checkpoint_instance, arg->rank);
|
|
|
}
|
|
|
|
|
|
int _starpu_mpi_checkpoint_post_cp_discard_recv(starpu_mpi_checkpoint_template_t cp_template)
|
|
@@ -388,8 +388,6 @@ int starpu_mpi_checkpoint_template_add_entry(starpu_mpi_checkpoint_template_t* c
|
|
|
|
|
|
|
|
|
int _checkpoint_template_digest_ack_reception(int checkpoint_id, int checkpoint_instance) {
|
|
|
- int old_cp_id;
|
|
|
- starpu_mpi_checkpoint_template_t old_cp_template;
|
|
|
starpu_mpi_checkpoint_template_t cp_template = _starpu_mpi_get_checkpoint_template_by_id(checkpoint_id);
|
|
|
starpu_pthread_mutex_lock(&cp_template_mutex);
|
|
|
fprintf(stderr, "Digesting ack recv: id=%d, inst=%d\n", checkpoint_id, checkpoint_instance);
|