|
|
@@ -110,7 +110,7 @@ int _starpu_mpi_checkpoint_template_add_entry(starpu_mpi_checkpoint_template_t c
|
|
|
starpu_mpi_tag_t tag;
|
|
|
backup_of_fn _backup_of;
|
|
|
|
|
|
- STARPU_ASSERT_MSG(!(arg_type & STARPU_COMMUTE), "Unable to checkpoint non sequential task flow.\n");
|
|
|
+ arg_type = arg_type & ~STARPU_COMMUTE;
|
|
|
|
|
|
switch(arg_type)
|
|
|
{
|
|
|
@@ -168,10 +168,11 @@ int _starpu_mpi_checkpoint_template_add_entry(starpu_mpi_checkpoint_template_t c
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-void checkpoint_discard(void* _args)
|
|
|
+void _cp_discard_message_recv_cb(void* _args)
|
|
|
{
|
|
|
// TODO: store the information of the new CP, for restart purpose
|
|
|
struct _starpu_mpi_cp_discard_arg_cb* arg = (struct _starpu_mpi_cp_discard_arg_cb*) _args;
|
|
|
+ _STARPU_MPI_FT_STATS_RECV_FT_SERVICE_MSG(sizeof(struct _starpu_mpi_cp_ack_msg));
|
|
|
_STARPU_MPI_DEBUG(0, "DISCARDING OLD CHECKPOINT DATA of rank %d - new one is CPID:%d - CPINST:%d\n", arg->rank, arg->msg.checkpoint_id, arg->msg.checkpoint_instance);
|
|
|
checkpoint_package_data_del(arg->msg.checkpoint_id, arg->msg.checkpoint_instance, arg->rank);
|
|
|
}
|
|
|
@@ -192,13 +193,15 @@ int _starpu_mpi_checkpoint_post_cp_discard_recv(starpu_mpi_checkpoint_template_t
|
|
|
starpu_malloc((void**)&arg, sizeof(struct _starpu_mpi_cp_discard_arg_cb));
|
|
|
arg->rank = cp_template->backup_of_array[i];
|
|
|
_STARPU_MPI_DEBUG(10, "Post DISCARD msg reception from %d\n", arg->rank);
|
|
|
- _ft_service_msg_irecv_cb(&arg->msg, sizeof(struct _starpu_mpi_cp_ack_msg), arg->rank, _STARPU_MPI_TAG_CP_INFO, MPI_COMM_WORLD, checkpoint_discard, (void*)arg);
|
|
|
+ _ft_service_msg_irecv_cb(&arg->msg, sizeof(struct _starpu_mpi_cp_ack_msg), arg->rank, _STARPU_MPI_TAG_CP_INFO,
|
|
|
+ MPI_COMM_WORLD, _cp_discard_message_recv_cb, (void *) arg);
|
|
|
}
|
|
|
return i;
|
|
|
}
|
|
|
|
|
|
-void free_arg(void* _args)
|
|
|
+void _cp_discard_message_send_cb(void* _args)
|
|
|
{
|
|
|
+ _STARPU_MPI_FT_STATS_SEND_FT_SERVICE_MSG(sizeof(struct _starpu_mpi_cp_ack_msg));
|
|
|
starpu_free(_args);
|
|
|
}
|
|
|
|
|
|
@@ -221,12 +224,31 @@ int _starpu_mpi_checkpoint_post_cp_discard_send(starpu_mpi_checkpoint_template_t
|
|
|
arg->msg.validation=0;
|
|
|
arg->msg.checkpoint_id = cp_id;
|
|
|
arg->msg.checkpoint_instance = cp_instance;
|
|
|
- _ft_service_msg_isend_cb(&arg->msg, sizeof(struct _starpu_mpi_cp_ack_msg), arg->rank, _STARPU_MPI_TAG_CP_INFO, MPI_COMM_WORLD, free_arg, (void*)arg);
|
|
|
+ _ft_service_msg_isend_cb(&arg->msg, sizeof(struct _starpu_mpi_cp_ack_msg), arg->rank, _STARPU_MPI_TAG_CP_INFO,
|
|
|
+ MPI_COMM_WORLD, _cp_discard_message_send_cb, (void *) arg);
|
|
|
}
|
|
|
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
+starpu_mpi_checkpoint_template_t _starpu_mpi_get_checkpoint_template_by_id(int checkpoint_id)
|
|
|
+{
|
|
|
+ starpu_pthread_mutex_lock(&cp_template_mutex);
|
|
|
+ for (int i=0 ; i < cp_template_array_size ; i++)
|
|
|
+ {
|
|
|
+// starpu_pthread_mutex_lock(&cp_template_array[i]->mutex);
|
|
|
+ if (cp_template_array[i]->cp_id == checkpoint_id)
|
|
|
+ {
|
|
|
+// starpu_pthread_mutex_unlock(&cp_template_array[i]->mutex);
|
|
|
+ starpu_pthread_mutex_unlock(&cp_template_mutex);
|
|
|
+ return cp_template_array[i];
|
|
|
+ }
|
|
|
+// starpu_pthread_mutex_unlock(&cp_template_array[i]->mutex);
|
|
|
+ }
|
|
|
+ starpu_pthread_mutex_unlock(&cp_template_mutex);
|
|
|
+ return NULL;
|
|
|
+}
|
|
|
+
|
|
|
|
|
|
//int _starpu_mpi_checkpoint_post_cp_discard_recv(starpu_mpi_checkpoint_template_t cp_template)
|
|
|
//{
|
|
|
@@ -244,7 +266,7 @@ int _starpu_mpi_checkpoint_post_cp_discard_send(starpu_mpi_checkpoint_template_t
|
|
|
// starpu_malloc((void**)&arg, sizeof(struct _starpu_mpi_cp_discard_arg_cb));
|
|
|
// arg->rank = cp_template->backup_of_array[i];
|
|
|
// _STARPU_MPI_DEBUG(10, "Posting DISCARD msg reception from %d\n", arg->rank);
|
|
|
-// _ft_service_msg_irecv_cb(&arg->msg, sizeof(struct _starpu_mpi_cp_ack_msg), arg->rank, _STARPU_MPI_TAG_CP_INFO, MPI_COMM_WORLD, checkpoint_discard, (void*)arg);
|
|
|
+// _ft_service_msg_irecv_cb(&arg->msg, sizeof(struct _starpu_mpi_cp_ack_msg), arg->rank, _STARPU_MPI_TAG_CP_INFO, MPI_COMM_WORLD, _cp_discard_message_recv_cb, (void*)arg);
|
|
|
// }
|
|
|
// if (last_valid_checkpoint.checkpoint_id == -1)
|
|
|
// {
|
|
|
@@ -269,7 +291,7 @@ int _starpu_mpi_checkpoint_post_cp_discard_send(starpu_mpi_checkpoint_template_t
|
|
|
// starpu_malloc((void**)&arg, sizeof(struct _starpu_mpi_cp_discard_arg_cb));
|
|
|
// arg->rank = old_template->backup_of_array[i];
|
|
|
// _STARPU_MPI_DEBUG(10, "Posting DISCARD msg reception from %d - LAST VALIDATED CP\n", arg->rank);
|
|
|
-// _ft_service_msg_irecv_cb(&arg->msg, sizeof(struct _starpu_mpi_cp_ack_msg), arg->rank, _STARPU_MPI_TAG_CP_INFO, MPI_COMM_WORLD, checkpoint_discard, (void*)arg);
|
|
|
+// _ft_service_msg_irecv_cb(&arg->msg, sizeof(struct _starpu_mpi_cp_ack_msg), arg->rank, _STARPU_MPI_TAG_CP_INFO, MPI_COMM_WORLD, _cp_discard_message_recv_cb, (void*)arg);
|
|
|
// }
|
|
|
// }
|
|
|
// }
|
|
|
@@ -294,7 +316,7 @@ int _starpu_mpi_checkpoint_post_cp_discard_send(starpu_mpi_checkpoint_template_t
|
|
|
// _STARPU_MPI_DEBUG(10, "Sending DISCARD msg reception to %d\n", arg->rank);
|
|
|
// arg->msg.checkpoint_id = cp_id;
|
|
|
// arg->msg.checkpoint_instance = cp_instance;
|
|
|
-// _ft_service_msg_isend_cb(&arg->msg, sizeof(struct _starpu_mpi_cp_ack_msg), arg->rank, _STARPU_MPI_TAG_CP_INFO, MPI_COMM_WORLD, free_arg, (void*)arg);
|
|
|
+// _ft_service_msg_isend_cb(&arg->msg, sizeof(struct _starpu_mpi_cp_ack_msg), arg->rank, _STARPU_MPI_TAG_CP_INFO, MPI_COMM_WORLD, _cp_discard_message_send_cb, (void*)arg);
|
|
|
// }
|
|
|
// if (last_valid_checkpoint.checkpoint_id == -1)
|
|
|
// {
|
|
|
@@ -321,7 +343,7 @@ int _starpu_mpi_checkpoint_post_cp_discard_send(starpu_mpi_checkpoint_template_t
|
|
|
// _STARPU_MPI_DEBUG(10, "Sending DISCARD msg to %d - OLD CP\n", arg->rank);
|
|
|
// arg->msg.checkpoint_id = cp_id;
|
|
|
// arg->msg.checkpoint_instance = cp_instance;
|
|
|
-// _ft_service_msg_isend_cb(&arg->msg, sizeof(struct _starpu_mpi_cp_ack_msg), arg->rank, _STARPU_MPI_TAG_CP_INFO, MPI_COMM_WORLD, free_arg, (void*)arg);
|
|
|
+// _ft_service_msg_isend_cb(&arg->msg, sizeof(struct _starpu_mpi_cp_ack_msg), arg->rank, _STARPU_MPI_TAG_CP_INFO, MPI_COMM_WORLD, _cp_discard_message_send_cb, (void*)arg);
|
|
|
// }
|
|
|
// }
|
|
|
// }
|