|
@@ -37,6 +37,55 @@ int checkpoint_package_shutdown()
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
+#ifdef STARPU_USE_MPI_FT_STATS
|
|
|
+void _stats_store_checkpoint_data(struct _starpu_mpi_checkpoint_data* new_checkpoint_data)
|
|
|
+{
|
|
|
+ struct _starpu_mpi_checkpoint_data* next_checkpoint_data;
|
|
|
+ struct _starpu_mpi_checkpoint_data* checkpoint_data = _starpu_mpi_checkpoint_data_list_begin(checkpoint_data_list);
|
|
|
+ while (checkpoint_data != _starpu_mpi_checkpoint_data_list_end(checkpoint_data_list))
|
|
|
+ {
|
|
|
+ next_checkpoint_data = _starpu_mpi_checkpoint_data_list_next(checkpoint_data);
|
|
|
+ if (checkpoint_data->tag == new_checkpoint_data->tag && checkpoint_data->ptr == new_checkpoint_data->ptr)
|
|
|
+ {
|
|
|
+ // The data is already in the CP data list,don't count it as a new data
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ checkpoint_data = next_checkpoint_data;
|
|
|
+ }
|
|
|
+ _STARPU_MPI_FT_STATS_STORE_CP_DATA(new_checkpoint_data->type==STARPU_VALUE?new_checkpoint_data->count:new_checkpoint_data->type==STARPU_R?starpu_data_get_size((starpu_data_handle_t) new_checkpoint_data->ptr):-1);
|
|
|
+}
|
|
|
+#else
|
|
|
+void _stats_store_checkpoint_data(STARPU_ATTRIBUTE_UNUSED struct _starpu_mpi_checkpoint_data* new_checkpoint_data)
|
|
|
+{
|
|
|
+ return;
|
|
|
+}
|
|
|
+#endif
|
|
|
+
|
|
|
+#ifdef STARPU_USE_MPI_FT_STATS
|
|
|
+void _stats_discard_checkpoint_data(struct _starpu_mpi_checkpoint_data* new_checkpoint_data)
|
|
|
+{
|
|
|
+ struct _starpu_mpi_checkpoint_data* next_checkpoint_data;
|
|
|
+ struct _starpu_mpi_checkpoint_data* checkpoint_data = _starpu_mpi_checkpoint_data_list_begin(checkpoint_data_list);
|
|
|
+ while (checkpoint_data != _starpu_mpi_checkpoint_data_list_end(checkpoint_data_list))
|
|
|
+ {
|
|
|
+ next_checkpoint_data = _starpu_mpi_checkpoint_data_list_next(checkpoint_data);
|
|
|
+ if (checkpoint_data->tag == new_checkpoint_data->tag && checkpoint_data->ptr == new_checkpoint_data->ptr)
|
|
|
+ {
|
|
|
+ // The data is still in the CP data list, don't count it as a discard
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ checkpoint_data = next_checkpoint_data;
|
|
|
+ }
|
|
|
+ _STARPU_MPI_FT_STATS_DISCARD_CP_DATA(new_checkpoint_data->type==STARPU_VALUE?new_checkpoint_data->count:new_checkpoint_data->type==STARPU_R?starpu_data_get_size((starpu_data_handle_t) new_checkpoint_data->ptr):-1);
|
|
|
+}
|
|
|
+#else
|
|
|
+void _stats_discard_checkpoint_data(STARPU_ATTRIBUTE_UNUSED struct _starpu_mpi_checkpoint_data* new_checkpoint_data)
|
|
|
+{
|
|
|
+ return;
|
|
|
+}
|
|
|
+#endif
|
|
|
+
|
|
|
+
|
|
|
int checkpoint_package_data_add(int cp_id, int cp_inst, int rank, starpu_mpi_tag_t tag, int type, void* ptr, int count)
|
|
|
{
|
|
|
struct _starpu_mpi_checkpoint_data* checkpoint_data = _starpu_mpi_checkpoint_data_new();
|
|
@@ -48,9 +97,9 @@ int checkpoint_package_data_add(int cp_id, int cp_inst, int rank, starpu_mpi_tag
|
|
|
checkpoint_data->ptr = ptr;
|
|
|
checkpoint_data->count = count;
|
|
|
starpu_pthread_mutex_lock(&package_package_mutex);
|
|
|
+ _stats_store_checkpoint_data(checkpoint_data);
|
|
|
_starpu_mpi_checkpoint_data_list_push_back(checkpoint_data_list, checkpoint_data);
|
|
|
starpu_pthread_mutex_unlock(&package_package_mutex);
|
|
|
- _STARPU_MPI_FT_STATS_STORE_CP_DATA(type==STARPU_VALUE?count:type==STARPU_R?starpu_data_get_size((starpu_data_handle_t) ptr):-1);
|
|
|
_STARPU_MPI_DEBUG(8, "CP data (%p) added - cpid:%d - cpinst:%d - rank:%d - tag:%ld\n", checkpoint_data->ptr, checkpoint_data->cp_id, checkpoint_data->cp_inst, checkpoint_data->rank, checkpoint_data->tag);
|
|
|
return 0;
|
|
|
}
|
|
@@ -58,6 +107,8 @@ int checkpoint_package_data_add(int cp_id, int cp_inst, int rank, starpu_mpi_tag
|
|
|
int _checkpoint_package_data_delete(struct _starpu_mpi_checkpoint_data* checkpoint_data)
|
|
|
{
|
|
|
size_t size;
|
|
|
+ _starpu_mpi_checkpoint_data_list_erase(checkpoint_data_list, checkpoint_data);
|
|
|
+ _stats_discard_checkpoint_data(checkpoint_data);
|
|
|
if (checkpoint_data->type==STARPU_R)
|
|
|
{
|
|
|
starpu_data_handle_t handle = checkpoint_data->ptr;
|
|
@@ -75,7 +126,6 @@ int _checkpoint_package_data_delete(struct _starpu_mpi_checkpoint_data* checkpoi
|
|
|
{
|
|
|
STARPU_ABORT_MSG("Unrecognized data type: %d\n", checkpoint_data->type);
|
|
|
}
|
|
|
- _starpu_mpi_checkpoint_data_list_erase(checkpoint_data_list, checkpoint_data);
|
|
|
free(checkpoint_data);
|
|
|
return size;
|
|
|
}
|