Просмотр исходного кода

Correclty count data storage/discard stat.

Romain LION лет назад: 5
Родитель
Сommit
fc95de730a
1 измененных файлов с 52 добавлено и 2 удалено
  1. 52 2
      mpi/src/mpi_failure_tolerance/starpu_mpi_checkpoint_package.c

+ 52 - 2
mpi/src/mpi_failure_tolerance/starpu_mpi_checkpoint_package.c

@@ -37,6 +37,55 @@ int checkpoint_package_shutdown()
 	return 0;
 }
 
+#ifdef STARPU_USE_MPI_FT_STATS
+void _stats_store_checkpoint_data(struct _starpu_mpi_checkpoint_data* new_checkpoint_data)
+{
+	struct _starpu_mpi_checkpoint_data* next_checkpoint_data;
+	struct _starpu_mpi_checkpoint_data* checkpoint_data = _starpu_mpi_checkpoint_data_list_begin(checkpoint_data_list);
+	while (checkpoint_data != _starpu_mpi_checkpoint_data_list_end(checkpoint_data_list))
+	{
+		next_checkpoint_data = _starpu_mpi_checkpoint_data_list_next(checkpoint_data);
+		if (checkpoint_data->tag == new_checkpoint_data->tag && checkpoint_data->ptr == new_checkpoint_data->ptr)
+		{
+			// The data is already in the CP data list,don't count it as a new data
+			return;
+		}
+		checkpoint_data = next_checkpoint_data;
+	}
+	_STARPU_MPI_FT_STATS_STORE_CP_DATA(new_checkpoint_data->type==STARPU_VALUE?new_checkpoint_data->count:new_checkpoint_data->type==STARPU_R?starpu_data_get_size((starpu_data_handle_t) new_checkpoint_data->ptr):-1);
+}
+#else
+void _stats_store_checkpoint_data(STARPU_ATTRIBUTE_UNUSED struct _starpu_mpi_checkpoint_data* new_checkpoint_data)
+{
+	return;
+}
+#endif
+
+#ifdef STARPU_USE_MPI_FT_STATS
+void _stats_discard_checkpoint_data(struct _starpu_mpi_checkpoint_data* new_checkpoint_data)
+{
+	struct _starpu_mpi_checkpoint_data* next_checkpoint_data;
+	struct _starpu_mpi_checkpoint_data* checkpoint_data = _starpu_mpi_checkpoint_data_list_begin(checkpoint_data_list);
+	while (checkpoint_data != _starpu_mpi_checkpoint_data_list_end(checkpoint_data_list))
+	{
+		next_checkpoint_data = _starpu_mpi_checkpoint_data_list_next(checkpoint_data);
+		if (checkpoint_data->tag == new_checkpoint_data->tag && checkpoint_data->ptr == new_checkpoint_data->ptr)
+		{
+			// The data is still in the CP data list, don't count it as a discard
+			return;
+		}
+		checkpoint_data = next_checkpoint_data;
+	}
+	_STARPU_MPI_FT_STATS_DISCARD_CP_DATA(new_checkpoint_data->type==STARPU_VALUE?new_checkpoint_data->count:new_checkpoint_data->type==STARPU_R?starpu_data_get_size((starpu_data_handle_t) new_checkpoint_data->ptr):-1);
+}
+#else
+void _stats_discard_checkpoint_data(STARPU_ATTRIBUTE_UNUSED struct _starpu_mpi_checkpoint_data* new_checkpoint_data)
+{
+	return;
+}
+#endif
+
+
 int checkpoint_package_data_add(int cp_id, int cp_inst, int rank, starpu_mpi_tag_t tag, int type, void* ptr, int count)
 {
 	struct _starpu_mpi_checkpoint_data* checkpoint_data = _starpu_mpi_checkpoint_data_new();
@@ -48,9 +97,9 @@ int checkpoint_package_data_add(int cp_id, int cp_inst, int rank, starpu_mpi_tag
 	checkpoint_data->ptr = ptr;
 	checkpoint_data->count = count;
 	starpu_pthread_mutex_lock(&package_package_mutex);
+	_stats_store_checkpoint_data(checkpoint_data);
 	_starpu_mpi_checkpoint_data_list_push_back(checkpoint_data_list, checkpoint_data);
 	starpu_pthread_mutex_unlock(&package_package_mutex);
-	_STARPU_MPI_FT_STATS_STORE_CP_DATA(type==STARPU_VALUE?count:type==STARPU_R?starpu_data_get_size((starpu_data_handle_t) ptr):-1);
 	_STARPU_MPI_DEBUG(8, "CP data (%p) added - cpid:%d - cpinst:%d - rank:%d - tag:%ld\n", checkpoint_data->ptr, checkpoint_data->cp_id, checkpoint_data->cp_inst, checkpoint_data->rank, checkpoint_data->tag);
 	return 0;
 }
@@ -58,6 +107,8 @@ int checkpoint_package_data_add(int cp_id, int cp_inst, int rank, starpu_mpi_tag
 int _checkpoint_package_data_delete(struct _starpu_mpi_checkpoint_data* checkpoint_data)
 {
 	size_t size;
+	_starpu_mpi_checkpoint_data_list_erase(checkpoint_data_list, checkpoint_data);
+	_stats_discard_checkpoint_data(checkpoint_data);
 	if (checkpoint_data->type==STARPU_R)
 	{
 		starpu_data_handle_t handle = checkpoint_data->ptr;
@@ -75,7 +126,6 @@ int _checkpoint_package_data_delete(struct _starpu_mpi_checkpoint_data* checkpoi
 	{
 		STARPU_ABORT_MSG("Unrecognized data type: %d\n", checkpoint_data->type);
 	}
-	_starpu_mpi_checkpoint_data_list_erase(checkpoint_data_list, checkpoint_data);
 	free(checkpoint_data);
 	return size;
 }