|
@@ -28,10 +28,15 @@
|
|
|
starpu_pthread_mutex_t cp_template_mutex;
|
|
|
starpu_mpi_checkpoint_template_t cp_template_array[MAX_CP_TEMPLATE_NUMBER];
|
|
|
int my_rank;
|
|
|
+int size;
|
|
|
int cp_template_number = 0;
|
|
|
|
|
|
+typedef int (*backup_of_fn)(int);
|
|
|
+
|
|
|
void checkpoint_template_lib_init(void) {
|
|
|
starpu_pthread_mutex_init(&cp_template_mutex, NULL);
|
|
|
+ starpu_mpi_comm_rank(MPI_COMM_WORLD, &my_rank);
|
|
|
+ starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
|
|
|
}
|
|
|
|
|
|
void checkpoint_template_lib_quit(void) {
|
|
@@ -46,51 +51,125 @@ void checkpoint_template_lib_quit(void) {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-int _starpu_mpi_checkpoint_template_register(starpu_mpi_checkpoint_template_t* cp_template, int cp_id, va_list varg_list)
|
|
|
+int starpu_mpi_checkpoint_template_create(starpu_mpi_checkpoint_template_t* cp_template, int cp_id)
|
|
|
{
|
|
|
- int arg_type;
|
|
|
- //void* useless;
|
|
|
- void* ptr;
|
|
|
- int count;
|
|
|
- int backup_rank;
|
|
|
- int backup_of;
|
|
|
-// int (*_backup_of)(int);
|
|
|
-// int (*_backuped_by)(int);
|
|
|
+ *cp_template = _starpu_mpi_checkpoint_template_new(cp_id);
|
|
|
+ return 0;
|
|
|
+}
|
|
|
|
|
|
- starpu_mpi_checkpoint_template_t _cp_template = _starpu_mpi_checkpoint_template_new(cp_id);
|
|
|
+int _starpu_mpi_checkpoint_template_add_entry(starpu_mpi_checkpoint_template_t cp_template, int arg_type, va_list varg_list)
|
|
|
+{
|
|
|
+ void* ptr;
|
|
|
+ int count;
|
|
|
+ int my_backup;
|
|
|
+ int backup_of;
|
|
|
+ int data_rank;
|
|
|
+ starpu_mpi_tag_t tag;
|
|
|
+ backup_of_fn _backup_of;
|
|
|
|
|
|
- va_list varg_list_copy;
|
|
|
- va_copy(varg_list_copy, varg_list);
|
|
|
+ STARPU_ASSERT_MSG(!(arg_type & STARPU_COMMUTE), "Unable to checkpoint non sequential task flow.\n");
|
|
|
|
|
|
- while ((arg_type = va_arg(varg_list_copy, int)) != 0)
|
|
|
+ switch(arg_type)
|
|
|
{
|
|
|
- STARPU_ASSERT_MSG(!(arg_type & STARPU_COMMUTE), "Unable to checkpoint non sequential task flow.\n");
|
|
|
+ case STARPU_R:
|
|
|
+ ptr = va_arg(varg_list, void*);
|
|
|
+ count = 1;
|
|
|
+ my_backup = va_arg(varg_list, int);
|
|
|
+ backup_of = -1;
|
|
|
+ data_rank = starpu_mpi_data_get_rank(*(starpu_data_handle_t*)ptr);
|
|
|
+ if (my_rank==data_rank || my_rank==my_backup)
|
|
|
+ {
|
|
|
+ return _starpu_mpi_checkpoint_template_add_data(cp_template, arg_type, ptr, count, my_backup, backup_of, -1);
|
|
|
+ }
|
|
|
+ else
|
|
|
+ {
|
|
|
+ /* Since this data does not concern me (i.e. it is nor my data neither a data which I'm the back up)
|
|
|
+ * it is considered unecessary to register in the CP */
|
|
|
+ return 0;
|
|
|
+ }
|
|
|
+ case STARPU_VALUE:
|
|
|
+ ptr = va_arg(varg_list, void*);
|
|
|
+ count = va_arg(varg_list, int);
|
|
|
+ tag = va_arg(varg_list, starpu_mpi_tag_t);
|
|
|
+ _backup_of = va_arg(varg_list, backup_of_fn);
|
|
|
+ /* I register the backup that will save this data */
|
|
|
+ _starpu_mpi_checkpoint_template_add_data(cp_template, arg_type, ptr, count, _backup_of(my_rank), -1, tag);
|
|
|
+ for (int i=0 ; i<my_rank ; i++)
|
|
|
+ {
|
|
|
+ if (_backup_of(i) == my_rank)
|
|
|
+ {
|
|
|
+ /* I'm the back up of someone else for this data, I have to remember it */
|
|
|
+ _starpu_mpi_checkpoint_template_add_data(cp_template, arg_type, ptr, count, -1, i, tag);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ for (int i=my_rank+1 ; i<size ; i++)
|
|
|
+ {
|
|
|
+ if (_backup_of(i) == my_rank)
|
|
|
+ {
|
|
|
+ /* I'm the back up of someone else for this data, I have to remember it */
|
|
|
+ _starpu_mpi_checkpoint_template_add_data(cp_template, arg_type, ptr, count, -1, i, tag);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return 0;
|
|
|
+// case STARPU_DATA_ARRAY:
|
|
|
+// ptr = va_arg(varg_list, void*);
|
|
|
+// count = va_arg(varg_list, int);
|
|
|
+// my_backup = va_arg(varg_list, int);
|
|
|
+// backup_of = -1;
|
|
|
+// break;
|
|
|
+ default:
|
|
|
+ STARPU_ABORT_MSG("Unrecognized argument %d, did you perhaps forget to end arguments with 0?\n", arg_type);
|
|
|
+ break;
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
|
|
|
- switch(arg_type)
|
|
|
+int _starpu_mpi_checkpoint_template_freeze(starpu_mpi_checkpoint_template_t cp_template)
|
|
|
+{
|
|
|
+ starpu_pthread_mutex_lock(&cp_template->mutex);
|
|
|
+
|
|
|
+ cp_template->frozen = 1;
|
|
|
+ cp_template->message_number = 0;
|
|
|
+ cp_template->size = _starpu_mpi_checkpoint_template_item_list_size(&cp_template->list);
|
|
|
+
|
|
|
+ struct _starpu_mpi_checkpoint_template_item* item = _starpu_mpi_checkpoint_template_get_first_data(cp_template);
|
|
|
+
|
|
|
+ while (item != _starpu_mpi_checkpoint_template_end(cp_template))
|
|
|
+ {
|
|
|
+ switch (item->type)
|
|
|
{
|
|
|
- case STARPU_R:
|
|
|
- ptr = va_arg(varg_list_copy, void*);
|
|
|
- count = 1;
|
|
|
- backup_rank = va_arg(varg_list_copy, int);
|
|
|
- backup_of = -1;
|
|
|
- break;
|
|
|
case STARPU_VALUE:
|
|
|
- ptr = va_arg(varg_list_copy, void*);
|
|
|
- count = va_arg(varg_list_copy, int);
|
|
|
- backup_rank = va_arg(varg_list_copy, int);
|
|
|
- backup_of = va_arg(varg_list_copy, int);
|
|
|
+ cp_template->message_number++;
|
|
|
break;
|
|
|
-// case STARPU_DATA_ARRAY:
|
|
|
-// ptr = va_arg(varg_list_copy, void*);
|
|
|
-// count = va_arg(varg_list_copy, int);
|
|
|
-// backup_rank = va_arg(varg_list_copy, int);
|
|
|
-// backup_of = -1;
|
|
|
-// break;
|
|
|
- default:
|
|
|
- STARPU_ABORT_MSG("Unrecognized argument %d, did you perhaps forget to end arguments with 0?\n", arg_type);
|
|
|
+ case STARPU_R:
|
|
|
+ if (starpu_mpi_data_get_rank((starpu_data_handle_t) item->ptr))
|
|
|
+ {
|
|
|
+ cp_template->message_number++;
|
|
|
+ }
|
|
|
+ break;
|
|
|
+ case STARPU_DATA_ARRAY:
|
|
|
break;
|
|
|
}
|
|
|
- _starpu_mpi_checkpoint_template_add_data(_cp_template, arg_type, ptr, count, backup_rank, backup_of);
|
|
|
+ item = _starpu_mpi_checkpoint_template_get_next_data(cp_template, item);
|
|
|
+ }
|
|
|
+
|
|
|
+ starpu_pthread_mutex_unlock(&cp_template->mutex);
|
|
|
+
|
|
|
+ return cp_template->size;
|
|
|
+}
|
|
|
+
|
|
|
+int _starpu_mpi_checkpoint_template_register(starpu_mpi_checkpoint_template_t* cp_template, int cp_id, va_list varg_list)
|
|
|
+{
|
|
|
+ int arg_type;
|
|
|
+
|
|
|
+ starpu_mpi_checkpoint_template_t _cp_template = _starpu_mpi_checkpoint_template_new(cp_id);
|
|
|
+
|
|
|
+ va_list varg_list_copy;
|
|
|
+ va_copy(varg_list_copy, varg_list);
|
|
|
+
|
|
|
+ while ((arg_type = va_arg(varg_list_copy, int)) != 0)
|
|
|
+ {
|
|
|
+ _starpu_mpi_checkpoint_template_add_entry(_cp_template, arg_type, varg_list_copy);
|
|
|
};
|
|
|
va_end(varg_list_copy);
|
|
|
|
|
@@ -111,6 +190,11 @@ int _starpu_mpi_checkpoint_template_register(starpu_mpi_checkpoint_template_t* c
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
+int starpu_mpi_checkpoint_template_freeze(starpu_mpi_checkpoint_template_t* cp_template)
|
|
|
+{
|
|
|
+ return _starpu_mpi_checkpoint_template_freeze(*cp_template);
|
|
|
+}
|
|
|
+
|
|
|
int starpu_mpi_checkpoint_template_register(starpu_mpi_checkpoint_template_t* cp_template, int cp_id, ...)
|
|
|
{
|
|
|
va_list varg_list;
|
|
@@ -120,6 +204,19 @@ int starpu_mpi_checkpoint_template_register(starpu_mpi_checkpoint_template_t* cp
|
|
|
return ret;
|
|
|
}
|
|
|
|
|
|
+int starpu_mpi_checkpoint_template_add_entry(starpu_mpi_checkpoint_template_t* cp_template, ...)
|
|
|
+{
|
|
|
+ va_list varg_list;
|
|
|
+ int arg_type;
|
|
|
+ int ret;
|
|
|
+ va_start(varg_list, cp_template);
|
|
|
+ arg_type = va_arg(varg_list, int);
|
|
|
+ STARPU_ASSERT_MSG(arg_type!=STARPU_NONE, "Unhandled arg_type: STARPU_NONE(0).\n");
|
|
|
+ ret = _starpu_mpi_checkpoint_template_add_entry(*cp_template, arg_type, varg_list);
|
|
|
+ va_end(varg_list);
|
|
|
+ return ret;
|
|
|
+}
|
|
|
+
|
|
|
int _checkpoint_template_digest_ack_reception(int checkpoint_id, int checkpoint_instance) {
|
|
|
starpu_pthread_mutex_lock(&cp_template_mutex);
|
|
|
fprintf(stderr, "Digesting ack recv: id=%d, inst=%d\n", checkpoint_id, checkpoint_instance);
|