|
@@ -21,21 +21,30 @@
|
|
|
|
|
|
#include <starpu_mpi_checkpoint.h>
|
|
|
#include <sys/param.h>
|
|
|
+#include <starpu_mpi_private.h>
|
|
|
|
|
|
#define MAX_CP_TEMPLATE_NUMBER 32 // Arbitrary limit
|
|
|
|
|
|
-starpu_pthread_mutex_t cp_template_mutex;
|
|
|
-starpu_mpi_checkpoint_template cp_template_array[MAX_CP_TEMPLATE_NUMBER];
|
|
|
+starpu_pthread_mutex_t cp_template_mutex;
|
|
|
+starpu_mpi_checkpoint_template_t cp_template_array[MAX_CP_TEMPLATE_NUMBER];
|
|
|
+int my_rank;
|
|
|
int cp_template_number = 0;
|
|
|
|
|
|
-int _starpu_mpi_checkpoint_template_register(starpu_mpi_checkpoint_template* cp_template, int cp_id, va_list varg_list)
|
|
|
+extern struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, starpu_mpi_tag_t data_tag, MPI_Comm comm, unsigned detached, unsigned sync, void (*callback)(void *), void *arg, int sequential_consistency, int is_internal_req, starpu_ssize_t count);
|
|
|
+extern struct _starpu_mpi_req *_starpu_mpi_isend_common(starpu_data_handle_t data_handle, int dest, starpu_mpi_tag_t data_tag, MPI_Comm comm, unsigned detached, unsigned sync, int prio, void (*callback)(void *), void *arg, int sequential_consistency);
|
|
|
+
|
|
|
+
|
|
|
+int _starpu_mpi_checkpoint_template_register(starpu_mpi_checkpoint_template_t* cp_template, int cp_id, va_list varg_list)
|
|
|
{
|
|
|
int arg_type;
|
|
|
void* ptr;
|
|
|
int count;
|
|
|
int backup_rank;
|
|
|
+ int backup_of;
|
|
|
+// int (*_backup_of)(int);
|
|
|
+// int (*_backuped_by)(int);
|
|
|
|
|
|
- starpu_mpi_checkpoint_template _cp_template = _starpu_mpi_checkpoint_template_new(cp_id);
|
|
|
+ starpu_mpi_checkpoint_template_t _cp_template = _starpu_mpi_checkpoint_template_new(cp_id);
|
|
|
|
|
|
va_list varg_list_copy;
|
|
|
va_copy(varg_list_copy, varg_list);
|
|
@@ -50,18 +59,25 @@ int _starpu_mpi_checkpoint_template_register(starpu_mpi_checkpoint_template* cp_
|
|
|
ptr = va_arg(varg_list_copy, void*);
|
|
|
count = 1;
|
|
|
backup_rank = va_arg(varg_list_copy, int);
|
|
|
+ backup_of = -1;
|
|
|
break;
|
|
|
case STARPU_VALUE:
|
|
|
+ ptr = va_arg(varg_list_copy, void*);
|
|
|
+ count = va_arg(varg_list_copy, int);
|
|
|
+ backup_rank = va_arg(varg_list_copy, int);
|
|
|
+ backup_of = va_arg(varg_list_copy, int);
|
|
|
+ break;
|
|
|
case STARPU_DATA_ARRAY:
|
|
|
ptr = va_arg(varg_list_copy, void*);
|
|
|
count = va_arg(varg_list_copy, int);
|
|
|
backup_rank = va_arg(varg_list_copy, int);
|
|
|
+ backup_of = -1;
|
|
|
break;
|
|
|
default:
|
|
|
STARPU_ABORT_MSG("Unrecognized argument %d, did you perhaps forget to end arguments with 0?\n", arg_type);
|
|
|
break;
|
|
|
}
|
|
|
- _starpu_mpi_checkpoint_template_add_data(_cp_template, arg_type, ptr, count, backup_rank);
|
|
|
+ _starpu_mpi_checkpoint_template_add_data(_cp_template, arg_type, ptr, count, backup_rank, backup_of);
|
|
|
};
|
|
|
va_end(varg_list_copy);
|
|
|
|
|
@@ -82,7 +98,7 @@ int _starpu_mpi_checkpoint_template_register(starpu_mpi_checkpoint_template* cp_
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
-int starpu_mpi_checkpoint_template_register(starpu_mpi_checkpoint_template* cp_template, int cp_id, ...)
|
|
|
+int starpu_mpi_checkpoint_template_register(starpu_mpi_checkpoint_template_t* cp_template, int cp_id, ...)
|
|
|
{
|
|
|
va_list varg_list;
|
|
|
va_start(varg_list, cp_id);
|
|
@@ -91,8 +107,60 @@ int starpu_mpi_checkpoint_template_register(starpu_mpi_checkpoint_template* cp_t
|
|
|
return ret;
|
|
|
}
|
|
|
|
|
|
+void print_received_value(void* handle)
|
|
|
+{
|
|
|
+ fprintf(stderr, "Node %d - I received backup value:%d\n", my_rank, *(int*)starpu_data_handle_to_pointer(*(starpu_data_handle_t*)handle, STARPU_MAIN_RAM));
|
|
|
+}
|
|
|
+
|
|
|
+int _starpu_mpi_checkpoint_template_submit(starpu_mpi_checkpoint_template_t cp_template)
|
|
|
+{
|
|
|
+ starpu_data_handle_t* handle;
|
|
|
+ struct _starpu_mpi_checkpoint_template_item* item;
|
|
|
+ //MPI_Comm comm;
|
|
|
+
|
|
|
+ starpu_pthread_mutex_lock(&cp_template->mutex);
|
|
|
+ STARPU_ASSERT_MSG(cp_template->pending==0, "Can not submit a checkpoint while previous instance has not succeeded.\n");
|
|
|
+
|
|
|
+ cp_template->pending = 1;
|
|
|
+
|
|
|
+ item = _starpu_mpi_checkpoint_template_get_first_data(cp_template);
|
|
|
+ fprintf(stderr, "begin iter\n");
|
|
|
+
|
|
|
+ while (item != _starpu_mpi_checkpoint_template_end(cp_template))
|
|
|
+ {
|
|
|
+ switch (item->type)
|
|
|
+ {
|
|
|
+ case STARPU_VALUE:
|
|
|
+// starpu_data_handle_t send_handle;
|
|
|
+// starpu_variable_data_register(&send_handle, STARPU_MAIN_RAM, (uintptr_t)item->ptr, item->count);
|
|
|
+// starpu_mpi_data_register(send_handle, )
|
|
|
+// starpu_mpi_send
|
|
|
+ break;
|
|
|
+ case STARPU_R:
|
|
|
+ handle = (starpu_data_handle_t*)item->ptr;
|
|
|
+ if (starpu_mpi_data_get_rank(*handle)==my_rank)
|
|
|
+ {
|
|
|
+ fprintf(stderr,"sending to %d (tag %d)\n", item->backup_rank, (int)starpu_mpi_data_get_tag(*handle));
|
|
|
+ _starpu_mpi_isend_common(*handle, item->backup_rank, starpu_mpi_data_get_tag(*handle), MPI_COMM_WORLD, 1, 0, 0, NULL, NULL, 1);
|
|
|
+ }
|
|
|
+ else if (item->backup_rank==my_rank)
|
|
|
+ {
|
|
|
+ fprintf(stderr,"recving from %d (tag %d)\n", starpu_mpi_data_get_rank(*handle), (int)starpu_mpi_data_get_tag(*handle));
|
|
|
+ _starpu_mpi_irecv_common(*handle, starpu_mpi_data_get_rank(*handle), starpu_mpi_data_get_tag(*handle), MPI_COMM_WORLD, 1, 0, &print_received_value, (void*)handle, 1, 1, 1);
|
|
|
+ }
|
|
|
+ break;
|
|
|
+ }
|
|
|
+
|
|
|
+ item = _starpu_mpi_checkpoint_template_get_next_data(cp_template, item);
|
|
|
+ };
|
|
|
+
|
|
|
+ starpu_pthread_mutex_unlock(&cp_template->mutex);
|
|
|
+
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
// For test purpose
|
|
|
-int _starpu_mpi_checkpoint_template_print(starpu_mpi_checkpoint_template cp_template)
|
|
|
+int _starpu_mpi_checkpoint_template_print(starpu_mpi_checkpoint_template_t cp_template)
|
|
|
{
|
|
|
int val;
|
|
|
int i = 0;
|
|
@@ -133,19 +201,15 @@ int _starpu_mpi_checkpoint_template_print(starpu_mpi_checkpoint_template cp_temp
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
-int _starpu_mpi_checkpoint_template_submit(starpu_mpi_checkpoint_template cp_template) {
|
|
|
- STARPU_ASSERT_MSG(cp_template->pending==0, "A checkpoint submission has been requested while the previous "
|
|
|
- "one has not ended.\n");
|
|
|
-
|
|
|
- for (int i = 0; i < cp_template->size; ++i)
|
|
|
- {
|
|
|
- break;
|
|
|
- }
|
|
|
+int starpu_mpi_checkpoint_template_submit(starpu_mpi_checkpoint_template_t cp_template)
|
|
|
+{
|
|
|
+ _starpu_mpi_checkpoint_template_submit(cp_template);
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
-int _starpu_mpi_checkpoint_turn_on(void)
|
|
|
+int starpu_mpi_checkpoint_turn_on(void)
|
|
|
{
|
|
|
starpu_pthread_mutex_init(&cp_template_mutex, NULL);
|
|
|
+ starpu_mpi_comm_rank(MPI_COMM_WORLD, &my_rank); //TODO: check compatibility with several Comms behaviour
|
|
|
return 0;
|
|
|
}
|