|
@@ -111,7 +111,6 @@ void _send_cp_internal_data_cb(void* _args) {
|
|
|
|
|
|
//TODO: check cp_domain!
|
|
|
struct _starpu_mpi_checkpoint_tracker* tracker = _starpu_mpi_checkpoint_template_get_tracking_inst_by_id_inst(0, arg->msg.checkpoint_instance);
|
|
|
- fprintf(stderr, "inst: %d tracker:%p\n", arg->msg.checkpoint_instance, tracker);
|
|
|
if(!tracker->first_msg_sent_flag)
|
|
|
{
|
|
|
tracker->first_msg_sent_flag = 1;
|
|
@@ -150,9 +149,10 @@ void _recv_internal_data_stats(STARPU_ATTRIBUTE_UNUSED struct _starpu_mpi_cp_ack
|
|
|
}
|
|
|
#endif
|
|
|
|
|
|
-int starpu_mpi_submit_checkpoint_template(starpu_mpi_checkpoint_template_t cp_template)
|
|
|
+int starpu_mpi_submit_checkpoint_template(starpu_mpi_checkpoint_template_t cp_template, int prio)
|
|
|
{
|
|
|
starpu_data_handle_t handle;
|
|
|
+ struct _starpu_mpi_data* mpi_data;
|
|
|
struct _starpu_mpi_cp_ack_arg_cb* arg;
|
|
|
void* cpy_ptr;
|
|
|
struct _starpu_mpi_checkpoint_template_item* item;
|
|
@@ -183,7 +183,7 @@ int starpu_mpi_submit_checkpoint_template(starpu_mpi_checkpoint_template_t cp_te
|
|
|
starpu_variable_data_register(&arg->handle, STARPU_MAIN_RAM, (uintptr_t)cpy_ptr, item->count);
|
|
|
arg->rank = item->backupped_by;
|
|
|
_STARPU_MPI_DEBUG(0, "Submit CP: sending external data:%d, tag:%ld, to :%d\n", (int)(*(int*)cpy_ptr), arg->tag, arg->rank);
|
|
|
- starpu_mpi_isend_detached_prio(arg->handle, arg->rank, arg->tag, 0, MPI_COMM_WORLD,
|
|
|
+ starpu_mpi_isend_detached_prio(arg->handle, arg->rank, arg->tag, prio, MPI_COMM_WORLD,
|
|
|
&_send_cp_external_data_cb, (void*)arg);
|
|
|
// The callback needs to free the handle specially created for the send, and post ack recv
|
|
|
}
|
|
@@ -200,8 +200,15 @@ int starpu_mpi_submit_checkpoint_template(starpu_mpi_checkpoint_template_t cp_te
|
|
|
break;
|
|
|
case STARPU_R:
|
|
|
handle = (starpu_data_handle_t)item->ptr;
|
|
|
+ mpi_data = _starpu_mpi_data_get(handle);
|
|
|
if (starpu_mpi_data_get_rank(handle)==my_rank)
|
|
|
{
|
|
|
+ if (!mpi_data->modified)
|
|
|
+ {
|
|
|
+ _starpu_mpi_checkpoint_tracker_update(cp_template, cp_template->cp_id, cp_template->checkpoint_domain, current_instance);
|
|
|
+ //TODO: check if the data are all acknowledged
|
|
|
+ break; // We don't want to CP a data that is still at initial state.
|
|
|
+ }
|
|
|
_STARPU_MPI_DEBUG(0, "Submit CP: sending starPU data to %d (tag %d)\n", item->backupped_by, (int)starpu_mpi_data_get_tag(handle));
|
|
|
arg = calloc(1, sizeof(struct _starpu_mpi_cp_ack_arg_cb));
|
|
|
arg->rank = item->backupped_by;
|
|
@@ -211,13 +218,17 @@ int starpu_mpi_submit_checkpoint_template(starpu_mpi_checkpoint_template_t cp_te
|
|
|
arg->count = item->count;
|
|
|
arg->msg.checkpoint_id = cp_template->cp_id;
|
|
|
arg->msg.checkpoint_instance = current_instance;
|
|
|
- _starpu_mpi_isend_cache_aware(handle, item->backupped_by, starpu_mpi_data_get_tag(handle), MPI_COMM_WORLD, 1, 0, 0,
|
|
|
+ _starpu_mpi_isend_cache_aware(handle, item->backupped_by, starpu_mpi_data_get_tag(handle), MPI_COMM_WORLD, 1, 0, prio,
|
|
|
&_send_cp_internal_data_cb, (void*)arg, 1, &arg->cache_flag);
|
|
|
// the callbacks need to post ack recv. The cache one needs to release the handle.
|
|
|
_send_internal_data_stats(arg);
|
|
|
}
|
|
|
else if (item->backup_of == starpu_mpi_data_get_rank(handle))
|
|
|
{
|
|
|
+ if (!mpi_data->modified)
|
|
|
+ {
|
|
|
+ break; // We don't want to CP a data that is still at initial state.
|
|
|
+ }
|
|
|
_STARPU_MPI_DEBUG(0, "Submit CP: receiving starPU data from %d (tag %d)\n", starpu_mpi_data_get_rank(handle), (int)starpu_mpi_data_get_tag(handle));
|
|
|
arg = calloc(1, sizeof(struct _starpu_mpi_cp_ack_arg_cb));
|
|
|
arg->rank = item->backup_of;
|
|
@@ -232,7 +243,7 @@ int starpu_mpi_submit_checkpoint_template(starpu_mpi_checkpoint_template_t cp_te
|
|
|
// The callback needs to do nothing. The cached one must release the handle.
|
|
|
_recv_internal_data_stats(arg);
|
|
|
starpu_data_dup_ro(&arg->copy_handle, arg->handle, 1);
|
|
|
- //starpu_data_acquire_cb(arg->copy_handle, STARPU_R, _recv_internal_dup_ro_cb, arg);
|
|
|
+ starpu_data_acquire_cb(arg->copy_handle, STARPU_R, _recv_internal_dup_ro_cb, arg);
|
|
|
// The callback need to store the data and post ack send.
|
|
|
}
|
|
|
break;
|