|
@@ -109,11 +109,11 @@ void _send_cp_internal_data_cb(void* _args) {
|
|
|
{
|
|
|
|
|
|
//TODO: check cp_domain!
|
|
|
- struct _starpu_mpi_checkpoint_tracker* tracker = _starpu_mpi_checkpoint_template_get_tracking_inst_by_id_inst(0, arg->msg.checkpoint_instance);
|
|
|
+ struct _starpu_mpi_checkpoint_tracker* tracker = _starpu_mpi_checkpoint_template_get_tracking_inst_by_id_inst(0, arg->checkpoint_instance_hint);
|
|
|
if(!tracker->first_msg_sent_flag)
|
|
|
{
|
|
|
tracker->first_msg_sent_flag = 1;
|
|
|
- _STARPU_MPI_TRACE_CHECKPOINT_BEGIN(arg->msg.checkpoint_instance,0);
|
|
|
+ _STARPU_MPI_TRACE_CHECKPOINT_BEGIN(arg->checkpoint_instance_hint,0);
|
|
|
}
|
|
|
}
|
|
|
}
|
|
@@ -158,9 +158,6 @@ int starpu_mpi_submit_checkpoint_template(starpu_mpi_checkpoint_template_t cp_te
|
|
|
int current_instance;
|
|
|
|
|
|
current_instance = increment_current_instance();
|
|
|
-// if (current_instance>3)
|
|
|
-// starpu_task_wait_for_all();
|
|
|
- fprintf(stderr, "Node %d submitting CP inst %d\n", _my_rank, current_instance);
|
|
|
_starpu_mpi_checkpoint_post_cp_discard_recv(cp_template);
|
|
|
_starpu_mpi_checkpoint_template_create_instance_tracker(cp_template, cp_template->cp_id, cp_template->checkpoint_domain, current_instance);
|
|
|
//TODO check what happens when all the ack msg are received when we arrrive here.
|
|
@@ -176,10 +173,6 @@ int starpu_mpi_submit_checkpoint_template(starpu_mpi_checkpoint_template_t cp_te
|
|
|
arg->type = STARPU_VALUE;
|
|
|
arg->count = item->count;
|
|
|
arg->cache_flag = 0;
|
|
|
- arg->msg.checkpoint_id = cp_template->cp_id;
|
|
|
- arg->msg.checkpoint_instance = current_instance;
|
|
|
- if (arg->msg.checkpoint_instance>3)
|
|
|
- fprintf(stderr, "arg->msg.checkpoint_instance:%d\n", arg->msg.checkpoint_instance);
|
|
|
if (item->backupped_by != -1)
|
|
|
{
|
|
|
cpy_ptr = malloc(item->count);
|
|
@@ -193,6 +186,8 @@ int starpu_mpi_submit_checkpoint_template(starpu_mpi_checkpoint_template_t cp_te
|
|
|
}
|
|
|
else if (item->backup_of != -1)
|
|
|
{
|
|
|
+ arg->msg.checkpoint_id = cp_template->cp_id;
|
|
|
+ arg->msg.checkpoint_instance = current_instance;
|
|
|
cpy_ptr = malloc(item->count);
|
|
|
starpu_variable_data_register(&arg->handle, STARPU_MAIN_RAM, (uintptr_t)cpy_ptr, item->count);
|
|
|
arg->rank = item->backup_of;
|
|
@@ -220,9 +215,7 @@ int starpu_mpi_submit_checkpoint_template(starpu_mpi_checkpoint_template_t cp_te
|
|
|
arg->tag = starpu_mpi_data_get_tag(handle);
|
|
|
arg->type = STARPU_R;
|
|
|
arg->count = item->count;
|
|
|
- arg->msg.checkpoint_id = cp_template->cp_id;
|
|
|
- arg->msg.klm=42;
|
|
|
- arg->msg.checkpoint_instance = current_instance;
|
|
|
+ arg->checkpoint_instance_hint = current_instance;
|
|
|
if (arg->msg.checkpoint_instance>3)
|
|
|
fprintf(stderr, "arg->msg.checkpoint_instance:%d\n", arg->msg.checkpoint_instance);
|
|
|
_starpu_mpi_isend_cache_aware(handle, item->backupped_by, starpu_mpi_data_get_tag(handle), MPI_COMM_WORLD, 1, 0, prio,
|
|
@@ -244,7 +237,6 @@ int starpu_mpi_submit_checkpoint_template(starpu_mpi_checkpoint_template_t cp_te
|
|
|
arg->type = STARPU_R;
|
|
|
arg->count = item->count;
|
|
|
arg->msg.checkpoint_id = cp_template->cp_id;
|
|
|
- arg->msg.klm=42;
|
|
|
arg->msg.checkpoint_instance = current_instance;
|
|
|
if (arg->msg.checkpoint_instance>3)
|
|
|
fprintf(stderr, "arg->msg.checkpoint_instance:%d\n", arg->msg.checkpoint_instance);
|