浏览代码

Correct and clean

Romain LION 5 年之前
父节点
当前提交
ddd6138f54

+ 5 - 13
mpi/src/mpi_failure_tolerance/starpu_mpi_checkpoint.c

@@ -109,11 +109,11 @@ void _send_cp_internal_data_cb(void* _args) {
 	{
 
 		//TODO: check cp_domain!
-		struct _starpu_mpi_checkpoint_tracker* tracker = _starpu_mpi_checkpoint_template_get_tracking_inst_by_id_inst(0, arg->msg.checkpoint_instance);
+		struct _starpu_mpi_checkpoint_tracker* tracker = _starpu_mpi_checkpoint_template_get_tracking_inst_by_id_inst(0, arg->checkpoint_instance_hint);
 		if(!tracker->first_msg_sent_flag)
 		{
 			tracker->first_msg_sent_flag = 1;
-			_STARPU_MPI_TRACE_CHECKPOINT_BEGIN(arg->msg.checkpoint_instance,0);
+			_STARPU_MPI_TRACE_CHECKPOINT_BEGIN(arg->checkpoint_instance_hint,0);
 		}
 	}
 }
@@ -158,9 +158,6 @@ int starpu_mpi_submit_checkpoint_template(starpu_mpi_checkpoint_template_t cp_te
 	int current_instance;
 
 	current_instance = increment_current_instance();
-//	if (current_instance>3)
-//		starpu_task_wait_for_all();
-	fprintf(stderr, "Node %d submitting CP inst %d\n", _my_rank, current_instance);
 	_starpu_mpi_checkpoint_post_cp_discard_recv(cp_template);
 	_starpu_mpi_checkpoint_template_create_instance_tracker(cp_template, cp_template->cp_id, cp_template->checkpoint_domain, current_instance);
 	//TODO check what happens when all the ack msg are received when we arrrive here.
@@ -176,10 +173,6 @@ int starpu_mpi_submit_checkpoint_template(starpu_mpi_checkpoint_template_t cp_te
 				arg->type = STARPU_VALUE;
 				arg->count = item->count;
 				arg->cache_flag = 0;
-				arg->msg.checkpoint_id = cp_template->cp_id;
-				arg->msg.checkpoint_instance = current_instance;
-				if (arg->msg.checkpoint_instance>3)
-					fprintf(stderr, "arg->msg.checkpoint_instance:%d\n", arg->msg.checkpoint_instance);
 				if (item->backupped_by != -1)
 				{
 					cpy_ptr = malloc(item->count);
@@ -193,6 +186,8 @@ int starpu_mpi_submit_checkpoint_template(starpu_mpi_checkpoint_template_t cp_te
 				}
 				else if (item->backup_of != -1)
 				{
+					arg->msg.checkpoint_id = cp_template->cp_id;
+					arg->msg.checkpoint_instance = current_instance;
 					cpy_ptr = malloc(item->count);
 					starpu_variable_data_register(&arg->handle, STARPU_MAIN_RAM, (uintptr_t)cpy_ptr, item->count);
 					arg->rank = item->backup_of;
@@ -220,9 +215,7 @@ int starpu_mpi_submit_checkpoint_template(starpu_mpi_checkpoint_template_t cp_te
 					arg->tag = starpu_mpi_data_get_tag(handle);
 					arg->type = STARPU_R;
 					arg->count = item->count;
-					arg->msg.checkpoint_id = cp_template->cp_id;
-					arg->msg.klm=42;
-					arg->msg.checkpoint_instance = current_instance;
+					arg->checkpoint_instance_hint = current_instance;
 					if (arg->msg.checkpoint_instance>3)
 						fprintf(stderr, "arg->msg.checkpoint_instance:%d\n", arg->msg.checkpoint_instance);
 					_starpu_mpi_isend_cache_aware(handle, item->backupped_by, starpu_mpi_data_get_tag(handle), MPI_COMM_WORLD, 1, 0, prio,
@@ -244,7 +237,6 @@ int starpu_mpi_submit_checkpoint_template(starpu_mpi_checkpoint_template_t cp_te
 					arg->type = STARPU_R;
 					arg->count = item->count;
 					arg->msg.checkpoint_id = cp_template->cp_id;
-					arg->msg.klm=42;
 					arg->msg.checkpoint_instance = current_instance;
 					if (arg->msg.checkpoint_instance>3)
 						fprintf(stderr, "arg->msg.checkpoint_instance:%d\n", arg->msg.checkpoint_instance);

+ 1 - 1
mpi/src/mpi_failure_tolerance/starpu_mpi_checkpoint.h

@@ -31,7 +31,6 @@ extern int _my_rank;
 struct _starpu_mpi_cp_ack_msg
 {
 	int checkpoint_id;
-	int klm;
 	int checkpoint_instance;
 };
 
@@ -52,6 +51,7 @@ struct _starpu_mpi_cp_ack_arg_cb
 	int count;
 	starpu_mpi_tag_t              tag;
 	struct _starpu_mpi_cp_ack_msg msg;
+	int checkpoint_instance_hint;
 	int cache_flag;
 };