Browse Source

Add external data support for CP.
Begin to change debug printf to _STARPU_MPI_DEBUG call

Romain LION 5 years ago
parent
commit
b7b45dbea5

+ 62 - 28
mpi/src/mpi_failure_tolerance/starpu_mpi_checkpoint.c

@@ -39,7 +39,7 @@ extern struct _starpu_mpi_req* _starpu_mpi_irecv_cache_aware(starpu_data_handle_
 void _starpu_mpi_treat_ack_receipt_cb(void* _args)
 {
 	struct _starpu_mpi_cp_ack_arg_cb* arg = (struct _starpu_mpi_cp_ack_arg_cb*) _args;
-	fprintf(stderr, "ack msg recved id:%d inst:%d\n", arg->msg.checkpoint_id, arg->msg.checkpoint_instance);
+	_STARPU_DEBUG(3, "ack msg recved id:%d inst:%d\n", arg->msg.checkpoint_id, arg->msg.checkpoint_instance);
 	if (_checkpoint_template_digest_ack_reception(arg->msg.checkpoint_id, arg->msg.checkpoint_instance) == 0) {
 		free(arg);
 	}
@@ -55,9 +55,8 @@ void _print_ack_sent_cb(void* _args)
 void _starpu_mpi_push_cp_ack_send_cb(void* _args)
 {
 	struct _starpu_mpi_cp_ack_arg_cb* arg = (struct _starpu_mpi_cp_ack_arg_cb*) _args;
-	fprintf(stderr, "Send ack msg to %d: id=%d inst=%d\n", arg->rank, arg->msg.checkpoint_id, arg->msg.checkpoint_instance);
+	_STARPU_DEBUG(3,"Send ack msg to %d: id=%d inst=%d\n", arg->rank, arg->msg.checkpoint_id, arg->msg.checkpoint_instance);
 	_ft_service_msg_isend_cb((void*)&arg->msg, sizeof(struct _starpu_mpi_cp_ack_msg), arg->rank, _STARPU_MPI_TAG_CP_ACK, MPI_COMM_WORLD, _print_ack_sent_cb, _args);
-
 }
 
 void _starpu_mpi_store_data_and_push_cp_ack_send_cb(void* _args)
@@ -70,7 +69,12 @@ void _starpu_mpi_store_data_and_push_cp_ack_send_cb(void* _args)
 void _starpu_mpi_push_cp_ack_recv_cb(void* _args)
 {
 	struct _starpu_mpi_cp_ack_arg_cb* arg = (struct _starpu_mpi_cp_ack_arg_cb*) _args;
-	fprintf(stderr, "Posting ack recv cb from %d\n", arg->rank);
+	if (STARPU_VALUE == arg->type)
+	{
+		free(starpu_data_handle_to_pointer(arg->handle, STARPU_MAIN_RAM));
+		starpu_data_unregister(arg->handle);
+	}
+	_STARPU_DEBUG(3, "Posting ack recv cb from %d\n", arg->rank);
 	_ft_service_msg_irecv_cb((void*)&arg->msg, sizeof(struct _starpu_mpi_cp_ack_msg), arg->rank, _STARPU_MPI_TAG_CP_ACK, MPI_COMM_WORLD, _starpu_mpi_treat_ack_receipt_cb, _args);
 }
 
@@ -86,6 +90,14 @@ void _starpu_checkpoint_cached_data_recv_copy_and_ack(void* _arg)
 void _starpu_checkpoint_data_recv_copy_and_ack(void* _arg)
 {
 	struct _starpu_mpi_cp_ack_arg_cb* arg = (struct _starpu_mpi_cp_ack_arg_cb*) _arg;
+
+	if (STARPU_VALUE == arg->type)
+	{
+		// an handle as specificaly been created, no need to copy the data. Call directly the Callback
+		arg->copy_handle = arg->handle;
+		return _starpu_mpi_store_data_and_push_cp_ack_send_cb(_arg);
+	}
+
 	starpu_data_register_same(&arg->copy_handle, arg->handle);
 	starpu_data_cpy(arg->copy_handle, arg->handle, 1, _starpu_mpi_store_data_and_push_cp_ack_send_cb, _arg);
 }
@@ -93,18 +105,19 @@ void _starpu_checkpoint_data_recv_copy_and_ack(void* _arg)
 int starpu_mpi_submit_checkpoint_template(starpu_mpi_checkpoint_template_t cp_template)
 {
 	// TODO: For now checkpoint are not taken asynchronously. It will be later, and then we will have to acquire READ permissions to StarPU in order to not have the data potentially corrupted.
-	starpu_data_handle_t handle;
+	starpu_data_handle_t* handle;
+	struct _starpu_mpi_cp_ack_arg_cb* arg;
+	void* cpy_ptr;
 	struct _starpu_mpi_checkpoint_template_item* item;
 	//MPI_Comm comm;
 	starpu_pthread_mutex_lock(&cp_template->mutex);
-	fprintf(stderr, "Mutex taken\n");
 	set_pending_checkpoint_template(cp_template);
-	fprintf(stderr, "Checkpoint now pending\n");
 	STARPU_ASSERT_MSG(cp_template->pending==0, "Can not submit a checkpoint while previous instance has not succeeded.\n");
 
 	cp_template->pending               = 1;
 	cp_template->cp_template_current_instance++;
-	cp_template->remaining_ack_awaited = cp_template->message_number;
+	cp_template->remaining_ack_awaited = cp_template->sent_message_number;
+	starpu_pthread_mutex_unlock(&cp_template->mutex);
 
 	item = _starpu_mpi_checkpoint_template_get_first_data(cp_template);
 
@@ -115,40 +128,63 @@ int starpu_mpi_submit_checkpoint_template(starpu_mpi_checkpoint_template_t cp_te
 		switch (item->type)
 		{
 			case STARPU_VALUE:
-//				starpu_data_handle_t send_handle;
-//				starpu_variable_data_register(&send_handle, STARPU_MAIN_RAM, (uintptr_t)item->ptr, item->count);
-//				starpu_mpi_data_register(send_handle, )
-//				starpu_mpi_send
+				// TODO: Maybe do not pass via starpu handles for external data, and need to reimplement mpi comm layer for
+				arg = calloc(1, sizeof(struct _starpu_mpi_cp_ack_arg_cb));
+				handle = &arg->handle;
+				arg->tag = item->tag;
+				arg->type = STARPU_VALUE;
+				arg->count = item->count;
+				arg->msg.checkpoint_id = cp_template->cp_template_id;
+				arg->msg.checkpoint_instance = cp_template->cp_template_current_instance;
+				if (item->backupped_by != -1)
+				{
+					cpy_ptr = malloc(item->count);
+					memcpy(cpy_ptr, item->ptr, item->count);
+					starpu_variable_data_register(handle, STARPU_MAIN_RAM, (uintptr_t)cpy_ptr, item->count);
+					arg->rank = item->backupped_by;
+					_STARPU_MPI_DEBUG(0, "Submit CP: sending external data:%d, tag:%ld, to :%d\n", (int)(*(int*)cpy_ptr), arg->tag, arg->rank);
+					starpu_mpi_isend_detached_prio(*handle, arg->rank, arg->tag, 0, MPI_COMM_WORLD,
+												   &_starpu_mpi_push_cp_ack_recv_cb, (void*)arg);
+				}
+				else if (item->backup_of != -1)
+				{
+					cpy_ptr = malloc(item->count);
+					starpu_variable_data_register(handle, STARPU_MAIN_RAM, (uintptr_t)cpy_ptr, item->count);
+					arg->rank = item->backup_of;
+					_STARPU_MPI_DEBUG(0, "Submit CP: receiving external data tag:%ld, from :%d\n", arg->tag, arg->rank);
+					starpu_mpi_irecv_detached(*handle, arg->rank, arg->tag, MPI_COMM_WORLD,
+											  &_starpu_checkpoint_data_recv_copy_and_ack, (void*)arg);
+				}
 				break;
 			case STARPU_R:
-				handle = *(starpu_data_handle_t*)item->ptr;
-				if (starpu_mpi_data_get_rank(handle)==my_rank)
+				handle = (starpu_data_handle_t*)item->ptr;
+				if (starpu_mpi_data_get_rank(*handle)==my_rank)
 				{
-					fprintf(stderr, "sending to %d (tag %d)\n", item->backupped_by, (int)starpu_mpi_data_get_tag(handle));
-					struct _starpu_mpi_cp_ack_arg_cb* arg = calloc(1, sizeof(struct _starpu_mpi_cp_ack_arg_cb));
+					_STARPU_MPI_DEBUG(0, "Submit CP: sending starPU data to %d (tag %d)\n", item->backupped_by, (int)starpu_mpi_data_get_tag(*handle));
+					arg = calloc(1, sizeof(struct _starpu_mpi_cp_ack_arg_cb));
 					arg->rank = item->backupped_by;
-					arg->handle = handle;
-					arg->tag = starpu_mpi_data_get_tag(handle);
+					arg->handle = *handle;
+					arg->tag = starpu_mpi_data_get_tag(*handle);
 					arg->type = STARPU_R;
 					arg->count = item->count;
 					arg->msg.checkpoint_id = cp_template->cp_template_id;
 					arg->msg.checkpoint_instance = cp_template->cp_template_current_instance;
-					_starpu_mpi_isend_cache_aware(handle, item->backupped_by, starpu_mpi_data_get_tag(handle), MPI_COMM_WORLD, 1, 0, 0,
+					_starpu_mpi_isend_cache_aware(*handle, item->backupped_by, starpu_mpi_data_get_tag(*handle), MPI_COMM_WORLD, 1, 0, 0,
 					                              &_starpu_mpi_push_cp_ack_recv_cb, (void*)arg, &_starpu_mpi_push_cp_ack_recv_cb, (void*)arg, 1);
 				}
-				else if (item->backup_of == starpu_mpi_data_get_rank(handle))
+				else if (item->backup_of == starpu_mpi_data_get_rank(*handle))
 				{
-					fprintf(stderr,"recving from %d (tag %d)\n", starpu_mpi_data_get_rank(handle), (int)starpu_mpi_data_get_tag(handle));
-					struct _starpu_mpi_cp_ack_arg_cb* arg = calloc(1, sizeof(struct _starpu_mpi_cp_ack_arg_cb));
+					_STARPU_MPI_DEBUG(0, "Submit CP: receiving starPU data from %d (tag %d)\n", starpu_mpi_data_get_rank(*handle), (int)starpu_mpi_data_get_tag(*handle));
+					arg = calloc(1, sizeof(struct _starpu_mpi_cp_ack_arg_cb));
 					arg->rank = item->backup_of;
-					arg->handle = handle;
-					arg->tag = starpu_mpi_data_get_tag(handle);
+					arg->handle = *handle;
+					arg->tag = starpu_mpi_data_get_tag(*handle);
 					arg->type = STARPU_R;
 					arg->count = item->count;
 					arg->msg.checkpoint_id = cp_template->cp_template_id;
 					arg->msg.checkpoint_instance = cp_template->cp_template_current_instance;
-					_starpu_mpi_irecv_cache_aware(handle, starpu_mpi_data_get_rank(handle), starpu_mpi_data_get_tag(handle), MPI_COMM_WORLD, 1, 0,
-					                              &_starpu_checkpoint_data_recv_copy_and_ack, (void*)arg, &_starpu_checkpoint_cached_data_recv_copy_and_ack, (void*)arg, 1, 1, 1);
+					_starpu_mpi_irecv_cache_aware(*handle, starpu_mpi_data_get_rank(*handle), starpu_mpi_data_get_tag(*handle), MPI_COMM_WORLD, 1, 0,
+					                              &_starpu_checkpoint_data_recv_copy_and_ack, (void*)arg, &_starpu_checkpoint_cached_data_recv_copy_and_ack, (void*)arg, 1, 0, 1);
 				}
 				break;
 		}
@@ -156,8 +192,6 @@ int starpu_mpi_submit_checkpoint_template(starpu_mpi_checkpoint_template_t cp_te
 		item = _starpu_mpi_checkpoint_template_get_next_data(cp_template, item);
 	};
 
-	starpu_pthread_mutex_unlock(&cp_template->mutex);
-
 	return 0;
 }
 

+ 5 - 3
mpi/src/mpi_failure_tolerance/starpu_mpi_checkpoint_package.c

@@ -38,7 +38,7 @@ int checkpoint_package_data_add(int cp_id, int cp_inst, int rank, starpu_mpi_tag
 	checkpoint_data->ptr = ptr;
 	checkpoint_data->count = count;
 	_starpu_mpi_checkpoint_data_list_push_back(checkpoint_data_list, checkpoint_data);
-	fprintf(stderr, "CP data added - cpid:%d - cpinst:%d - rank:%d - tag:%ld\n", checkpoint_data->cp_id, checkpoint_data->cp_inst, checkpoint_data->rank, checkpoint_data->tag);
+	_STARPU_DEBUG(8, "CP data added - cpid:%d - cpinst:%d - rank:%d - tag:%ld\n", checkpoint_data->cp_id, checkpoint_data->cp_inst, checkpoint_data->rank, checkpoint_data->tag);
 	return 0;
 }
 
@@ -56,10 +56,12 @@ int checkpoint_package_data_del(int cp_id, int cp_inst, int rank)
 			if (checkpoint_data->type==STARPU_R)
 			{
 				starpu_data_handle_t handle = checkpoint_data->ptr;
-//				void* ptr = starpu_data_handle_to_pointer(handle, STARPU_MAIN_RAM);
-//				free(ptr);
 				starpu_data_unregister(handle);
 			}
+			if (checkpoint_data->type==STARPU_VALUE)
+			{
+				free(checkpoint_data->ptr);
+			}
 			_starpu_mpi_checkpoint_data_list_erase(checkpoint_data_list, checkpoint_data);
 			done++;
 		}

+ 14 - 25
mpi/src/mpi_failure_tolerance/starpu_mpi_checkpoint_template.c

@@ -182,7 +182,7 @@ void checkpoint_discard(void* _args)
 {
 	// TODO: flag data as "CP ready", since the CP has succeeded
 	struct _starpu_mpi_cp_discard_arg_cb* arg = (struct _starpu_mpi_cp_discard_arg_cb*) _args;
-	fprintf(stderr, "DISCARDING OLD CHECKPOINT DATA - new one is CPID:%d - CPINST:%d\n", arg->msg.checkpoint_id, arg->msg.checkpoint_instance);
+	_STARPU_MPI_DEBUG(0, "DISCARDING OLD CHECKPOINT DATA - new one is CPID:%d - CPINST:%d\n", arg->msg.checkpoint_id, arg->msg.checkpoint_instance);
 	checkpoint_package_data_del(arg->msg.checkpoint_id, arg->msg.checkpoint_instance, arg->rank);
 }
 
@@ -201,7 +201,7 @@ int _starpu_mpi_checkpoint_post_cp_discard_recv(starpu_mpi_checkpoint_template_t
 	{
 		starpu_malloc((void**)&arg, sizeof(struct _starpu_mpi_cp_discard_arg_cb));
 		arg->rank = cp_template->backup_of_array[i];
-		fprintf(stderr, "Posting DISCARD msg reception from %d\n", arg->rank);
+		_STARPU_MPI_DEBUG(10, "Posting DISCARD msg reception from %d\n", arg->rank);
 		_ft_service_msg_irecv_cb(&arg->msg, sizeof(struct _starpu_mpi_cp_ack_msg), arg->rank, _STARPU_MPI_TAG_CP_DISCARD, MPI_COMM_WORLD, checkpoint_discard, (void*)arg);
 	}
 	if (last_valid_checkpoint.checkpoint_id == -1)
@@ -226,7 +226,7 @@ int _starpu_mpi_checkpoint_post_cp_discard_recv(starpu_mpi_checkpoint_template_t
 			{
 				starpu_malloc((void**)&arg, sizeof(struct _starpu_mpi_cp_discard_arg_cb));
 				arg->rank = old_template->backup_of_array[i];
-				fprintf(stderr, "Posting DISCARD msg reception from %d - LAST VALIDATED CP\n", arg->rank);
+				_STARPU_MPI_DEBUG(10, "Posting DISCARD msg reception from %d - LAST VALIDATED CP\n", arg->rank);
 				_ft_service_msg_irecv_cb(&arg->msg, sizeof(struct _starpu_mpi_cp_ack_msg), arg->rank, _STARPU_MPI_TAG_CP_DISCARD, MPI_COMM_WORLD, checkpoint_discard, (void*)arg);
 			}
 		}
@@ -250,12 +250,11 @@ int _starpu_mpi_checkpoint_post_cp_discard_send(starpu_mpi_checkpoint_template_t
 	struct _starpu_mpi_cp_discard_arg_cb* arg;
 	int i, j, flag;
 	starpu_mpi_checkpoint_template_t old_template;
-	fprintf(stderr, "backupped_by_array_used_size: %d\n", cp_template->backupped_by_array_used_size);
 	for (i=0 ; i<cp_template->backupped_by_array_used_size ; i++)
 	{
 		starpu_malloc((void**)&arg, sizeof(struct _starpu_mpi_cp_discard_arg_cb));
 		arg->rank = cp_template->backupped_by_array[i];
-		fprintf(stderr, "Sending DISCARD msg reception to %d\n", arg->rank);
+		_STARPU_MPI_DEBUG(10, "Sending DISCARD msg reception to %d\n", arg->rank);
 		arg->msg.checkpoint_id = cp_id;
 		arg->msg.checkpoint_instance = cp_instance;
 		_ft_service_msg_isend_cb(&arg->msg, sizeof(struct _starpu_mpi_cp_ack_msg), arg->rank, _STARPU_MPI_TAG_CP_DISCARD, MPI_COMM_WORLD, free_arg, (void*)arg);
@@ -282,7 +281,7 @@ int _starpu_mpi_checkpoint_post_cp_discard_send(starpu_mpi_checkpoint_template_t
 			{
 				starpu_malloc((void**)&arg, sizeof(struct _starpu_mpi_cp_discard_arg_cb));
 				arg->rank = old_template->backupped_by_array[i];
-				fprintf(stderr, "Sending DISCARD msg to %d - OLD CP\n", arg->rank);
+				_STARPU_MPI_DEBUG(10, "Sending DISCARD msg to %d - OLD CP\n", arg->rank);
 				arg->msg.checkpoint_id = cp_id;
 				arg->msg.checkpoint_instance = cp_instance;
 				_ft_service_msg_isend_cb(&arg->msg, sizeof(struct _starpu_mpi_cp_ack_msg), arg->rank, _STARPU_MPI_TAG_CP_DISCARD, MPI_COMM_WORLD, free_arg, (void*)arg);
@@ -296,27 +295,17 @@ int _starpu_mpi_checkpoint_template_freeze(starpu_mpi_checkpoint_template_t cp_t
 {
 	starpu_pthread_mutex_lock(&cp_template->mutex);
 
-	cp_template->frozen         = 1;
-	cp_template->message_number = 0;
-	cp_template->size           = _starpu_mpi_checkpoint_template_item_list_size(&cp_template->list);
+	cp_template->frozen              = 1;
+	cp_template->sent_message_number = 0;
+	cp_template->size                = _starpu_mpi_checkpoint_template_item_list_size(&cp_template->list);
 
 	struct _starpu_mpi_checkpoint_template_item* item = _starpu_mpi_checkpoint_template_get_first_data(cp_template);
 
 	while (item != _starpu_mpi_checkpoint_template_end(cp_template))
 	{
-		switch (item->type)
+		if (item->backup_of==-1 && item->backupped_by!=-1)
 		{
-			case STARPU_VALUE:
-				cp_template->message_number++;
-				break;
-			case STARPU_R:
-				if (starpu_mpi_data_get_rank(*(starpu_data_handle_t*) item->ptr))
-				{
-					cp_template->message_number++;
-				}
-				break;
-			case STARPU_DATA_ARRAY:
-				break;
+			cp_template->sent_message_number++;
 		}
 		item = _starpu_mpi_checkpoint_template_get_next_data(cp_template, item);
 	}
@@ -390,22 +379,22 @@ int starpu_mpi_checkpoint_template_add_entry(starpu_mpi_checkpoint_template_t* c
 int _checkpoint_template_digest_ack_reception(int checkpoint_id, int checkpoint_instance) {
 	starpu_mpi_checkpoint_template_t cp_template = _starpu_mpi_get_checkpoint_template_by_id(checkpoint_id);
 	starpu_pthread_mutex_lock(&cp_template_mutex);
-	fprintf(stderr, "Digesting ack recv: id=%d, inst=%d\n", checkpoint_id, checkpoint_instance);
+	_STARPU_MPI_DEBUG(20, "Digesting ack recv: id=%d, inst=%d\n", checkpoint_id, checkpoint_instance);
 
 	starpu_pthread_mutex_lock(&cp_template->mutex);
 	if (cp_template->cp_template_current_instance == checkpoint_instance)
 	{
-		fprintf(stderr, "Inst found, remaining ack msg awaited:%d\n", cp_template->remaining_ack_awaited);
 		cp_template->remaining_ack_awaited--;
+		_STARPU_MPI_DEBUG(20, "Inst found, remaining ack msg awaited:%d\n", cp_template->remaining_ack_awaited);
 		if (cp_template->remaining_ack_awaited == 0)
 		{
-			fprintf(stderr, "All cp material for cpid:%d, cpinst:%d - have been sent and acknowledged.\n", checkpoint_id, checkpoint_instance);
+			_STARPU_MPI_DEBUG(20, "All cp material for cpid:%d, cpinst:%d - have been sent and acknowledged.\n", checkpoint_id, checkpoint_instance);
 			_starpu_mpi_checkpoint_post_cp_discard_send(cp_template, checkpoint_id, checkpoint_instance);
 			valid_pending_checkpoint_template(cp_template);
 			cp_template->pending=0;
 			last_valid_checkpoint.checkpoint_id = checkpoint_id;
 			last_valid_checkpoint.checkpoint_instance = checkpoint_instance;
-			fprintf(stderr, "Digested\n");
+			_STARPU_MPI_DEBUG(20, "Digested\n");
 		}
 		starpu_pthread_mutex_unlock(&cp_template->mutex);
 		starpu_pthread_mutex_unlock(&cp_template_mutex);

+ 1 - 1
mpi/src/mpi_failure_tolerance/starpu_mpi_checkpoint_template.h

@@ -51,7 +51,7 @@ struct _starpu_mpi_checkpoint_template{
 	int                                              size;
 	int                                              cp_template_id;
 	int                                              cp_template_current_instance;
-	int                                              message_number;
+	int                                              sent_message_number;
 	int                                              remaining_ack_awaited;
 	int                                              pending;
 	int                                              frozen;

+ 1 - 5
mpi/src/mpi_failure_tolerance/starpu_mpi_ft_service_comms.c

@@ -65,7 +65,6 @@ int _ft_service_msg_recv_send_common(void* ptr, int count, int rank, int tag, in
 
 	STARPU_PTHREAD_MUTEX_LOCK(&detached_ft_service_requests_mutex);
 	if (req_type==SEND_REQ) {
-		fprintf(stderr, "data:%d/%d\n", *(int*)(req->ptr), *(int*)(req->ptr+4));
 		MPI_Isend(req->ptr, req->count, req->datatype, req->node_tag.node.rank, req->node_tag.data_tag,
 		          req->node_tag.node.comm, &req->backend->data_request);
 	}
@@ -77,7 +76,7 @@ int _ft_service_msg_recv_send_common(void* ptr, int count, int rank, int tag, in
 		STARPU_ASSERT_MSG(1, "Unrecognized req type: Only RECV_REQ and SEND_REQ are accepeted\n");
 	}
 	_starpu_mpi_req_list_push_back(&detached_ft_service_requests, req);
-	fprintf(stderr, "pushed service req: %p in list %p - prev: %p - next: %p - dest:%d - tag:%d - type:%s\n", req, &detached_ft_service_requests, _starpu_mpi_req_list_prev(req), _starpu_mpi_req_list_next(req), req->node_tag.node.rank, (int)req->node_tag.data_tag, req_type ? "recv" : "send");
+	_STARPU_MPI_DEBUG(2, "pushed service req: %p in list %p - prev: %p - next: %p - dest:%d - tag:%d - type:%s\n", req, &detached_ft_service_requests, _starpu_mpi_req_list_prev(req), _starpu_mpi_req_list_next(req), req->node_tag.node.rank, (int)req->node_tag.data_tag, req_type ? "recv" : "send");
 	if (req_type==SEND_REQ) {
 		detached_send_n_ft_service_requests++;
 	}
@@ -103,7 +102,6 @@ int _ft_service_msg_recv_send_common(void* ptr, int count, int rank, int tag, in
 static void _starpu_mpi_handle_ft_request_termination(struct _starpu_mpi_req *req)
 {
 	_STARPU_MPI_LOG_IN();
-	fprintf(stderr, "Handle termination begin \n");
 	_STARPU_MPI_DEBUG(2, "complete MPI request %p type %s tag %"PRIi64" src %d data %p ptr %p datatype '%s' count %d registered_datatype %d internal_req %p\n",
 			req, _starpu_mpi_request_type(req->request_type), req->node_tag.data_tag, req->node_tag.node.rank, req->data_handle, req->ptr,
 			req->datatype_name, (int)req->count, req->registered_datatype, req->backend->internal_req);
@@ -161,7 +159,6 @@ static void _starpu_mpi_handle_ft_request_termination(struct _starpu_mpi_req *re
 	STARPU_PTHREAD_COND_BROADCAST(&req->backend->req_cond);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&req->backend->req_mutex);
 	_STARPU_MPI_LOG_OUT();
-	fprintf(stderr, "Handle termination end\n");
 }
 
 void starpu_mpi_test_ft_detached_service_requests(void)
@@ -203,7 +200,6 @@ void starpu_mpi_test_ft_detached_service_requests(void)
 		}
 		else
 		{
-			fprintf(stderr, "req success: %p\n", req);
 			_STARPU_MPI_TRACE_POLLING_END();
 			struct _starpu_mpi_req *next_req;
 			next_req = _starpu_mpi_req_list_next(req);

+ 1 - 0
mpi/src/starpu_mpi_req.c

@@ -95,6 +95,7 @@ struct _starpu_mpi_req *_starpu_mpi_request_fill(starpu_data_handle_t data_handl
 	req->sequential_consistency = sequential_consistency;
 	req->count = count;
 
+	fprintf(stderr, "Filling req %p with internal:%d\n", req, is_internal_req);
 	_mpi_backend._starpu_mpi_backend_request_fill(req, comm, is_internal_req);
 
 	return req;

+ 9 - 5
mpi/tests/checkpoints.c

@@ -118,11 +118,14 @@ int test_checkpoint_submit(int argc, char* argv[])
 	starpu_mpi_checkpoint_template_t cp_template;
 	int val0 = 0;
 	int val1 = 0;
+	int stage = 10;
 
 	FPRINTF(stderr, "Go\n");
 
 	STARPU_MPI_INIT();
 
+	stage+=me;
+
 	FPRINTF_MPI(stderr, "Init ok - my rnk %d - size %d\n", me, nb_nodes);
 
 	starpu_variable_data_register(&handle0, STARPU_MAIN_RAM, (uintptr_t)&val0, sizeof(int));
@@ -135,6 +138,7 @@ int test_checkpoint_submit(int argc, char* argv[])
 	starpu_mpi_checkpoint_template_register(&cp_template, 321,
 			STARPU_R, &handle0, 1,
 			STARPU_R, &handle1, 0,
+			STARPU_VALUE, &stage, sizeof(int), 300, &backup_of,
 			0);
 	FPRINTF_MPI(stderr, "Registered\n");
 
@@ -157,9 +161,10 @@ int test_checkpoint_submit(int argc, char* argv[])
 
 	FPRINTF_MPI(stderr, "Submitted\n");
 
-	sleep(1);
+	usleep(150000);
+	stage++;
 	fprintf(stderr, "\n\n");
-	sleep(1);
+	usleep(150000);
 
 	if (me==0)
 	{
@@ -180,10 +185,9 @@ int test_checkpoint_submit(int argc, char* argv[])
 
 	FPRINTF_MPI(stderr, "Submitted\n");
 
-	sleep(1);
+	usleep(150000);
 	fprintf(stderr, "\n\n");
-	sleep(1);
-
+	usleep(150000);
 	FPRINTF_MPI(stderr, "Bye!\n");
 	starpu_shutdown();