Browse Source

mpi/src: use MPI_Error_string() to get a meaningful message for a MPI error code

Nathalie Furmento 9 years ago
parent
commit
b6ff75a8ce
3 changed files with 20 additions and 200 deletions
  1. 13 13
      mpi/src/starpu_mpi.c
  2. 6 186
      mpi/src/starpu_mpi_private.c
  3. 1 1
      mpi/src/starpu_mpi_private.h

+ 13 - 13
mpi/src/starpu_mpi.c

@@ -355,13 +355,13 @@ static void _starpu_mpi_isend_data_func(struct _starpu_mpi_req *req)
 	{
 		_STARPU_MPI_COMM_TO_DEBUG(req->count, req->datatype, req->node_tag.rank, _STARPU_MPI_TAG_DATA, req->node_tag.data_tag, req->node_tag.comm);
 		req->ret = MPI_Isend(req->ptr, req->count, req->datatype, req->node_tag.rank, _STARPU_MPI_TAG_DATA, req->node_tag.comm, &req->data_request);
-		STARPU_MPI_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Isend returning %s", _starpu_mpi_get_mpi_code(req->ret));
+		STARPU_MPI_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Isend returning %s", _starpu_mpi_get_mpi_error_code(req->ret));
 	}
 	else
 	{
 		_STARPU_MPI_COMM_TO_DEBUG(req->count, req->datatype, req->node_tag.rank, _STARPU_MPI_TAG_SYNC_DATA, req->node_tag.data_tag, req->node_tag.comm);
 		req->ret = MPI_Issend(req->ptr, req->count, req->datatype, req->node_tag.rank, _STARPU_MPI_TAG_SYNC_DATA, req->node_tag.comm, &req->data_request);
-		STARPU_MPI_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Issend returning %s", _starpu_mpi_get_mpi_code(req->ret));
+		STARPU_MPI_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Issend returning %s", _starpu_mpi_get_mpi_error_code(req->ret));
 	}
 
 	_STARPU_MPI_TRACE_ISEND_SUBMIT_END(req->node_tag.rank, req->node_tag.data_tag, 0);
@@ -412,7 +412,7 @@ static void _starpu_mpi_isend_size_func(struct _starpu_mpi_req *req)
 			req->count = req->envelope->size;
 			_STARPU_MPI_COMM_TO_DEBUG(sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.rank, _STARPU_MPI_TAG_ENVELOPE, _STARPU_MPI_TAG_ENVELOPE, req->node_tag.comm);
 			ret = MPI_Isend(req->envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.rank, _STARPU_MPI_TAG_ENVELOPE, req->node_tag.comm, &req->size_req);
-			STARPU_MPI_ASSERT_MSG(ret == MPI_SUCCESS, "when sending size, MPI_Isend returning %s", _starpu_mpi_get_mpi_code(ret));
+			STARPU_MPI_ASSERT_MSG(ret == MPI_SUCCESS, "when sending size, MPI_Isend returning %s", _starpu_mpi_get_mpi_error_code(ret));
  		}
 
  		// Pack the data
@@ -423,7 +423,7 @@ static void _starpu_mpi_isend_size_func(struct _starpu_mpi_req *req)
 			_STARPU_MPI_DEBUG(20, "Sending size %ld (%ld %s) to node %d (second call to pack)\n", req->envelope->size, sizeof(req->count), "MPI_BYTE", req->node_tag.rank);
 			_STARPU_MPI_COMM_TO_DEBUG(sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.rank, _STARPU_MPI_TAG_ENVELOPE, _STARPU_MPI_TAG_ENVELOPE, req->node_tag.comm);
 			ret = MPI_Isend(req->envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.rank, _STARPU_MPI_TAG_ENVELOPE, req->node_tag.comm, &req->size_req);
-			STARPU_MPI_ASSERT_MSG(ret == MPI_SUCCESS, "when sending size, MPI_Isend returning %s", _starpu_mpi_get_mpi_code(ret));
+			STARPU_MPI_ASSERT_MSG(ret == MPI_SUCCESS, "when sending size, MPI_Isend returning %s", _starpu_mpi_get_mpi_error_code(ret));
  		}
  		else
  		{
@@ -541,7 +541,7 @@ static void _starpu_mpi_irecv_data_func(struct _starpu_mpi_req *req)
 		_STARPU_MPI_DEBUG(20, "Telling node %d it can send the data and waiting for the data back ...\n", req->node_tag.rank);
 		_STARPU_MPI_COMM_TO_DEBUG(sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.rank, _STARPU_MPI_TAG_ENVELOPE, _STARPU_MPI_TAG_ENVELOPE, req->node_tag.comm);
 		req->ret = MPI_Send(_envelope, sizeof(struct _starpu_mpi_envelope), MPI_BYTE, req->node_tag.rank, _STARPU_MPI_TAG_ENVELOPE, req->node_tag.comm);
-		STARPU_MPI_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Send returning %s", _starpu_mpi_get_mpi_code(req->ret));
+		STARPU_MPI_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Send returning %s", _starpu_mpi_get_mpi_error_code(req->ret));
 		free(_envelope);
 		_envelope = NULL;
 	}
@@ -556,7 +556,7 @@ static void _starpu_mpi_irecv_data_func(struct _starpu_mpi_req *req)
 		_STARPU_MPI_COMM_FROM_DEBUG(req->count, req->datatype, req->node_tag.rank, _STARPU_MPI_TAG_DATA, req->node_tag.data_tag, req->node_tag.comm);
 		req->ret = MPI_Irecv(req->ptr, req->count, req->datatype, req->node_tag.rank, _STARPU_MPI_TAG_DATA, req->node_tag.comm, &req->data_request);
 	}
-	STARPU_MPI_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_IRecv returning %s", _starpu_mpi_get_mpi_code(req->ret));
+	STARPU_MPI_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_IRecv returning %s", _starpu_mpi_get_mpi_error_code(req->ret));
 
 	_STARPU_MPI_TRACE_IRECV_SUBMIT_END(req->node_tag.rank, req->node_tag.data_tag);
 
@@ -666,7 +666,7 @@ static void _starpu_mpi_wait_func(struct _starpu_mpi_req *waiting_req)
 	_STARPU_MPI_TRACE_UWAIT_BEGIN(req->node_tag.rank, req->node_tag.data_tag);
 
 	req->ret = MPI_Wait(&req->data_request, waiting_req->status);
-	STARPU_MPI_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Wait returning %s", _starpu_mpi_get_mpi_code(req->ret));
+	STARPU_MPI_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Wait returning %s", _starpu_mpi_get_mpi_error_code(req->ret));
 
 	_STARPU_MPI_TRACE_UWAIT_END(req->node_tag.rank, req->node_tag.data_tag);
 
@@ -738,7 +738,7 @@ static void _starpu_mpi_test_func(struct _starpu_mpi_req *testing_req)
 	_STARPU_MPI_TRACE_UTESTING_BEGIN(req->node_tag.rank, req->node_tag.data_tag);
 
 	req->ret = MPI_Test(&req->data_request, testing_req->flag, testing_req->status);
-	STARPU_MPI_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Test returning %s", _starpu_mpi_get_mpi_code(req->ret));
+	STARPU_MPI_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Test returning %s", _starpu_mpi_get_mpi_error_code(req->ret));
 
 	_STARPU_MPI_TRACE_UTESTING_END(req->node_tag.rank, req->node_tag.data_tag);
 
@@ -831,7 +831,7 @@ static void _starpu_mpi_barrier_func(struct _starpu_mpi_req *barrier_req)
 	_STARPU_MPI_LOG_IN();
 
 	barrier_req->ret = MPI_Barrier(barrier_req->node_tag.comm);
-	STARPU_MPI_ASSERT_MSG(barrier_req->ret == MPI_SUCCESS, "MPI_Barrier returning %s", _starpu_mpi_get_mpi_code(barrier_req->ret));
+	STARPU_MPI_ASSERT_MSG(barrier_req->ret == MPI_SUCCESS, "MPI_Barrier returning %s", _starpu_mpi_get_mpi_error_code(barrier_req->ret));
 
 	_starpu_mpi_handle_request_termination(barrier_req);
 	_STARPU_MPI_LOG_OUT();
@@ -949,7 +949,7 @@ static void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req)
 					// MPI_Wait to make sure data have been sent
 					int ret;
 					ret = MPI_Wait(&req->size_req, MPI_STATUS_IGNORE);
-					STARPU_MPI_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Wait returning %s", _starpu_mpi_get_mpi_code(ret));
+					STARPU_MPI_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Wait returning %s", _starpu_mpi_get_mpi_error_code(ret));
 					free(req->ptr);
 					req->ptr = NULL;
 				}
@@ -1089,7 +1089,7 @@ static void _starpu_mpi_test_detached_requests(void)
 		//_STARPU_MPI_DEBUG(3, "Test detached request %p - mpitag %d - TYPE %s %d\n", &req->data_request, req->node_tag.data_tag, _starpu_mpi_request_type(req->request_type), req->node_tag.rank);
 		req->ret = MPI_Test(&req->data_request, &flag, &status);
 
-		STARPU_MPI_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Test returning %s", _starpu_mpi_get_mpi_code(req->ret));
+		STARPU_MPI_ASSERT_MSG(req->ret == MPI_SUCCESS, "MPI_Test returning %s", _starpu_mpi_get_mpi_error_code(req->ret));
 
 		if (!flag)
 		{
@@ -1558,7 +1558,7 @@ static void _starpu_mpi_add_sync_point_in_fxt(void)
 	starpu_mpi_comm_size(MPI_COMM_WORLD, &worldsize);
 
 	ret = MPI_Barrier(MPI_COMM_WORLD);
-	STARPU_MPI_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Barrier returning %s", _starpu_mpi_get_mpi_code(ret));
+	STARPU_MPI_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Barrier returning %s", _starpu_mpi_get_mpi_error_code(ret));
 
 	/* We generate a "unique" key so that we can make sure that different
 	 * FxT traces come from the same MPI run. */
@@ -1573,7 +1573,7 @@ static void _starpu_mpi_add_sync_point_in_fxt(void)
 	}
 
 	ret = MPI_Bcast(&random_number, 1, MPI_INT, 0, MPI_COMM_WORLD);
-	STARPU_MPI_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Bcast returning %s", _starpu_mpi_get_mpi_code(ret));
+	STARPU_MPI_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Bcast returning %s", _starpu_mpi_get_mpi_error_code(ret));
 
 	_STARPU_MPI_TRACE_BARRIER(rank, worldsize, random_number);
 

+ 6 - 186
mpi/src/starpu_mpi_private.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2010, 2012, 2014-2016  Université de Bordeaux
- * Copyright (C) 2010, 2011, 2012, 2013, 2015  CNRS
+ * Copyright (C) 2010, 2011, 2012, 2013, 2015, 2016  CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -43,190 +43,10 @@ void starpu_mpi_set_communication_tag(int tag)
 	_starpu_mpi_tag = tag;
 }
 
-char *_starpu_mpi_get_mpi_code(int code)
+char *_starpu_mpi_get_mpi_error_code(int code)
 {
-	switch (code)
-	{
-	case MPI_SUCCESS: return "MPI_SUCCESS";
-#ifdef MPI_ERR_BUFFER
-	case MPI_ERR_BUFFER: return "MPI_ERR_BUFFER";
-#endif
-#ifdef MPI_ERR_COUNT
-	case MPI_ERR_COUNT: return "MPI_ERR_COUNT";
-#endif
-#ifdef MPI_ERR_TYPE
-	case MPI_ERR_TYPE: return "MPI_ERR_TYPE";
-#endif
-#ifdef MPI_ERR_TAG
-	case MPI_ERR_TAG: return "MPI_ERR_TAG";
-#endif
-#ifdef MPI_ERR_COMM
-	case MPI_ERR_COMM: return "MPI_ERR_COMM";
-#endif
-#ifdef MPI_ERR_RANK
-	case MPI_ERR_RANK: return "MPI_ERR_RANK";
-#endif
-#ifdef MPI_ERR_REQUEST
-	case MPI_ERR_REQUEST: return "MPI_ERR_REQUEST";
-#endif
-#ifdef MPI_ERR_ROOT
-	case MPI_ERR_ROOT: return "MPI_ERR_ROOT";
-#endif
-#ifdef MPI_ERR_GROUP
-	case MPI_ERR_GROUP: return "MPI_ERR_GROUP";
-#endif
-#ifdef MPI_ERR_OP
-	case MPI_ERR_OP: return "MPI_ERR_OP";
-#endif
-#ifdef MPI_ERR_TOPOLOGY
-	case MPI_ERR_TOPOLOGY: return "MPI_ERR_TOPOLOGY";
-#endif
-#ifdef MPI_ERR_DIMS
-	case MPI_ERR_DIMS: return "MPI_ERR_DIMS";
-#endif
-#ifdef MPI_ERR_ARG
-	case MPI_ERR_ARG: return "MPI_ERR_ARG";
-#endif
-#ifdef MPI_ERR_UNKNOWN
-	case MPI_ERR_UNKNOWN: return "MPI_ERR_UNKNOWN";
-#endif
-#ifdef MPI_ERR_TRUNCATE
-	case MPI_ERR_TRUNCATE: return "MPI_ERR_TRUNCATE";
-#endif
-#ifdef MPI_ERR_OTHER
-	case MPI_ERR_OTHER: return "MPI_ERR_OTHER";
-#endif
-#ifdef MPI_ERR_INTERN
-	case MPI_ERR_INTERN: return "MPI_ERR_INTERN";
-#endif
-#ifdef MPI_ERR_IN_STATUS
-	case MPI_ERR_IN_STATUS: return "MPI_ERR_IN_STATUS";
-#endif
-#ifdef MPI_ERR_PENDING
-	case MPI_ERR_PENDING: return "MPI_ERR_PENDING";
-#endif
-#ifdef MPI_ERR_ACCESS
-	case MPI_ERR_ACCESS: return "MPI_ERR_ACCESS";
-#endif
-#ifdef MPI_ERR_AMODE
-	case MPI_ERR_AMODE: return "MPI_ERR_AMODE";
-#endif
-#ifdef MPI_ERR_ASSERT
-	case MPI_ERR_ASSERT: return "MPI_ERR_ASSERT";
-#endif
-#ifdef MPI_ERR_BAD_FILE
-	case MPI_ERR_BAD_FILE: return "MPI_ERR_BAD_FILE";
-#endif
-#ifdef MPI_ERR_BASE
-	case MPI_ERR_BASE: return "MPI_ERR_BASE";
-#endif
-#ifdef MPI_ERR_CONVERSION
-	case MPI_ERR_CONVERSION: return "MPI_ERR_CONVERSION";
-#endif
-#ifdef MPI_ERR_DISP
-	case MPI_ERR_DISP: return "MPI_ERR_DISP";
-#endif
-#ifdef MPI_ERR_DUP_DATAREP
-	case MPI_ERR_DUP_DATAREP: return "MPI_ERR_DUP_DATAREP";
-#endif
-#ifdef MPI_ERR_FILE_EXISTS
-	case MPI_ERR_FILE_EXISTS: return "MPI_ERR_FILE_EXISTS";
-#endif
-#ifdef MPI_ERR_FILE_IN_USE
-	case MPI_ERR_FILE_IN_USE: return "MPI_ERR_FILE_IN_USE";
-#endif
-#ifdef MPI_ERR_FILE
-	case MPI_ERR_FILE: return "MPI_ERR_FILE";
-#endif
-#ifdef MPI_ERR_INFO_KEY
-	case MPI_ERR_INFO_KEY: return "MPI_ERR_INFO_KEY";
-#endif
-#ifdef MPI_ERR_INFO_NOKEY
-	case MPI_ERR_INFO_NOKEY: return "MPI_ERR_INFO_NOKEY";
-#endif
-#ifdef MPI_ERR_INFO_VALUE
-	case MPI_ERR_INFO_VALUE: return "MPI_ERR_INFO_VALUE";
-#endif
-#ifdef MPI_ERR_INFO
-	case MPI_ERR_INFO: return "MPI_ERR_INFO";
-#endif
-#ifdef MPI_ERR_IO
-	case MPI_ERR_IO: return "MPI_ERR_IO";
-#endif
-#ifdef MPI_ERR_KEYVAL
-	case MPI_ERR_KEYVAL: return "MPI_ERR_KEYVAL";
-#endif
-#ifdef MPI_ERR_LOCKTYPE
-	case MPI_ERR_LOCKTYPE: return "MPI_ERR_LOCKTYPE";
-#endif
-#ifdef MPI_ERR_NAME
-	case MPI_ERR_NAME: return "MPI_ERR_NAME";
-#endif
-#ifdef MPI_ERR_NO_MEM
-	case MPI_ERR_NO_MEM: return "MPI_ERR_NO_MEM";
-#endif
-#ifdef MPI_ERR_NOT_SAME
-	case MPI_ERR_NOT_SAME: return "MPI_ERR_NOT_SAME";
-#endif
-#ifdef MPI_ERR_NO_SPACE
-	case MPI_ERR_NO_SPACE: return "MPI_ERR_NO_SPACE";
-#endif
-#ifdef MPI_ERR_NO_SUCH_FILE
-	case MPI_ERR_NO_SUCH_FILE: return "MPI_ERR_NO_SUCH_FILE";
-#endif
-#ifdef MPI_ERR_PORT
-	case MPI_ERR_PORT: return "MPI_ERR_PORT";
-#endif
-#ifdef MPI_ERR_QUOTA
-	case MPI_ERR_QUOTA: return "MPI_ERR_QUOTA";
-#endif
-#ifdef MPI_ERR_READ_ONLY
-	case MPI_ERR_READ_ONLY: return "MPI_ERR_READ_ONLY";
-#endif
-#ifdef MPI_ERR_RMA_CONFLICT
-	case MPI_ERR_RMA_CONFLICT: return "MPI_ERR_RMA_CONFLICT";
-#endif
-#ifdef MPI_ERR_RMA_SYNC
-	case MPI_ERR_RMA_SYNC: return "MPI_ERR_RMA_SYNC";
-#endif
-#ifdef MPI_ERR_SERVICE
-	case MPI_ERR_SERVICE: return "MPI_ERR_SERVICE";
-#endif
-#ifdef MPI_ERR_SIZE
-	case MPI_ERR_SIZE: return "MPI_ERR_SIZE";
-#endif
-#ifdef MPI_ERR_SPAWN
-	case MPI_ERR_SPAWN: return "MPI_ERR_SPAWN";
-#endif
-#ifdef MPI_ERR_UNSUPPORTED_DATAREP
-	case MPI_ERR_UNSUPPORTED_DATAREP: return "MPI_ERR_UNSUPPORTED_DATAREP";
-#endif
-#ifdef MPI_ERR_UNSUPPORTED_OPERATION
-	case MPI_ERR_UNSUPPORTED_OPERATION: return "MPI_ERR_UNSUPPORTED_OPERATION";
-#endif
-#ifdef MPI_ERR_WIN
-	case MPI_ERR_WIN: return "MPI_ERR_WIN";
-#endif
-#ifdef MPI_ERR_EXITED
-        case MPI_ERR_EXITED: return "MPI_ERR_EXITED";
-#endif
-#ifdef MPI_ERR_CONNECT
-        case MPI_ERR_CONNECT: return "MPI_ERR_CONNECT";
-#endif
-#ifdef MPI_ERR_PROC_FAILED
-        case MPI_ERR_PROC_FAILED: return "MPI_ERR_PROC_FAILED";
-#endif
-#ifdef MPI_ERR_REVOKED
-        case MPI_ERR_REVOKED: return "MPI_ERR_REVOKED";
-#endif
-#if defined(MPI_ERR_LASTCODE) && MPI_ERR_LASTCODE != MPI_SUCCESS
-	case MPI_ERR_LASTCODE: return "MPI_ERR_LASTCODE";
-#endif
-	default:
-		{
-			static char str[22];
-			snprintf(str, sizeof(str), "MPI error %d\n", code);
-			return str;
-		}
-	}
+	static char str[MPI_MAX_OBJECT_NAME];
+	int len;
+	MPI_Error_string(code, str, &len);
+	return str;
 }

+ 1 - 1
mpi/src/starpu_mpi_private.h

@@ -30,7 +30,7 @@ extern "C" {
 #endif
 
 extern int _starpu_debug_rank;
-char *_starpu_mpi_get_mpi_code(int code);
+char *_starpu_mpi_get_mpi_error_code(int code);
 extern int _starpu_mpi_comm;
 
 #ifdef STARPU_VERBOSE