浏览代码

Add various functions to debug the MPI LU code.

Cédric Augonnet 15 年之前
父节点
当前提交
9484d29449
共有 4 个文件被更改,包括 365 次插入27 次删除
  1. 88 11
      mpi/examples/mpi_lu/plu_example.c
  2. 263 4
      mpi/examples/mpi_lu/plu_solve.c
  3. 2 0
      mpi/examples/mpi_lu/pxlu.h
  4. 12 12
      mpi/examples/mpi_lu/pxlu_kernels.c

+ 88 - 11
mpi/examples/mpi_lu/plu_example.c

@@ -42,6 +42,21 @@ static TYPE **tmp_12_block;
 static starpu_data_handle *tmp_21_block_handles;
 static TYPE **tmp_21_block;
 
+
+TYPE *STARPU_PLU(reconstruct_matrix)(unsigned size, unsigned nblocks);
+static void display_block_content(unsigned bi, unsigned bj, unsigned blocksize);
+
+static void display_all_blocks(unsigned nblocks, unsigned blocksize)
+{
+	fprintf(stderr, "DISPLAY ALL\n");
+	unsigned bi, bj;
+	for (bj = 0; bj < nblocks; bj++)
+	for (bi = 0; bi < nblocks; bi++)
+		display_block_content(bi, bj, blocksize);
+
+	fprintf(stderr, "*****************\n");
+}
+
 static void parse_args(int argc, char **argv)
 {
 	int i;
@@ -80,8 +95,8 @@ static void fill_block_with_random(TYPE *blockptr, unsigned size, unsigned nbloc
 	for (j = 0; j < block_size; j++)
 	for (i = 0; i < block_size; i++)
 	{
-		blockptr[i+j*block_size] = (TYPE)drand48();
-//		blockptr[i+j*block_size] = (i == j)?2.0:0.0;
+	//	blockptr[i+j*block_size] = (TYPE)drand48();
+		blockptr[i+j*block_size] = (i == j)?2.0:(TYPE)j;
 	}
 }
 
@@ -183,6 +198,9 @@ static void init_matrix(int rank)
 			(uintptr_t)tmp_21_block[k],
 			size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
 	}
+
+
+	display_all_blocks(nblocks, size/nblocks);
 }
 
 int get_block_rank(unsigned i, unsigned j)
@@ -210,6 +228,25 @@ static void display_grid(int rank, unsigned nblocks)
 	}
 }
 
+static void display_block_content(unsigned bi, unsigned bj, unsigned blocksize)
+{
+	TYPE *data = STARPU_PLU(get_block)(bj, bi);
+
+	fprintf(stderr, "BLOCK i = %d j = %d\n", bi, bj);
+
+	unsigned i, j;
+	for (j = 0; j < blocksize; j++)
+	{
+		for (i = 0; i < blocksize; i++)
+		{
+			fprintf(stderr, "%f ", data[j+i*blocksize]);
+		}
+		fprintf(stderr, "\n");
+	}
+
+	fprintf(stderr, "****\n");
+}
+
 int main(int argc, char **argv)
 {
 	int rank;
@@ -243,12 +280,15 @@ int main(int argc, char **argv)
 
 	init_matrix(rank);
 
+	TYPE *a_r = STARPU_PLU(reconstruct_matrix)(size, nblocks);
+	STARPU_PLU(display_data_content)(a_r, size);
+	
+
 	TYPE *x, *y;
 
 	if (check)
 	{
-		if (rank == 0)
-			fprintf(stderr, "Compute AX = B\n");
+		unsigned ind;
 
 		x = calloc(size, sizeof(TYPE));
 		STARPU_ASSERT(x);
@@ -256,22 +296,25 @@ int main(int argc, char **argv)
 		y = calloc(size, sizeof(TYPE));
 		STARPU_ASSERT(y);
 		
-		unsigned ind;
-		for (ind = 0; ind < size; ind++)
+		if (rank == 0)
 		{
-			//x[ind] = (TYPE)1.0;
-			x[ind] = (TYPE)drand48();
-			y[ind] = (TYPE)0.0;
+			fprintf(stderr, "Compute AX = B\n");
+
+			for (ind = 0; ind < size; ind++)
+			{
+				x[ind] = (TYPE)ind;
+//				x[ind] = (TYPE)drand48();
+				y[ind] = (TYPE)0.0;
+			}
 		}
 
 		STARPU_PLU(compute_ax)(size, x, y, nblocks, rank);
 
 		if (rank == 0)
-		for (ind = 0; ind < 10; ind++)
+		for (ind = 0; ind < STARPU_MIN(10, size); ind++)
 		{
 			fprintf(stderr, "y[%d] = %f\n", ind, (float)y[ind]);
 		}
-		
 	}
 
 	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
@@ -313,6 +356,40 @@ int main(int argc, char **argv)
 	}
 
 	/*
+	 *	Test Result Correctness
+	 */
+
+	STARPU_PLU(compute_lu_matrix)(size, nblocks);
+
+	TYPE *y2;
+
+	if (check)
+	{
+		unsigned ind;
+
+		y2 = calloc(size, sizeof(TYPE));
+		STARPU_ASSERT(y);
+		
+		if (rank == 0)
+		{
+			fprintf(stderr, "Compute LUX = B2\n");
+
+			for (ind = 0; ind < size; ind++)
+			{
+				y2[ind] = (TYPE)0.0;
+			}
+		}
+
+		STARPU_PLU(compute_lux)(size, x, y2, nblocks, rank);
+
+		if (rank == 0)
+		for (ind = 0; ind < STARPU_MIN(10, size); ind++)
+		{
+			fprintf(stderr, "y[%d] = %f\n", ind, (float)y2[ind]);
+		}
+	}
+
+	/*
 	 * 	Termination
 	 */
 

+ 263 - 4
mpi/examples/mpi_lu/plu_solve.c

@@ -17,15 +17,272 @@
 #include <starpu.h>
 #include "pxlu.h"
 
-static STARPU_PLU(compute_ax_block)(unsigned size, unsigned nblocks,
+void STARPU_PLU(display_data_content)(TYPE *data, unsigned blocksize)
+{
+	fprintf(stderr, "DISPLAY BLOCK\n");
+
+	unsigned i, j;
+	for (j = 0; j < blocksize; j++)
+	{
+		for (i = 0; i < blocksize; i++)
+		{
+			fprintf(stderr, "%f ", data[j+i*blocksize]);
+		}
+		fprintf(stderr, "\n");
+	}
+
+	fprintf(stderr, "****\n");
+}
+
+static STARPU_PLU(compute_ax_block)(unsigned block_size, TYPE *block_data, TYPE *sub_x, TYPE *sub_y)
+{
+	CPU_GEMV("N", block_size, block_size, 1.0, block_data, block_size, sub_x, 1, 1.0, sub_y, 1);
+}
+
+void STARPU_PLU(extract_upper)(unsigned block_size, TYPE *inblock, TYPE *outblock)
+{
+	unsigned li, lj;
+	for (lj = 0; lj < block_size; lj++)
+	{
+		/* Upper block diag is 1 */
+		outblock[lj*(block_size + 1)] = (TYPE)1.0;
+
+		for (li = lj + 1; li < block_size; li++)
+		{
+			outblock[lj + li*block_size] = inblock[lj + li*block_size];
+		}
+	}
+}
+
+static STARPU_PLU(compute_ax_block_upper)(unsigned size, unsigned nblocks,
 				 TYPE *block_data, TYPE *sub_x, TYPE *sub_y)
 {
-	CPU_GEMV("N", size/nblocks, size/nblocks, 1.0, block_data, size/nblocks, sub_x, 1, 1.0, sub_y, 1);
+	unsigned block_size = size/nblocks;
+
+	fprintf(stderr, "KEEP UPPER\n");
+	STARPU_PLU(display_data_content)(block_data, block_size);
+
+	/* Take a copy of the upper part of the diagonal block */
+	TYPE *upper_block_copy = calloc((block_size)*(block_size), sizeof(TYPE));
+	STARPU_PLU(extract_upper)(block_size, block_data, upper_block_copy);
+		
+	STARPU_PLU(display_data_content)(upper_block_copy, block_size);
+
+	STARPU_PLU(compute_ax_block)(size/nblocks, upper_block_copy, sub_x, sub_y);
+	
+	free(upper_block_copy);
+}
+
+
+void STARPU_PLU(extract_lower)(unsigned block_size, TYPE *inblock, TYPE *outblock)
+{
+	unsigned li, lj;
+	for (lj = 0; lj < block_size; lj++)
+	{
+		for (li = 0; li <= lj; li++)
+		{
+			outblock[lj + li*block_size] = inblock[lj + li*block_size];
+		}
+	}
 }
 
-/* y is only valid on node 0 */
+
+TYPE *STARPU_PLU(reconstruct_matrix)(unsigned size, unsigned nblocks)
+{
+	TYPE *bigmatrix = calloc(size*size, sizeof(TYPE));
+
+	unsigned block_size = size/nblocks;
+
+	unsigned bi, bj;
+	for (bj = 0; bj < nblocks; bj++)
+	for (bi = 0; bi < nblocks; bi++)
+	{
+		TYPE *block = STARPU_PLU(get_block)(bj, bi);
+		//TYPE *block = STARPU_PLU(get_block)(bj, bi);
+
+		unsigned j, i;
+		for (j = 0; j < block_size; j++)
+		for (i = 0; i < block_size; i++)
+		{
+			bigmatrix[(j + bj*block_size)+(i+bi*block_size)*size] =
+								block[j+i*block_size];
+		}
+	}
+
+	return bigmatrix;
+}
+
+static TYPE *reconstruct_lower(unsigned size, unsigned nblocks)
+{
+	TYPE *lower = calloc(size*size, sizeof(TYPE));
+
+	TYPE *bigmatrix = STARPU_PLU(reconstruct_matrix)(size, nblocks);
+
+	STARPU_PLU(extract_lower)(size, bigmatrix, lower); 
+
+	return lower;
+}
+
+static TYPE *reconstruct_upper(unsigned size, unsigned nblocks)
+{
+	TYPE *upper = calloc(size*size, sizeof(TYPE));
+
+	TYPE *bigmatrix = STARPU_PLU(reconstruct_matrix)(size, nblocks);
+
+	STARPU_PLU(extract_upper)(size, bigmatrix, upper); 
+
+	return upper;
+}
+
+
+void STARPU_PLU(compute_lu_matrix)(unsigned size, unsigned nblocks)
+{
+	fprintf(stderr, "ALL\n\n");
+	TYPE *all_r = STARPU_PLU(reconstruct_matrix)(size, nblocks);
+	STARPU_PLU(display_data_content)(all_r, size);
+
+	fprintf(stderr, "\nLOWER\n");
+	TYPE *lower_r = reconstruct_lower(size, nblocks);
+	STARPU_PLU(display_data_content)(lower_r, size);
+
+	fprintf(stderr, "\nUPPER\n");
+	TYPE *upper_r = reconstruct_upper(size, nblocks);
+	STARPU_PLU(display_data_content)(upper_r, size);
+
+	TYPE *lu_r = calloc(size*size, sizeof(TYPE));
+	CPU_TRMM("R", "U", "N", "U", size, size, 1.0f, lower_r, size, upper_r, size);
+
+	fprintf(stderr, "\nLU\n");
+	STARPU_PLU(display_data_content)(lower_r, size);
+}
+
+static STARPU_PLU(compute_ax_block_lower)(unsigned size, unsigned nblocks,
+				 TYPE *block_data, TYPE *sub_x, TYPE *sub_y)
+{
+	unsigned block_size = size/nblocks;
+
+	fprintf(stderr, "KEEP LOWER\n");
+	STARPU_PLU(display_data_content)(block_data, block_size);
+
+	/* Take a copy of the upper part of the diagonal block */
+	TYPE *lower_block_copy = calloc((block_size)*(block_size), sizeof(TYPE));
+	STARPU_PLU(extract_lower)(block_size, block_data, lower_block_copy);
+
+	STARPU_PLU(display_data_content)(lower_block_copy, block_size);
+
+	STARPU_PLU(compute_ax_block)(size/nblocks, lower_block_copy, sub_x, sub_y);
+	
+	free(lower_block_copy);
+}
+
+void STARPU_PLU(compute_lux)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks, int rank)
+{
+	/* Create temporary buffers where all MPI processes are going to
+	 * compute Ui x = yi where Ai is the matrix containing the blocks of U
+	 * affected to process i, and 0 everywhere else. We then have y as the
+	 * sum of all yi. */
+	TYPE *yi = calloc(size, sizeof(TYPE));
+
+	unsigned block_size = size/nblocks;
+
+	/* Compute UiX = Yi */
+	unsigned long i,j;
+	for (j = 0; j < nblocks; j++)
+	{
+		if (get_block_rank(j, j) == rank)
+		{
+			TYPE *block_data = STARPU_PLU(get_block)(j, j);
+			TYPE *sub_x = &x[j*(block_size)];
+			TYPE *sub_yi = &yi[j*(block_size)];
+
+			STARPU_PLU(compute_ax_block_upper)(size, nblocks, block_data, sub_x, sub_yi);
+		}
+
+		for (i = j + 1; i < nblocks; i++)
+		{
+			if (get_block_rank(i, j) == rank)
+			{
+				/* That block belongs to the current MPI process */
+				TYPE *block_data = STARPU_PLU(get_block)(j, i);
+				TYPE *sub_x = &x[i*(block_size)];
+				TYPE *sub_yi = &yi[j*(block_size)];
+
+				STARPU_PLU(compute_ax_block)(size/nblocks, block_data, sub_x, sub_yi);
+			}
+		}
+	}
+
+	/* Grab Sum Yi in X */
+	MPI_Reduce(yi, x, size, MPI_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
+	memset(yi, 0, size*sizeof(TYPE));
+
+	unsigned ind;
+	if (rank == 0)
+	{
+		fprintf(stderr, "INTERMEDIATE\n");
+		for (ind = 0; ind < STARPU_MIN(10, size); ind++)
+		{
+			fprintf(stderr, "x[%d] = %f\n", ind, (float)x[ind]);
+		}
+		fprintf(stderr, "****\n");
+	}
+
+	/* Everyone needs x */
+	int bcst_ret;
+	bcst_ret = MPI_Bcast(&x, size, MPI_TYPE, 0, MPI_COMM_WORLD);
+	STARPU_ASSERT(bcst_ret == MPI_SUCCESS);
+
+	/* Compute LiX = Yi (with X = UX) */
+	for (j = 0; j < nblocks; j++)
+	{
+		if (j > 0)
+		for (i = 0; i < j; i++)
+		{
+			if (get_block_rank(i, j) == rank)
+			{
+				/* That block belongs to the current MPI process */
+				TYPE *block_data = STARPU_PLU(get_block)(j, i);
+				TYPE *sub_x = &x[i*(block_size)];
+				TYPE *sub_yi = &yi[j*(block_size)];
+
+				STARPU_PLU(compute_ax_block)(size/nblocks, block_data, sub_x, sub_yi);
+			}
+		}
+
+		if (get_block_rank(j, j) == rank)
+		{
+			TYPE *block_data = STARPU_PLU(get_block)(j, j);
+			TYPE *sub_x = &x[j*(block_size)];
+			TYPE *sub_yi = &yi[j*(block_size)];
+
+			STARPU_PLU(compute_ax_block_lower)(size, nblocks, block_data, sub_x, sub_yi);
+		}
+	}
+
+	/* Grab Sum Yi in Y */
+	MPI_Reduce(yi, y, size, MPI_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
+
+	free(yi);
+}
+
+
+/* x and y must be valid (at least) on 0 */
 void STARPU_PLU(compute_ax)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks, int rank)
 {
+	/* Send x to everyone */
+	int bcst_ret;
+	bcst_ret = MPI_Bcast(&x, size, MPI_TYPE, 0, MPI_COMM_WORLD);
+	STARPU_ASSERT(bcst_ret == MPI_SUCCESS);
+
+	if (rank == 0)
+	{
+		unsigned ind;
+		for (ind = 0; ind < STARPU_MIN(10, size); ind++)
+			fprintf(stderr, "x[%d] = %f\n", ind, (float)x[ind]);
+
+		fprintf(stderr, "Compute AX = B\n");
+	}
+
 	/* Create temporary buffers where all MPI processes are going to
 	 * compute Ai x = yi where Ai is the matrix containing the blocks of A
 	 * affected to process i, and 0 everywhere else. We then have y as the
@@ -45,11 +302,13 @@ void STARPU_PLU(compute_ax)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks, i
 				TYPE *sub_x = &x[i*(size/nblocks)];
 				TYPE *sub_yi = &yi[j*(size/nblocks)];
 
-				STARPU_PLU(compute_ax_block)(size, nblocks, block_data, sub_x, sub_yi);
+				STARPU_PLU(compute_ax_block)(size/nblocks, block_data, sub_x, sub_yi);
 			}
 		}
 	}
 
 	/* Compute the Sum of all yi = y */
 	MPI_Reduce(yi, y, size, MPI_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
+
+	free(yi);
 }

+ 2 - 0
mpi/examples/mpi_lu/pxlu.h

@@ -36,4 +36,6 @@ starpu_data_handle STARPU_PLU(get_tmp_11_block_handle)(void);
 starpu_data_handle STARPU_PLU(get_tmp_12_block_handle)(unsigned j);
 starpu_data_handle STARPU_PLU(get_tmp_21_block_handle)(unsigned i);
 
+void STARPU_PLU(display_data_content)(TYPE *data, unsigned blocksize);
+
 #endif // __PXLU_H__

+ 12 - 12
mpi/examples/mpi_lu/pxlu_kernels.c

@@ -37,9 +37,9 @@ static inline void STARPU_PLU(common_u22)(void *descr[],
 	unsigned ld21 = GET_BLAS_LD(descr[1]);
 	unsigned ld22 = GET_BLAS_LD(descr[2]);
 
-//	int rank;
-//	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-//	fprintf(stderr, "KERNEL 22 %d\n", rank);
+	int rank;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	fprintf(stderr, "KERNEL 22 %d\n", rank);
 
 #ifdef USE_CUDA
 	cublasStatus status;
@@ -127,9 +127,9 @@ static inline void STARPU_PLU(common_u12)(void *descr[],
 	unsigned nx12 = GET_BLAS_NX(descr[1]);
 	unsigned ny12 = GET_BLAS_NY(descr[1]);
 
-//	int rank;
-//	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-//	fprintf(stderr, "KERNEL 12 %d\n", rank);
+	int rank;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	fprintf(stderr, "KERNEL 12 %d\n", rank);
 
 #ifdef USE_CUDA
 	cublasStatus status;
@@ -215,9 +215,9 @@ static inline void STARPU_PLU(common_u21)(void *descr[],
 	unsigned nx21 = GET_BLAS_NX(descr[1]);
 	unsigned ny21 = GET_BLAS_NY(descr[1]);
 	
-//	int rank;
-//	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-//	fprintf(stderr, "KERNEL 21 %d\n", rank);
+	int rank;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	fprintf(stderr, "KERNEL 21 %d\n", rank);
 
 
 #ifdef USE_CUDA
@@ -299,9 +299,9 @@ static inline void STARPU_PLU(common_u11)(void *descr[],
 
 	unsigned long z;
 
-//	int rank;
-//	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-//	fprintf(stderr, "KERNEL 11 %d\n", rank);
+	int rank;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	fprintf(stderr, "KERNEL 11 %d\n", rank);
 
 	switch (s) {
 		case 0: