Browse Source

Something got messed up with fortran vs. C ordering, so we temporarility
exchange kernels and the orders of the arguments.

Cédric Augonnet 15 years ago
parent
commit
0a02a72fd6
3 changed files with 162 additions and 179 deletions
  1. 10 10
      mpi/examples/mpi_lu/plu_example.c
  2. 142 163
      mpi/examples/mpi_lu/plu_solve.c
  3. 10 6
      mpi/examples/mpi_lu/pxlu.c

+ 10 - 10
mpi/examples/mpi_lu/plu_example.c

@@ -84,11 +84,6 @@ static void fill_block_with_random(TYPE *blockptr, unsigned size, unsigned nbloc
 	}
 }
 
-starpu_data_handle STARPU_PLU(get_block_handle)(unsigned i, unsigned j)
-{
-	return dataA_handles[j+i*nblocks];
-}
-
 starpu_data_handle STARPU_PLU(get_tmp_11_block_handle)(void)
 {
 	return tmp_11_block_handle;
@@ -104,11 +99,6 @@ starpu_data_handle STARPU_PLU(get_tmp_21_block_handle)(unsigned i)
 	return tmp_21_block_handles[i];
 }
 
-TYPE *STARPU_PLU(get_block)(unsigned i, unsigned j)
-{
-	return dataA[j+i*nblocks];
-}
-
 static void init_matrix(int rank)
 {
 	fprintf(stderr, "INIT MATRIX on node %d\n", rank);
@@ -193,6 +183,11 @@ static void init_matrix(int rank)
 	//display_all_blocks(nblocks, size/nblocks);
 }
 
+TYPE *STARPU_PLU(get_block)(unsigned i, unsigned j)
+{
+	return dataA[j+i*nblocks];
+}
+
 int get_block_rank(unsigned i, unsigned j)
 {
 	/* Take a 2D block cyclic distribution */
@@ -200,6 +195,11 @@ int get_block_rank(unsigned i, unsigned j)
 	return (j % q) * p + (i % p);
 }
 
+starpu_data_handle STARPU_PLU(get_block_handle)(unsigned i, unsigned j)
+{
+	return dataA_handles[j+i*nblocks];
+}
+
 static void display_grid(int rank, unsigned nblocks)
 {
 	//if (rank == 0)

+ 142 - 163
mpi/examples/mpi_lu/plu_solve.c

@@ -18,6 +18,10 @@
 #include <math.h>
 #include "pxlu.h"
 
+/*
+ *	Various useful functions
+ */
+
 static double frobenius_norm(TYPE *v, unsigned n)
 {
         double sum2 = 0.0;
@@ -52,12 +56,6 @@ void STARPU_PLU(display_data_content)(TYPE *data, unsigned blocksize)
 	fprintf(stderr, "****\n");
 }
 
-static STARPU_PLU(compute_ax_block)(unsigned block_size, TYPE *block_data, TYPE *sub_x, TYPE *sub_y)
-{
-	fprintf(stderr, "block data %p sub x %p sub y %p\n", block_data, sub_x, sub_y);
-	CPU_GEMV("N", block_size, block_size, 1.0, block_data, block_size, sub_x, 1, 1.0, sub_y, 1);
-}
-
 void STARPU_PLU(extract_upper)(unsigned block_size, TYPE *inblock, TYPE *outblock)
 {
 	unsigned li, lj;
@@ -85,186 +83,39 @@ void STARPU_PLU(extract_lower)(unsigned block_size, TYPE *inblock, TYPE *outbloc
 	}
 }
 
+/*
+ *	Compute Ax = y
+ */
+
+static STARPU_PLU(compute_ax_block)(unsigned block_size, TYPE *block_data, TYPE *sub_x, TYPE *sub_y)
+{
+	fprintf(stderr, "block data %p sub x %p sub y %p\n", block_data, sub_x, sub_y);
+	CPU_GEMV("N", block_size, block_size, 1.0, block_data, block_size, sub_x, 1, 1.0, sub_y, 1);
+}
+
 static STARPU_PLU(compute_ax_block_upper)(unsigned size, unsigned nblocks,
 				 TYPE *block_data, TYPE *sub_x, TYPE *sub_y)
 {
 	unsigned block_size = size/nblocks;
 
-//	fprintf(stderr, "KEEP UPPER\n");
-//	STARPU_PLU(display_data_content)(block_data, block_size);
-
 	/* Take a copy of the upper part of the diagonal block */
 	TYPE *upper_block_copy = calloc((block_size)*(block_size), sizeof(TYPE));
 	STARPU_PLU(extract_upper)(block_size, block_data, upper_block_copy);
 		
-//	STARPU_PLU(display_data_content)(upper_block_copy, block_size);
-
 	STARPU_PLU(compute_ax_block)(block_size, upper_block_copy, sub_x, sub_y);
 	
 	free(upper_block_copy);
 }
 
-TYPE *STARPU_PLU(reconstruct_matrix)(unsigned size, unsigned nblocks)
-{
-//	fprintf(stderr, "RECONSTRUCT MATRIX size %d nblocks %d\n", size, nblocks);
-
-	TYPE *bigmatrix = calloc(size*size, sizeof(TYPE));
-
-	unsigned block_size = size/nblocks;
-
-	int rank;
-	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-
-	unsigned bi, bj;
-	for (bj = 0; bj < nblocks; bj++)
-	for (bi = 0; bi < nblocks; bi++)
-	{
-		TYPE *block;
-		
-		if (get_block_rank(bi, bj) == 0)
-		{
-	//		if (rank == 0)
-	//			fprintf(stderr, "RECONSTRUCT bi %d bj %d\n", bi, bj);
-
-			block = STARPU_PLU(get_block)(bi, bj);
-		}
-		else {
-			MPI_Status status;
-
-			if (rank == 0)
-			{
-				block = calloc(block_size*block_size, sizeof(TYPE));
-
-	//			fprintf(stderr, "RECONSTRUCT bi %d bj %d - receive in %p from node %d\n", bi, bj, block, get_block_rank(bi, bj));
-				int ret = MPI_Recv(block, block_size*block_size, MPI_TYPE, get_block_rank(bi, bj), 0, MPI_COMM_WORLD, &status);
-				STARPU_ASSERT(ret == MPI_SUCCESS);
-			}
-			else {
-				block = STARPU_PLU(get_block)(bi, bj);
-	//			fprintf(stderr, "RECONSTRUCT bi %d bj %d - send %p from node %d\n", bi, bj, block, get_block_rank(bi, bj));
-				int ret = MPI_Send(block, block_size*block_size, MPI_TYPE, 0, 0, MPI_COMM_WORLD);
-				STARPU_ASSERT(ret == MPI_SUCCESS);
-			}
-		}
-
-		if (rank == 0)
-		{
-			unsigned j, i;
-			for (j = 0; j < block_size; j++)
-			for (i = 0; i < block_size; i++)
-			{
-				bigmatrix[(j + bj*block_size)+(i+bi*block_size)*size] =
-									block[j+i*block_size];
-			}
-
-			if (get_block_rank(bi, bj) != 0)
-				free(block);
-		}
-	}
-
-	return bigmatrix;
-}
-
-static TYPE *reconstruct_lower(unsigned size, unsigned nblocks)
-{
-	TYPE *lower = calloc(size*size, sizeof(TYPE));
-
-	TYPE *bigmatrix = STARPU_PLU(reconstruct_matrix)(size, nblocks);
-
-	STARPU_PLU(extract_lower)(size, bigmatrix, lower); 
-
-	return lower;
-}
-
-static TYPE *reconstruct_upper(unsigned size, unsigned nblocks)
-{
-	TYPE *upper = calloc(size*size, sizeof(TYPE));
-
-	TYPE *bigmatrix = STARPU_PLU(reconstruct_matrix)(size, nblocks);
-
-	STARPU_PLU(extract_upper)(size, bigmatrix, upper); 
-
-	return upper;
-}
-
-
-void STARPU_PLU(compute_lu_matrix)(unsigned size, unsigned nblocks, TYPE *Asaved)
-{
-	TYPE *all_r = STARPU_PLU(reconstruct_matrix)(size, nblocks);
-
-	int rank;
-	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-
-	if (rank == 0)
-	{
-	        TYPE *L = malloc((size_t)size*size*sizeof(TYPE));
-	        TYPE *U = malloc((size_t)size*size*sizeof(TYPE));
-	
-	        memset(L, 0, size*size*sizeof(TYPE));
-	        memset(U, 0, size*size*sizeof(TYPE));
-	
-	        /* only keep the lower part */
-		unsigned i, j;
-	        for (j = 0; j < size; j++)
-	        {
-	                for (i = 0; i < j; i++)
-	                {
-	                        L[j+i*size] = all_r[j+i*size];
-	                }
-	
-	                /* diag i = j */
-	                L[j+j*size] = all_r[j+j*size];
-	                U[j+j*size] = 1.0;
-	
-	                for (i = j+1; i < size; i++)
-	                {
-	                        U[j+i*size] = all_r[j+i*size];
-	                }
-	        }
-	
-		STARPU_PLU(display_data_content)(L, size);
-		STARPU_PLU(display_data_content)(U, size);
-	
-	        /* now A_err = L, compute L*U */
-	        CPU_TRMM("R", "U", "N", "U", size, size, 1.0f, U, size, L, size);
-	
-		fprintf(stderr, "\nLU\n");
-		STARPU_PLU(display_data_content)(L, size);
-	
-	        /* compute "LU - A" in L*/
-	        CPU_AXPY(size*size, -1.0, Asaved, 1, L, 1);
-	
-	        TYPE err = CPU_ASUM(size*size, L, 1);
-	        int max = CPU_IAMAX(size*size, L, 1);
-	
-		fprintf(stderr, "DISPLAY ERROR\n");
-
-		STARPU_PLU(display_data_content)(L, size);
-	
-	        fprintf(stderr, "(A - LU) Avg error : %e\n", err/(size*size));
-	        fprintf(stderr, "(A - LU) Max error : %e\n", L[max]);
-	
-		double residual = frobenius_norm(L, size);
-		double matnorm = frobenius_norm(Asaved, size);
-	
-		fprintf(stderr, "||A-LU|| / (||A||*N) : %e\n", residual/(matnorm*size));
-	}
-}
-
 static STARPU_PLU(compute_ax_block_lower)(unsigned size, unsigned nblocks,
 				 TYPE *block_data, TYPE *sub_x, TYPE *sub_y)
 {
 	unsigned block_size = size/nblocks;
 
-//	fprintf(stderr, "KEEP LOWER\n");
-//	STARPU_PLU(display_data_content)(block_data, block_size);
-
 	/* Take a copy of the upper part of the diagonal block */
 	TYPE *lower_block_copy = calloc((block_size)*(block_size), sizeof(TYPE));
 	STARPU_PLU(extract_lower)(block_size, block_data, lower_block_copy);
 
-//	STARPU_PLU(display_data_content)(lower_block_copy, block_size);
-
 	STARPU_PLU(compute_ax_block)(size/nblocks, lower_block_copy, sub_x, sub_y);
 	
 	free(lower_block_copy);
@@ -363,6 +214,70 @@ void STARPU_PLU(compute_lux)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks,
 }
 
 
+
+/*
+ *	Allocate a contiguous matrix on node 0 and fill it with the whole
+ *	content of the matrix distributed accross all nodes.
+ */
+
+TYPE *STARPU_PLU(reconstruct_matrix)(unsigned size, unsigned nblocks)
+{
+//	fprintf(stderr, "RECONSTRUCT MATRIX size %d nblocks %d\n", size, nblocks);
+
+	TYPE *bigmatrix = calloc(size*size, sizeof(TYPE));
+
+	unsigned block_size = size/nblocks;
+
+	int rank;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+	unsigned bi, bj;
+	for (bj = 0; bj < nblocks; bj++)
+	for (bi = 0; bi < nblocks; bi++)
+	{
+		TYPE *block;
+
+		int block_rank = get_block_rank(bi, bj);
+		
+		if (block_rank == 0)
+		{
+			block = STARPU_PLU(get_block)(bi, bj);
+		}
+		else {
+			MPI_Status status;
+
+			if (rank == 0)
+			{
+				block = calloc(block_size*block_size, sizeof(TYPE));
+
+				int ret = MPI_Recv(block, block_size*block_size, MPI_TYPE, block_rank, 0, MPI_COMM_WORLD, &status);
+				STARPU_ASSERT(ret == MPI_SUCCESS);
+			}
+			else {
+				block = STARPU_PLU(get_block)(bi, bj);
+				int ret = MPI_Send(block, block_size*block_size, MPI_TYPE, 0, 0, MPI_COMM_WORLD);
+				STARPU_ASSERT(ret == MPI_SUCCESS);
+			}
+		}
+
+		if (rank == 0)
+		{
+			unsigned j, i;
+			for (j = 0; j < block_size; j++)
+			for (i = 0; i < block_size; i++)
+			{
+				bigmatrix[(j + bj*block_size)+(i+bi*block_size)*size] =
+									block[j+i*block_size];
+			}
+
+			if (get_block_rank(bi, bj) != 0)
+				free(block);
+		}
+	}
+
+	return bigmatrix;
+}
+
 /* x and y must be valid (at least) on 0 */
 void STARPU_PLU(compute_ax)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks, int rank)
 {
@@ -404,3 +319,67 @@ void STARPU_PLU(compute_ax)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks, i
 
 	free(yi);
 }
+
+void STARPU_PLU(compute_lu_matrix)(unsigned size, unsigned nblocks, TYPE *Asaved)
+{
+	TYPE *all_r = STARPU_PLU(reconstruct_matrix)(size, nblocks);
+
+	int rank;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+	if (rank == 0)
+	{
+	        TYPE *L = malloc((size_t)size*size*sizeof(TYPE));
+	        TYPE *U = malloc((size_t)size*size*sizeof(TYPE));
+	
+	        memset(L, 0, size*size*sizeof(TYPE));
+	        memset(U, 0, size*size*sizeof(TYPE));
+	
+	        /* only keep the lower part */
+		unsigned i, j;
+	        for (j = 0; j < size; j++)
+	        {
+	                for (i = 0; i < j; i++)
+	                {
+	                        L[j+i*size] = all_r[j+i*size];
+	                }
+	
+	                /* diag i = j */
+	                L[j+j*size] = all_r[j+j*size];
+	                U[j+j*size] = 1.0;
+	
+	                for (i = j+1; i < size; i++)
+	                {
+	                        U[j+i*size] = all_r[j+i*size];
+	                }
+	        }
+	
+		STARPU_PLU(display_data_content)(L, size);
+		STARPU_PLU(display_data_content)(U, size);
+	
+	        /* now A_err = L, compute L*U */
+	        CPU_TRMM("R", "U", "N", "U", size, size, 1.0f, U, size, L, size);
+	
+		fprintf(stderr, "\nLU\n");
+		STARPU_PLU(display_data_content)(L, size);
+	
+	        /* compute "LU - A" in L*/
+	        CPU_AXPY(size*size, -1.0, Asaved, 1, L, 1);
+	
+	        TYPE err = CPU_ASUM(size*size, L, 1);
+	        int max = CPU_IAMAX(size*size, L, 1);
+	
+		fprintf(stderr, "DISPLAY ERROR\n");
+
+		STARPU_PLU(display_data_content)(L, size);
+	
+	        fprintf(stderr, "(A - LU) Avg error : %e\n", err/(size*size));
+	        fprintf(stderr, "(A - LU) Max error : %e\n", L[max]);
+	
+		double residual = frobenius_norm(L, size);
+		double matnorm = frobenius_norm(Asaved, size);
+	
+		fprintf(stderr, "||A-LU|| / (||A||*N) : %e\n", residual/(matnorm*size));
+	}
+}
+

+ 10 - 6
mpi/examples/mpi_lu/pxlu.c

@@ -361,8 +361,9 @@ static void create_task_12_real(unsigned k, unsigned j)
 {
 	struct starpu_task *task = create_task(TAG12(k, j));
 	
-	task->cl = &STARPU_PLU(cl12);
-//	task->cl = &STARPU_PLU(cl21);
+#warning temporary fix :/
+//	task->cl = &STARPU_PLU(cl12);
+	task->cl = &STARPU_PLU(cl21);
 
 	int myrank;
 	MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
@@ -504,8 +505,9 @@ static void create_task_21_real(unsigned k, unsigned i)
 {
 	struct starpu_task *task = create_task(TAG21(k, i));
 
-	task->cl = &STARPU_PLU(cl21);
-//	task->cl = &STARPU_PLU(cl12);
+#warning temporary fix 
+//	task->cl = &STARPU_PLU(cl21);
+	task->cl = &STARPU_PLU(cl12);
 	
 	/* which sub-data is manipulated ? */
 	starpu_data_handle diag_block;
@@ -605,10 +607,12 @@ static void create_task_22_real(unsigned k, unsigned i, unsigned j)
 
 
 
-	task->buffers[0].handle = block21;
+	//task->buffers[0].handle = block21;
+	task->buffers[0].handle = block12;
 	task->buffers[0].mode = STARPU_R;
 
-	task->buffers[1].handle = block12;
+	//task->buffers[1].handle = block12;
+	task->buffers[1].handle = block21;
 	task->buffers[1].mode = STARPU_R;
 
 	/* produced by TAG22(k-1, i, j) */