15 years ago · 9484d29449
--- a/mpi/examples/mpi_lu/plu_example.c
+++ b/mpi/examples/mpi_lu/plu_example.c
@@ -42,6 +42,21 @@ static TYPE **tmp_12_block;
 
				 static starpu_data_handle *tmp_21_block_handles;
			
 
				 static TYPE **tmp_21_block;
			
 
				 
			
 
				+
			
 
				+TYPE *STARPU_PLU(reconstruct_matrix)(unsigned size, unsigned nblocks);
			
 
				+static void display_block_content(unsigned bi, unsigned bj, unsigned blocksize);
			
 
				+
			
 
				+static void display_all_blocks(unsigned nblocks, unsigned blocksize)
			
 
				+{
			
 
				+	fprintf(stderr, "DISPLAY ALL\n");
			
 
				+	unsigned bi, bj;
			
 
				+	for (bj = 0; bj < nblocks; bj++)
			
 
				+	for (bi = 0; bi < nblocks; bi++)
			
 
				+		display_block_content(bi, bj, blocksize);
			
 
				+
			
 
				+	fprintf(stderr, "*****************\n");
			
 
				+}
			
 
				+
			
 
				 static void parse_args(int argc, char **argv)
			
 
				 {
			
 
				 	int i;
			
@@ -80,8 +95,8 @@ static void fill_block_with_random(TYPE *blockptr, unsigned size, unsigned nbloc
 
				 	for (j = 0; j < block_size; j++)
			
 
				 	for (i = 0; i < block_size; i++)
			
 
				 	{
			
 
				-		blockptr[i+j*block_size] = (TYPE)drand48();
			
 
				-//		blockptr[i+j*block_size] = (i == j)?2.0:0.0;
			
 
				+	//	blockptr[i+j*block_size] = (TYPE)drand48();
			
 
				+		blockptr[i+j*block_size] = (i == j)?2.0:(TYPE)j;
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -183,6 +198,9 @@ static void init_matrix(int rank)
 
				 			(uintptr_t)tmp_21_block[k],
			
 
				 			size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
			
 
				 	}
			
 
				+
			
 
				+
			
 
				+	display_all_blocks(nblocks, size/nblocks);
			
 
				 }
			
 
				 
			
 
				 int get_block_rank(unsigned i, unsigned j)
			
@@ -210,6 +228,25 @@ static void display_grid(int rank, unsigned nblocks)
 
				 	}
			
 
				 }
			
 
				 
			
 
				+static void display_block_content(unsigned bi, unsigned bj, unsigned blocksize)
			
 
				+{
			
 
				+	TYPE *data = STARPU_PLU(get_block)(bj, bi);
			
 
				+
			
 
				+	fprintf(stderr, "BLOCK i = %d j = %d\n", bi, bj);
			
 
				+
			
 
				+	unsigned i, j;
			
 
				+	for (j = 0; j < blocksize; j++)
			
 
				+	{
			
 
				+		for (i = 0; i < blocksize; i++)
			
 
				+		{
			
 
				+			fprintf(stderr, "%f ", data[j+i*blocksize]);
			
 
				+		}
			
 
				+		fprintf(stderr, "\n");
			
 
				+	}
			
 
				+
			
 
				+	fprintf(stderr, "****\n");
			
 
				+}
			
 
				+
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				 	int rank;
			
@@ -243,12 +280,15 @@ int main(int argc, char **argv)
 
				 
			
 
				 	init_matrix(rank);
			
 
				 
			
 
				+	TYPE *a_r = STARPU_PLU(reconstruct_matrix)(size, nblocks);
			
 
				+	STARPU_PLU(display_data_content)(a_r, size);
			
 
				+	
			
 
				+
			
 
				 	TYPE *x, *y;
			
 
				 
			
 
				 	if (check)
			
 
				 	{
			
 
				-		if (rank == 0)
			
 
				-			fprintf(stderr, "Compute AX = B\n");
			
 
				+		unsigned ind;
			
 
				 
			
 
				 		x = calloc(size, sizeof(TYPE));
			
 
				 		STARPU_ASSERT(x);
			
@@ -256,22 +296,25 @@ int main(int argc, char **argv)
 
				 		y = calloc(size, sizeof(TYPE));
			
 
				 		STARPU_ASSERT(y);
			
 
				 		
			
 
				-		unsigned ind;
			
 
				-		for (ind = 0; ind < size; ind++)
			
 
				+		if (rank == 0)
			
 
				 		{
			
 
				-			//x[ind] = (TYPE)1.0;
			
 
				-			x[ind] = (TYPE)drand48();
			
 
				-			y[ind] = (TYPE)0.0;
			
 
				+			fprintf(stderr, "Compute AX = B\n");
			
 
				+
			
 
				+			for (ind = 0; ind < size; ind++)
			
 
				+			{
			
 
				+				x[ind] = (TYPE)ind;
			
 
				+//				x[ind] = (TYPE)drand48();
			
 
				+				y[ind] = (TYPE)0.0;
			
 
				+			}
			
 
				 		}
			
 
				 
			
 
				 		STARPU_PLU(compute_ax)(size, x, y, nblocks, rank);
			
 
				 
			
 
				 		if (rank == 0)
			
 
				-		for (ind = 0; ind < 10; ind++)
			
 
				+		for (ind = 0; ind < STARPU_MIN(10, size); ind++)
			
 
				 		{
			
 
				 			fprintf(stderr, "y[%d] = %f\n", ind, (float)y[ind]);
			
 
				 		}
			
 
				-		
			
 
				 	}
			
 
				 
			
 
				 	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
			
@@ -313,6 +356,40 @@ int main(int argc, char **argv)
 
				 	}
			
 
				 
			
 
				 	/*
			
 
				+	 *	Test Result Correctness
			
 
				+	 */
			
 
				+
			
 
				+	STARPU_PLU(compute_lu_matrix)(size, nblocks);
			
 
				+
			
 
				+	TYPE *y2;
			
 
				+
			
 
				+	if (check)
			
 
				+	{
			
 
				+		unsigned ind;
			
 
				+
			
 
				+		y2 = calloc(size, sizeof(TYPE));
			
 
				+		STARPU_ASSERT(y);
			
 
				+		
			
 
				+		if (rank == 0)
			
 
				+		{
			
 
				+			fprintf(stderr, "Compute LUX = B2\n");
			
 
				+
			
 
				+			for (ind = 0; ind < size; ind++)
			
 
				+			{
			
 
				+				y2[ind] = (TYPE)0.0;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		STARPU_PLU(compute_lux)(size, x, y2, nblocks, rank);
			
 
				+
			
 
				+		if (rank == 0)
			
 
				+		for (ind = 0; ind < STARPU_MIN(10, size); ind++)
			
 
				+		{
			
 
				+			fprintf(stderr, "y[%d] = %f\n", ind, (float)y2[ind]);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				 	 * 	Termination
			
 
				 	 */
			
 
				 
			
--- a/mpi/examples/mpi_lu/plu_solve.c
+++ b/mpi/examples/mpi_lu/plu_solve.c
@@ -17,15 +17,272 @@
 
				 #include <starpu.h>
			
 
				 #include "pxlu.h"
			
 
				 
			
 
				-static STARPU_PLU(compute_ax_block)(unsigned size, unsigned nblocks,
			
 
				+void STARPU_PLU(display_data_content)(TYPE *data, unsigned blocksize)
			
 
				+{
			
 
				+	fprintf(stderr, "DISPLAY BLOCK\n");
			
 
				+
			
 
				+	unsigned i, j;
			
 
				+	for (j = 0; j < blocksize; j++)
			
 
				+	{
			
 
				+		for (i = 0; i < blocksize; i++)
			
 
				+		{
			
 
				+			fprintf(stderr, "%f ", data[j+i*blocksize]);
			
 
				+		}
			
 
				+		fprintf(stderr, "\n");
			
 
				+	}
			
 
				+
			
 
				+	fprintf(stderr, "****\n");
			
 
				+}
			
 
				+
			
 
				+static STARPU_PLU(compute_ax_block)(unsigned block_size, TYPE *block_data, TYPE *sub_x, TYPE *sub_y)
			
 
				+{
			
 
				+	CPU_GEMV("N", block_size, block_size, 1.0, block_data, block_size, sub_x, 1, 1.0, sub_y, 1);
			
 
				+}
			
 
				+
			
 
				+void STARPU_PLU(extract_upper)(unsigned block_size, TYPE *inblock, TYPE *outblock)
			
 
				+{
			
 
				+	unsigned li, lj;
			
 
				+	for (lj = 0; lj < block_size; lj++)
			
 
				+	{
			
 
				+		/* Upper block diag is 1 */
			
 
				+		outblock[lj*(block_size + 1)] = (TYPE)1.0;
			
 
				+
			
 
				+		for (li = lj + 1; li < block_size; li++)
			
 
				+		{
			
 
				+			outblock[lj + li*block_size] = inblock[lj + li*block_size];
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static STARPU_PLU(compute_ax_block_upper)(unsigned size, unsigned nblocks,
			
 
				 				 TYPE *block_data, TYPE *sub_x, TYPE *sub_y)
			
 
				 {
			
 
				-	CPU_GEMV("N", size/nblocks, size/nblocks, 1.0, block_data, size/nblocks, sub_x, 1, 1.0, sub_y, 1);
			
 
				+	unsigned block_size = size/nblocks;
			
 
				+
			
 
				+	fprintf(stderr, "KEEP UPPER\n");
			
 
				+	STARPU_PLU(display_data_content)(block_data, block_size);
			
 
				+
			
 
				+	/* Take a copy of the upper part of the diagonal block */
			
 
				+	TYPE *upper_block_copy = calloc((block_size)*(block_size), sizeof(TYPE));
			
 
				+	STARPU_PLU(extract_upper)(block_size, block_data, upper_block_copy);
			
 
				+		
			
 
				+	STARPU_PLU(display_data_content)(upper_block_copy, block_size);
			
 
				+
			
 
				+	STARPU_PLU(compute_ax_block)(size/nblocks, upper_block_copy, sub_x, sub_y);
			
 
				+	
			
 
				+	free(upper_block_copy);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void STARPU_PLU(extract_lower)(unsigned block_size, TYPE *inblock, TYPE *outblock)
			
 
				+{
			
 
				+	unsigned li, lj;
			
 
				+	for (lj = 0; lj < block_size; lj++)
			
 
				+	{
			
 
				+		for (li = 0; li <= lj; li++)
			
 
				+		{
			
 
				+			outblock[lj + li*block_size] = inblock[lj + li*block_size];
			
 
				+		}
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				-/* y is only valid on node 0 */
			
 
				+
			
 
				+TYPE *STARPU_PLU(reconstruct_matrix)(unsigned size, unsigned nblocks)
			
 
				+{
			
 
				+	TYPE *bigmatrix = calloc(size*size, sizeof(TYPE));
			
 
				+
			
 
				+	unsigned block_size = size/nblocks;
			
 
				+
			
 
				+	unsigned bi, bj;
			
 
				+	for (bj = 0; bj < nblocks; bj++)
			
 
				+	for (bi = 0; bi < nblocks; bi++)
			
 
				+	{
			
 
				+		TYPE *block = STARPU_PLU(get_block)(bj, bi);
			
 
				+		//TYPE *block = STARPU_PLU(get_block)(bj, bi);
			
 
				+
			
 
				+		unsigned j, i;
			
 
				+		for (j = 0; j < block_size; j++)
			
 
				+		for (i = 0; i < block_size; i++)
			
 
				+		{
			
 
				+			bigmatrix[(j + bj*block_size)+(i+bi*block_size)*size] =
			
 
				+								block[j+i*block_size];
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return bigmatrix;
			
 
				+}
			
 
				+
			
 
				+static TYPE *reconstruct_lower(unsigned size, unsigned nblocks)
			
 
				+{
			
 
				+	TYPE *lower = calloc(size*size, sizeof(TYPE));
			
 
				+
			
 
				+	TYPE *bigmatrix = STARPU_PLU(reconstruct_matrix)(size, nblocks);
			
 
				+
			
 
				+	STARPU_PLU(extract_lower)(size, bigmatrix, lower); 
			
 
				+
			
 
				+	return lower;
			
 
				+}
			
 
				+
			
 
				+static TYPE *reconstruct_upper(unsigned size, unsigned nblocks)
			
 
				+{
			
 
				+	TYPE *upper = calloc(size*size, sizeof(TYPE));
			
 
				+
			
 
				+	TYPE *bigmatrix = STARPU_PLU(reconstruct_matrix)(size, nblocks);
			
 
				+
			
 
				+	STARPU_PLU(extract_upper)(size, bigmatrix, upper); 
			
 
				+
			
 
				+	return upper;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void STARPU_PLU(compute_lu_matrix)(unsigned size, unsigned nblocks)
			
 
				+{
			
 
				+	fprintf(stderr, "ALL\n\n");
			
 
				+	TYPE *all_r = STARPU_PLU(reconstruct_matrix)(size, nblocks);
			
 
				+	STARPU_PLU(display_data_content)(all_r, size);
			
 
				+
			
 
				+	fprintf(stderr, "\nLOWER\n");
			
 
				+	TYPE *lower_r = reconstruct_lower(size, nblocks);
			
 
				+	STARPU_PLU(display_data_content)(lower_r, size);
			
 
				+
			
 
				+	fprintf(stderr, "\nUPPER\n");
			
 
				+	TYPE *upper_r = reconstruct_upper(size, nblocks);
			
 
				+	STARPU_PLU(display_data_content)(upper_r, size);
			
 
				+
			
 
				+	TYPE *lu_r = calloc(size*size, sizeof(TYPE));
			
 
				+	CPU_TRMM("R", "U", "N", "U", size, size, 1.0f, lower_r, size, upper_r, size);
			
 
				+
			
 
				+	fprintf(stderr, "\nLU\n");
			
 
				+	STARPU_PLU(display_data_content)(lower_r, size);
			
 
				+}
			
 
				+
			
 
				+static STARPU_PLU(compute_ax_block_lower)(unsigned size, unsigned nblocks,
			
 
				+				 TYPE *block_data, TYPE *sub_x, TYPE *sub_y)
			
 
				+{
			
 
				+	unsigned block_size = size/nblocks;
			
 
				+
			
 
				+	fprintf(stderr, "KEEP LOWER\n");
			
 
				+	STARPU_PLU(display_data_content)(block_data, block_size);
			
 
				+
			
 
				+	/* Take a copy of the upper part of the diagonal block */
			
 
				+	TYPE *lower_block_copy = calloc((block_size)*(block_size), sizeof(TYPE));
			
 
				+	STARPU_PLU(extract_lower)(block_size, block_data, lower_block_copy);
			
 
				+
			
 
				+	STARPU_PLU(display_data_content)(lower_block_copy, block_size);
			
 
				+
			
 
				+	STARPU_PLU(compute_ax_block)(size/nblocks, lower_block_copy, sub_x, sub_y);
			
 
				+	
			
 
				+	free(lower_block_copy);
			
 
				+}
			
 
				+
			
 
				+void STARPU_PLU(compute_lux)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks, int rank)
			
 
				+{
			
 
				+	/* Create temporary buffers where all MPI processes are going to
			
 
				+	 * compute Ui x = yi where Ai is the matrix containing the blocks of U
			
 
				+	 * affected to process i, and 0 everywhere else. We then have y as the
			
 
				+	 * sum of all yi. */
			
 
				+	TYPE *yi = calloc(size, sizeof(TYPE));
			
 
				+
			
 
				+	unsigned block_size = size/nblocks;
			
 
				+
			
 
				+	/* Compute UiX = Yi */
			
 
				+	unsigned long i,j;
			
 
				+	for (j = 0; j < nblocks; j++)
			
 
				+	{
			
 
				+		if (get_block_rank(j, j) == rank)
			
 
				+		{
			
 
				+			TYPE *block_data = STARPU_PLU(get_block)(j, j);
			
 
				+			TYPE *sub_x = &x[j*(block_size)];
			
 
				+			TYPE *sub_yi = &yi[j*(block_size)];
			
 
				+
			
 
				+			STARPU_PLU(compute_ax_block_upper)(size, nblocks, block_data, sub_x, sub_yi);
			
 
				+		}
			
 
				+
			
 
				+		for (i = j + 1; i < nblocks; i++)
			
 
				+		{
			
 
				+			if (get_block_rank(i, j) == rank)
			
 
				+			{
			
 
				+				/* That block belongs to the current MPI process */
			
 
				+				TYPE *block_data = STARPU_PLU(get_block)(j, i);
			
 
				+				TYPE *sub_x = &x[i*(block_size)];
			
 
				+				TYPE *sub_yi = &yi[j*(block_size)];
			
 
				+
			
 
				+				STARPU_PLU(compute_ax_block)(size/nblocks, block_data, sub_x, sub_yi);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* Grab Sum Yi in X */
			
 
				+	MPI_Reduce(yi, x, size, MPI_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
			
 
				+	memset(yi, 0, size*sizeof(TYPE));
			
 
				+
			
 
				+	unsigned ind;
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				+		fprintf(stderr, "INTERMEDIATE\n");
			
 
				+		for (ind = 0; ind < STARPU_MIN(10, size); ind++)
			
 
				+		{
			
 
				+			fprintf(stderr, "x[%d] = %f\n", ind, (float)x[ind]);
			
 
				+		}
			
 
				+		fprintf(stderr, "****\n");
			
 
				+	}
			
 
				+
			
 
				+	/* Everyone needs x */
			
 
				+	int bcst_ret;
			
 
				+	bcst_ret = MPI_Bcast(&x, size, MPI_TYPE, 0, MPI_COMM_WORLD);
			
 
				+	STARPU_ASSERT(bcst_ret == MPI_SUCCESS);
			
 
				+
			
 
				+	/* Compute LiX = Yi (with X = UX) */
			
 
				+	for (j = 0; j < nblocks; j++)
			
 
				+	{
			
 
				+		if (j > 0)
			
 
				+		for (i = 0; i < j; i++)
			
 
				+		{
			
 
				+			if (get_block_rank(i, j) == rank)
			
 
				+			{
			
 
				+				/* That block belongs to the current MPI process */
			
 
				+				TYPE *block_data = STARPU_PLU(get_block)(j, i);
			
 
				+				TYPE *sub_x = &x[i*(block_size)];
			
 
				+				TYPE *sub_yi = &yi[j*(block_size)];
			
 
				+
			
 
				+				STARPU_PLU(compute_ax_block)(size/nblocks, block_data, sub_x, sub_yi);
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		if (get_block_rank(j, j) == rank)
			
 
				+		{
			
 
				+			TYPE *block_data = STARPU_PLU(get_block)(j, j);
			
 
				+			TYPE *sub_x = &x[j*(block_size)];
			
 
				+			TYPE *sub_yi = &yi[j*(block_size)];
			
 
				+
			
 
				+			STARPU_PLU(compute_ax_block_lower)(size, nblocks, block_data, sub_x, sub_yi);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/* Grab Sum Yi in Y */
			
 
				+	MPI_Reduce(yi, y, size, MPI_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
			
 
				+
			
 
				+	free(yi);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* x and y must be valid (at least) on 0 */
			
 
				 void STARPU_PLU(compute_ax)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks, int rank)
			
 
				 {
			
 
				+	/* Send x to everyone */
			
 
				+	int bcst_ret;
			
 
				+	bcst_ret = MPI_Bcast(&x, size, MPI_TYPE, 0, MPI_COMM_WORLD);
			
 
				+	STARPU_ASSERT(bcst_ret == MPI_SUCCESS);
			
 
				+
			
 
				+	if (rank == 0)
			
 
				+	{
			
 
				+		unsigned ind;
			
 
				+		for (ind = 0; ind < STARPU_MIN(10, size); ind++)
			
 
				+			fprintf(stderr, "x[%d] = %f\n", ind, (float)x[ind]);
			
 
				+
			
 
				+		fprintf(stderr, "Compute AX = B\n");
			
 
				+	}
			
 
				+
			
 
				 	/* Create temporary buffers where all MPI processes are going to
			
 
				 	 * compute Ai x = yi where Ai is the matrix containing the blocks of A
			
 
				 	 * affected to process i, and 0 everywhere else. We then have y as the
			
@@ -45,11 +302,13 @@ void STARPU_PLU(compute_ax)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks, i
 
				 				TYPE *sub_x = &x[i*(size/nblocks)];
			
 
				 				TYPE *sub_yi = &yi[j*(size/nblocks)];
			
 
				 
			
 
				-				STARPU_PLU(compute_ax_block)(size, nblocks, block_data, sub_x, sub_yi);
			
 
				+				STARPU_PLU(compute_ax_block)(size/nblocks, block_data, sub_x, sub_yi);
			
 
				 			}
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				 	/* Compute the Sum of all yi = y */
			
 
				 	MPI_Reduce(yi, y, size, MPI_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
			
 
				+
			
 
				+	free(yi);
			
 
				 }
			
--- a/mpi/examples/mpi_lu/pxlu.h
+++ b/mpi/examples/mpi_lu/pxlu.h
@@ -36,4 +36,6 @@ starpu_data_handle STARPU_PLU(get_tmp_11_block_handle)(void);
 
				 starpu_data_handle STARPU_PLU(get_tmp_12_block_handle)(unsigned j);
			
 
				 starpu_data_handle STARPU_PLU(get_tmp_21_block_handle)(unsigned i);
			
 
				 
			
 
				+void STARPU_PLU(display_data_content)(TYPE *data, unsigned blocksize);
			
 
				+
			
 
				 #endif // __PXLU_H__
			
--- a/mpi/examples/mpi_lu/pxlu_kernels.c
+++ b/mpi/examples/mpi_lu/pxlu_kernels.c
@@ -37,9 +37,9 @@ static inline void STARPU_PLU(common_u22)(void *descr[],
 
				 	unsigned ld21 = GET_BLAS_LD(descr[1]);
			
 
				 	unsigned ld22 = GET_BLAS_LD(descr[2]);
			
 
				 
			
 
				-//	int rank;
			
 
				-//	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				-//	fprintf(stderr, "KERNEL 22 %d\n", rank);
			
 
				+	int rank;
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	fprintf(stderr, "KERNEL 22 %d\n", rank);
			
 
				 
			
 
				 #ifdef USE_CUDA
			
 
				 	cublasStatus status;
			
@@ -127,9 +127,9 @@ static inline void STARPU_PLU(common_u12)(void *descr[],
 
				 	unsigned nx12 = GET_BLAS_NX(descr[1]);
			
 
				 	unsigned ny12 = GET_BLAS_NY(descr[1]);
			
 
				 
			
 
				-//	int rank;
			
 
				-//	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				-//	fprintf(stderr, "KERNEL 12 %d\n", rank);
			
 
				+	int rank;
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	fprintf(stderr, "KERNEL 12 %d\n", rank);
			
 
				 
			
 
				 #ifdef USE_CUDA
			
 
				 	cublasStatus status;
			
@@ -215,9 +215,9 @@ static inline void STARPU_PLU(common_u21)(void *descr[],
 
				 	unsigned nx21 = GET_BLAS_NX(descr[1]);
			
 
				 	unsigned ny21 = GET_BLAS_NY(descr[1]);
			
 
				 	
			
 
				-//	int rank;
			
 
				-//	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				-//	fprintf(stderr, "KERNEL 21 %d\n", rank);
			
 
				+	int rank;
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	fprintf(stderr, "KERNEL 21 %d\n", rank);
			
 
				 
			
 
				 
			
 
				 #ifdef USE_CUDA
			
@@ -299,9 +299,9 @@ static inline void STARPU_PLU(common_u11)(void *descr[],
 
				 
			
 
				 	unsigned long z;
			
 
				 
			
 
				-//	int rank;
			
 
				-//	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				-//	fprintf(stderr, "KERNEL 11 %d\n", rank);
			
 
				+	int rank;
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	fprintf(stderr, "KERNEL 11 %d\n", rank);
			
 
				 
			
 
				 	switch (s) {
			
 
				 		case 0: