浏览代码

- yet more debugging
- measure the error

Cédric Augonnet 15 年之前
父节点
当前提交
bbb0e033a8
共有 5 个文件被更改,包括 231 次插入182 次删除
  1. 43 65
      mpi/examples/mpi_lu/plu_example.c
  2. 130 87
      mpi/examples/mpi_lu/plu_solve.c
  3. 35 9
      mpi/examples/mpi_lu/pxlu.c
  4. 7 2
      mpi/examples/mpi_lu/pxlu.h
  5. 16 19
      mpi/examples/mpi_lu/pxlu_kernels.c

+ 43 - 65
mpi/examples/mpi_lu/plu_example.c

@@ -42,21 +42,6 @@ static TYPE **tmp_12_block;
 static starpu_data_handle *tmp_21_block_handles;
 static TYPE **tmp_21_block;
 
-
-TYPE *STARPU_PLU(reconstruct_matrix)(unsigned size, unsigned nblocks);
-static void display_block_content(unsigned bi, unsigned bj, unsigned blocksize);
-
-static void display_all_blocks(unsigned nblocks, unsigned blocksize)
-{
-	fprintf(stderr, "DISPLAY ALL\n");
-	unsigned bi, bj;
-	for (bj = 0; bj < nblocks; bj++)
-	for (bi = 0; bi < nblocks; bi++)
-		display_block_content(bi, bj, blocksize);
-
-	fprintf(stderr, "*****************\n");
-}
-
 static void parse_args(int argc, char **argv)
 {
 	int i;
@@ -92,16 +77,16 @@ static void fill_block_with_random(TYPE *blockptr, unsigned size, unsigned nbloc
 	const unsigned block_size = (size/nblocks);
 
 	unsigned i, j;
-	for (j = 0; j < block_size; j++)
 	for (i = 0; i < block_size; i++)
+	for (j = 0; j < block_size; j++)
 	{
-		blockptr[i+j*block_size] = (TYPE)drand48();
+		blockptr[j+i*block_size] = (TYPE)drand48();
 	}
 }
 
-starpu_data_handle STARPU_PLU(get_block_handle)(unsigned j, unsigned i)
+starpu_data_handle STARPU_PLU(get_block_handle)(unsigned i, unsigned j)
 {
-	return dataA_handles[i+j*nblocks];
+	return dataA_handles[j+i*nblocks];
 }
 
 starpu_data_handle STARPU_PLU(get_tmp_11_block_handle)(void)
@@ -119,13 +104,15 @@ starpu_data_handle STARPU_PLU(get_tmp_21_block_handle)(unsigned i)
 	return tmp_21_block_handles[i];
 }
 
-TYPE *STARPU_PLU(get_block)(unsigned j, unsigned i)
+TYPE *STARPU_PLU(get_block)(unsigned i, unsigned j)
 {
-	return dataA[i+j*nblocks];
+	return dataA[j+i*nblocks];
 }
 
 static void init_matrix(int rank)
 {
+	fprintf(stderr, "INIT MATRIX on node %d\n", rank);
+
 	/* Allocate a grid of data handles, not all of them have to be allocated later on */
 	dataA_handles = calloc(nblocks*nblocks, sizeof(starpu_data_handle));
 	dataA = calloc(nblocks*nblocks, sizeof(TYPE *));
@@ -138,31 +125,36 @@ static void init_matrix(int rank)
 	{
 		for (i = 0; i < nblocks; i++)
 		{
+			TYPE **blockptr = &dataA[j+i*nblocks];
+//			starpu_data_handle *handleptr = &dataA_handles[j+nblocks*i];
+			starpu_data_handle *handleptr = &dataA_handles[j+nblocks*i];
+
 			if (get_block_rank(i, j) == rank)
 			{
 				/* This blocks should be treated by the current MPI process */
 				/* Allocate and fill it */
-				starpu_malloc_pinned_if_possible((void **)&dataA[i+j*nblocks], blocksize);
+				starpu_malloc_pinned_if_possible((void **)blockptr, blocksize);
 
-				fill_block_with_random(STARPU_PLU(get_block)(j, i), size, nblocks);
+				//fprintf(stderr, "Rank %d : fill block (i = %d, j = %d)\n", rank, i, j);
+				fill_block_with_random(*blockptr, size, nblocks);
+				//fprintf(stderr, "Rank %d : fill block (i = %d, j = %d)\n", rank, i, j);
 				if (i == j)
 				{
-					TYPE *b = STARPU_PLU(get_block)(j, i);
 					unsigned tmp;
 					for (tmp = 0; tmp < size/nblocks; tmp++)
 					{
-						b[tmp*((size/nblocks)+1)] += (TYPE)10*nblocks;
+						(*blockptr)[tmp*((size/nblocks)+1)] += (TYPE)10*nblocks;
 					}
 				}
 
 				/* Register it to StarPU */
-				starpu_register_blas_data(&dataA_handles[i+nblocks*j], 0,
-					(uintptr_t)dataA[i+nblocks*j], size/nblocks,
+				starpu_register_blas_data(handleptr, 0,
+					(uintptr_t)*blockptr, size/nblocks,
 					size/nblocks, size/nblocks, sizeof(TYPE));
 			}
 			else {
-				dataA[i+j*nblocks] = STARPU_POISON_PTR;
-				dataA_handles[i+j*nblocks] = STARPU_POISON_PTR;
+				*blockptr = STARPU_POISON_PTR;
+				*handleptr = STARPU_POISON_PTR;
 			}
 		}
 	}
@@ -198,7 +190,7 @@ static void init_matrix(int rank)
 			size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
 	}
 
-//	display_all_blocks(nblocks, size/nblocks);
+	//display_all_blocks(nblocks, size/nblocks);
 }
 
 int get_block_rank(unsigned i, unsigned j)
@@ -210,41 +202,25 @@ int get_block_rank(unsigned i, unsigned j)
 
 static void display_grid(int rank, unsigned nblocks)
 {
-	if (rank == 0)
+	//if (rank == 0)
 	{
-		fprintf(stderr, "2D grid layout: \n");
+		fprintf(stderr, "2D grid layout (Rank %d): \n", rank);
 		
 		unsigned i, j;
 		for (j = 0; j < nblocks; j++)
 		{
 			for (i = 0; i < nblocks; i++)
 			{
-				fprintf(stderr, "%d ", get_block_rank(i, j));
+				TYPE *blockptr = STARPU_PLU(get_block)(i, j);
+				starpu_data_handle handle = STARPU_PLU(get_block_handle)(i, j);
+
+				fprintf(stderr, "%d (data %p handle %p)", get_block_rank(i, j), blockptr, handle);
 			}
 			fprintf(stderr, "\n");
 		}
 	}
 }
 
-static void display_block_content(unsigned bi, unsigned bj, unsigned blocksize)
-{
-	TYPE *data = STARPU_PLU(get_block)(bj, bi);
-
-	fprintf(stderr, "BLOCK i = %d j = %d\n", bi, bj);
-
-	unsigned i, j;
-	for (j = 0; j < blocksize; j++)
-	{
-		for (i = 0; i < blocksize; i++)
-		{
-			fprintf(stderr, "%f ", data[j+i*blocksize]);
-		}
-		fprintf(stderr, "\n");
-	}
-
-	fprintf(stderr, "****\n");
-}
-
 int main(int argc, char **argv)
 {
 	int rank;
@@ -263,8 +239,6 @@ int main(int argc, char **argv)
 
 	STARPU_ASSERT(p*q == world_size);
 
-	//display_grid(rank, nblocks);
-
 	starpu_init(NULL);
 	starpu_mpi_initialize();
 	starpu_helper_init_cublas();
@@ -278,6 +252,8 @@ int main(int argc, char **argv)
 
 	init_matrix(rank);
 
+	display_grid(rank, nblocks);
+
 	TYPE *a_r;
 //	STARPU_PLU(display_data_content)(a_r, size);
 
@@ -285,31 +261,34 @@ int main(int argc, char **argv)
 
 	if (check)
 	{
-		unsigned ind;
-
 		x = calloc(size, sizeof(TYPE));
 		STARPU_ASSERT(x);
 
 		y = calloc(size, sizeof(TYPE));
 		STARPU_ASSERT(y);
 
-		a_r = STARPU_PLU(reconstruct_matrix)(size, nblocks);
-	
 		if (rank == 0)
 		{
+			unsigned ind;
 			for (ind = 0; ind < size; ind++)
-			{
 				x[ind] = (TYPE)drand48();
-				y[ind] = (TYPE)0.0;
-			}
 		}
 
-		STARPU_PLU(compute_ax)(size, x, y, nblocks, rank);
+		a_r = STARPU_PLU(reconstruct_matrix)(size, nblocks);
+
+		if (rank == 0)
+			STARPU_PLU(display_data_content)(a_r, size);
+
+		fprintf(stderr, "COMPUTE AX on node %d \n", rank);
+//		STARPU_PLU(compute_ax)(size, x, y, nblocks, rank);
+
+		fprintf(stderr, "COMPUTE AX on node %d AFTER\n", rank);
 	}
 
 	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
 	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
 
+	fprintf(stderr, "GO for main on node %d\n", rank);
 	double timing = STARPU_PLU(plu_main)(nblocks, rank, world_size);
 
 	/*
@@ -321,9 +300,6 @@ int main(int argc, char **argv)
 	double max_timing = timing;
 	double sum_timing = timing;
 
-	barrier_ret = MPI_Barrier(MPI_COMM_WORLD);
-	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
-	
 	reduce_ret = MPI_Reduce(&timing, &min_timing, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
 	STARPU_ASSERT(reduce_ret == MPI_SUCCESS);
 
@@ -359,6 +335,7 @@ int main(int argc, char **argv)
 
 		STARPU_PLU(compute_lu_matrix)(size, nblocks, a_r);
 
+#if 0
 		/*
 		 *	Compute || Ax - LUx ||
 		 */
@@ -386,6 +363,7 @@ int main(int argc, char **argv)
 	
 	        fprintf(stderr, "(A - LU)X Avg error : %e\n", err/(size*size));
 	        fprintf(stderr, "(A - LU)X Max error : %e\n", y2[max]);
+#endif
 	}
 
 	/*

+ 130 - 87
mpi/examples/mpi_lu/plu_solve.c

@@ -15,8 +15,26 @@
  */
 
 #include <starpu.h>
+#include <math.h>
 #include "pxlu.h"
 
+static double frobenius_norm(TYPE *v, unsigned n)
+{
+        double sum2 = 0.0;
+
+        /* compute sqrt(Sum(|x|^2)) */
+
+        unsigned i,j;
+        for (j = 0; j < n; j++)
+        for (i = 0; i < n; i++)
+        {
+                double a = fabsl((double)v[i+n*j]);
+                sum2 += a*a;
+        }
+
+        return sqrt(sum2);
+}
+
 void STARPU_PLU(display_data_content)(TYPE *data, unsigned blocksize)
 {
 	fprintf(stderr, "DISPLAY BLOCK\n");
@@ -36,6 +54,7 @@ void STARPU_PLU(display_data_content)(TYPE *data, unsigned blocksize)
 
 static STARPU_PLU(compute_ax_block)(unsigned block_size, TYPE *block_data, TYPE *sub_x, TYPE *sub_y)
 {
+	fprintf(stderr, "block data %p sub x %p sub y %p\n", block_data, sub_x, sub_y);
 	CPU_GEMV("N", block_size, block_size, 1.0, block_data, block_size, sub_x, 1, 1.0, sub_y, 1);
 }
 
@@ -66,34 +85,6 @@ void STARPU_PLU(extract_lower)(unsigned block_size, TYPE *inblock, TYPE *outbloc
 	}
 }
 
-#if 0
-void STARPU_PLU(extract_upper)(unsigned block_size, TYPE *inblock, TYPE *outblock)
-{
-	unsigned li, lj;
-	for (lj = 0; lj < block_size; lj++)
-	{
-		for (li = lj; li < block_size; li++)
-		{
-			outblock[lj + li*block_size] = inblock[lj + li*block_size];
-		}
-	}
-}
-
-void STARPU_PLU(extract_lower)(unsigned block_size, TYPE *inblock, TYPE *outblock)
-{
-	unsigned li, lj;
-	for (lj = 0; lj < block_size; lj++)
-	{
-		for (li = 0; li < lj; li++)
-		{
-			outblock[lj + li*block_size] = inblock[lj + li*block_size];
-		}
-
-		outblock[lj*(block_size + 1)] = (TYPE)1.0;
-	}
-}
-#endif
-
 static STARPU_PLU(compute_ax_block_upper)(unsigned size, unsigned nblocks,
 				 TYPE *block_data, TYPE *sub_x, TYPE *sub_y)
 {
@@ -108,30 +99,66 @@ static STARPU_PLU(compute_ax_block_upper)(unsigned size, unsigned nblocks,
 		
 //	STARPU_PLU(display_data_content)(upper_block_copy, block_size);
 
-	STARPU_PLU(compute_ax_block)(size/nblocks, upper_block_copy, sub_x, sub_y);
+	STARPU_PLU(compute_ax_block)(block_size, upper_block_copy, sub_x, sub_y);
 	
 	free(upper_block_copy);
 }
 
 TYPE *STARPU_PLU(reconstruct_matrix)(unsigned size, unsigned nblocks)
 {
+//	fprintf(stderr, "RECONSTRUCT MATRIX size %d nblocks %d\n", size, nblocks);
+
 	TYPE *bigmatrix = calloc(size*size, sizeof(TYPE));
 
 	unsigned block_size = size/nblocks;
 
+	int rank;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
 	unsigned bi, bj;
 	for (bj = 0; bj < nblocks; bj++)
 	for (bi = 0; bi < nblocks; bi++)
 	{
-		TYPE *block = STARPU_PLU(get_block)(bj, bi);
-		//TYPE *block = STARPU_PLU(get_block)(bj, bi);
+		TYPE *block;
+		
+		if (get_block_rank(bi, bj) == 0)
+		{
+	//		if (rank == 0)
+	//			fprintf(stderr, "RECONSTRUCT bi %d bj %d\n", bi, bj);
+
+			block = STARPU_PLU(get_block)(bi, bj);
+		}
+		else {
+			MPI_Status status;
+
+			if (rank == 0)
+			{
+				block = calloc(block_size*block_size, sizeof(TYPE));
+
+	//			fprintf(stderr, "RECONSTRUCT bi %d bj %d - receive in %p from node %d\n", bi, bj, block, get_block_rank(bi, bj));
+				int ret = MPI_Recv(block, block_size*block_size, MPI_TYPE, get_block_rank(bi, bj), 0, MPI_COMM_WORLD, &status);
+				STARPU_ASSERT(ret == MPI_SUCCESS);
+			}
+			else {
+				block = STARPU_PLU(get_block)(bi, bj);
+	//			fprintf(stderr, "RECONSTRUCT bi %d bj %d - send %p from node %d\n", bi, bj, block, get_block_rank(bi, bj));
+				int ret = MPI_Send(block, block_size*block_size, MPI_TYPE, 0, 0, MPI_COMM_WORLD);
+				STARPU_ASSERT(ret == MPI_SUCCESS);
+			}
+		}
 
-		unsigned j, i;
-		for (j = 0; j < block_size; j++)
-		for (i = 0; i < block_size; i++)
+		if (rank == 0)
 		{
-			bigmatrix[(j + bj*block_size)+(i+bi*block_size)*size] =
-								block[j+i*block_size];
+			unsigned j, i;
+			for (j = 0; j < block_size; j++)
+			for (i = 0; i < block_size; i++)
+			{
+				bigmatrix[(j + bj*block_size)+(i+bi*block_size)*size] =
+									block[j+i*block_size];
+			}
+
+			if (get_block_rank(bi, bj) != 0)
+				free(block);
 		}
 	}
 
@@ -163,55 +190,65 @@ static TYPE *reconstruct_upper(unsigned size, unsigned nblocks)
 
 void STARPU_PLU(compute_lu_matrix)(unsigned size, unsigned nblocks, TYPE *Asaved)
 {
-//	fprintf(stderr, "ALL\n\n");
 	TYPE *all_r = STARPU_PLU(reconstruct_matrix)(size, nblocks);
-//	STARPU_PLU(display_data_content)(all_r, size);
-
-        TYPE *L = malloc((size_t)size*size*sizeof(TYPE));
-        TYPE *U = malloc((size_t)size*size*sizeof(TYPE));
-
-        memset(L, 0, size*size*sizeof(TYPE));
-        memset(U, 0, size*size*sizeof(TYPE));
 
-        /* only keep the lower part */
-	unsigned i, j;
-        for (j = 0; j < size; j++)
-        {
-                for (i = 0; i < j; i++)
-                {
-                        L[j+i*size] = all_r[j+i*size];
-                }
-
-                /* diag i = j */
-                L[j+j*size] = all_r[j+j*size];
-                U[j+j*size] = 1.0;
-
-                for (i = j+1; i < size; i++)
-                {
-                        U[j+i*size] = all_r[j+i*size];
-                }
-        }
-
-//	STARPU_PLU(display_data_content)(L, size);
-//	STARPU_PLU(display_data_content)(U, size);
-
-        /* now A_err = L, compute L*U */
-        CPU_TRMM("R", "U", "N", "U", size, size, 1.0f, U, size, L, size);
+	int rank;
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 
-//	fprintf(stderr, "\nLU\n");
-//	STARPU_PLU(display_data_content)(L, size);
-
-        /* compute "LU - A" in L*/
-        CPU_AXPY(size*size, -1.0, Asaved, 1, L, 1);
-
-        TYPE err = CPU_ASUM(size*size, L, 1);
-        int max = CPU_IAMAX(size*size, L, 1);
-
-//	STARPU_PLU(display_data_content)(L, size);
-
-        fprintf(stderr, "(A - LU) Avg error : %e\n", err/(size*size));
-        fprintf(stderr, "(A - LU) Max error : %e\n", L[max]);
+	if (rank == 0)
+	{
+	        TYPE *L = malloc((size_t)size*size*sizeof(TYPE));
+	        TYPE *U = malloc((size_t)size*size*sizeof(TYPE));
+	
+	        memset(L, 0, size*size*sizeof(TYPE));
+	        memset(U, 0, size*size*sizeof(TYPE));
+	
+	        /* only keep the lower part */
+		unsigned i, j;
+	        for (j = 0; j < size; j++)
+	        {
+	                for (i = 0; i < j; i++)
+	                {
+	                        L[j+i*size] = all_r[j+i*size];
+	                }
+	
+	                /* diag i = j */
+	                L[j+j*size] = all_r[j+j*size];
+	                U[j+j*size] = 1.0;
+	
+	                for (i = j+1; i < size; i++)
+	                {
+	                        U[j+i*size] = all_r[j+i*size];
+	                }
+	        }
+	
+		STARPU_PLU(display_data_content)(L, size);
+		STARPU_PLU(display_data_content)(U, size);
+	
+	        /* now A_err = L, compute L*U */
+	        CPU_TRMM("R", "U", "N", "U", size, size, 1.0f, U, size, L, size);
+	
+		fprintf(stderr, "\nLU\n");
+		STARPU_PLU(display_data_content)(L, size);
+	
+	        /* compute "LU - A" in L*/
+	        CPU_AXPY(size*size, -1.0, Asaved, 1, L, 1);
+	
+	        TYPE err = CPU_ASUM(size*size, L, 1);
+	        int max = CPU_IAMAX(size*size, L, 1);
+	
+		fprintf(stderr, "DISPLAY ERROR\n");
 
+		STARPU_PLU(display_data_content)(L, size);
+	
+	        fprintf(stderr, "(A - LU) Avg error : %e\n", err/(size*size));
+	        fprintf(stderr, "(A - LU) Max error : %e\n", L[max]);
+	
+		double residual = frobenius_norm(L, size);
+		double matnorm = frobenius_norm(Asaved, size);
+	
+		fprintf(stderr, "||A-LU|| / (||A||*N) : %e\n", residual/(matnorm*size));
+	}
 }
 
 static STARPU_PLU(compute_ax_block_lower)(unsigned size, unsigned nblocks,
@@ -241,6 +278,8 @@ void STARPU_PLU(compute_lux)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks,
 	 * sum of all yi. */
 	TYPE *yi = calloc(size, sizeof(TYPE));
 
+	fprintf(stderr, "Compute LU\n");
+
 	unsigned block_size = size/nblocks;
 
 	/* Compute UiX = Yi */
@@ -261,7 +300,7 @@ void STARPU_PLU(compute_lux)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks,
 			if (get_block_rank(i, j) == rank)
 			{
 				/* That block belongs to the current MPI process */
-				TYPE *block_data = STARPU_PLU(get_block)(j, i);
+				TYPE *block_data = STARPU_PLU(get_block)(i, j);
 				TYPE *sub_x = &x[i*(block_size)];
 				TYPE *sub_yi = &yi[j*(block_size)];
 
@@ -299,7 +338,7 @@ void STARPU_PLU(compute_lux)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks,
 			if (get_block_rank(i, j) == rank)
 			{
 				/* That block belongs to the current MPI process */
-				TYPE *block_data = STARPU_PLU(get_block)(j, i);
+				TYPE *block_data = STARPU_PLU(get_block)(i, j);
 				TYPE *sub_x = &x[i*(block_size)];
 				TYPE *sub_yi = &yi[j*(block_size)];
 
@@ -327,6 +366,8 @@ void STARPU_PLU(compute_lux)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks,
 /* x and y must be valid (at least) on 0 */
 void STARPU_PLU(compute_ax)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks, int rank)
 {
+	unsigned block_size = size/nblocks;
+
 	/* Send x to everyone */
 	int bcst_ret;
 	bcst_ret = MPI_Bcast(&x, size, MPI_TYPE, 0, MPI_COMM_WORLD);
@@ -347,11 +388,11 @@ void STARPU_PLU(compute_ax)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks, i
 			if (get_block_rank(i, j) == rank)
 			{
 				/* That block belongs to the current MPI process */
-				TYPE *block_data = STARPU_PLU(get_block)(j, i);
-				TYPE *sub_x = &x[i*(size/nblocks)];
-				TYPE *sub_yi = &yi[j*(size/nblocks)];
+				TYPE *block_data = STARPU_PLU(get_block)(i, j);
+				TYPE *sub_x = &x[i*block_size];
+				TYPE *sub_yi = &yi[j*block_size];
 
-				STARPU_PLU(compute_ax_block)(size/nblocks, block_data, sub_x, sub_yi);
+				STARPU_PLU(compute_ax_block)(block_size, block_data, sub_x, sub_yi);
 			}
 		}
 	}
@@ -359,5 +400,7 @@ void STARPU_PLU(compute_ax)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks, i
 	/* Compute the Sum of all yi = y */
 	MPI_Reduce(yi, y, size, MPI_TYPE, MPI_SUM, 0, MPI_COMM_WORLD);
 
+	fprintf(stderr, "RANK %d - FOO 1 y[0] %f\n", rank, y[0]);
+
 	free(yi);
 }

+ 35 - 9
mpi/examples/mpi_lu/pxlu.c

@@ -131,6 +131,8 @@ static void receive_when_deps_are_done(unsigned ndeps, starpu_tag_t *deps_tags,
 				starpu_tag_t partial_tag,
 				starpu_tag_t unlocked_tag)
 {
+	STARPU_ASSERT(handle != STARPU_POISON_PTR);
+
 	struct recv_when_done_callback_arg *arg =
 		malloc(sizeof(struct recv_when_done_callback_arg));
 	
@@ -360,6 +362,12 @@ static void create_task_12_real(unsigned k, unsigned j)
 	struct starpu_task *task = create_task(TAG12(k, j));
 	
 	task->cl = &STARPU_PLU(cl12);
+//	task->cl = &STARPU_PLU(cl21);
+
+	int myrank;
+	MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+
+	STARPU_ASSERT(myrank == rank);
 
 	/* which sub-data is manipulated ? */
 	starpu_data_handle diag_block;
@@ -370,10 +378,15 @@ static void create_task_12_real(unsigned k, unsigned j)
 
 	task->buffers[0].handle = diag_block; 
 	task->buffers[0].mode = STARPU_R;
-//	task->buffers[1].handle = STARPU_PLU(get_block_handle)(j, k); 
 	task->buffers[1].handle = STARPU_PLU(get_block_handle)(k, j); 
 	task->buffers[1].mode = STARPU_RW;
 
+	STARPU_ASSERT(get_block_rank(k, j) == rank);
+	STARPU_ASSERT(STARPU_PLU(get_tmp_11_block_handle)() != STARPU_POISON_PTR);
+
+	STARPU_ASSERT(task->buffers[0].handle != STARPU_POISON_PTR);
+	STARPU_ASSERT(task->buffers[1].handle != STARPU_POISON_PTR);
+
 	struct callback_arg *arg = malloc(sizeof(struct callback_arg));
 		arg->j = j;
 		arg->k = k;
@@ -479,7 +492,6 @@ static void callback_task_21_real(void *_arg)
 	rank_mask[rank] = 0;
 
 	/* Send the block to those nodes */
-//	starpu_data_handle block_handle = STARPU_PLU(get_block_handle)(k, i);
 	starpu_data_handle block_handle = STARPU_PLU(get_block_handle)(i, k);
 	starpu_tag_t tag = TAG21_SAVE(k, i);
 	int mpi_tag = MPI_TAG21(k, i);
@@ -493,6 +505,7 @@ static void create_task_21_real(unsigned k, unsigned i)
 	struct starpu_task *task = create_task(TAG21(k, i));
 
 	task->cl = &STARPU_PLU(cl21);
+//	task->cl = &STARPU_PLU(cl12);
 	
 	/* which sub-data is manipulated ? */
 	starpu_data_handle diag_block;
@@ -503,10 +516,15 @@ static void create_task_21_real(unsigned k, unsigned i)
 
 	task->buffers[0].handle = diag_block; 
 	task->buffers[0].mode = STARPU_R;
-//	task->buffers[1].handle = STARPU_PLU(get_block_handle)(k, i);
 	task->buffers[1].handle = STARPU_PLU(get_block_handle)(i, k);
 	task->buffers[1].mode = STARPU_RW;
 
+	STARPU_ASSERT(STARPU_PLU(get_tmp_11_block_handle)() != STARPU_POISON_PTR);
+
+	STARPU_ASSERT(task->buffers[0].handle != STARPU_POISON_PTR);
+	STARPU_ASSERT(task->buffers[1].handle != STARPU_POISON_PTR);
+
+
 	struct callback_arg *arg = malloc(sizeof(struct callback_arg));
 		arg->i = i;
 		arg->k = k;
@@ -570,15 +588,11 @@ static void create_task_22_real(unsigned k, unsigned i, unsigned j)
 	starpu_data_handle block21;
 	if (get_block_rank(i, k) == rank)
 	{
-	//	block21 = STARPU_PLU(get_block_handle)(k, i);
 		block21 = STARPU_PLU(get_block_handle)(i, k);
 	}
 	else 
 		block21 = STARPU_PLU(get_tmp_21_block_handle)(i);
 
-	task->buffers[0].handle = block21;
-	task->buffers[0].mode = STARPU_R;
-
 	/* produced by TAG12_SAVE(k, j) */
 	starpu_data_handle block12;
 	if (get_block_rank(k, j) == rank)
@@ -589,14 +603,22 @@ static void create_task_22_real(unsigned k, unsigned i, unsigned j)
 	else 
 		block12 = STARPU_PLU(get_tmp_12_block_handle)(j);
 
+
+
+	task->buffers[0].handle = block21;
+	task->buffers[0].mode = STARPU_R;
+
 	task->buffers[1].handle = block12;
 	task->buffers[1].mode = STARPU_R;
 
 	/* produced by TAG22(k-1, i, j) */
-//	task->buffers[2].handle = STARPU_PLU(get_block_handle)(j, i);
 	task->buffers[2].handle = STARPU_PLU(get_block_handle)(i, j);
 	task->buffers[2].mode = STARPU_RW;
 
+	STARPU_ASSERT(task->buffers[0].handle != STARPU_POISON_PTR);
+	STARPU_ASSERT(task->buffers[1].handle != STARPU_POISON_PTR);
+	STARPU_ASSERT(task->buffers[2].handle != STARPU_POISON_PTR);
+
 	if (!no_prio &&  (i == k + 1) && (j == k +1) ) {
 		task->priority = MAX_PRIO;
 	}
@@ -638,6 +660,8 @@ static void wait_tag_and_fetch_handle(starpu_tag_t tag, starpu_data_handle handl
 
 static void wait_termination(void)
 {
+	starpu_wait_all_tasks();
+
 	unsigned k, i, j;
 	for (k = 0; k < nblocks; k++)
 	{
@@ -654,8 +678,9 @@ static void wait_termination(void)
 			/* Wait task 21ki is needed */
 			if (get_block_rank(i, k) == rank)
 			{
-				//starpu_data_handle block21 = STARPU_PLU(get_block_handle)(k, i);
 				starpu_data_handle block21 = STARPU_PLU(get_block_handle)(i, k);
+				//starpu_data_handle block21 = STARPU_PLU(get_block_handle)(k, i);
+				fprintf(stderr, "BLOCK21 i %d k %d -> handle %p\n", i, k, block21);
 				wait_tag_and_fetch_handle(TAG21_SAVE(k, i), block21);
 			}
 		}
@@ -667,6 +692,7 @@ static void wait_termination(void)
 			{
 				//starpu_data_handle block12 = STARPU_PLU(get_block_handle)(j, k);
 				starpu_data_handle block12 = STARPU_PLU(get_block_handle)(k, j);
+				fprintf(stderr, "BLOCK12 j %d k %d -> handle %p\n", j, k, block12);
 				wait_tag_and_fetch_handle(TAG12_SAVE(k, j), block12);
 			}
 		}

+ 7 - 2
mpi/examples/mpi_lu/pxlu.h

@@ -30,8 +30,13 @@
 
 double STARPU_PLU(plu_main)(unsigned nblocks, int rank, int world_size);
 
-starpu_data_handle STARPU_PLU(get_block_handle)(unsigned j, unsigned i);
-TYPE *STARPU_PLU(get_block)(unsigned j, unsigned i);
+TYPE *STARPU_PLU(reconstruct_matrix)(unsigned size, unsigned nblocks);
+void STARPU_PLU(compute_lu_matrix)(unsigned size, unsigned nblocks, TYPE *Asaved);
+
+void STARPU_PLU(compute_ax)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks, int rank);
+void STARPU_PLU(compute_lux)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks, int rank);
+starpu_data_handle STARPU_PLU(get_block_handle)(unsigned i, unsigned j);
+TYPE *STARPU_PLU(get_block)(unsigned i, unsigned j);
 starpu_data_handle STARPU_PLU(get_tmp_11_block_handle)(void);
 starpu_data_handle STARPU_PLU(get_tmp_12_block_handle)(unsigned j);
 starpu_data_handle STARPU_PLU(get_tmp_21_block_handle)(unsigned i);

+ 16 - 19
mpi/examples/mpi_lu/pxlu_kernels.c

@@ -129,19 +129,17 @@ static inline void STARPU_PLU(common_u12)(void *descr[],
 
 	int rank;
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-//	fprintf(stderr, "KERNEL 12 %d\n", rank);
+	fprintf(stderr, "KERNEL 12 %d\n", rank);
 
 #ifdef USE_CUDA
 	cublasStatus status;
 	cudaError_t cures;
 #endif
 
-//	fprintf(stderr, "INPUT 12 U11\n");
-//	STARPU_PLU(display_data_content)(sub11, nx12);
-//	fprintf(stderr, "INPUT 12 U12\n");
-//	STARPU_PLU(display_data_content)(sub12, nx12);
-
-
+	fprintf(stderr, "INPUT 12 U11\n");
+	STARPU_PLU(display_data_content)(sub11, nx12);
+	fprintf(stderr, "INPUT 12 U12\n");
+	STARPU_PLU(display_data_content)(sub12, nx12);
 
 	/* solve L11 U12 = A12 (find U12) */
 	switch (s) {
@@ -168,8 +166,8 @@ static inline void STARPU_PLU(common_u12)(void *descr[],
 			break;
 	}
 
-//	fprintf(stderr, "OUTPUT 12 U12\n");
-//	STARPU_PLU(display_data_content)(sub12, nx12);
+	fprintf(stderr, "OUTPUT 12 U12\n");
+	STARPU_PLU(display_data_content)(sub12, nx12);
 }
 
 static void STARPU_PLU(cpu_u12)(void *descr[], void *_args)
@@ -227,14 +225,12 @@ static inline void STARPU_PLU(common_u21)(void *descr[],
 	
 	int rank;
 	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-//	fprintf(stderr, "KERNEL 21 %d \n", rank);
-
-	//fprintf(stderr, "INPUT 21 U11\n");
-	//STARPU_PLU(display_data_content)(sub11, nx21);
-	//fprintf(stderr, "INPUT 21 U12\n");
-	//STARPU_PLU(display_data_content)(sub21, nx21);
-
+	fprintf(stderr, "KERNEL 21 %d \n", rank);
 
+	fprintf(stderr, "INPUT 21 U11\n");
+	STARPU_PLU(display_data_content)(sub11, nx21);
+	fprintf(stderr, "INPUT 21 U21\n");
+	STARPU_PLU(display_data_content)(sub21, nx21);
 
 #ifdef USE_CUDA
 	cublasStatus status;
@@ -265,9 +261,10 @@ static inline void STARPU_PLU(common_u21)(void *descr[],
 			break;
 	}
 
-//	fprintf(stderr, "INPUT 21 U21\n");
-//	STARPU_PLU(display_data_content)(sub21, nx21);
-
+	fprintf(stderr, "OUTPUT 21 U11\n");
+	STARPU_PLU(display_data_content)(sub11, nx21);
+	fprintf(stderr, "OUTPUT 21 U21\n");
+	STARPU_PLU(display_data_content)(sub21, nx21);
 
 }