Browse Source

Use an array of temporary buffers for "task 11" instead of a single buffer.

Cédric Augonnet 15 years ago
parent
commit
f838d1aae0
3 changed files with 57 additions and 4 deletions
  1. 29 1
      mpi/examples/mpi_lu/plu_example.c
  2. 22 3
      mpi/examples/mpi_lu/pxlu.c
  3. 6 0
      mpi/examples/mpi_lu/pxlu.h

+ 29 - 1
mpi/examples/mpi_lu/plu_example.c

@@ -36,8 +36,13 @@ static TYPE **dataA;
 
 /* In order to implement the distributed LU decomposition, we allocate
  * temporary buffers */
+#ifdef SINGLE_TMP11
 static starpu_data_handle tmp_11_block_handle;
 static TYPE *tmp_11_block;
+#else
+static starpu_data_handle *tmp_11_block_handles;
+static TYPE **tmp_11_block;
+#endif
 static starpu_data_handle *tmp_12_block_handles;
 static TYPE **tmp_12_block;
 static starpu_data_handle *tmp_21_block_handles;
@@ -94,10 +99,17 @@ static void fill_block_with_random(TYPE *blockptr, unsigned size, unsigned nbloc
 	}
 }
 
+#ifdef SINGLE_TMP11
 starpu_data_handle STARPU_PLU(get_tmp_11_block_handle)(void)
 {
 	return tmp_11_block_handle;
 }
+#else
+starpu_data_handle STARPU_PLU(get_tmp_11_block_handle)(unsigned k)
+{
+	return tmp_11_block_handles[k];
+}
+#endif
 
 starpu_data_handle STARPU_PLU(get_tmp_12_block_handle)(unsigned j)
 {
@@ -159,10 +171,27 @@ static void init_matrix(int rank)
 
 	/* Allocate the temporary buffers required for the distributed algorithm */
 
+	unsigned k;
+
 	/* tmp buffer 11 */
+#ifdef SINGLE_TMP11
 	starpu_malloc_pinned_if_possible((void **)&tmp_11_block, blocksize);
 	starpu_register_blas_data(&tmp_11_block_handle, 0, (uintptr_t)tmp_11_block,
 			size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
+#else
+	tmp_11_block_handles = calloc(nblocks, sizeof(starpu_data_handle));
+	tmp_11_block = calloc(nblocks, sizeof(TYPE *));
+
+	for (k = 0; k < nblocks; k++)
+	{
+		starpu_malloc_pinned_if_possible((void **)&tmp_11_block[k], blocksize);
+		STARPU_ASSERT(tmp_11_block[k]);
+
+		starpu_register_blas_data(&tmp_11_block_handles[k], 0,
+			(uintptr_t)tmp_11_block[k],
+			size/nblocks, size/nblocks, size/nblocks, sizeof(TYPE));
+	}
+#endif
 
 	/* tmp buffers 12 and 21 */
 	tmp_12_block_handles = calloc(nblocks, sizeof(starpu_data_handle));
@@ -170,7 +199,6 @@ static void init_matrix(int rank)
 	tmp_12_block = calloc(nblocks, sizeof(TYPE *));
 	tmp_21_block = calloc(nblocks, sizeof(TYPE *));
 	
-	unsigned k;
 	for (k = 0; k < nblocks; k++)
 	{
 		starpu_malloc_pinned_if_possible((void **)&tmp_12_block[k], blocksize);

+ 22 - 3
mpi/examples/mpi_lu/pxlu.c

@@ -187,6 +187,7 @@ static void create_task_11_recv(unsigned k)
 	unsigned ndeps = 0;
 	starpu_tag_t tag_array[2*nblocks];
 	
+#ifdef SINGLE_TMP11
 	if (k > 0)
 	for (i = (k-1)+1; i < nblocks; i++)
 	{
@@ -200,9 +201,14 @@ static void create_task_11_recv(unsigned k)
 		if (rank == get_block_rank(k-1, j))
 			tag_array[ndeps++] = TAG12(k-1, j);
 	}
+#endif
 	
 	int source = get_block_rank(k, k);
+#ifdef SINGLE_TMP11
 	starpu_data_handle block_handle = STARPU_PLU(get_tmp_11_block_handle)();
+#else
+	starpu_data_handle block_handle = STARPU_PLU(get_tmp_11_block_handle)(k);
+#endif
 	int mpi_tag = MPI_TAG11(k);
 	starpu_tag_t partial_tag = TAG11_SAVE_PARTIAL(k);
 	starpu_tag_t unlocked_tag = TAG11_SAVE(k);
@@ -400,9 +406,17 @@ static void create_task_12_real(unsigned k, unsigned j)
 	/* which sub-data is manipulated ? */
 	starpu_data_handle diag_block;
 	if (get_block_rank(k, k) == rank)
+	{
 		diag_block = STARPU_PLU(get_block_handle)(k, k);
+	}
 	else 
+	{
+#ifdef SINGLE_TMP11
 		diag_block = STARPU_PLU(get_tmp_11_block_handle)();
+#else
+		diag_block = STARPU_PLU(get_tmp_11_block_handle)(k);
+#endif
+	}
 
 	task->buffers[0].handle = diag_block; 
 	task->buffers[0].mode = STARPU_R;
@@ -410,7 +424,6 @@ static void create_task_12_real(unsigned k, unsigned j)
 	task->buffers[1].mode = STARPU_RW;
 
 	STARPU_ASSERT(get_block_rank(k, j) == rank);
-	STARPU_ASSERT(STARPU_PLU(get_tmp_11_block_handle)() != STARPU_POISON_PTR);
 
 	STARPU_ASSERT(task->buffers[0].handle != STARPU_POISON_PTR);
 	STARPU_ASSERT(task->buffers[1].handle != STARPU_POISON_PTR);
@@ -547,17 +560,23 @@ static void create_task_21_real(unsigned k, unsigned i)
 	/* which sub-data is manipulated ? */
 	starpu_data_handle diag_block;
 	if (get_block_rank(k, k) == rank)
+	{
 		diag_block = STARPU_PLU(get_block_handle)(k, k);
+	}
 	else 
+	{
+#ifdef SINGLE_TMP11
 		diag_block = STARPU_PLU(get_tmp_11_block_handle)();
+#else
+		diag_block = STARPU_PLU(get_tmp_11_block_handle)(k);
+#endif
+	}
 
 	task->buffers[0].handle = diag_block; 
 	task->buffers[0].mode = STARPU_R;
 	task->buffers[1].handle = STARPU_PLU(get_block_handle)(i, k);
 	task->buffers[1].mode = STARPU_RW;
 
-	STARPU_ASSERT(STARPU_PLU(get_tmp_11_block_handle)() != STARPU_POISON_PTR);
-
 	STARPU_ASSERT(task->buffers[0].handle != STARPU_POISON_PTR);
 	STARPU_ASSERT(task->buffers[1].handle != STARPU_POISON_PTR);
 

+ 6 - 0
mpi/examples/mpi_lu/pxlu.h

@@ -28,6 +28,8 @@
 #define BLAS3_FLOP(n1,n2,n3)    \
         (2*((uint64_t)n1)*((uint64_t)n2)*((uint64_t)n3))
 
+//#define SINGLE_TMP11	1
+
 struct debug_info {
 	unsigned i;
 	unsigned j;
@@ -45,7 +47,11 @@ void STARPU_PLU(compute_ax)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks, i
 void STARPU_PLU(compute_lux)(unsigned size, TYPE *x, TYPE *y, unsigned nblocks, int rank);
 starpu_data_handle STARPU_PLU(get_block_handle)(unsigned i, unsigned j);
 TYPE *STARPU_PLU(get_block)(unsigned i, unsigned j);
+#ifdef SINGLE_TMP11
 starpu_data_handle STARPU_PLU(get_tmp_11_block_handle)(void);
+#else
+starpu_data_handle STARPU_PLU(get_tmp_11_block_handle)(unsigned k);
+#endif
 starpu_data_handle STARPU_PLU(get_tmp_12_block_handle)(unsigned j);
 starpu_data_handle STARPU_PLU(get_tmp_21_block_handle)(unsigned i);