Просмотр исходного кода

mpi-cholesky: Add submission order variants: by columns or by antidiagonals

Samuel Thibault лет назад: 5
Родитель
Сommit
b6940c9325

+ 164 - 1
mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c

@@ -115,6 +115,164 @@ static void run_cholesky(starpu_data_handle_t **data_handles, int rank, int node
 	}
 }
 
+/* TODO: generated from compiler polyhedral analysis of classical algorithm */
+static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, int nodes)
+{
+	unsigned k, m, n;
+	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
+
+	/* Column */
+	for (n = 0; n<nblocks; n++)
+	{
+		starpu_iteration_push(n);
+
+		/* Row */
+		for (m = n; m<nblocks; m++)
+		{
+			for (k = 0; k < n; k++)
+			{
+				/* Accumulate updates from TRSMs */
+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+						       STARPU_R, data_handles[n][k],
+						       STARPU_R, data_handles[m][k],
+						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
+						       0);
+			}
+			k = n;
+			if (m > n)
+			{
+				/* non-diagonal block, solve */
+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+						       STARPU_R, data_handles[k][k],
+						       STARPU_RW, data_handles[m][k],
+						       0);
+			}
+			else
+			{
+				/* diagonal block, factorize */
+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
+						       STARPU_RW, data_handles[k][k],
+						       0);
+			}
+		}
+
+		starpu_iteration_pop();
+	}
+
+	/* Submit flushes, StarPU will fit them according to the progress */
+	starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
+	for (m = 0; m < nblocks; m++)
+		for (n = 0; n < nblocks ; n++)
+			starpu_data_wont_use(data_handles[m][n]);
+}
+
+/* TODO: generated from compiler polyhedral analysis of classical algorithm */
+static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int rank, int nodes)
+{
+	unsigned a, b, c;
+	unsigned k, m, n;
+	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
+
+	/* double-antidiagonal number:
+	 * - a=0 contains (0,0) plus (1,0)
+	 * - a=1 contains (2,0), (1,1) plus (3,0), (2, 1)
+	 * - etc.
+	 */
+	for (a = 0; a < nblocks; a++)
+	{
+		starpu_iteration_push(a);
+
+		unsigned bfirst;
+		if (2*a < nblocks)
+			bfirst = 0;
+		else
+			bfirst = 2*a - (nblocks-1);
+
+		/* column within first antidiagonal for a */
+		for (b = bfirst; b <= a; b++)
+		{
+			/* column */
+			n = b;
+			/* row */
+			m = 2*a-b;
+
+			/* Accumulate updates from TRSMs */
+			for (c = 0; c < n; c++)
+			{
+				k = c;
+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+						       STARPU_R, data_handles[n][k],
+						       STARPU_R, data_handles[m][k],
+						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
+						       0);
+			}
+
+			if (b < a) {
+				/* non-diagonal block, solve */
+				k = n;
+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+						       STARPU_R, data_handles[k][k],
+						       STARPU_RW, data_handles[m][k],
+						       0);
+			}
+			else
+			{
+				/* diagonal block, factorize */
+				k = a;
+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
+						       STARPU_RW, data_handles[k][k],
+						       0);
+			}
+		}
+
+		/* column within second antidiagonal for a */
+		for (b = bfirst; b <= a; b++)
+		{
+			/* column */
+			n = b;
+			/* row */
+			m = 2*a-b + 1;
+
+			if (m >= nblocks)
+				/* Skip first item when even number of tiles */
+				continue;
+
+			/* Accumulate updates from TRSMs */
+			for (c = 0; c < n; c++)
+			{
+				k = c;
+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+						       STARPU_R, data_handles[n][k],
+						       STARPU_R, data_handles[m][k],
+						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
+						       0);
+			}
+			/* non-diagonal block, solve */
+			k = n;
+			starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
+					       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
+					       STARPU_R, data_handles[k][k],
+					       STARPU_RW, data_handles[m][k],
+					       0);
+		}
+
+		starpu_iteration_pop();
+	}
+
+	/* Submit flushes, StarPU will fit them according to the progress */
+	starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
+	for (m = 0; m < nblocks; m++)
+		for (n = 0; n < nblocks ; n++)
+			starpu_data_wont_use(data_handles[m][n]);
+}
+
 /*
  *	code to bootstrap the factorization
  *	and construct the DAG
@@ -164,7 +322,12 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 	starpu_mpi_barrier(MPI_COMM_WORLD);
 	start = starpu_timing_now();
 
-	run_cholesky(data_handles, rank, nodes);
+	switch (submission) {
+		case TRIANGLES:		run_cholesky(data_handles, rank, nodes); break;
+		case COLUMNS:		run_cholesky_column(data_handles, rank, nodes); break;
+		case ANTIDIAGONALS:	run_cholesky_antidiagonal(data_handles, rank, nodes); break;
+		default: STARPU_ABORT();
+	}
 
 	starpu_mpi_wait_for_all(MPI_COMM_WORLD);
 	starpu_mpi_barrier(MPI_COMM_WORLD);

+ 12 - 1
mpi/examples/matrix_decomposition/mpi_decomposition_params.c

@@ -43,6 +43,7 @@ unsigned check = 0;
 unsigned display = 0;
 int dblockx = -1;
 int dblocky = -1;
+enum submission submission = TRIANGLES;
 
 void parse_args(int argc, char **argv, int nodes)
 {
@@ -79,6 +80,16 @@ void parse_args(int argc, char **argv, int nodes)
                         nbigblocks = strtol(argv[++i], &argptr, 10);
                 }
 
+                if (strcmp(argv[i], "-columns") == 0)
+                {
+                        submission = COLUMNS;
+                }
+
+                if (strcmp(argv[i], "-antidiagonals") == 0)
+                {
+                        submission = ANTIDIAGONALS;
+                }
+
                 if (strcmp(argv[i], "-no-prio") == 0)
                 {
                         noprio = 1;
@@ -96,7 +107,7 @@ void parse_args(int argc, char **argv, int nodes)
 
                 if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0)
                 {
-			printf("usage : %s [-size size] [-nblocks nblocks] [-no-prio] [-display] [-check]\n", argv[0]);
+                        printf("usage : %s [-size size] [-nblocks nblocks] [-columns] [-antidiagonals] [-no-prio] [-display] [-check]\n", argv[0]);
                 }
         }
 

+ 7 - 0
mpi/examples/matrix_decomposition/mpi_decomposition_params.h

@@ -28,6 +28,13 @@ extern unsigned display;
 extern int dblockx;
 extern int dblocky;
 
+enum submission {
+	TRIANGLES,
+	COLUMNS,
+	ANTIDIAGONALS,
+};
+extern enum submission submission;
+
 void parse_args(int argc, char **argv, int nodes);
 
 #endif // __MPI_CHOLESKY_PARAMS_H__