|
@@ -198,7 +198,7 @@ static void run_cholesky(starpu_data_handle_t **data_handles, int rank, int node
|
|
/* TODO: generate from compiler polyhedral analysis of classical algorithm */
|
|
/* TODO: generate from compiler polyhedral analysis of classical algorithm */
|
|
static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, int nodes)
|
|
static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, int nodes)
|
|
{
|
|
{
|
|
- unsigned k, m, n;
|
|
|
|
|
|
+ unsigned k, m, n, i;
|
|
unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
|
|
unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
|
|
unsigned nn = size/nblocks;
|
|
unsigned nn = size/nblocks;
|
|
|
|
|
|
@@ -222,6 +222,13 @@ static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, i
|
|
STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
|
|
STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
|
|
STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
|
|
STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
|
|
0);
|
|
0);
|
|
|
|
+
|
|
|
|
+ if (m == n)
|
|
|
|
+ {
|
|
|
|
+ /* Nobody else will need it */
|
|
|
|
+ starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[m][k]);
|
|
|
|
+ starpu_data_wont_use(data_handles[m][k]);
|
|
|
|
+ }
|
|
}
|
|
}
|
|
k = n;
|
|
k = n;
|
|
if (m > n)
|
|
if (m > n)
|
|
@@ -243,27 +250,26 @@ static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, i
|
|
STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
|
|
STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
|
|
0);
|
|
0);
|
|
}
|
|
}
|
|
|
|
+
|
|
}
|
|
}
|
|
|
|
+
|
|
|
|
+ /* We won't need it any more */
|
|
|
|
+ starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][n]);
|
|
|
|
+ starpu_data_wont_use(data_handles[n][n]);
|
|
|
|
+
|
|
if (n%checkpoint_period==checkpoint_period-1)
|
|
if (n%checkpoint_period==checkpoint_period-1)
|
|
starpu_mpi_submit_checkpoint_template(*checkpoint_p, (int)(nblocks - 2*n));
|
|
starpu_mpi_submit_checkpoint_template(*checkpoint_p, (int)(nblocks - 2*n));
|
|
|
|
+
|
|
starpu_iteration_pop();
|
|
starpu_iteration_pop();
|
|
}
|
|
}
|
|
|
|
|
|
- /* Submit flushes, StarPU will fit them according to the progress */
|
|
|
|
- starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
|
|
|
|
- for (m = 0; m < nblocks; m++)
|
|
|
|
- for (n = 0; n < nblocks ; n++)
|
|
|
|
- {
|
|
|
|
- if (starpu_data_get_home_node(data_handles[m][n])>=0)
|
|
|
|
- starpu_data_wont_use(data_handles[m][n]);
|
|
|
|
- }
|
|
|
|
}
|
|
}
|
|
|
|
|
|
/* TODO: generate from compiler polyhedral analysis of classical algorithm */
|
|
/* TODO: generate from compiler polyhedral analysis of classical algorithm */
|
|
static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int rank, int nodes)
|
|
static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int rank, int nodes)
|
|
{
|
|
{
|
|
unsigned a, c;
|
|
unsigned a, c;
|
|
- unsigned k, m, n;
|
|
|
|
|
|
+ unsigned k, m, n, i;
|
|
unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
|
|
unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
|
|
unsigned nn = size/nblocks;
|
|
unsigned nn = size/nblocks;
|
|
|
|
|
|
@@ -301,6 +307,13 @@ static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int r
|
|
STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
|
|
STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
|
|
STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
|
|
STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
|
|
0);
|
|
0);
|
|
|
|
+
|
|
|
|
+ if (m == nblocks-1)
|
|
|
|
+ {
|
|
|
|
+ /* Nobody else will need it */
|
|
|
|
+ starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][k]);
|
|
|
|
+ starpu_data_wont_use(data_handles[n][k]);
|
|
|
|
+ }
|
|
}
|
|
}
|
|
|
|
|
|
/* k = n */
|
|
/* k = n */
|
|
@@ -323,6 +336,13 @@ static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int r
|
|
STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
|
|
STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
|
|
0);
|
|
0);
|
|
}
|
|
}
|
|
|
|
+
|
|
|
|
+ if (m == nblocks - 1)
|
|
|
|
+ {
|
|
|
|
+ /* We do not need the potrf result any more */
|
|
|
|
+ starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][n]);
|
|
|
|
+ starpu_data_wont_use(data_handles[n][n]);
|
|
|
|
+ }
|
|
}
|
|
}
|
|
|
|
|
|
/* column within second antidiagonal for a */
|
|
/* column within second antidiagonal for a */
|
|
@@ -345,6 +365,13 @@ static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int r
|
|
STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
|
|
STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
|
|
STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
|
|
STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
|
|
0);
|
|
0);
|
|
|
|
+
|
|
|
|
+ if (m == nblocks-1)
|
|
|
|
+ {
|
|
|
|
+ /* Nobody else will need it */
|
|
|
|
+ starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][k]);
|
|
|
|
+ starpu_data_wont_use(data_handles[n][k]);
|
|
|
|
+ }
|
|
}
|
|
}
|
|
/* non-diagonal block, solve */
|
|
/* non-diagonal block, solve */
|
|
k = n;
|
|
k = n;
|
|
@@ -354,21 +381,19 @@ static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int r
|
|
STARPU_RW, data_handles[m][k],
|
|
STARPU_RW, data_handles[m][k],
|
|
STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
|
|
STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
|
|
0);
|
|
0);
|
|
|
|
+
|
|
|
|
+ if (m == nblocks - 1)
|
|
|
|
+ {
|
|
|
|
+ /* We do not need the potrf result any more */
|
|
|
|
+ starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][n]);
|
|
|
|
+ starpu_data_wont_use(data_handles[n][n]);
|
|
|
|
+ }
|
|
}
|
|
}
|
|
|
|
|
|
if (a%checkpoint_period==checkpoint_period-1)
|
|
if (a%checkpoint_period==checkpoint_period-1)
|
|
starpu_mpi_submit_checkpoint_template(*checkpoint_p, (int)(2*nblocks -4*a));
|
|
starpu_mpi_submit_checkpoint_template(*checkpoint_p, (int)(2*nblocks -4*a));
|
|
starpu_iteration_pop();
|
|
starpu_iteration_pop();
|
|
}
|
|
}
|
|
-
|
|
|
|
- /* Submit flushes, StarPU will fit them according to the progress */
|
|
|
|
- starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
|
|
|
|
- for (m = 0; m < nblocks; m++)
|
|
|
|
- for (n = 0; n < nblocks ; n++)
|
|
|
|
- {
|
|
|
|
- if (starpu_data_get_home_node(data_handles[m][n])>=0)
|
|
|
|
- starpu_data_wont_use(data_handles[m][n]);
|
|
|
|
- }
|
|
|
|
}
|
|
}
|
|
|
|
|
|
/* TODO: generate from compiler polyhedral analysis of classical algorithm */
|
|
/* TODO: generate from compiler polyhedral analysis of classical algorithm */
|
|
@@ -380,7 +405,7 @@ static void run_cholesky_prio(starpu_data_handle_t **data_handles, int rank, int
|
|
unsigned nn = size/nblocks;
|
|
unsigned nn = size/nblocks;
|
|
|
|
|
|
/*
|
|
/*
|
|
- * This is basically similar to above, except that we shift k according to the priorities set in the algorithm, so that prio ~ 2*a or 2*a+1
|
|
|
|
|
|
+ * This is basically similar to above, except that we shift k according to the priorities set in the algorithm, so that gemm prio ~= 2*nblocks - a
|
|
* double-antidiagonal number:
|
|
* double-antidiagonal number:
|
|
* - a=0 contains (0,0) plus (1,0)
|
|
* - a=0 contains (0,0) plus (1,0)
|
|
* - a=1 contains (2,0), (1,1) plus (3,0), (2, 1)
|
|
* - a=1 contains (2,0), (1,1) plus (3,0), (2, 1)
|
|
@@ -394,16 +419,13 @@ static void run_cholesky_prio(starpu_data_handle_t **data_handles, int rank, int
|
|
{
|
|
{
|
|
starpu_iteration_push(a);
|
|
starpu_iteration_push(a);
|
|
|
|
|
|
- for (k = 0; k < nblocks; k++)
|
|
|
|
|
|
+ for (k = 0; k < (int) nblocks; k++)
|
|
{
|
|
{
|
|
n = k;
|
|
n = k;
|
|
/* Should be m = a-k-n; for potrf and trsm to respect
|
|
/* Should be m = a-k-n; for potrf and trsm to respect
|
|
priorities, but needs to be this for dependencies */
|
|
priorities, but needs to be this for dependencies */
|
|
m = a-2*k-n;
|
|
m = a-2*k-n;
|
|
|
|
|
|
- if (m < 0 || m >= nblocks)
|
|
|
|
- continue;
|
|
|
|
-
|
|
|
|
if (m == n)
|
|
if (m == n)
|
|
{
|
|
{
|
|
/* diagonal block, factorize */
|
|
/* diagonal block, factorize */
|
|
@@ -413,7 +435,7 @@ static void run_cholesky_prio(starpu_data_handle_t **data_handles, int rank, int
|
|
STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
|
|
STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
|
|
0);
|
|
0);
|
|
}
|
|
}
|
|
- else
|
|
|
|
|
|
+ else if (m >= n && m < (int) nblocks)
|
|
{
|
|
{
|
|
/* non-diagonal block, solve */
|
|
/* non-diagonal block, solve */
|
|
starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
|
|
starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
|
|
@@ -424,13 +446,20 @@ static void run_cholesky_prio(starpu_data_handle_t **data_handles, int rank, int
|
|
0);
|
|
0);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+ if (m == (int) nblocks - 1)
|
|
|
|
+ {
|
|
|
|
+ /* We do not need the potrf result any more */
|
|
|
|
+ starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][n]);
|
|
|
|
+ starpu_data_wont_use(data_handles[n][n]);
|
|
|
|
+ }
|
|
|
|
+
|
|
/* column within antidiagonal for a */
|
|
/* column within antidiagonal for a */
|
|
- for (n = k + 1; n < nblocks; n++)
|
|
|
|
|
|
+ for (n = k + 1; n < (int) nblocks; n++)
|
|
{
|
|
{
|
|
/* row */
|
|
/* row */
|
|
m = a-2*k-n;
|
|
m = a-2*k-n;
|
|
|
|
|
|
- if (m >= n && m < nblocks)
|
|
|
|
|
|
+ if (m >= n && m < (int) nblocks)
|
|
{
|
|
{
|
|
/* Update */
|
|
/* Update */
|
|
starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
|
|
starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
|
|
@@ -440,6 +469,12 @@ static void run_cholesky_prio(starpu_data_handle_t **data_handles, int rank, int
|
|
STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
|
|
STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
|
|
STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
|
|
STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
|
|
0);
|
|
0);
|
|
|
|
+ if (m == (int) nblocks - 1)
|
|
|
|
+ {
|
|
|
|
+ /* Nobody else will need it */
|
|
|
|
+ starpu_data_wont_use(data_handles[n][k]);
|
|
|
|
+ starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][k]);
|
|
|
|
+ }
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
@@ -449,15 +484,6 @@ static void run_cholesky_prio(starpu_data_handle_t **data_handles, int rank, int
|
|
starpu_mpi_submit_checkpoint_template(*checkpoint_p, (int)(2*nblocks - a));
|
|
starpu_mpi_submit_checkpoint_template(*checkpoint_p, (int)(2*nblocks - a));
|
|
starpu_iteration_pop();
|
|
starpu_iteration_pop();
|
|
}
|
|
}
|
|
-
|
|
|
|
- /* Submit flushes, StarPU will fit them according to the progress */
|
|
|
|
- starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
|
|
|
|
- for (m = 0; m < nblocks; m++)
|
|
|
|
- for (n = 0; n < nblocks ; n++)
|
|
|
|
- {
|
|
|
|
- if (starpu_data_get_home_node(data_handles[m][n])>=0)
|
|
|
|
- starpu_data_wont_use(data_handles[m][n]);
|
|
|
|
- }
|
|
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
/*
|
|
@@ -546,7 +572,7 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
|
|
if (rank == 0)
|
|
if (rank == 0)
|
|
{
|
|
{
|
|
*timing = end - start;
|
|
*timing = end - start;
|
|
- *flops = (1.0f*size*size*size)/3.0f;
|
|
|
|
|
|
+ *flops = FLOPS_SPOTRF(size);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|