|
@@ -20,6 +20,64 @@
|
|
|
#include <limits.h>
|
|
|
#include <math.h>
|
|
|
|
|
|
+/* This is from magma
|
|
|
+
|
|
|
+ -- Innovative Computing Laboratory
|
|
|
+ -- Electrical Engineering and Computer Science Department
|
|
|
+ -- University of Tennessee
|
|
|
+ -- (C) Copyright 2009
|
|
|
+
|
|
|
+ Redistribution and use in source and binary forms, with or without
|
|
|
+ modification, are permitted provided that the following conditions
|
|
|
+ are met:
|
|
|
+
|
|
|
+ * Redistributions of source code must retain the above copyright
|
|
|
+ notice, this list of conditions and the following disclaimer.
|
|
|
+ * Redistributions in binary form must reproduce the above copyright
|
|
|
+ notice, this list of conditions and the following disclaimer in the
|
|
|
+ documentation and/or other materials provided with the distribution.
|
|
|
+ * Neither the name of the University of Tennessee, Knoxville nor the
|
|
|
+ names of its contributors may be used to endorse or promote products
|
|
|
+ derived from this software without specific prior written permission.
|
|
|
+
|
|
|
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
|
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
|
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
|
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
|
+ HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
|
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
|
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
|
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
|
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
|
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
|
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
+
|
|
|
+ */
|
|
|
+
|
|
|
+#define FMULS_POTRF(__n) ((double)(__n) * (((1. / 6.) * (double)(__n) + 0.5) * (double)(__n) + (1. / 3.)))
|
|
|
+#define FADDS_POTRF(__n) ((double)(__n) * (((1. / 6.) * (double)(__n) ) * (double)(__n) - (1. / 6.)))
|
|
|
+
|
|
|
+#define FLOPS_SPOTRF(__n) ( FMULS_POTRF((__n)) + FADDS_POTRF((__n)) )
|
|
|
+
|
|
|
+#define FMULS_TRMM_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)+1.))
|
|
|
+#define FADDS_TRMM_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)-1.))
|
|
|
+
|
|
|
+#define FMULS_TRMM(__m, __n) ( /*( (__side) == PlasmaLeft ) ? FMULS_TRMM_2((__m), (__n)) :*/ FMULS_TRMM_2((__n), (__m)) )
|
|
|
+#define FADDS_TRMM(__m, __n) ( /*( (__side) == PlasmaLeft ) ? FADDS_TRMM_2((__m), (__n)) :*/ FADDS_TRMM_2((__n), (__m)) )
|
|
|
+
|
|
|
+#define FMULS_TRSM FMULS_TRMM
|
|
|
+#define FADDS_TRSM FMULS_TRMM
|
|
|
+
|
|
|
+#define FLOPS_STRSM(__m, __n) ( FMULS_TRSM((__m), (__n)) + FADDS_TRSM((__m), (__n)) )
|
|
|
+
|
|
|
+
|
|
|
+#define FMULS_GEMM(__m, __n, __k) ((double)(__m) * (double)(__n) * (double)(__k))
|
|
|
+#define FADDS_GEMM(__m, __n, __k) ((double)(__m) * (double)(__n) * (double)(__k))
|
|
|
+
|
|
|
+#define FLOPS_SGEMM(__m, __n, __k) ( FMULS_GEMM((__m), (__n), (__k)) + FADDS_GEMM((__m), (__n), (__k)) )
|
|
|
+
|
|
|
+/* End of magma code */
|
|
|
+
|
|
|
/*
|
|
|
* Create the codelets
|
|
|
*/
|
|
@@ -72,6 +130,7 @@ static void run_cholesky(starpu_data_handle_t **data_handles, int rank, int node
|
|
|
{
|
|
|
unsigned k, m, n;
|
|
|
unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
|
|
|
+ unsigned nn = size/nblocks;
|
|
|
|
|
|
for (k = 0; k < nblocks; k++)
|
|
|
{
|
|
@@ -80,6 +139,7 @@ static void run_cholesky(starpu_data_handle_t **data_handles, int rank, int node
|
|
|
starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
|
|
|
STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
|
|
|
STARPU_RW, data_handles[k][k],
|
|
|
+ STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
|
|
|
0);
|
|
|
|
|
|
for (m = k+1; m<nblocks; m++)
|
|
@@ -88,28 +148,30 @@ static void run_cholesky(starpu_data_handle_t **data_handles, int rank, int node
|
|
|
STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
|
|
|
STARPU_R, data_handles[k][k],
|
|
|
STARPU_RW, data_handles[m][k],
|
|
|
+ STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
|
|
|
0);
|
|
|
|
|
|
starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[k][k]);
|
|
|
if (my_distrib(k, k, nodes) == rank)
|
|
|
starpu_data_wont_use(data_handles[k][k]);
|
|
|
+ }
|
|
|
|
|
|
- for (n = k+1; n<nblocks; n++)
|
|
|
+ for (n = k+1; n<nblocks; n++)
|
|
|
+ {
|
|
|
+ for (m = n; m<nblocks; m++)
|
|
|
{
|
|
|
- if (n <= m)
|
|
|
- {
|
|
|
- starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
|
|
|
- STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
|
|
|
- STARPU_R, data_handles[n][k],
|
|
|
- STARPU_R, data_handles[m][k],
|
|
|
- STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
|
|
|
- 0);
|
|
|
- }
|
|
|
+ starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
|
|
|
+ STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
|
|
|
+ STARPU_R, data_handles[n][k],
|
|
|
+ STARPU_R, data_handles[m][k],
|
|
|
+ STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
|
|
|
+ STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
|
|
|
+ 0);
|
|
|
}
|
|
|
|
|
|
- starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[m][k]);
|
|
|
- if (my_distrib(m, k, nodes) == rank)
|
|
|
- starpu_data_wont_use(data_handles[m][k]);
|
|
|
+ starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][k]);
|
|
|
+ if (my_distrib(n, k, nodes) == rank)
|
|
|
+ starpu_data_wont_use(data_handles[n][k]);
|
|
|
}
|
|
|
starpu_iteration_pop();
|
|
|
}
|
|
@@ -120,6 +182,7 @@ static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, i
|
|
|
{
|
|
|
unsigned k, m, n;
|
|
|
unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
|
|
|
+ unsigned nn = size/nblocks;
|
|
|
|
|
|
/* Column */
|
|
|
for (n = 0; n<nblocks; n++)
|
|
@@ -137,7 +200,15 @@ static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, i
|
|
|
STARPU_R, data_handles[n][k],
|
|
|
STARPU_R, data_handles[m][k],
|
|
|
STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
|
|
|
+ STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
|
|
|
0);
|
|
|
+
|
|
|
+ if (m == n)
|
|
|
+ {
|
|
|
+ /* Nobody else will need it */
|
|
|
+ starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[m][k]);
|
|
|
+ starpu_data_wont_use(data_handles[m][k]);
|
|
|
+ }
|
|
|
}
|
|
|
k = n;
|
|
|
if (m > n)
|
|
@@ -147,6 +218,7 @@ static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, i
|
|
|
STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
|
|
|
STARPU_R, data_handles[k][k],
|
|
|
STARPU_RW, data_handles[m][k],
|
|
|
+ STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
|
|
|
0);
|
|
|
}
|
|
|
else
|
|
@@ -155,26 +227,27 @@ static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, i
|
|
|
starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
|
|
|
STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
|
|
|
STARPU_RW, data_handles[k][k],
|
|
|
+ STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
|
|
|
0);
|
|
|
}
|
|
|
+
|
|
|
}
|
|
|
|
|
|
+ /* We won't need it any more */
|
|
|
+ starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][n]);
|
|
|
+ starpu_data_wont_use(data_handles[n][n]);
|
|
|
+
|
|
|
starpu_iteration_pop();
|
|
|
}
|
|
|
-
|
|
|
- /* Submit flushes, StarPU will fit them according to the progress */
|
|
|
- starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
|
|
|
- for (m = 0; m < nblocks; m++)
|
|
|
- for (n = 0; n < nblocks ; n++)
|
|
|
- starpu_data_wont_use(data_handles[m][n]);
|
|
|
}
|
|
|
|
|
|
/* TODO: generate from compiler polyhedral analysis of classical algorithm */
|
|
|
static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int rank, int nodes)
|
|
|
{
|
|
|
- unsigned a, c;
|
|
|
+ unsigned a;
|
|
|
unsigned k, m, n;
|
|
|
unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
|
|
|
+ unsigned nn = size/nblocks;
|
|
|
|
|
|
/* double-antidiagonal number:
|
|
|
* - a=0 contains (0,0) plus (1,0)
|
|
@@ -205,7 +278,15 @@ static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int r
|
|
|
STARPU_R, data_handles[n][k],
|
|
|
STARPU_R, data_handles[m][k],
|
|
|
STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
|
|
|
+ STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
|
|
|
0);
|
|
|
+
|
|
|
+ if (m == nblocks-1)
|
|
|
+ {
|
|
|
+ /* Nobody else will need it */
|
|
|
+ starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][k]);
|
|
|
+ starpu_data_wont_use(data_handles[n][k]);
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
/* k = n */
|
|
@@ -216,6 +297,7 @@ static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int r
|
|
|
STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
|
|
|
STARPU_R, data_handles[k][k],
|
|
|
STARPU_RW, data_handles[m][k],
|
|
|
+ STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
|
|
|
0);
|
|
|
}
|
|
|
else
|
|
@@ -224,8 +306,16 @@ static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int r
|
|
|
starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
|
|
|
STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
|
|
|
STARPU_RW, data_handles[k][k],
|
|
|
+ STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
|
|
|
0);
|
|
|
}
|
|
|
+
|
|
|
+ if (m == nblocks - 1)
|
|
|
+ {
|
|
|
+ /* We do not need the potrf result any more */
|
|
|
+ starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][n]);
|
|
|
+ starpu_data_wont_use(data_handles[n][n]);
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
/* column within second antidiagonal for a */
|
|
@@ -246,7 +336,15 @@ static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int r
|
|
|
STARPU_R, data_handles[n][k],
|
|
|
STARPU_R, data_handles[m][k],
|
|
|
STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
|
|
|
+ STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
|
|
|
0);
|
|
|
+
|
|
|
+ if (m == nblocks-1)
|
|
|
+ {
|
|
|
+ /* Nobody else will need it */
|
|
|
+ starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][k]);
|
|
|
+ starpu_data_wont_use(data_handles[n][k]);
|
|
|
+ }
|
|
|
}
|
|
|
/* non-diagonal block, solve */
|
|
|
k = n;
|
|
@@ -254,17 +352,19 @@ static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int r
|
|
|
STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
|
|
|
STARPU_R, data_handles[k][k],
|
|
|
STARPU_RW, data_handles[m][k],
|
|
|
+ STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
|
|
|
0);
|
|
|
+
|
|
|
+ if (m == nblocks - 1)
|
|
|
+ {
|
|
|
+ /* We do not need the potrf result any more */
|
|
|
+ starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][n]);
|
|
|
+ starpu_data_wont_use(data_handles[n][n]);
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
starpu_iteration_pop();
|
|
|
}
|
|
|
-
|
|
|
- /* Submit flushes, StarPU will fit them according to the progress */
|
|
|
- starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
|
|
|
- for (m = 0; m < nblocks; m++)
|
|
|
- for (n = 0; n < nblocks ; n++)
|
|
|
- starpu_data_wont_use(data_handles[m][n]);
|
|
|
}
|
|
|
|
|
|
/* TODO: generate from compiler polyhedral analysis of classical algorithm */
|
|
@@ -273,9 +373,10 @@ static void run_cholesky_prio(starpu_data_handle_t **data_handles, int rank, int
|
|
|
unsigned a;
|
|
|
int k, m, n;
|
|
|
unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
|
|
|
+ unsigned nn = size/nblocks;
|
|
|
|
|
|
/*
|
|
|
- * This is basically similar to above, except that we shift k according to the priorities set in the algorithm, so that prio ~ 2*a or 2*a+1
|
|
|
+ * This is basically similar to above, except that we shift k according to the priorities set in the algorithm, so that gemm prio ~= 2*nblocks - a
|
|
|
* double-antidiagonal number:
|
|
|
* - a=0 contains (0,0) plus (1,0)
|
|
|
* - a=1 contains (2,0), (1,1) plus (3,0), (2, 1)
|
|
@@ -285,41 +386,47 @@ static void run_cholesky_prio(starpu_data_handle_t **data_handles, int rank, int
|
|
|
{
|
|
|
starpu_iteration_push(a);
|
|
|
|
|
|
- for (k = 0; k < nblocks; k++)
|
|
|
+ for (k = 0; k < (int) nblocks; k++)
|
|
|
{
|
|
|
n = k;
|
|
|
/* Should be m = a-k-n; for potrf and trsm to respect
|
|
|
priorities, but needs to be this for dependencies */
|
|
|
m = a-2*k-n;
|
|
|
|
|
|
- if (m < 0 || m >= nblocks)
|
|
|
- continue;
|
|
|
-
|
|
|
if (m == n)
|
|
|
{
|
|
|
/* diagonal block, factorize */
|
|
|
starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
|
|
|
STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
|
|
|
STARPU_RW, data_handles[k][k],
|
|
|
+ STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
|
|
|
0);
|
|
|
}
|
|
|
- else
|
|
|
+ else if (m >= n && m < (int) nblocks)
|
|
|
{
|
|
|
/* non-diagonal block, solve */
|
|
|
starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
|
|
|
STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
|
|
|
STARPU_R, data_handles[k][k],
|
|
|
STARPU_RW, data_handles[m][k],
|
|
|
+ STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
|
|
|
0);
|
|
|
}
|
|
|
|
|
|
+ if (m == (int) nblocks - 1)
|
|
|
+ {
|
|
|
+ /* We do not need the potrf result any more */
|
|
|
+ starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][n]);
|
|
|
+ starpu_data_wont_use(data_handles[n][n]);
|
|
|
+ }
|
|
|
+
|
|
|
/* column within antidiagonal for a */
|
|
|
- for (n = k + 1; n < nblocks; n++)
|
|
|
+ for (n = k + 1; n < (int) nblocks; n++)
|
|
|
{
|
|
|
/* row */
|
|
|
m = a-2*k-n;
|
|
|
|
|
|
- if (m >= n && m < nblocks)
|
|
|
+ if (m >= n && m < (int) nblocks)
|
|
|
{
|
|
|
/* Update */
|
|
|
starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
|
|
@@ -327,7 +434,14 @@ static void run_cholesky_prio(starpu_data_handle_t **data_handles, int rank, int
|
|
|
STARPU_R, data_handles[n][k],
|
|
|
STARPU_R, data_handles[m][k],
|
|
|
STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
|
|
|
+ STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
|
|
|
0);
|
|
|
+ if (m == (int) nblocks - 1)
|
|
|
+ {
|
|
|
+ /* Nobody else will need it */
|
|
|
+ starpu_data_wont_use(data_handles[n][k]);
|
|
|
+ starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[n][k]);
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
|
|
@@ -335,12 +449,6 @@ static void run_cholesky_prio(starpu_data_handle_t **data_handles, int rank, int
|
|
|
|
|
|
starpu_iteration_pop();
|
|
|
}
|
|
|
-
|
|
|
- /* Submit flushes, StarPU will fit them according to the progress */
|
|
|
- starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
|
|
|
- for (m = 0; m < nblocks; m++)
|
|
|
- for (n = 0; n < nblocks ; n++)
|
|
|
- starpu_data_wont_use(data_handles[m][n]);
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -423,7 +531,7 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
|
|
|
if (rank == 0)
|
|
|
{
|
|
|
*timing = end - start;
|
|
|
- *flops = (1.0f*size*size*size)/3.0f;
|
|
|
+ *flops = FLOPS_SPOTRF(size);
|
|
|
}
|
|
|
}
|
|
|
|