|
@@ -20,6 +20,64 @@
|
|
|
#include <limits.h>
|
|
|
#include <math.h>
|
|
|
|
|
|
+/* This is from magma
|
|
|
+
|
|
|
+ -- Innovative Computing Laboratory
|
|
|
+ -- Electrical Engineering and Computer Science Department
|
|
|
+ -- University of Tennessee
|
|
|
+ -- (C) Copyright 2009
|
|
|
+
|
|
|
+ Redistribution and use in source and binary forms, with or without
|
|
|
+ modification, are permitted provided that the following conditions
|
|
|
+ are met:
|
|
|
+
|
|
|
+ * Redistributions of source code must retain the above copyright
|
|
|
+ notice, this list of conditions and the following disclaimer.
|
|
|
+ * Redistributions in binary form must reproduce the above copyright
|
|
|
+ notice, this list of conditions and the following disclaimer in the
|
|
|
+ documentation and/or other materials provided with the distribution.
|
|
|
+ * Neither the name of the University of Tennessee, Knoxville nor the
|
|
|
+ names of its contributors may be used to endorse or promote products
|
|
|
+ derived from this software without specific prior written permission.
|
|
|
+
|
|
|
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
|
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
|
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
|
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
|
+ HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
|
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
|
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
|
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
|
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
|
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
|
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
+
|
|
|
+ */
|
|
|
+
|
|
|
+#define FMULS_POTRF(__n) ((double)(__n) * (((1. / 6.) * (double)(__n) + 0.5) * (double)(__n) + (1. / 3.)))
|
|
|
+#define FADDS_POTRF(__n) ((double)(__n) * (((1. / 6.) * (double)(__n) ) * (double)(__n) - (1. / 6.)))
|
|
|
+
|
|
|
+#define FLOPS_SPOTRF(__n) ( FMULS_POTRF((__n)) + FADDS_POTRF((__n)) )
|
|
|
+
|
|
|
+#define FMULS_TRMM_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)+1.))
|
|
|
+#define FADDS_TRMM_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)-1.))
|
|
|
+
|
|
|
+#define FMULS_TRMM(__m, __n) ( /*( (__side) == PlasmaLeft ) ? FMULS_TRMM_2((__m), (__n)) :*/ FMULS_TRMM_2((__n), (__m)) )
|
|
|
+#define FADDS_TRMM(__m, __n) ( /*( (__side) == PlasmaLeft ) ? FADDS_TRMM_2((__m), (__n)) :*/ FADDS_TRMM_2((__n), (__m)) )
|
|
|
+
|
|
|
+#define FMULS_TRSM FMULS_TRMM
|
|
|
+#define FADDS_TRSM FMULS_TRMM
|
|
|
+
|
|
|
+#define FLOPS_STRSM(__m, __n) ( FMULS_TRSM((__m), (__n)) + FADDS_TRSM((__m), (__n)) )
|
|
|
+
|
|
|
+
|
|
|
+#define FMULS_GEMM(__m, __n, __k) ((double)(__m) * (double)(__n) * (double)(__k))
|
|
|
+#define FADDS_GEMM(__m, __n, __k) ((double)(__m) * (double)(__n) * (double)(__k))
|
|
|
+
|
|
|
+#define FLOPS_SGEMM(__m, __n, __k) ( FMULS_GEMM((__m), (__n), (__k)) + FADDS_GEMM((__m), (__n), (__k)) )
|
|
|
+
|
|
|
+/* End of magma code */
|
|
|
+
|
|
|
/*
|
|
|
* Create the codelets
|
|
|
*/
|
|
@@ -72,6 +130,7 @@ static void run_cholesky(starpu_data_handle_t **data_handles, int rank, int node
|
|
|
{
|
|
|
unsigned k, m, n;
|
|
|
unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
|
|
|
+ unsigned nn = size/nblocks;
|
|
|
|
|
|
for (k = 0; k < nblocks; k++)
|
|
|
{
|
|
@@ -80,6 +139,7 @@ static void run_cholesky(starpu_data_handle_t **data_handles, int rank, int node
|
|
|
starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
|
|
|
STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
|
|
|
STARPU_RW, data_handles[k][k],
|
|
|
+ STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
|
|
|
0);
|
|
|
|
|
|
for (m = k+1; m<nblocks; m++)
|
|
@@ -88,6 +148,7 @@ static void run_cholesky(starpu_data_handle_t **data_handles, int rank, int node
|
|
|
STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
|
|
|
STARPU_R, data_handles[k][k],
|
|
|
STARPU_RW, data_handles[m][k],
|
|
|
+ STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
|
|
|
0);
|
|
|
|
|
|
starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[k][k]);
|
|
@@ -104,6 +165,7 @@ static void run_cholesky(starpu_data_handle_t **data_handles, int rank, int node
|
|
|
STARPU_R, data_handles[n][k],
|
|
|
STARPU_R, data_handles[m][k],
|
|
|
STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
|
|
|
+ STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
|
|
|
0);
|
|
|
}
|
|
|
|
|
@@ -120,6 +182,7 @@ static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, i
|
|
|
{
|
|
|
unsigned k, m, n;
|
|
|
unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
|
|
|
+ unsigned nn = size/nblocks;
|
|
|
|
|
|
/* Column */
|
|
|
for (n = 0; n<nblocks; n++)
|
|
@@ -137,6 +200,7 @@ static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, i
|
|
|
STARPU_R, data_handles[n][k],
|
|
|
STARPU_R, data_handles[m][k],
|
|
|
STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
|
|
|
+ STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
|
|
|
0);
|
|
|
}
|
|
|
k = n;
|
|
@@ -147,6 +211,7 @@ static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, i
|
|
|
STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
|
|
|
STARPU_R, data_handles[k][k],
|
|
|
STARPU_RW, data_handles[m][k],
|
|
|
+ STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
|
|
|
0);
|
|
|
}
|
|
|
else
|
|
@@ -155,6 +220,7 @@ static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, i
|
|
|
starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
|
|
|
STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
|
|
|
STARPU_RW, data_handles[k][k],
|
|
|
+ STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
|
|
|
0);
|
|
|
}
|
|
|
}
|
|
@@ -175,6 +241,7 @@ static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int r
|
|
|
unsigned a, c;
|
|
|
unsigned k, m, n;
|
|
|
unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
|
|
|
+ unsigned nn = size/nblocks;
|
|
|
|
|
|
/* double-antidiagonal number:
|
|
|
* - a=0 contains (0,0) plus (1,0)
|
|
@@ -205,6 +272,7 @@ static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int r
|
|
|
STARPU_R, data_handles[n][k],
|
|
|
STARPU_R, data_handles[m][k],
|
|
|
STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
|
|
|
+ STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
|
|
|
0);
|
|
|
}
|
|
|
|
|
@@ -216,6 +284,7 @@ static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int r
|
|
|
STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
|
|
|
STARPU_R, data_handles[k][k],
|
|
|
STARPU_RW, data_handles[m][k],
|
|
|
+ STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
|
|
|
0);
|
|
|
}
|
|
|
else
|
|
@@ -224,6 +293,7 @@ static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int r
|
|
|
starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
|
|
|
STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
|
|
|
STARPU_RW, data_handles[k][k],
|
|
|
+ STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
|
|
|
0);
|
|
|
}
|
|
|
}
|
|
@@ -246,6 +316,7 @@ static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int r
|
|
|
STARPU_R, data_handles[n][k],
|
|
|
STARPU_R, data_handles[m][k],
|
|
|
STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
|
|
|
+ STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
|
|
|
0);
|
|
|
}
|
|
|
/* non-diagonal block, solve */
|
|
@@ -254,6 +325,7 @@ static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int r
|
|
|
STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
|
|
|
STARPU_R, data_handles[k][k],
|
|
|
STARPU_RW, data_handles[m][k],
|
|
|
+ STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
|
|
|
0);
|
|
|
}
|
|
|
|
|
@@ -273,6 +345,7 @@ static void run_cholesky_prio(starpu_data_handle_t **data_handles, int rank, int
|
|
|
unsigned a;
|
|
|
int k, m, n;
|
|
|
unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
|
|
|
+ unsigned nn = size/nblocks;
|
|
|
|
|
|
/*
|
|
|
* This is basically similar to above, except that we shift k according to the priorities set in the algorithm, so that prio ~ 2*a or 2*a+1
|
|
@@ -301,6 +374,7 @@ static void run_cholesky_prio(starpu_data_handle_t **data_handles, int rank, int
|
|
|
starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
|
|
|
STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
|
|
|
STARPU_RW, data_handles[k][k],
|
|
|
+ STARPU_FLOPS, (double) FLOPS_SPOTRF(nn),
|
|
|
0);
|
|
|
}
|
|
|
else
|
|
@@ -310,6 +384,7 @@ static void run_cholesky_prio(starpu_data_handle_t **data_handles, int rank, int
|
|
|
STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
|
|
|
STARPU_R, data_handles[k][k],
|
|
|
STARPU_RW, data_handles[m][k],
|
|
|
+ STARPU_FLOPS, (double) FLOPS_STRSM(nn, nn),
|
|
|
0);
|
|
|
}
|
|
|
|
|
@@ -327,6 +402,7 @@ static void run_cholesky_prio(starpu_data_handle_t **data_handles, int rank, int
|
|
|
STARPU_R, data_handles[n][k],
|
|
|
STARPU_R, data_handles[m][k],
|
|
|
STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
|
|
|
+ STARPU_FLOPS, (double) FLOPS_SGEMM(nn, nn, nn),
|
|
|
0);
|
|
|
}
|
|
|
}
|