Selaa lähdekoodia

cholesky: Add pieces for compiler-generated code

Samuel Thibault 6 vuotta sitten
vanhempi
commit
a50f0acd93
1 muutettua tiedostoa jossa 34 lisäystä ja 4 poistoa
  1. 34 4
      examples/cholesky/cholesky_compil.c

+ 34 - 4
examples/cholesky/cholesky_compil.c

@@ -26,6 +26,7 @@
 
 #include "cholesky.h"
 #include "../sched_ctx_utils/sched_ctx_utils.h"
+#include <math.h>
 
 #if defined(STARPU_USE_CUDA) && defined(STARPU_HAVE_MAGMA)
 #include "magma.h"
@@ -47,8 +48,14 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 	double start;
 	double end;
 
-	unsigned long N = starpu_matrix_get_nx(dataA);
-	unsigned long nn = N/nblocks;
+	unsigned long nelems = starpu_matrix_get_nx(dataA);
+	unsigned long nn = nelems/nblocks;
+	int N = nblocks;
+	int M = nblocks;
+
+	int lambda_b = starpu_get_env_number_default("CHOLESKY_LAMBDA_B", nblocks);
+	int lambda_o_u = starpu_get_env_number_default("CHOLESKY_LAMBDA_O_U", 0);
+	int lambda_o_d = starpu_get_env_number_default("CHOLESKY_LAMBDA_O_D", 0);
 
 	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
 
@@ -58,7 +65,13 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
 	start = starpu_timing_now();
 
+#define min(x,y)  (x<y?x:y)
+#define max(x,y)  (x<y?y:x)
+#define ceild(n,d)  ceil(((double)(n))/((double)(d)))
+#define floord(n,d) floor(((double)(n))/((double)(d)))
+
 #define A(i,j) starpu_data_get_sub_data(dataA, 2, j, i)
+
 #define _POTRF(cl, A, prio) do { \
 		int ret = starpu_task_insert(cl, \
 					 STARPU_PRIORITY, noprio_p ? STARPU_DEFAULT_PRIO : unbound_prio ? (int) (prio) : (int) STARPU_MAX_PRIO, \
@@ -120,6 +133,23 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 #define SYRK_CPU(A, B, prio)	_SYRK(&cl22_cpu, A, B, prio)
 #define GEMM_CPU(A, B, C, prio)	_GEMM(&cl22_cpu, A, B, C, prio)
 
+#define potrf_oreille_up(k)		{ POTRF_GPU(A(k,k),(2*N - 2*k)); }
+#define potrf_oreille_down(k)		{ POTRF_GPU(A(k,k),(2*N - 2*k)); }
+#define potrf_cpu(k)			{ POTRF_CPU(A(k,k),(2*N - 2*k)); }
+#define potrf_bande(k)			{ POTRF(A(k,k),(2*N - 2*k)); }
+#define trsm_oreille_up(k,m)		{ TRSM_GPU(A(k,k),A(m,k), (2*nblocks - 2*k - m)); }
+#define trsm_oreille_down(k,m)		{ TRSM_GPU(A(k,k),A(m,k), (2*nblocks - 2*k - m)); }
+#define trsm_cpu(k,m)			{ TRSM_CPU(A(k,k),A(m,k), (2*nblocks - 2*k - m)); }
+#define trsm_bande(k,m)			{ TRSM(A(k,k),A(m,k), (2*nblocks - 2*k - m)); }
+#define herk_oreille_up(k,n)		{ SYRK_GPU(A(n,k),A(n,n), (2*nblocks - 2*k - n)); }
+#define herk_oreille_down(k,n)		{ SYRK_GPU(A(n,k),A(n,n), (2*nblocks - 2*k - n)); }
+#define herk_cpu(k,n)			{ SYRK_CPU(A(n,k),A(n,n), (2*nblocks - 2*k - n)); }
+#define herk_bande(k,n)			{ SYRK(A(n,k),A(n,n), (2*nblocks - 2*k - n)); }
+#define gemm_oreille_up(k,n,m)		{ GEMM_GPU(A(m,k),A(n,k),A(m,n), (2*nblocks - 2*k - n - m)); }
+#define gemm_oreille_down(k,n,m)	{ GEMM_GPU(A(m,k),A(n,k),A(m,n), (2*nblocks - 2*k - n - m)); }
+#define gemm_cpu(k,n,m)			{ GEMM_CPU(A(m,k),A(n,k),A(m,n), (2*nblocks - 2*k - n - m)); }
+#define gemm_bande(k,n,m)		{ GEMM(A(m,k),A(n,k),A(m,n), (2*nblocks - 2*k - n - m)); }
+
 #include "cholesky_compiled.c"
 
 	starpu_task_wait_for_all();
@@ -132,7 +162,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 
 	double timing = end - start;
 
-	double flop = FLOPS_SPOTRF(N);
+	double flop = FLOPS_SPOTRF(nelems);
 
 	if(with_ctxs_p || with_noctxs_p || chole1_p || chole2_p)
 		update_sched_ctx_timing_results((flop/timing/1000.0f), (timing/1000000.0f));
@@ -143,7 +173,7 @@ static int _cholesky(starpu_data_handle_t dataA, unsigned nblocks)
 			PRINTF("\tTms\tTGFlops");
 		PRINTF("\n");
 
-		PRINTF("%lu\t%.0f\t%.1f", N, timing/1000, (flop/timing/1000.0f));
+		PRINTF("%lu\t%.0f\t%.1f", nelems, timing/1000, (flop/timing/1000.0f));
 		if (bound_lp_p)
 		{
 			FILE *f = fopen("cholesky.lp", "w");