|
@@ -1,7 +1,7 @@
|
|
|
/* StarPU --- Runtime system for heterogeneous multicore architectures.
|
|
|
*
|
|
|
* Copyright (C) 2009, 2010 Université de Bordeaux 1
|
|
|
- * Copyright (C) 2010 Centre National de la Recherche Scientifique
|
|
|
+ * Copyright (C) 2010, 2011 Centre National de la Recherche Scientifique
|
|
|
*
|
|
|
* StarPU is free software; you can redistribute it and/or modify
|
|
|
* it under the terms of the GNU Lesser General Public License as published by
|
|
@@ -25,7 +25,7 @@
|
|
|
#endif
|
|
|
|
|
|
/*
|
|
|
- * U22
|
|
|
+ * U22
|
|
|
*/
|
|
|
|
|
|
static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, __attribute__((unused)) void *_args)
|
|
@@ -43,31 +43,31 @@ static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, __at
|
|
|
unsigned ld12 = STARPU_MATRIX_GET_LD(descr[1]);
|
|
|
unsigned ld22 = STARPU_MATRIX_GET_LD(descr[2]);
|
|
|
|
|
|
-//#ifdef STARPU_USE_CUDA
|
|
|
-// cublasStatus st;
|
|
|
-//#endif
|
|
|
-//
|
|
|
-// switch (s) {
|
|
|
-// case 0:
|
|
|
-// SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21,
|
|
|
-// right, ld12, 1.0f, center, ld22);
|
|
|
-// break;
|
|
|
-//#ifdef STARPU_USE_CUDA
|
|
|
-// case 1:
|
|
|
-// cublasSgemm('n', 't', dy, dx, dz,
|
|
|
-// -1.0f, left, ld21, right, ld12,
|
|
|
-// 1.0f, center, ld22);
|
|
|
-// st = cublasGetError();
|
|
|
-// STARPU_ASSERT(!st);
|
|
|
-//
|
|
|
-// cudaThreadSynchronize();
|
|
|
-//
|
|
|
-// break;
|
|
|
-//#endif
|
|
|
-// default:
|
|
|
-// STARPU_ABORT();
|
|
|
-// break;
|
|
|
-// }
|
|
|
+#ifdef STARPU_USE_CUDA
|
|
|
+ cublasStatus st;
|
|
|
+#endif
|
|
|
+
|
|
|
+ switch (s) {
|
|
|
+ case 0:
|
|
|
+ SGEMM("N", "T", dy, dx, dz, -1.0f, left, ld21,
|
|
|
+ right, ld12, 1.0f, center, ld22);
|
|
|
+ break;
|
|
|
+#ifdef STARPU_USE_CUDA
|
|
|
+ case 1:
|
|
|
+ cublasSgemm('n', 't', dy, dx, dz,
|
|
|
+ -1.0f, left, ld21, right, ld12,
|
|
|
+ 1.0f, center, ld22);
|
|
|
+ st = cublasGetError();
|
|
|
+ STARPU_ASSERT(!st);
|
|
|
+
|
|
|
+ cudaThreadSynchronize();
|
|
|
+
|
|
|
+ break;
|
|
|
+#endif
|
|
|
+ default:
|
|
|
+ STARPU_ABORT();
|
|
|
+ break;
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
void chol_cpu_codelet_update_u22(void *descr[], void *_args)
|
|
@@ -82,7 +82,7 @@ void chol_cublas_codelet_update_u22(void *descr[], void *_args)
|
|
|
}
|
|
|
#endif// STARPU_USE_CUDA
|
|
|
|
|
|
-/*
|
|
|
+/*
|
|
|
* U21
|
|
|
*/
|
|
|
|
|
@@ -101,20 +101,20 @@ static inline void chol_common_codelet_update_u21(void *descr[], int s, __attrib
|
|
|
unsigned nx21 = STARPU_MATRIX_GET_NY(descr[1]);
|
|
|
unsigned ny21 = STARPU_MATRIX_GET_NX(descr[1]);
|
|
|
|
|
|
-// switch (s) {
|
|
|
-// case 0:
|
|
|
-// STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
|
|
|
-// break;
|
|
|
-//#ifdef STARPU_USE_CUDA
|
|
|
-// case 1:
|
|
|
-// cublasStrsm('R', 'L', 'T', 'N', nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
|
|
|
-// cudaThreadSynchronize();
|
|
|
-// break;
|
|
|
-//#endif
|
|
|
-// default:
|
|
|
-// STARPU_ABORT();
|
|
|
-// break;
|
|
|
-// }
|
|
|
+ switch (s) {
|
|
|
+ case 0:
|
|
|
+ STRSM("R", "L", "T", "N", nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
|
|
|
+ break;
|
|
|
+#ifdef STARPU_USE_CUDA
|
|
|
+ case 1:
|
|
|
+ cublasStrsm('R', 'L', 'T', 'N', nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
|
|
|
+ cudaThreadSynchronize();
|
|
|
+ break;
|
|
|
+#endif
|
|
|
+ default:
|
|
|
+ STARPU_ABORT();
|
|
|
+ break;
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
void chol_cpu_codelet_update_u21(void *descr[], void *_args)
|
|
@@ -127,77 +127,77 @@ void chol_cublas_codelet_update_u21(void *descr[], void *_args)
|
|
|
{
|
|
|
chol_common_codelet_update_u21(descr, 1, _args);
|
|
|
}
|
|
|
-#endif
|
|
|
+#endif
|
|
|
|
|
|
/*
|
|
|
* U11
|
|
|
*/
|
|
|
|
|
|
-static inline void chol_common_codelet_update_u11(void *descr[], int s, __attribute__((unused)) void *_args)
|
|
|
+static inline void chol_common_codelet_update_u11(void *descr[], int s, __attribute__((unused)) void *_args)
|
|
|
{
|
|
|
// printf("11\n");
|
|
|
float *sub11;
|
|
|
|
|
|
- sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
|
|
|
+ sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
|
|
|
|
|
|
unsigned nx = STARPU_MATRIX_GET_NY(descr[0]);
|
|
|
unsigned ld = STARPU_MATRIX_GET_LD(descr[0]);
|
|
|
|
|
|
unsigned z;
|
|
|
|
|
|
-// switch (s) {
|
|
|
-// case 0:
|
|
|
-//
|
|
|
-// /*
|
|
|
-// * - alpha 11 <- lambda 11 = sqrt(alpha11)
|
|
|
-// * - alpha 21 <- l 21 = alpha 21 / lambda 11
|
|
|
-// * - A22 <- A22 - l21 trans(l21)
|
|
|
-// */
|
|
|
-//
|
|
|
-// for (z = 0; z < nx; z++)
|
|
|
-// {
|
|
|
-// float lambda11;
|
|
|
-// lambda11 = sqrt(sub11[z+z*ld]);
|
|
|
-// sub11[z+z*ld] = lambda11;
|
|
|
-//
|
|
|
-// STARPU_ASSERT(lambda11 != 0.0f);
|
|
|
-//
|
|
|
-// SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
|
|
|
-//
|
|
|
-// SSYR("L", nx - z - 1, -1.0f,
|
|
|
-// &sub11[(z+1)+z*ld], 1,
|
|
|
-// &sub11[(z+1)+(z+1)*ld], ld);
|
|
|
-// }
|
|
|
-// break;
|
|
|
-//#ifdef STARPU_USE_CUDA
|
|
|
-// case 1:
|
|
|
-// for (z = 0; z < nx; z++)
|
|
|
-// {
|
|
|
-// float lambda11;
|
|
|
-// cudaMemcpy(&lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost);
|
|
|
-// cudaStreamSynchronize(0);
|
|
|
-//
|
|
|
-// STARPU_ASSERT(lambda11 != 0.0f);
|
|
|
-//
|
|
|
-// lambda11 = sqrt(lambda11);
|
|
|
-//
|
|
|
-// cublasSetVector(1, sizeof(float), &lambda11, sizeof(float), &sub11[z+z*ld], sizeof(float));
|
|
|
-//
|
|
|
-// cublasSscal(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
|
|
|
-//
|
|
|
-// cublasSsyr('U', nx - z - 1, -1.0f,
|
|
|
-// &sub11[(z+1)+z*ld], 1,
|
|
|
-// &sub11[(z+1)+(z+1)*ld], ld);
|
|
|
-// }
|
|
|
-//
|
|
|
-// cudaThreadSynchronize();
|
|
|
-//
|
|
|
-// break;
|
|
|
-//#endif
|
|
|
-// default:
|
|
|
-// STARPU_ABORT();
|
|
|
-// break;
|
|
|
-// }
|
|
|
+ switch (s) {
|
|
|
+ case 0:
|
|
|
+
|
|
|
+ /*
|
|
|
+ * - alpha 11 <- lambda 11 = sqrt(alpha11)
|
|
|
+ * - alpha 21 <- l 21 = alpha 21 / lambda 11
|
|
|
+ * - A22 <- A22 - l21 trans(l21)
|
|
|
+ */
|
|
|
+
|
|
|
+ for (z = 0; z < nx; z++)
|
|
|
+ {
|
|
|
+ float lambda11;
|
|
|
+ lambda11 = sqrt(sub11[z+z*ld]);
|
|
|
+ sub11[z+z*ld] = lambda11;
|
|
|
+
|
|
|
+ STARPU_ASSERT(lambda11 != 0.0f);
|
|
|
+
|
|
|
+ SSCAL(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
|
|
|
+
|
|
|
+ SSYR("L", nx - z - 1, -1.0f,
|
|
|
+ &sub11[(z+1)+z*ld], 1,
|
|
|
+ &sub11[(z+1)+(z+1)*ld], ld);
|
|
|
+ }
|
|
|
+ break;
|
|
|
+#ifdef STARPU_USE_CUDA
|
|
|
+ case 1:
|
|
|
+ for (z = 0; z < nx; z++)
|
|
|
+ {
|
|
|
+ float lambda11;
|
|
|
+ cudaMemcpy(&lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost);
|
|
|
+ cudaStreamSynchronize(0);
|
|
|
+
|
|
|
+ STARPU_ASSERT(lambda11 != 0.0f);
|
|
|
+
|
|
|
+ lambda11 = sqrt(lambda11);
|
|
|
+
|
|
|
+ cublasSetVector(1, sizeof(float), &lambda11, sizeof(float), &sub11[z+z*ld], sizeof(float));
|
|
|
+
|
|
|
+ cublasSscal(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
|
|
|
+
|
|
|
+ cublasSsyr('U', nx - z - 1, -1.0f,
|
|
|
+ &sub11[(z+1)+z*ld], 1,
|
|
|
+ &sub11[(z+1)+(z+1)*ld], ld);
|
|
|
+ }
|
|
|
+
|
|
|
+ cudaThreadSynchronize();
|
|
|
+
|
|
|
+ break;
|
|
|
+#endif
|
|
|
+ default:
|
|
|
+ STARPU_ABORT();
|
|
|
+ break;
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
|