|
@@ -1,7 +1,7 @@
|
|
|
/* StarPU --- Runtime system for heterogeneous multicore architectures.
|
|
|
*
|
|
|
* Copyright (C) 2012 Inria
|
|
|
- * Copyright (C) 2009-2014 Université de Bordeaux
|
|
|
+ * Copyright (C) 2009-2014, 2018 Université de Bordeaux
|
|
|
* Copyright (C) 2010-2013,2015,2017 CNRS
|
|
|
*
|
|
|
* StarPU is free software; you can redistribute it and/or modify
|
|
@@ -23,6 +23,7 @@
|
|
|
#include <cuda.h>
|
|
|
#include <cuda_runtime.h>
|
|
|
#include <cublas.h>
|
|
|
+#include <starpu_cublas_v2.h>
|
|
|
#ifdef STARPU_HAVE_MAGMA
|
|
|
#include "magma.h"
|
|
|
#include "magma_lapack.h"
|
|
@@ -33,6 +34,11 @@
|
|
|
* U22
|
|
|
*/
|
|
|
|
|
|
+#if defined(STARPU_USE_CUDA)
|
|
|
+static const float p1 = 1.0;
|
|
|
+static const float m1 = -1.0;
|
|
|
+#endif
|
|
|
+
|
|
|
static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, void *_args)
|
|
|
{
|
|
|
(void)_args;
|
|
@@ -49,10 +55,6 @@ static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, void
|
|
|
unsigned ld12 = STARPU_MATRIX_GET_LD(descr[1]);
|
|
|
unsigned ld22 = STARPU_MATRIX_GET_LD(descr[2]);
|
|
|
|
|
|
-#ifdef STARPU_USE_CUDA
|
|
|
- cublasStatus st;
|
|
|
-#endif
|
|
|
-
|
|
|
switch (s)
|
|
|
{
|
|
|
case 0:
|
|
@@ -61,19 +63,16 @@ static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, void
|
|
|
break;
|
|
|
#ifdef STARPU_USE_CUDA
|
|
|
case 1:
|
|
|
-#ifdef STARPU_HAVE_MAGMA
|
|
|
- cublasSetKernelStream(starpu_cuda_get_local_stream());
|
|
|
-#endif
|
|
|
- cublasSgemm('n', 't', dy, dx, dz,
|
|
|
- -1.0f, left, ld21, right, ld12,
|
|
|
- 1.0f, center, ld22);
|
|
|
- st = cublasGetError();
|
|
|
- if (STARPU_UNLIKELY(st != CUBLAS_STATUS_SUCCESS))
|
|
|
- STARPU_CUBLAS_REPORT_ERROR(st);
|
|
|
-
|
|
|
- cudaStreamSynchronize(starpu_cuda_get_local_stream());
|
|
|
+ {
|
|
|
+ cublasStatus_t status = cublasSgemm(starpu_cublas_get_local_handle(),
|
|
|
+ CUBLAS_OP_N, CUBLAS_OP_T, dy, dx, dz,
|
|
|
+ &m1, left, ld21, right, ld12,
|
|
|
+ &p1, center, ld22);
|
|
|
+ if (status != CUBLAS_STATUS_SUCCESS)
|
|
|
+ STARPU_CUBLAS_REPORT_ERROR(status);
|
|
|
|
|
|
break;
|
|
|
+ }
|
|
|
#endif
|
|
|
default:
|
|
|
STARPU_ABORT();
|
|
@@ -113,6 +112,10 @@ static inline void chol_common_codelet_update_u21(void *descr[], int s, void *_a
|
|
|
unsigned nx21 = STARPU_MATRIX_GET_NY(descr[1]);
|
|
|
unsigned ny21 = STARPU_MATRIX_GET_NX(descr[1]);
|
|
|
|
|
|
+#ifdef STARPU_USE_CUDA
|
|
|
+ cublasStatus status;
|
|
|
+#endif
|
|
|
+
|
|
|
switch (s)
|
|
|
{
|
|
|
case 0:
|
|
@@ -120,11 +123,11 @@ static inline void chol_common_codelet_update_u21(void *descr[], int s, void *_a
|
|
|
break;
|
|
|
#ifdef STARPU_USE_CUDA
|
|
|
case 1:
|
|
|
-#ifdef STARPU_HAVE_MAGMA
|
|
|
- cublasSetKernelStream(starpu_cuda_get_local_stream());
|
|
|
-#endif
|
|
|
- cublasStrsm('R', 'L', 'T', 'N', nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
|
|
|
- cudaStreamSynchronize(starpu_cuda_get_local_stream());
|
|
|
+ status = cublasStrsm(starpu_cublas_get_local_handle(),
|
|
|
+ CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT,
|
|
|
+ nx21, ny21, &p1, sub11, ld11, sub21, ld21);
|
|
|
+ if (status != CUBLAS_STATUS_SUCCESS)
|
|
|
+ STARPU_CUBLAS_REPORT_ERROR(status);
|
|
|
break;
|
|
|
#endif
|
|
|
default:
|
|
@@ -197,36 +200,65 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, void *_a
|
|
|
{
|
|
|
int ret;
|
|
|
int info;
|
|
|
+ cudaStream_t stream = starpu_cuda_get_local_stream();
|
|
|
+#if (MAGMA_VERSION_MAJOR > 1) || (MAGMA_VERSION_MAJOR == 1 && MAGMA_VERSION_MINOR >= 4)
|
|
|
+ cublasSetKernelStream(stream);
|
|
|
+ magmablasSetKernelStream(stream);
|
|
|
+#else
|
|
|
+ starpu_cublas_set_stream();
|
|
|
+#endif
|
|
|
ret = magma_spotrf_gpu(MagmaLower, nx, sub11, ld, &info);
|
|
|
if (ret != MAGMA_SUCCESS)
|
|
|
{
|
|
|
fprintf(stderr, "Error in Magma: %d\n", ret);
|
|
|
STARPU_ABORT();
|
|
|
}
|
|
|
+#if (MAGMA_VERSION_MAJOR > 1) || (MAGMA_VERSION_MAJOR == 1 && MAGMA_VERSION_MINOR >= 4)
|
|
|
+ cudaError_t cures = cudaStreamSynchronize(stream);
|
|
|
+#else
|
|
|
cudaError_t cures = cudaThreadSynchronize();
|
|
|
+#endif
|
|
|
STARPU_ASSERT(!cures);
|
|
|
}
|
|
|
#else
|
|
|
+ {
|
|
|
+
|
|
|
+ float *lambda11;
|
|
|
+ cublasStatus_t status;
|
|
|
+ cudaStream_t stream = starpu_cuda_get_local_stream();
|
|
|
+ cublasHandle_t handle = starpu_cublas_get_local_handle();
|
|
|
+ cudaHostAlloc((void **)&lambda11, sizeof(float), 0);
|
|
|
+
|
|
|
for (z = 0; z < nx; z++)
|
|
|
{
|
|
|
- float lambda11;
|
|
|
- cudaMemcpyAsync(&lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
|
|
|
- cudaStreamSynchronize(starpu_cuda_get_local_stream());
|
|
|
+ cudaMemcpyAsync(lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, stream);
|
|
|
+ cudaStreamSynchronize(stream);
|
|
|
|
|
|
- STARPU_ASSERT(lambda11 != 0.0f);
|
|
|
+ STARPU_ASSERT(*lambda11 != 0.0f);
|
|
|
|
|
|
- lambda11 = sqrt(lambda11);
|
|
|
+ *lambda11 = sqrt(*lambda11);
|
|
|
|
|
|
- cublasSetVector(1, sizeof(float), &lambda11, sizeof(float), &sub11[z+z*ld], sizeof(float));
|
|
|
+/* cublasSetVector(1, sizeof(float), lambda11, sizeof(float), &sub11[z+z*ld], sizeof(float)); */
|
|
|
+ cudaMemcpyAsync(&sub11[z+z*ld], lambda11, sizeof(float), cudaMemcpyHostToDevice, stream);
|
|
|
+ float scal = 1.0f/(*lambda11);
|
|
|
|
|
|
- cublasSscal(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
|
|
|
+ status = cublasSscal(handle,
|
|
|
+ nx - z - 1, &scal, &sub11[(z+1)+z*ld], 1);
|
|
|
+ if (status != CUBLAS_STATUS_SUCCESS)
|
|
|
+ STARPU_CUBLAS_REPORT_ERROR(status);
|
|
|
|
|
|
- cublasSsyr('U', nx - z - 1, -1.0f,
|
|
|
+ status = cublasSsyr(handle,
|
|
|
+ CUBLAS_FILL_MODE_UPPER,
|
|
|
+ nx - z - 1, &m1,
|
|
|
&sub11[(z+1)+z*ld], 1,
|
|
|
&sub11[(z+1)+(z+1)*ld], ld);
|
|
|
+ if (status != CUBLAS_STATUS_SUCCESS)
|
|
|
+ STARPU_CUBLAS_REPORT_ERROR(status);
|
|
|
}
|
|
|
|
|
|
- cudaStreamSynchronize(starpu_cuda_get_local_stream());
|
|
|
+ cudaStreamSynchronize(stream);
|
|
|
+ cudaFreeHost(lambda11);
|
|
|
+ }
|
|
|
#endif
|
|
|
break;
|
|
|
#endif
|