Переглянути джерело

Harmonize distributed cholesky codelets with mono-node cholesky codelets

Samuel Thibault 6 роки тому
батько
коміт
eb2d5ff746

+ 8 - 6
mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c

@@ -35,7 +35,7 @@ static struct starpu_codelet cl11 =
 #endif
 	.nbuffers = 1,
 	.modes = {STARPU_RW},
-	.model = &chol_model_11
+	.model = &chol_model_11,
 	.color = 0xffff00,
 };
 
@@ -47,9 +47,10 @@ static struct starpu_codelet cl21 =
 #elif defined(STARPU_SIMGRID)
 	.cuda_funcs = {(void*)1},
 #endif
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 	.nbuffers = 2,
 	.modes = {STARPU_R, STARPU_RW},
-	.model = &chol_model_21
+	.model = &chol_model_21,
 	.color = 0x8080ff,
 };
 
@@ -61,9 +62,10 @@ static struct starpu_codelet cl22 =
 #elif defined(STARPU_SIMGRID)
 	.cuda_funcs = {(void*)1},
 #endif
+	.cuda_flags = {STARPU_CUDA_ASYNC},
 	.nbuffers = 3,
 	.modes = {STARPU_R, STARPU_R, STARPU_RW | STARPU_COMMUTE},
-	.model = &chol_model_22
+	.model = &chol_model_22,
 	.color = 0x00ff00,
 };
 
@@ -160,6 +162,9 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 
 	starpu_task_wait_for_all();
 
+	starpu_mpi_barrier(MPI_COMM_WORLD);
+	end = starpu_timing_now();
+
 	for(x = 0; x < nblocks ; x++)
 	{
 		for (y = 0; y < nblocks; y++)
@@ -171,9 +176,6 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 	}
 	free(data_handles);
 
-	starpu_mpi_barrier(MPI_COMM_WORLD);
-	end = starpu_timing_now();
-
 	if (rank == 0)
 	{
 		*timing = end - start;

+ 62 - 30
mpi/examples/matrix_decomposition/mpi_cholesky_kernels.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2012                                     Inria
- * Copyright (C) 2009-2014                                Université de Bordeaux
+ * Copyright (C) 2009-2014, 2018                                Université de Bordeaux
  * Copyright (C) 2010-2013,2015,2017                      CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -23,6 +23,7 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cublas.h>
+#include <starpu_cublas_v2.h>
 #ifdef STARPU_HAVE_MAGMA
 #include "magma.h"
 #include "magma_lapack.h"
@@ -33,6 +34,11 @@
  * U22
  */
 
+#if defined(STARPU_USE_CUDA)
+static const float p1 =  1.0;
+static const float m1 = -1.0;
+#endif
+
 static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, void *_args)
 {
 	(void)_args;
@@ -49,10 +55,6 @@ static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, void
 	unsigned ld12 = STARPU_MATRIX_GET_LD(descr[1]);
 	unsigned ld22 = STARPU_MATRIX_GET_LD(descr[2]);
 
-#ifdef STARPU_USE_CUDA
-	cublasStatus st;
-#endif
-
 	switch (s)
 	{
 		case 0:
@@ -61,19 +63,16 @@ static inline void chol_common_cpu_codelet_update_u22(void *descr[], int s, void
 			break;
 #ifdef STARPU_USE_CUDA
 		case 1:
-#ifdef STARPU_HAVE_MAGMA
-			cublasSetKernelStream(starpu_cuda_get_local_stream());
-#endif
-			cublasSgemm('n', 't', dy, dx, dz,
-					-1.0f, left, ld21, right, ld12,
-					 1.0f, center, ld22);
-			st = cublasGetError();
-			if (STARPU_UNLIKELY(st != CUBLAS_STATUS_SUCCESS))
-				STARPU_CUBLAS_REPORT_ERROR(st);
-
-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
+		{
+			cublasStatus_t status = cublasSgemm(starpu_cublas_get_local_handle(),
+					CUBLAS_OP_N, CUBLAS_OP_T, dy, dx, dz,
+					&m1, left, ld21, right, ld12,
+					&p1, center, ld22);
+			if (status != CUBLAS_STATUS_SUCCESS)
+				STARPU_CUBLAS_REPORT_ERROR(status);
 
 			break;
+		}
 #endif
 		default:
 			STARPU_ABORT();
@@ -113,6 +112,10 @@ static inline void chol_common_codelet_update_u21(void *descr[], int s, void *_a
 	unsigned nx21 = STARPU_MATRIX_GET_NY(descr[1]);
 	unsigned ny21 = STARPU_MATRIX_GET_NX(descr[1]);
 
+#ifdef STARPU_USE_CUDA
+	cublasStatus status;
+#endif
+
 	switch (s)
 	{
 		case 0:
@@ -120,11 +123,11 @@ static inline void chol_common_codelet_update_u21(void *descr[], int s, void *_a
 			break;
 #ifdef STARPU_USE_CUDA
 		case 1:
-#ifdef STARPU_HAVE_MAGMA
-			cublasSetKernelStream(starpu_cuda_get_local_stream());
-#endif
-			cublasStrsm('R', 'L', 'T', 'N', nx21, ny21, 1.0f, sub11, ld11, sub21, ld21);
-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
+			status = cublasStrsm(starpu_cublas_get_local_handle(),
+					CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT,
+					nx21, ny21, &p1, sub11, ld11, sub21, ld21);
+			if (status != CUBLAS_STATUS_SUCCESS)
+				STARPU_CUBLAS_REPORT_ERROR(status);
 			break;
 #endif
 		default:
@@ -197,36 +200,65 @@ static inline void chol_common_codelet_update_u11(void *descr[], int s, void *_a
 			{
 				int ret;
 				int info;
+			cudaStream_t stream = starpu_cuda_get_local_stream();
+#if (MAGMA_VERSION_MAJOR > 1) || (MAGMA_VERSION_MAJOR == 1 && MAGMA_VERSION_MINOR >= 4)
+			cublasSetKernelStream(stream);
+			magmablasSetKernelStream(stream);
+#else
+			starpu_cublas_set_stream();
+#endif
 				ret = magma_spotrf_gpu(MagmaLower, nx, sub11, ld, &info);
 				if (ret != MAGMA_SUCCESS)
 				{
 					fprintf(stderr, "Error in Magma: %d\n", ret);
 					STARPU_ABORT();
 				}
+#if (MAGMA_VERSION_MAJOR > 1) || (MAGMA_VERSION_MAJOR == 1 && MAGMA_VERSION_MINOR >= 4)
+			cudaError_t cures = cudaStreamSynchronize(stream);
+#else
 				cudaError_t cures = cudaThreadSynchronize();
+#endif
 				STARPU_ASSERT(!cures);
 			}
 #else
+			{
+
+			float *lambda11;
+			cublasStatus_t status;
+			cudaStream_t stream = starpu_cuda_get_local_stream();
+			cublasHandle_t handle = starpu_cublas_get_local_handle();
+			cudaHostAlloc((void **)&lambda11, sizeof(float), 0);
+
 			for (z = 0; z < nx; z++)
 			{
-				float lambda11;
-				cudaMemcpyAsync(&lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
-				cudaStreamSynchronize(starpu_cuda_get_local_stream());
+				cudaMemcpyAsync(lambda11, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, stream);
+				cudaStreamSynchronize(stream);
 
-				STARPU_ASSERT(lambda11 != 0.0f);
+				STARPU_ASSERT(*lambda11 != 0.0f);
 
-				lambda11 = sqrt(lambda11);
+				*lambda11 = sqrt(*lambda11);
 
-				cublasSetVector(1, sizeof(float), &lambda11, sizeof(float), &sub11[z+z*ld], sizeof(float));
+/*				cublasSetVector(1, sizeof(float), lambda11, sizeof(float), &sub11[z+z*ld], sizeof(float)); */
+				cudaMemcpyAsync(&sub11[z+z*ld], lambda11, sizeof(float), cudaMemcpyHostToDevice, stream);
+				float scal = 1.0f/(*lambda11);
 
-				cublasSscal(nx - z - 1, 1.0f/lambda11, &sub11[(z+1)+z*ld], 1);
+				status = cublasSscal(handle,
+						     nx - z - 1, &scal, &sub11[(z+1)+z*ld], 1);
+				if (status != CUBLAS_STATUS_SUCCESS)
+					STARPU_CUBLAS_REPORT_ERROR(status);
 
-				cublasSsyr('U', nx - z - 1, -1.0f,
+				status = cublasSsyr(handle,
+						    CUBLAS_FILL_MODE_UPPER,
+						    nx - z - 1, &m1,
 							&sub11[(z+1)+z*ld], 1,
 							&sub11[(z+1)+(z+1)*ld], ld);
+				if (status != CUBLAS_STATUS_SUCCESS)
+					STARPU_CUBLAS_REPORT_ERROR(status);
 			}
 
-			cudaStreamSynchronize(starpu_cuda_get_local_stream());
+			cudaStreamSynchronize(stream);
+			cudaFreeHost(lambda11);
+			}
 #endif
 			break;
 #endif