Просмотр исходного кода

Replace cublas calls with plain cuda functions in the BLAS, CSR and BCSR
interfaces.

Cédric Augonnet лет назад: 16
Родитель
Сommit
a95033f93a

+ 42 - 28
src/datawizard/interfaces/bcsr_interface.c

@@ -29,16 +29,16 @@
 
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 #ifdef USE_CUDA
-static int copy_ram_to_cublas(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
-static int copy_cublas_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
+static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
+static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 #endif
 
 static const struct copy_data_methods_s bcsr_copy_data_methods_s = {
 	.ram_to_ram = dummy_copy_ram_to_ram,
 	.ram_to_spu = NULL,
 #ifdef USE_CUDA
-	.ram_to_cuda = copy_ram_to_cublas,
-	.cuda_to_ram = copy_cublas_to_ram,
+	.ram_to_cuda = copy_ram_to_cuda,
+	.cuda_to_ram = copy_cuda_to_ram,
 #endif
 	.cuda_to_cuda = NULL,
 	.cuda_to_spu = NULL,
@@ -261,15 +261,15 @@ static size_t allocate_bcsr_buffer_on_node(starpu_data_handle handle, uint32_t d
 			break;
 #ifdef USE_CUDA
 		case CUDA_RAM:
-			cublasAlloc(nnz*r*c, elemsize, (void **)&addr_nzval);
+			cudaMalloc((void **)&addr_nzval, nnz*r*c*elemsize);
 			if (!addr_nzval)
 				goto fail_nzval;
 
-			cublasAlloc(nnz, sizeof(uint32_t), (void **)&addr_colind);
+			cudaMalloc((void **)&addr_colind, nnz*sizeof(uint32_t));
 			if (!addr_colind)
 				goto fail_colind;
 
-			cublasAlloc((nrow+1), sizeof(uint32_t), (void **)&addr_rowptr);
+			cudaMalloc((void **)&addr_rowptr, (nrow+1)*sizeof(uint32_t));
 			if (!addr_rowptr)
 				goto fail_rowptr;
 
@@ -296,7 +296,7 @@ fail_rowptr:
 			free((void *)addr_colind);
 #ifdef USE_CUDA
 		case CUDA_RAM:
-			cublasFree((void*)addr_colind);
+			cudaFree((void*)addr_colind);
 			break;
 #endif
 		default:
@@ -309,7 +309,7 @@ fail_colind:
 			free((void *)addr_nzval);
 #ifdef USE_CUDA
 		case CUDA_RAM:
-			cublasFree((void*)addr_nzval);
+			cudaFree((void*)addr_nzval);
 			break;
 #endif
 		default:
@@ -337,9 +337,9 @@ static void liberate_bcsr_buffer_on_node(void *interface, uint32_t node)
 			break;
 #ifdef USE_CUDA
 		case CUDA_RAM:
-			cublasFree((void*)bcsr_interface->nzval);
-			cublasFree((void*)bcsr_interface->colind);
-			cublasFree((void*)bcsr_interface->rowptr);
+			cudaFree((void*)bcsr_interface->nzval);
+			cudaFree((void*)bcsr_interface->colind);
+			cudaFree((void*)bcsr_interface->rowptr);
 			break;
 #endif
 		default:
@@ -348,7 +348,7 @@ static void liberate_bcsr_buffer_on_node(void *interface, uint32_t node)
 }
 
 #ifdef USE_CUDA
-static int copy_cublas_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
+static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
 {
 	starpu_bcsr_interface_t *src_bcsr;
 	starpu_bcsr_interface_t *dst_bcsr;
@@ -363,21 +363,28 @@ static int copy_cublas_to_ram(starpu_data_handle handle, uint32_t src_node, uint
 	uint32_t r = src_bcsr->r;
 	uint32_t c = src_bcsr->c;
 
-	cublasGetVector(nnz*r*c, elemsize, (uint8_t *)src_bcsr->nzval, 1, 
-			 		   (uint8_t *)dst_bcsr->nzval, 1);
+	cudaError_t cures;
 
-	cublasGetVector(nnz, sizeof(uint32_t), (uint8_t *)src_bcsr->colind, 1, 
-						(uint8_t *)dst_bcsr->colind, 1);
+	cures = cudaMemcpy((char *)dst_bcsr->nzval, (char *)src_bcsr->nzval, nnz*r*c*elemsize, cudaMemcpyDeviceToHost);
+	if (STARPU_UNLIKELY(cures))
+		CUDA_REPORT_ERROR(cures);
+
+	cures = cudaMemcpy((char *)dst_bcsr->colind, (char *)src_bcsr->colind, nnz*sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	if (STARPU_UNLIKELY(cures))
+		CUDA_REPORT_ERROR(cures);
+
+	cures = cudaMemcpy((char *)dst_bcsr->rowptr, (char *)src_bcsr->rowptr, (nrow+1)*sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	if (STARPU_UNLIKELY(cures))
+		CUDA_REPORT_ERROR(cures);
+
+	cudaThreadSynchronize();
 
-	cublasGetVector((nrow+1), sizeof(uint32_t), (uint8_t *)src_bcsr->rowptr, 1, 
-						(uint8_t *)dst_bcsr->rowptr, 1);
-	
 	TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
 
 	return 0;
 }
 
-static int copy_ram_to_cublas(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
+static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
 {
 	starpu_bcsr_interface_t *src_bcsr;
 	starpu_bcsr_interface_t *dst_bcsr;
@@ -392,15 +399,22 @@ static int copy_ram_to_cublas(starpu_data_handle handle, uint32_t src_node, uint
 	uint32_t r = src_bcsr->r;
 	uint32_t c = src_bcsr->c;
 
-	cublasSetVector(nnz*r*c, elemsize, (uint8_t *)src_bcsr->nzval, 1, 
-					(uint8_t *)dst_bcsr->nzval, 1);
+	cudaError_t cures;
 
-	cublasSetVector(nnz, sizeof(uint32_t), (uint8_t *)src_bcsr->colind, 1, 
-						(uint8_t *)dst_bcsr->colind, 1);
+	cures = cudaMemcpy((char *)dst_bcsr->nzval, (char *)src_bcsr->nzval, nnz*r*c*elemsize, cudaMemcpyHostToDevice);
+	if (STARPU_UNLIKELY(cures))
+		CUDA_REPORT_ERROR(cures);
+
+	cures = cudaMemcpy((char *)dst_bcsr->colind, (char *)src_bcsr->colind, nnz*sizeof(uint32_t), cudaMemcpyHostToDevice);
+	if (STARPU_UNLIKELY(cures))
+		CUDA_REPORT_ERROR(cures);
+
+	cures = cudaMemcpy((char *)dst_bcsr->rowptr, (char *)src_bcsr->rowptr, (nrow+1)*sizeof(uint32_t), cudaMemcpyHostToDevice);
+	if (STARPU_UNLIKELY(cures))
+		CUDA_REPORT_ERROR(cures);
+
+	cudaThreadSynchronize();
 
-	cublasSetVector((nrow+1), sizeof(uint32_t), (uint8_t *)src_bcsr->rowptr, 1, 
-						(uint8_t *)dst_bcsr->rowptr, 1);
-	
 	TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
 
 	return 0;

+ 12 - 12
src/datawizard/interfaces/blas_interface.c

@@ -31,20 +31,20 @@
 
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 #ifdef USE_CUDA
-static int copy_ram_to_cublas(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
-static int copy_cublas_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
-static int copy_ram_to_cublas_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
-static int copy_cublas_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
+static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
+static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
+static int copy_ram_to_cuda_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
+static int copy_cuda_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
 #endif
 
 static const struct copy_data_methods_s blas_copy_data_methods_s = {
 	.ram_to_ram = dummy_copy_ram_to_ram,
 	.ram_to_spu = NULL,
 #ifdef USE_CUDA
-	.ram_to_cuda = copy_ram_to_cublas,
-	.cuda_to_ram = copy_cublas_to_ram,
-	.ram_to_cuda_async = copy_ram_to_cublas_async,
-	.cuda_to_ram_async = copy_cublas_to_ram_async,
+	.ram_to_cuda = copy_ram_to_cuda,
+	.cuda_to_ram = copy_cuda_to_ram,
+	.ram_to_cuda_async = copy_ram_to_cuda_async,
+	.cuda_to_ram_async = copy_cuda_to_ram_async,
 #endif
 	.cuda_to_cuda = NULL,
 	.cuda_to_spu = NULL,
@@ -304,7 +304,7 @@ static void liberate_blas_buffer_on_node(void *interface, uint32_t node)
 }
 
 #ifdef USE_CUDA
-static int copy_cublas_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
+static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
 {
 	starpu_blas_interface_t *src_blas;
 	starpu_blas_interface_t *dst_blas;
@@ -326,7 +326,7 @@ static int copy_cublas_to_ram(starpu_data_handle handle, uint32_t src_node, uint
 	return 0;
 }
 
-static int copy_ram_to_cublas(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
+static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
 {
 	starpu_blas_interface_t *src_blas;
 	starpu_blas_interface_t *dst_blas;
@@ -352,7 +352,7 @@ static int copy_ram_to_cublas(starpu_data_handle handle, uint32_t src_node, uint
 	return 0;
 }
 
-static int copy_cublas_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream)
+static int copy_cuda_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream)
 {
 	starpu_blas_interface_t *src_blas;
 	starpu_blas_interface_t *dst_blas;
@@ -390,7 +390,7 @@ static int copy_cublas_to_ram_async(starpu_data_handle handle, uint32_t src_node
 	return EAGAIN;
 }
 
-static int copy_ram_to_cublas_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream)
+static int copy_ram_to_cuda_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream)
 {
 	starpu_blas_interface_t *src_blas;
 	starpu_blas_interface_t *dst_blas;

+ 42 - 28
src/datawizard/interfaces/csr_interface.c

@@ -25,16 +25,16 @@
 
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 #ifdef USE_CUDA
-static int copy_ram_to_cublas(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
-static int copy_cublas_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
+static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
+static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 #endif
 
 static const struct copy_data_methods_s csr_copy_data_methods_s = {
 	.ram_to_ram = dummy_copy_ram_to_ram,
 	.ram_to_spu = NULL,
 #ifdef USE_CUDA
-	.ram_to_cuda = copy_ram_to_cublas,
-	.cuda_to_ram = copy_cublas_to_ram,
+	.ram_to_cuda = copy_ram_to_cuda,
+	.cuda_to_ram = copy_cuda_to_ram,
 #endif
 	.cuda_to_cuda = NULL,
 	.cuda_to_spu = NULL,
@@ -232,15 +232,15 @@ static size_t allocate_csr_buffer_on_node(starpu_data_handle handle, uint32_t ds
 			break;
 #ifdef USE_CUDA
 		case CUDA_RAM:
-			cublasAlloc(nnz, elemsize, (void **)&addr_nzval);
+			cudaMalloc((void **)&addr_nzval, nnz*elemsize);
 			if (!addr_nzval)
 				goto fail_nzval;
 
-			cublasAlloc(nnz, sizeof(uint32_t), (void **)&addr_colind);
+			cudaMalloc((void **)&addr_colind, nnz*sizeof(uint32_t));
 			if (!addr_colind)
 				goto fail_colind;
 
-			cublasAlloc((nrow+1), sizeof(uint32_t), (void **)&addr_rowptr);
+			cudaMalloc((void **)&addr_rowptr, (nrow+1)*sizeof(uint32_t));
 			if (!addr_rowptr)
 				goto fail_rowptr;
 
@@ -267,7 +267,7 @@ fail_rowptr:
 			free((void *)addr_colind);
 #ifdef USE_CUDA
 		case CUDA_RAM:
-			cublasFree((void*)addr_colind);
+			cudaFree((void*)addr_colind);
 			break;
 #endif
 		default:
@@ -280,7 +280,7 @@ fail_colind:
 			free((void *)addr_nzval);
 #ifdef USE_CUDA
 		case CUDA_RAM:
-			cublasFree((void*)addr_nzval);
+			cudaFree((void*)addr_nzval);
 			break;
 #endif
 		default:
@@ -308,9 +308,9 @@ static void liberate_csr_buffer_on_node(void *interface, uint32_t node)
 			break;
 #ifdef USE_CUDA
 		case CUDA_RAM:
-			cublasFree((void*)csr_interface->nzval);
-			cublasFree((void*)csr_interface->colind);
-			cublasFree((void*)csr_interface->rowptr);
+			cudaFree((void*)csr_interface->nzval);
+			cudaFree((void*)csr_interface->colind);
+			cudaFree((void*)csr_interface->rowptr);
 			break;
 #endif
 		default:
@@ -319,7 +319,7 @@ static void liberate_csr_buffer_on_node(void *interface, uint32_t node)
 }
 
 #ifdef USE_CUDA
-static int copy_cublas_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
+static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
 {
 	starpu_csr_interface_t *src_csr;
 	starpu_csr_interface_t *dst_csr;
@@ -331,21 +331,28 @@ static int copy_cublas_to_ram(starpu_data_handle handle, uint32_t src_node, uint
 	uint32_t nrow = src_csr->nrow;
 	size_t elemsize = src_csr->elemsize;
 
-	cublasGetVector(nnz, elemsize, (uint8_t *)src_csr->nzval, 1, 
-					(uint8_t *)dst_csr->nzval, 1);
+	cudaError_t cures;
 
-	cublasGetVector(nnz, sizeof(uint32_t), (uint8_t *)src_csr->colind, 1, 
-						(uint8_t *)dst_csr->colind, 1);
+	cures = cudaMemcpy((char *)dst_csr->nzval, (char *)src_csr->nzval, nnz*elemsize, cudaMemcpyDeviceToHost);
+	if (STARPU_UNLIKELY(cures))
+		CUDA_REPORT_ERROR(cures);
+
+	cures = cudaMemcpy((char *)dst_csr->colind, (char *)src_csr->colind, nnz*sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	if (STARPU_UNLIKELY(cures))
+		CUDA_REPORT_ERROR(cures);
+
+	cures = cudaMemcpy((char *)dst_csr->rowptr, (char *)src_csr->rowptr, (nrow+1)*sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	if (STARPU_UNLIKELY(cures))
+		CUDA_REPORT_ERROR(cures);
+
+	cudaThreadSynchronize();
 
-	cublasGetVector((nrow+1), sizeof(uint32_t), (uint8_t *)src_csr->rowptr, 1, 
-						(uint8_t *)dst_csr->rowptr, 1);
-	
 	TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
 
 	return 0;
 }
 
-static int copy_ram_to_cublas(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
+static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
 {
 	starpu_csr_interface_t *src_csr;
 	starpu_csr_interface_t *dst_csr;
@@ -357,15 +364,22 @@ static int copy_ram_to_cublas(starpu_data_handle handle, uint32_t src_node, uint
 	uint32_t nrow = src_csr->nrow;
 	size_t elemsize = src_csr->elemsize;
 
-	cublasSetVector(nnz, elemsize, (uint8_t *)src_csr->nzval, 1, 
-					(uint8_t *)dst_csr->nzval, 1);
+	cudaError_t cures;
 
-	cublasSetVector(nnz, sizeof(uint32_t), (uint8_t *)src_csr->colind, 1, 
-						(uint8_t *)dst_csr->colind, 1);
+	cures = cudaMemcpy((char *)dst_csr->nzval, (char *)src_csr->nzval, nnz*elemsize, cudaMemcpyHostToDevice);
+	if (STARPU_UNLIKELY(cures))
+		CUDA_REPORT_ERROR(cures);
+
+	cures = cudaMemcpy((char *)dst_csr->colind, (char *)src_csr->colind, nnz*sizeof(uint32_t), cudaMemcpyHostToDevice);
+	if (STARPU_UNLIKELY(cures))
+		CUDA_REPORT_ERROR(cures);
+
+	cures = cudaMemcpy((char *)dst_csr->rowptr, (char *)src_csr->rowptr, (nrow+1)*sizeof(uint32_t), cudaMemcpyHostToDevice);
+	if (STARPU_UNLIKELY(cures))
+		CUDA_REPORT_ERROR(cures);
+
+	cudaThreadSynchronize();
 
-	cublasSetVector((nrow+1), sizeof(uint32_t), (uint8_t *)src_csr->rowptr, 1, 
-						(uint8_t *)dst_csr->rowptr, 1);
-	
 	TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
 
 	return 0;