Browse Source

Don't use CUBLAS function to perform data transfers with the vector interface.

Cédric Augonnet 16 years ago
parent
commit
2927c638b3
1 changed files with 47 additions and 29 deletions
  1. 47 29
      src/datawizard/interfaces/vector_interface.c

+ 47 - 29
src/datawizard/interfaces/vector_interface.c

@@ -30,20 +30,20 @@
 
 static int dummy_copy_ram_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
 #ifdef USE_CUDA
-static int copy_ram_to_cublas(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
-static int copy_cublas_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
-static int copy_ram_to_cublas_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
-static int copy_cublas_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
+static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
+static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node);
+static int copy_ram_to_cuda_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
+static int copy_cuda_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream);
 #endif
 
 static const struct copy_data_methods_s vector_copy_data_methods_s = {
 	.ram_to_ram = dummy_copy_ram_to_ram,
 	.ram_to_spu = NULL,
 #ifdef USE_CUDA
-	.ram_to_cuda = copy_ram_to_cublas,
-	.cuda_to_ram = copy_cublas_to_ram,
-	.ram_to_cuda_async = copy_ram_to_cublas_async,
-	.cuda_to_ram_async = copy_cublas_to_ram_async,
+	.ram_to_cuda = copy_ram_to_cuda,
+	.cuda_to_ram = copy_cuda_to_ram,
+	.ram_to_cuda_async = copy_ram_to_cuda_async,
+	.cuda_to_ram_async = copy_cuda_to_ram_async,
 #endif
 	.cuda_to_cuda = NULL,
 	.cuda_to_spu = NULL,
@@ -187,6 +187,7 @@ static size_t allocate_vector_buffer_on_node(starpu_data_handle handle, uint32_t
 	starpu_vector_interface_t *interface =
 		starpu_data_get_interface_on_node(handle, dst_node);
 
+	unsigned fail = 0;
 	uintptr_t addr = 0;
 	size_t allocated_memory;
 
@@ -195,29 +196,40 @@ static size_t allocate_vector_buffer_on_node(starpu_data_handle handle, uint32_t
 
 	node_kind kind = get_node_kind(dst_node);
 
+#ifdef USE_CUDA
+	cudaError_t status;
+#endif
+
 	switch(kind) {
 		case RAM:
 			addr = (uintptr_t)malloc(nx*elemsize);
+			if (!addr)
+				fail = 1;
 			break;
 #ifdef USE_CUDA
 		case CUDA_RAM:
-			cublasAlloc(nx, elemsize, (void **)&addr);
+			status = cudaMalloc((void **)&addr, nx*elemsize);
+			if (!addr || (status != cudaSuccess))
+			{
+				if (STARPU_UNLIKELY(status != cudaErrorMemoryAllocation))
+					CUDA_REPORT_ERROR(status);
+
+				fail = 1;
+			}
 			break;
 #endif
 		default:
 			assert(0);
 	}
 
-	if (addr) {
-		/* allocation succeeded */
-		allocated_memory = nx*elemsize;
+	if (fail)
+		return 0;
 
-		/* update the data properly in consequence */
-		interface->ptr = addr;
-	} else {
-		/* allocation failed */
-		allocated_memory = 0;
-	}
+	/* allocation succeeded */
+	allocated_memory = nx*elemsize;
+
+	/* update the data properly in consequence */
+	interface->ptr = addr;
 	
 	return allocated_memory;
 }
@@ -233,7 +245,7 @@ static void liberate_vector_buffer_on_node(void *interface, uint32_t node)
 			break;
 #ifdef USE_CUDA
 		case CUDA_RAM:
-			cublasFree((void*)vector_interface->ptr);
+			cudaFree((void*)vector_interface->ptr);
 			break;
 #endif
 		default:
@@ -242,7 +254,7 @@ static void liberate_vector_buffer_on_node(void *interface, uint32_t node)
 }
 
 #ifdef USE_CUDA
-static int copy_cublas_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
+static int copy_cuda_to_ram(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
 {
 	starpu_vector_interface_t *src_vector;
 	starpu_vector_interface_t *dst_vector;
@@ -250,16 +262,19 @@ static int copy_cublas_to_ram(starpu_data_handle handle, uint32_t src_node, uint
 	src_vector = starpu_data_get_interface_on_node(handle, src_node);
 	dst_vector = starpu_data_get_interface_on_node(handle, dst_node);
 
-	cublasGetVector(src_vector->nx, src_vector->elemsize,
-		(uint8_t *)src_vector->ptr, 1,
-		(uint8_t *)dst_vector->ptr, 1);
+	cudaError_t cures;
+	cures = cudaMemcpy((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, cudaMemcpyDeviceToHost);
+	cudaThreadSynchronize();
+
+	if (STARPU_UNLIKELY(cures))
+		CUDA_REPORT_ERROR(cures);
 
 	TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
 
 	return 0;
 }
 
-static int copy_ram_to_cublas(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
+static int copy_ram_to_cuda(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node)
 {
 	starpu_vector_interface_t *src_vector;
 	starpu_vector_interface_t *dst_vector;
@@ -267,16 +282,19 @@ static int copy_ram_to_cublas(starpu_data_handle handle, uint32_t src_node, uint
 	src_vector = starpu_data_get_interface_on_node(handle, src_node);
 	dst_vector = starpu_data_get_interface_on_node(handle, dst_node);
 
-	cublasSetVector(src_vector->nx, src_vector->elemsize,
-		(uint8_t *)src_vector->ptr, 1,
-		(uint8_t *)dst_vector->ptr, 1);
+	cudaError_t cures;
+	cures = cudaMemcpy((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, cudaMemcpyHostToDevice);
+	cudaThreadSynchronize();
+
+	if (STARPU_UNLIKELY(cures))
+		CUDA_REPORT_ERROR(cures);
 
 	TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
 
 	return 0;
 }
 
-static int copy_cublas_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream)
+static int copy_cuda_to_ram_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream)
 {
 	starpu_vector_interface_t *src_vector;
 	starpu_vector_interface_t *dst_vector;
@@ -303,7 +321,7 @@ static int copy_cublas_to_ram_async(starpu_data_handle handle, uint32_t src_node
 	return EAGAIN;
 }
 
-static int copy_ram_to_cublas_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream)
+static int copy_ram_to_cuda_async(starpu_data_handle handle, uint32_t src_node, uint32_t dst_node, cudaStream_t *stream)
 {
 	starpu_vector_interface_t *src_vector;
 	starpu_vector_interface_t *dst_vector;