15 years ago · ddc2d2df2c
--- a/src/datawizard/interfaces/block_interface.c
+++ b/src/datawizard/interfaces/block_interface.c
@@ -566,7 +566,6 @@ static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attri
 
																 	size_t elemsize = src_block->elemsize;
															
 
																 	cudaError_t cures;
															
 
																-
															
 
																 	int ret;
															
 
																 	/* We may have a contiguous buffer for the entire block, or contiguous
															
@@ -672,8 +671,6 @@ no_async_default:
 
																 static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
															
 
																 {
															
 
																-	cudaError_t cures;
															
 
																-
															
 
																 	starpu_block_interface_t *src_block = src_interface;
															
 
																 	starpu_block_interface_t *dst_block = dst_interface;
															
@@ -682,15 +679,30 @@ static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__
 
																 	uint32_t nz = src_block->nz;
															
 
																 	size_t elemsize = src_block->elemsize;
															
 
																-	if ((src_block->nx == src_block->ldy) && (src_block->ldy == dst_block->ldy))
															
 
																+	cudaError_t cures;
															
 
																+        int ret;
															
 
																+
															
 
																+	/* We may have a contiguous buffer for the entire block, or contiguous
															
 
																+	 * plans within the block, we can avoid many small transfers that way */
															
 
																+	if ((nx == src_block->ldy) && (src_block->ldy == dst_block->ldy))
															
 
																 	{
															
 
																-		/* we are lucky */
															
 
																-		cures = cudaMemcpy((char *)dst_block->ptr, (char *)src_block->ptr,
															
 
																-						nx*ny*nz*elemsize, cudaMemcpyHostToDevice);
															
 
																+		/* Is that a single contiguous buffer ? */
															
 
																+		if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
															
 
																+		{
															
 
																+			cures = cudaMemcpy((char *)dst_block->ptr, (char *)src_block->ptr,
															
 
																+                                           nx*ny*nz*elemsize, cudaMemcpyHostToDevice, *stream);
															
 
																+                }
															
 
																+                else {
															
 
																+			/* Are all plans contiguous */
															
 
																+			cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
															
 
																+                                             (char *)src_block->ptr, src_block->ldz*elemsize,
															
 
																+                                             nx*ny*elemsize, nz, cudaMemcpyHostToDevice, *stream);
															
 
																+                }
															
 
																 		if (STARPU_UNLIKELY(cures))
															
 
																 			STARPU_CUDA_REPORT_ERROR(cures);
															
 
																 	}
															
 
																 	else {
															
 
																+		/* Default case: we transfer all lines one by one: ny*nz transfers */
															
 
																 		unsigned layer;
															
 
																 		for (layer = 0; layer < src_block->nz; layer++)
															
 
																 		{