Prechádzať zdrojové kódy

Block interface: update synchronous method to match the asynchronous one

Nathalie Furmento 15 rokov pred
rodič
commit
1c4b284f00
1 zmenil súbory, kde vykonal 21 pridanie a 9 odobranie
  1. 21 9
      src/datawizard/interfaces/block_interface.c

+ 21 - 9
src/datawizard/interfaces/block_interface.c

@@ -383,8 +383,6 @@ static void free_block_buffer_on_node(void *interface, uint32_t node)
 #ifdef STARPU_USE_CUDA
 static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
 {
-	cudaError_t cures;
-
 	starpu_block_interface_t *src_block = src_interface;
 	starpu_block_interface_t *dst_block = dst_interface;
 
@@ -393,15 +391,29 @@ static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__
 	uint32_t nz = src_block->nz;
 	size_t elemsize = src_block->elemsize;
 
-	if ((src_block->nx == src_block->ldy) && (src_block->ldy == dst_block->ldy))
+	cudaError_t cures;
+
+	if ((nx == src_block->ldy) && (src_block->ldy == dst_block->ldy))
 	{
-		/* we are lucky */
-		cures = cudaMemcpy((char *)dst_block->ptr, (char *)src_block->ptr,
-					nx*ny*nz*elemsize, cudaMemcpyDeviceToHost);
-		if (STARPU_UNLIKELY(cures))
-			STARPU_CUDA_REPORT_ERROR(cures);
+		/* Is that a single contiguous buffer ? */
+		if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
+		{
+                        cures = cudaMemcpy((char *)dst_block->ptr, (char *)src_block->ptr,
+                                           nx*ny*nz*elemsize, cudaMemcpyDeviceToHost);
+                        if (STARPU_UNLIKELY(cures))
+                                STARPU_CUDA_REPORT_ERROR(cures);
+                }
+		else {
+			/* Are all plans contiguous */
+                        cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
+                                             (char *)src_block->ptr, src_block->ldz*elemsize,
+                                             nx*ny*elemsize, nz, cudaMemcpyDeviceToHost);
+                        if (STARPU_UNLIKELY(cures))
+                                STARPU_CUDA_REPORT_ERROR(cures);
+                }
 	}
 	else {
+		/* Default case: we transfer all lines one by one: ny*nz transfers */
 		unsigned layer;
 		for (layer = 0; layer < src_block->nz; layer++)
 		{
@@ -418,7 +430,7 @@ static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__
 				STARPU_CUDA_REPORT_ERROR(cures);
 		}
 	}
-	
+
 	cudaThreadSynchronize();
 
 	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->elemsize*src_block->elemsize);