Browse Source

Block interface: fix src and dst pointers when copying non contiguous data to/from CUDA devices

Nathalie Furmento 15 years ago
parent
commit
f4b5814bc3
1 changed files with 24 additions and 38 deletions
  1. 24 38
      src/datawizard/interfaces/block_interface.c

+ 24 - 38
src/datawizard/interfaces/block_interface.c

@@ -417,14 +417,12 @@ static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__
 		unsigned layer;
 		for (layer = 0; layer < src_block->nz; layer++)
 		{
-			uint8_t *src_ptr = ((uint8_t *)src_block->ptr) 
-						+ src_block->ldz*src_block->elemsize;
-			uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) 
-						+ dst_block->ldz*dst_block->elemsize;
+			uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + layer*src_block->ldz*src_block->elemsize;
+			uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + layer*dst_block->ldz*dst_block->elemsize;
 
 			cures = cudaMemcpy2D((char *)dst_ptr, dst_block->ldy*elemsize,
-				(char *)src_ptr, src_block->ldy*elemsize,
-				nx*elemsize, ny, cudaMemcpyDeviceToHost);
+                                             (char *)src_ptr, src_block->ldy*elemsize,
+                                             nx*elemsize, ny, cudaMemcpyDeviceToHost);
 
 			if (STARPU_UNLIKELY(cures))
 				STARPU_CUDA_REPORT_ERROR(cures);
@@ -502,14 +500,12 @@ static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attri
 		unsigned layer;
 		for (layer = 0; layer < src_block->nz; layer++)
 		{
-			uint8_t *src_ptr = ((uint8_t *)src_block->ptr) 
-						+ src_block->ldz*src_block->elemsize;
-			uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) 
-						+ dst_block->ldz*dst_block->elemsize;
+			uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + layer*src_block->ldz*src_block->elemsize;
+			uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + layer*dst_block->ldz*dst_block->elemsize;
 
 			cures = cudaMemcpy2DAsync((char *)dst_ptr, dst_block->ldy*elemsize,
-					(char *)src_ptr, src_block->ldy*elemsize,
-					nx*elemsize, ny, cudaMemcpyDeviceToHost, *stream);
+                                                  (char *)src_ptr, src_block->ldy*elemsize,
+                                                  nx*elemsize, ny, cudaMemcpyDeviceToHost, *stream);
 
 			if (STARPU_UNLIKELY(cures))
 			{
@@ -518,7 +514,7 @@ static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attri
 			}
 
 		}
-		
+
 		ret = EAGAIN;
 
 	}
@@ -533,18 +529,15 @@ no_async_default:
 	unsigned layer;
 	for (layer = 0; layer < src_block->nz; layer++)
 	{
-		uint8_t *src_ptr = ((uint8_t *)src_block->ptr) 
-					+ src_block->ldz*src_block->elemsize;
-		uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) 
-					+ dst_block->ldz*dst_block->elemsize;
+		uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + layer*src_block->ldz*src_block->elemsize;
+		uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + layer*dst_block->ldz*dst_block->elemsize;
 
 		cures = cudaMemcpy2D((char *)dst_ptr, dst_block->ldy*elemsize,
-				(char *)src_ptr, src_block->ldy*elemsize,
-				nx*elemsize, ny, cudaMemcpyDeviceToHost);
+                                     (char *)src_ptr, src_block->ldy*elemsize,
+                                     nx*elemsize, ny, cudaMemcpyDeviceToHost);
 
 		if (STARPU_UNLIKELY(cures))
 			STARPU_CUDA_REPORT_ERROR(cures);
-		
 	}
 	cudaThreadSynchronize();
 
@@ -618,10 +611,8 @@ static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attri
 		unsigned layer;
 		for (layer = 0; layer < src_block->nz; layer++)
 		{
-			uint8_t *src_ptr = ((uint8_t *)src_block->ptr) 
-						+ src_block->ldz*src_block->elemsize;
-			uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) 
-						+ dst_block->ldz*dst_block->elemsize;
+			uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + layer*src_block->ldz*src_block->elemsize;
+			uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + layer*dst_block->ldz*dst_block->elemsize;
 
 			cures = cudaMemcpy2DAsync((char *)dst_ptr, dst_block->ldy*elemsize,
 					(char *)src_ptr, src_block->ldy*elemsize,
@@ -634,7 +625,7 @@ static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attri
 			}
 
 		}
-		
+
 		ret = EAGAIN;
 
 	}
@@ -649,18 +640,15 @@ no_async_default:
 	unsigned layer;
 	for (layer = 0; layer < src_block->nz; layer++)
 	{
-		uint8_t *src_ptr = ((uint8_t *)src_block->ptr) 
-					+ src_block->ldz*src_block->elemsize;
-		uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) 
-					+ dst_block->ldz*dst_block->elemsize;
+		uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + layer*src_block->ldz*src_block->elemsize;
+		uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + layer*dst_block->ldz*dst_block->elemsize;
 
 		cures = cudaMemcpy2D((char *)dst_ptr, dst_block->ldy*elemsize,
-				(char *)src_ptr, src_block->ldy*elemsize,
-				nx*elemsize, ny, cudaMemcpyHostToDevice);
+                                     (char *)src_ptr, src_block->ldy*elemsize,
+                                     nx*elemsize, ny, cudaMemcpyHostToDevice);
 
 		if (STARPU_UNLIKELY(cures))
 			STARPU_CUDA_REPORT_ERROR(cures);
-		
 	}
 	cudaThreadSynchronize();
 
@@ -706,14 +694,12 @@ static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__
 		unsigned layer;
 		for (layer = 0; layer < src_block->nz; layer++)
 		{
-			uint8_t *src_ptr = ((uint8_t *)src_block->ptr) 
-						+ src_block->ldz*src_block->elemsize;
-			uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) 
-						+ dst_block->ldz*dst_block->elemsize;
+			uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + layer*src_block->ldz*src_block->elemsize;
+			uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + layer*dst_block->ldz*dst_block->elemsize;
 
 			cures = cudaMemcpy2D((char *)dst_ptr, dst_block->ldy*elemsize,
-				(char *)src_ptr, src_block->ldy*elemsize,
-				nx*elemsize, ny, cudaMemcpyHostToDevice);
+                                             (char *)src_ptr, src_block->ldy*elemsize,
+                                             nx*elemsize, ny, cudaMemcpyHostToDevice);
 
 			if (STARPU_UNLIKELY(cures))
 				STARPU_CUDA_REPORT_ERROR(cures);