Browse Source

- Factorize functions which only differ by one parameter
- Implement CUDA<->CUDA (intra-device) copy methods for existing interfaces

Cédric Augonnet 14 years ago
parent
commit
19f09aba2d

+ 16 - 33
src/datawizard/interfaces/bcsr_interface.c

@@ -34,6 +34,7 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__(
 #ifdef STARPU_USE_CUDA
 static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
 static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
 #endif
 #ifdef STARPU_USE_OPENCL
 static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
@@ -46,12 +47,12 @@ static const struct starpu_data_copy_methods bcsr_copy_data_methods_s = {
 #ifdef STARPU_USE_CUDA
 	.ram_to_cuda = copy_ram_to_cuda,
 	.cuda_to_ram = copy_cuda_to_ram,
+	.cuda_to_cuda = copy_cuda_to_cuda,
 #endif
 #ifdef STARPU_USE_OPENCL
 	.ram_to_opencl = copy_ram_to_opencl,
 	.opencl_to_ram = copy_opencl_to_ram,
 #endif
-	.cuda_to_cuda = NULL,
 	.cuda_to_spu = NULL,
 	.spu_to_ram = NULL,
 	.spu_to_cuda = NULL,
@@ -410,7 +411,7 @@ static void free_bcsr_buffer_on_node(void *interface, uint32_t node)
 }
 
 #ifdef STARPU_USE_CUDA
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), enum cudaMemcpyKind kind)
 {
 	starpu_bcsr_interface_t *src_bcsr = src_interface;
 	starpu_bcsr_interface_t *dst_bcsr = dst_interface;
@@ -424,15 +425,15 @@ static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__
 
 	cudaError_t cures;
 
-	cures = cudaMemcpy((char *)dst_bcsr->nzval, (char *)src_bcsr->nzval, nnz*r*c*elemsize, cudaMemcpyDeviceToHost);
+	cures = cudaMemcpy((char *)dst_bcsr->nzval, (char *)src_bcsr->nzval, nnz*r*c*elemsize, kind);
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 
-	cures = cudaMemcpy((char *)dst_bcsr->colind, (char *)src_bcsr->colind, nnz*sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	cures = cudaMemcpy((char *)dst_bcsr->colind, (char *)src_bcsr->colind, nnz*sizeof(uint32_t), kind);
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 
-	cures = cudaMemcpy((char *)dst_bcsr->rowptr, (char *)src_bcsr->rowptr, (nrow+1)*sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	cures = cudaMemcpy((char *)dst_bcsr->rowptr, (char *)src_bcsr->rowptr, (nrow+1)*sizeof(uint32_t), kind);
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 
@@ -443,37 +444,19 @@ static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__
 	return 0;
 }
 
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
 {
-	starpu_bcsr_interface_t *src_bcsr = src_interface;
-	starpu_bcsr_interface_t *dst_bcsr = dst_interface;
-
-	uint32_t nnz = src_bcsr->nnz;
-	uint32_t nrow = src_bcsr->nrow;
-	size_t elemsize = src_bcsr->elemsize;
-
-	uint32_t r = src_bcsr->r;
-	uint32_t c = src_bcsr->c;
-
-	cudaError_t cures;
-
-	cures = cudaMemcpy((char *)dst_bcsr->nzval, (char *)src_bcsr->nzval, nnz*r*c*elemsize, cudaMemcpyHostToDevice);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
-
-	cures = cudaMemcpy((char *)dst_bcsr->colind, (char *)src_bcsr->colind, nnz*sizeof(uint32_t), cudaMemcpyHostToDevice);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
-
-	cures = cudaMemcpy((char *)dst_bcsr->rowptr, (char *)src_bcsr->rowptr, (nrow+1)*sizeof(uint32_t), cudaMemcpyHostToDevice);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
-
-	cudaThreadSynchronize();
+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
+}
 
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
+static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+{
+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
+}
 
-	return 0;
+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+{
+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
 }
 #endif // STARPU_USE_CUDA
 

+ 28 - 168
src/datawizard/interfaces/block_interface.c

@@ -32,6 +32,7 @@ static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__
 static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
 static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream);
 static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream);
+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
 #endif
 #ifdef STARPU_USE_OPENCL
 static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
@@ -48,6 +49,7 @@ static const struct starpu_data_copy_methods block_copy_data_methods_s = {
 	.cuda_to_ram = copy_cuda_to_ram,
 	.ram_to_cuda_async = copy_ram_to_cuda_async,
 	.cuda_to_ram_async = copy_cuda_to_ram_async,
+	.cuda_to_cuda = copy_cuda_to_cuda,
 #endif
 #ifdef STARPU_USE_OPENCL
 	.ram_to_opencl = copy_ram_to_opencl,
@@ -55,7 +57,6 @@ static const struct starpu_data_copy_methods block_copy_data_methods_s = {
         .ram_to_opencl_async = copy_ram_to_opencl_async,
 	.opencl_to_ram_async = copy_opencl_to_ram_async,
 #endif
-	.cuda_to_cuda = NULL,
 	.cuda_to_spu = NULL,
 	.spu_to_ram = NULL,
 	.spu_to_cuda = NULL,
@@ -380,7 +381,7 @@ static void free_block_buffer_on_node(void *interface, uint32_t node)
 }
 
 #ifdef STARPU_USE_CUDA
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), enum cudaMemcpyKind kind)
 {
 	starpu_block_interface_t *src_block = src_interface;
 	starpu_block_interface_t *dst_block = dst_interface;
@@ -398,7 +399,7 @@ static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__
 		if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
 		{
                         cures = cudaMemcpy((char *)dst_block->ptr, (char *)src_block->ptr,
-                                           nx*ny*nz*elemsize, cudaMemcpyDeviceToHost);
+                                           nx*ny*nz*elemsize, kind);
                         if (STARPU_UNLIKELY(cures))
                                 STARPU_CUDA_REPORT_ERROR(cures);
                 }
@@ -406,7 +407,7 @@ static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__
 			/* Are all plans contiguous */
                         cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
                                              (char *)src_block->ptr, src_block->ldz*elemsize,
-                                             nx*ny*elemsize, nz, cudaMemcpyDeviceToHost);
+                                             nx*ny*elemsize, nz, kind);
                         if (STARPU_UNLIKELY(cures))
                                 STARPU_CUDA_REPORT_ERROR(cures);
                 }
@@ -421,7 +422,7 @@ static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__
 
 			cures = cudaMemcpy2D((char *)dst_ptr, dst_block->ldy*elemsize,
                                              (char *)src_ptr, src_block->ldy*elemsize,
-                                             nx*elemsize, ny, cudaMemcpyDeviceToHost);
+                                             nx*elemsize, ny, kind);
 
 			if (STARPU_UNLIKELY(cures))
 				STARPU_CUDA_REPORT_ERROR(cures);
@@ -435,7 +436,7 @@ static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__
 	return 0;
 }
 
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
+static int copy_cuda_async_common(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream, enum cudaMemcpyKind kind)
 {
 	starpu_block_interface_t *src_block = src_interface;
 	starpu_block_interface_t *dst_block = dst_interface;
@@ -457,11 +458,11 @@ static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attri
 		if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
 		{
 			cures = cudaMemcpyAsync((char *)dst_block->ptr, (char *)src_block->ptr,
-					nx*ny*nz*elemsize, cudaMemcpyDeviceToHost, *stream);
+					nx*ny*nz*elemsize, kind, *stream);
 			if (STARPU_UNLIKELY(cures))
 			{
 				cures = cudaMemcpy((char *)dst_block->ptr, (char *)src_block->ptr,
-					nx*ny*nz*elemsize, cudaMemcpyDeviceToHost);
+					nx*ny*nz*elemsize, kind);
 				if (STARPU_UNLIKELY(cures))
 					STARPU_CUDA_REPORT_ERROR(cures);
 				cudaThreadSynchronize();
@@ -477,12 +478,12 @@ static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attri
 			/* Are all plans contiguous */
 			cures = cudaMemcpy2DAsync((char *)dst_block->ptr, dst_block->ldz*elemsize,
 					(char *)src_block->ptr, src_block->ldz*elemsize,
-					nx*ny*elemsize, nz, cudaMemcpyDeviceToHost, *stream);
+					nx*ny*elemsize, nz, kind, *stream);
 			if (STARPU_UNLIKELY(cures))
 			{
 				cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
 						(char *)src_block->ptr, src_block->ldz*elemsize,
-						nx*ny*elemsize, nz, cudaMemcpyDeviceToHost);
+						nx*ny*elemsize, nz, kind);
 				if (STARPU_UNLIKELY(cures))
 					STARPU_CUDA_REPORT_ERROR(cures);
 				cudaThreadSynchronize();
@@ -504,7 +505,7 @@ static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attri
 
 			cures = cudaMemcpy2DAsync((char *)dst_ptr, dst_block->ldy*elemsize,
                                                   (char *)src_ptr, src_block->ldy*elemsize,
-                                                  nx*elemsize, ny, cudaMemcpyDeviceToHost, *stream);
+                                                  nx*elemsize, ny, kind, *stream);
 
 			if (STARPU_UNLIKELY(cures))
 			{
@@ -533,7 +534,7 @@ no_async_default:
 
 		cures = cudaMemcpy2D((char *)dst_ptr, dst_block->ldy*elemsize,
                                      (char *)src_ptr, src_block->ldy*elemsize,
-                                     nx*elemsize, ny, cudaMemcpyDeviceToHost);
+                                     nx*elemsize, ny, kind);
 
 		if (STARPU_UNLIKELY(cures))
 			STARPU_CUDA_REPORT_ERROR(cures);
@@ -545,170 +546,29 @@ no_async_default:
 	}
 }
 
-
-
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
+static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
 {
-	starpu_block_interface_t *src_block = src_interface;
-	starpu_block_interface_t *dst_block = dst_interface;
-
-	uint32_t nx = src_block->nx;
-	uint32_t ny = src_block->ny;
-	uint32_t nz = src_block->nz;
-	size_t elemsize = src_block->elemsize;
-
-	cudaError_t cures;
-	int ret;
-
-	/* We may have a contiguous buffer for the entire block, or contiguous
-	 * plans within the block, we can avoid many small transfers that way */
-	if ((nx == src_block->ldy) && (src_block->ldy == dst_block->ldy))
-	{
-		/* Is that a single contiguous buffer ? */
-		if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
-		{
-			cures = cudaMemcpyAsync((char *)dst_block->ptr, (char *)src_block->ptr,
-					nx*ny*nz*elemsize, cudaMemcpyHostToDevice, *stream);
-			if (STARPU_UNLIKELY(cures))
-			{
-				cures = cudaMemcpy((char *)dst_block->ptr, (char *)src_block->ptr,
-					nx*ny*nz*elemsize, cudaMemcpyHostToDevice);
-				if (STARPU_UNLIKELY(cures))
-					STARPU_CUDA_REPORT_ERROR(cures);
-				cudaThreadSynchronize();
-
-				ret = 0;
-			}
-			else {
-				ret = -EAGAIN;
-			}
-			
-		}
-		else {
-			/* Are all plans contiguous */
-			cures = cudaMemcpy2DAsync((char *)dst_block->ptr, dst_block->ldz*elemsize,
-					(char *)src_block->ptr, src_block->ldz*elemsize,
-					nx*ny*elemsize, nz, cudaMemcpyHostToDevice, *stream);
-			if (STARPU_UNLIKELY(cures))
-			{
-				cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
-						(char *)src_block->ptr, src_block->ldz*elemsize,
-						nx*ny*elemsize, nz, cudaMemcpyHostToDevice);
-				if (STARPU_UNLIKELY(cures))
-					STARPU_CUDA_REPORT_ERROR(cures);
-				cudaThreadSynchronize();
-
-				ret = 0;
-			}
-			else {
-				ret = -EAGAIN;
-			}
-		}
-	}
-	else {
-		/* Default case: we transfer all lines one by one: ny*nz transfers */
-		unsigned layer;
-		for (layer = 0; layer < src_block->nz; layer++)
-		{
-			uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + layer*src_block->ldz*src_block->elemsize;
-			uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + layer*dst_block->ldz*dst_block->elemsize;
-
-			cures = cudaMemcpy2DAsync((char *)dst_ptr, dst_block->ldy*elemsize,
-					(char *)src_ptr, src_block->ldy*elemsize,
-					nx*elemsize, ny, cudaMemcpyHostToDevice, *stream);
-
-			if (STARPU_UNLIKELY(cures))
-			{
-				/* I don't know how to do that "better" */
-				goto no_async_default;
-			}
-
-		}
-
-		ret = -EAGAIN;
-
-	}
-
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
-
-	return ret;
-
-no_async_default:
-
-	{
-	unsigned layer;
-	for (layer = 0; layer < src_block->nz; layer++)
-	{
-		uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + layer*src_block->ldz*src_block->elemsize;
-		uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + layer*dst_block->ldz*dst_block->elemsize;
-
-		cures = cudaMemcpy2D((char *)dst_ptr, dst_block->ldy*elemsize,
-                                     (char *)src_ptr, src_block->ldy*elemsize,
-                                     nx*elemsize, ny, cudaMemcpyHostToDevice);
-
-		if (STARPU_UNLIKELY(cures))
-			STARPU_CUDA_REPORT_ERROR(cures);
-	}
-	cudaThreadSynchronize();
-
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
-	return 0;
-	}
+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
 }
 
 static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
 {
-	starpu_block_interface_t *src_block = src_interface;
-	starpu_block_interface_t *dst_block = dst_interface;
-
-	uint32_t nx = src_block->nx;
-	uint32_t ny = src_block->ny;
-	uint32_t nz = src_block->nz;
-	size_t elemsize = src_block->elemsize;
-
-	cudaError_t cures;
-
-	/* We may have a contiguous buffer for the entire block, or contiguous
-	 * plans within the block, we can avoid many small transfers that way */
-	if ((nx == src_block->ldy) && (src_block->ldy == dst_block->ldy))
-	{
-		/* Is that a single contiguous buffer ? */
-		if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
-		{
-			cures = cudaMemcpy((char *)dst_block->ptr, (char *)src_block->ptr,
-                                           nx*ny*nz*elemsize, cudaMemcpyHostToDevice);
-                }
-                else {
-			/* Are all plans contiguous */
-			cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
-                                             (char *)src_block->ptr, src_block->ldz*elemsize,
-                                             nx*ny*elemsize, nz, cudaMemcpyHostToDevice);
-                }
-		if (STARPU_UNLIKELY(cures))
-			STARPU_CUDA_REPORT_ERROR(cures);
-	}
-	else {
-		/* Default case: we transfer all lines one by one: ny*nz transfers */
-		unsigned layer;
-		for (layer = 0; layer < src_block->nz; layer++)
-		{
-			uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + layer*src_block->ldz*src_block->elemsize;
-			uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + layer*dst_block->ldz*dst_block->elemsize;
-
-			cures = cudaMemcpy2D((char *)dst_ptr, dst_block->ldy*elemsize,
-                                             (char *)src_ptr, src_block->ldy*elemsize,
-                                             nx*elemsize, ny, cudaMemcpyHostToDevice);
-
-			if (STARPU_UNLIKELY(cures))
-				STARPU_CUDA_REPORT_ERROR(cures);
-		}
-	}
+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
+}
 
-	cudaThreadSynchronize();
+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+{
+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
+}
 
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
+{
+	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
+}
 
-	return 0;
+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
+{
+	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
 }
 #endif // STARPU_USE_CUDA
 

+ 16 - 30
src/datawizard/interfaces/csr_interface.c

@@ -30,6 +30,7 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__(
 #ifdef STARPU_USE_CUDA
 static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
 static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
 #endif
 #ifdef STARPU_USE_OPENCL
 static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
@@ -42,12 +43,12 @@ static const struct starpu_data_copy_methods csr_copy_data_methods_s = {
 #ifdef STARPU_USE_CUDA
 	.ram_to_cuda = copy_ram_to_cuda,
 	.cuda_to_ram = copy_cuda_to_ram,
+	.cuda_to_cuda = copy_cuda_to_cuda,
 #endif
 #ifdef STARPU_USE_OPENCL
 	.ram_to_opencl = copy_ram_to_opencl,
 	.opencl_to_ram = copy_opencl_to_ram,
 #endif
-	.cuda_to_cuda = NULL,
 	.cuda_to_spu = NULL,
 	.spu_to_ram = NULL,
 	.spu_to_cuda = NULL,
@@ -377,7 +378,7 @@ static void free_csr_buffer_on_node(void *interface, uint32_t node)
 }
 
 #ifdef STARPU_USE_CUDA
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), enum cudaMemcpyKind kind)
 {
 	starpu_csr_interface_t *src_csr = src_interface;
 	starpu_csr_interface_t *dst_csr = dst_interface;
@@ -388,15 +389,15 @@ static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__
 
 	cudaError_t cures;
 
-	cures = cudaMemcpy((char *)dst_csr->nzval, (char *)src_csr->nzval, nnz*elemsize, cudaMemcpyDeviceToHost);
+	cures = cudaMemcpy((char *)dst_csr->nzval, (char *)src_csr->nzval, nnz*elemsize, kind);
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 
-	cures = cudaMemcpy((char *)dst_csr->colind, (char *)src_csr->colind, nnz*sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	cures = cudaMemcpy((char *)dst_csr->colind, (char *)src_csr->colind, nnz*sizeof(uint32_t), kind);
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 
-	cures = cudaMemcpy((char *)dst_csr->rowptr, (char *)src_csr->rowptr, (nrow+1)*sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	cures = cudaMemcpy((char *)dst_csr->rowptr, (char *)src_csr->rowptr, (nrow+1)*sizeof(uint32_t), kind);
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 
@@ -407,34 +408,19 @@ static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__
 	return 0;
 }
 
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
 {
-	starpu_csr_interface_t *src_csr = src_interface;
-	starpu_csr_interface_t *dst_csr = dst_interface;
-
-	uint32_t nnz = src_csr->nnz;
-	uint32_t nrow = src_csr->nrow;
-	size_t elemsize = src_csr->elemsize;
-
-	cudaError_t cures;
-
-	cures = cudaMemcpy((char *)dst_csr->nzval, (char *)src_csr->nzval, nnz*elemsize, cudaMemcpyHostToDevice);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
-
-	cures = cudaMemcpy((char *)dst_csr->colind, (char *)src_csr->colind, nnz*sizeof(uint32_t), cudaMemcpyHostToDevice);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
-
-	cures = cudaMemcpy((char *)dst_csr->rowptr, (char *)src_csr->rowptr, (nrow+1)*sizeof(uint32_t), cudaMemcpyHostToDevice);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
-
-	cudaThreadSynchronize();
+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
+}
 
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
+static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+{
+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
+}
 
-	return 0;
+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+{
+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
 }
 #endif // STARPU_USE_CUDA
 

+ 15 - 21
src/datawizard/interfaces/matrix_interface.c

@@ -28,6 +28,7 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__(
 #ifdef STARPU_USE_CUDA
 static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
 static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
 static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream);
 static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream);
 #endif
@@ -46,6 +47,7 @@ static const struct starpu_data_copy_methods matrix_copy_data_methods_s = {
 	.cuda_to_ram = copy_cuda_to_ram,
 	.ram_to_cuda_async = copy_ram_to_cuda_async,
 	.cuda_to_ram_async = copy_cuda_to_ram_async,
+	.cuda_to_cuda = copy_cuda_to_cuda,
 #endif
 #ifdef STARPU_USE_OPENCL
 	.ram_to_opencl = copy_ram_to_opencl,
@@ -53,7 +55,6 @@ static const struct starpu_data_copy_methods matrix_copy_data_methods_s = {
         .ram_to_opencl_async = copy_ram_to_opencl_async,
 	.opencl_to_ram_async = copy_opencl_to_ram_async,
 #endif
-	.cuda_to_cuda = NULL,
 	.cuda_to_spu = NULL,
 	.spu_to_ram = NULL,
 	.spu_to_cuda = NULL,
@@ -348,7 +349,7 @@ static void free_matrix_buffer_on_node(void *interface, uint32_t node)
 }
 
 #ifdef STARPU_USE_CUDA
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), enum cudaMemcpyKind kind)
 {
 	starpu_matrix_interface_t *src_matrix = src_interface;
 	starpu_matrix_interface_t *dst_matrix = dst_interface;
@@ -358,7 +359,7 @@ static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__
 	cudaError_t cures;
 	cures = cudaMemcpy2D((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
 			(char *)src_matrix->ptr, src_matrix->ld*elemsize,
-			src_matrix->nx*elemsize, src_matrix->ny, cudaMemcpyDeviceToHost);
+			src_matrix->nx*elemsize, src_matrix->ny, kind);
 	if (STARPU_UNLIKELY(cures))
 		STARPU_CUDA_REPORT_ERROR(cures);
 
@@ -367,27 +368,20 @@ static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__
 	return 0;
 }
 
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
-{
-	starpu_matrix_interface_t *src_matrix = src_interface;
-	starpu_matrix_interface_t *dst_matrix = dst_interface;
 
-	size_t elemsize = src_matrix->elemsize;
+static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+{
+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
+}
 
-	cudaError_t cures;
-	cures = cudaMemcpy2D((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
-			(char *)src_matrix->ptr, src_matrix->ld*elemsize,
-			src_matrix->nx*elemsize, src_matrix->ny, cudaMemcpyHostToDevice);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
-		
-	cures = cudaThreadSynchronize();
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
-		
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, (size_t)src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
+static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+{
+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
+}
 
-	return 0;
+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+{
+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
 }
 
 static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)

+ 34 - 40
src/datawizard/interfaces/variable_interface.c

@@ -31,6 +31,7 @@ static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_in
 static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)));
 static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream);
 static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream);
+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)));
 #endif
 #ifdef STARPU_USE_OPENCL
 static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)));
@@ -47,6 +48,7 @@ static const struct starpu_data_copy_methods variable_copy_data_methods_s = {
 	.cuda_to_ram = copy_cuda_to_ram,
 	.ram_to_cuda_async = copy_ram_to_cuda_async,
 	.cuda_to_ram_async = copy_cuda_to_ram_async,
+	.cuda_to_cuda = copy_cuda_to_cuda,
 #endif
 #ifdef STARPU_USE_OPENCL
 	.ram_to_opencl = copy_ram_to_opencl,
@@ -54,7 +56,6 @@ static const struct starpu_data_copy_methods variable_copy_data_methods_s = {
         .ram_to_opencl_async = copy_ram_to_opencl_async,
 	.opencl_to_ram_async = copy_opencl_to_ram_async,
 #endif
-	.cuda_to_cuda = NULL,
 	.cuda_to_spu = NULL,
 	.spu_to_ram = NULL,
 	.spu_to_cuda = NULL,
@@ -264,13 +265,14 @@ static void free_variable_buffer_on_node(void *interface, uint32_t node)
 }
 
 #ifdef STARPU_USE_CUDA
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__((unused)),
+				void *dst_interface, unsigned dst_node __attribute__((unused)), enum cudaMemcpyKind kind)
 {
 	starpu_variable_interface_t *src_variable = src_interface;
 	starpu_variable_interface_t *dst_variable = dst_interface;
 
 	cudaError_t cures;
-	cures = cudaMemcpy((char *)dst_variable->ptr, (char *)src_variable->ptr, src_variable->elemsize, cudaMemcpyDeviceToHost);
+	cures = cudaMemcpy((char *)dst_variable->ptr, (char *)src_variable->ptr, src_variable->elemsize, kind);
 	cudaThreadSynchronize();
 
 	if (STARPU_UNLIKELY(cures))
@@ -281,34 +283,38 @@ static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__
 	return 0;
 }
 
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
-{
-	starpu_variable_interface_t *src_variable = src_interface;
-	starpu_variable_interface_t *dst_variable = dst_interface;
-
-	cudaError_t cures;
-	cures = cudaMemcpy((char *)dst_variable->ptr, (char *)src_variable->ptr, src_variable->elemsize, cudaMemcpyHostToDevice);
-	cudaThreadSynchronize();
 
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
+static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)),
+				void *dst_interface, unsigned dst_node __attribute__((unused)))
+{
+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
+}
 
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
+static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)),
+				void *dst_interface, unsigned dst_node __attribute__((unused)))
+{
+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
+}
 
-	return 0;
+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)),
+				void *dst_interface, unsigned dst_node __attribute__((unused)))
+{
+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
 }
 
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
+static int copy_cuda_async_common(void *src_interface, unsigned src_node __attribute__((unused)),
+					void *dst_interface, unsigned dst_node __attribute__((unused)),
+					cudaStream_t *stream, enum cudaMemcpyKind kind)
 {
 	starpu_variable_interface_t *src_variable = src_interface;
 	starpu_variable_interface_t *dst_variable = dst_interface;
 
 	cudaError_t cures;
-	cures = cudaMemcpyAsync((char *)dst_variable->ptr, (char *)src_variable->ptr, src_variable->elemsize, cudaMemcpyDeviceToHost, *stream);
+	cures = cudaMemcpyAsync((char *)dst_variable->ptr, (char *)src_variable->ptr, src_variable->elemsize, kind, *stream);
 	if (cures)
 	{
 		/* do it in a synchronous fashion */
-		cures = cudaMemcpy((char *)dst_variable->ptr, (char *)src_variable->ptr, src_variable->elemsize, cudaMemcpyDeviceToHost);
+		cures = cudaMemcpy((char *)dst_variable->ptr, (char *)src_variable->ptr, src_variable->elemsize, kind);
 		cudaThreadSynchronize();
 
 		if (STARPU_UNLIKELY(cures))
@@ -317,34 +323,22 @@ static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attri
 		return 0;
 	}
 
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->elemsize);
 
 	return -EAGAIN;
 }
 
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
-{
-	starpu_variable_interface_t *src_variable = src_interface;
-	starpu_variable_interface_t *dst_variable = dst_interface;
 
-	cudaError_t cures;
-	
-	cures = cudaMemcpyAsync((char *)dst_variable->ptr, (char *)src_variable->ptr, src_variable->elemsize, cudaMemcpyHostToDevice, *stream);
-	if (cures)
-	{
-		/* do it in a synchronous fashion */
-		cures = cudaMemcpy((char *)dst_variable->ptr, (char *)src_variable->ptr, src_variable->elemsize, cudaMemcpyHostToDevice);
-		cudaThreadSynchronize();
-
-		if (STARPU_UNLIKELY(cures))
-			STARPU_CUDA_REPORT_ERROR(cures);
-
-		return 0;
-	}
-
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)),
+					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
+{
+	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
+}
 
-	return -EAGAIN;
+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)),
+					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
+{
+	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
 }
 #endif // STARPU_USE_CUDA
 

+ 32 - 41
src/datawizard/interfaces/vector_interface.c

@@ -30,6 +30,7 @@ static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__
 static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node);
 static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node, cudaStream_t *stream);
 static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node, cudaStream_t *stream);
+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
 #endif
 #ifdef STARPU_USE_OPENCL
 static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node);
@@ -46,6 +47,7 @@ static const struct starpu_data_copy_methods vector_copy_data_methods_s = {
 	.cuda_to_ram = copy_cuda_to_ram,
 	.ram_to_cuda_async = copy_ram_to_cuda_async,
 	.cuda_to_ram_async = copy_cuda_to_ram_async,
+	.cuda_to_cuda = copy_cuda_to_cuda,
 #endif
 #ifdef STARPU_USE_OPENCL
 	.ram_to_opencl = copy_ram_to_opencl,
@@ -53,7 +55,6 @@ static const struct starpu_data_copy_methods vector_copy_data_methods_s = {
         .ram_to_opencl_async = copy_ram_to_opencl_async,
 	.opencl_to_ram_async = copy_opencl_to_ram_async,
 #endif
-	.cuda_to_cuda = NULL,
 	.cuda_to_spu = NULL,
 	.spu_to_ram = NULL,
 	.spu_to_cuda = NULL,
@@ -299,14 +300,14 @@ static void free_vector_buffer_on_node(void *interface, uint32_t node)
 }
 
 #ifdef STARPU_USE_CUDA
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)),
-				void *dst_interface, unsigned dst_node __attribute__((unused)))
+static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__((unused)),
+				void *dst_interface, unsigned dst_node __attribute__((unused)), enum cudaMemcpyKind kind)
 {
 	starpu_vector_interface_t *src_vector = src_interface;
 	starpu_vector_interface_t *dst_vector = dst_interface;
 
 	cudaError_t cures;
-	cures = cudaMemcpy((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, cudaMemcpyDeviceToHost);
+	cures = cudaMemcpy((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, kind);
 	cudaThreadSynchronize();
 
 	if (STARPU_UNLIKELY(cures))
@@ -317,36 +318,38 @@ static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__
 	return 0;
 }
 
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)),
+
+static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)),
 				void *dst_interface, unsigned dst_node __attribute__((unused)))
 {
-	starpu_vector_interface_t *src_vector = src_interface;
-	starpu_vector_interface_t *dst_vector = dst_interface;
-
-	cudaError_t cures;
-	cures = cudaMemcpy((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, cudaMemcpyHostToDevice);
-	cudaThreadSynchronize();
-
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
+}
 
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
+static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)),
+				void *dst_interface, unsigned dst_node __attribute__((unused)))
+{
+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
+}
 
-	return 0;
+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)),
+				void *dst_interface, unsigned dst_node __attribute__((unused)))
+{
+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
 }
 
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)),
-					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
+static int copy_cuda_async_common(void *src_interface, unsigned src_node __attribute__((unused)),
+					void *dst_interface, unsigned dst_node __attribute__((unused)),
+					cudaStream_t *stream, enum cudaMemcpyKind kind)
 {
 	starpu_vector_interface_t *src_vector = src_interface;
 	starpu_vector_interface_t *dst_vector = dst_interface;
 
 	cudaError_t cures;
-	cures = cudaMemcpyAsync((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, cudaMemcpyDeviceToHost, *stream);
+	cures = cudaMemcpyAsync((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, kind, *stream);
 	if (cures)
 	{
 		/* do it in a synchronous fashion */
-		cures = cudaMemcpy((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, cudaMemcpyDeviceToHost);
+		cures = cudaMemcpy((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, kind);
 		cudaThreadSynchronize();
 
 		if (STARPU_UNLIKELY(cures))
@@ -360,31 +363,19 @@ static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attri
 	return -EAGAIN;
 }
 
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)),
+
+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)),
 					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
 {
-	starpu_vector_interface_t *src_vector = src_interface;
-	starpu_vector_interface_t *dst_vector = dst_interface;
-
-	cudaError_t cures;
-	
-	cures = cudaMemcpyAsync((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, cudaMemcpyHostToDevice, *stream);
-	if (cures)
-	{
-		/* do it in a synchronous fashion */
-		cures = cudaMemcpy((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, cudaMemcpyHostToDevice);
-		cudaThreadSynchronize();
-
-		if (STARPU_UNLIKELY(cures))
-			STARPU_CUDA_REPORT_ERROR(cures);
-
-		return 0;
-	}
-
-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
+	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
+}
 
-	return -EAGAIN;
+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)),
+					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
+{
+	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
 }
+
 #endif // STARPU_USE_CUDA
 
 #ifdef STARPU_USE_OPENCL

+ 6 - 6
src/datawizard/interfaces/void_interface.c

@@ -34,7 +34,7 @@ static int dummy_opencl_copy_async(void *src_interface, unsigned src_node, void
 
 static const struct starpu_data_copy_methods void_copy_data_methods_s = {
 	.ram_to_ram = dummy_copy,
-	.ram_to_spu = NULL,
+	.ram_to_spu = dummy_copy,
 #ifdef STARPU_USE_CUDA
 	.ram_to_cuda = dummy_copy,
 	.cuda_to_ram = dummy_copy,
@@ -47,11 +47,11 @@ static const struct starpu_data_copy_methods void_copy_data_methods_s = {
         .ram_to_opencl_async = dummy_opencl_copy_async,
 	.opencl_to_ram_async = dummy_opencl_copy_async,
 #endif
-	.cuda_to_cuda = NULL,
-	.cuda_to_spu = NULL,
-	.spu_to_ram = NULL,
-	.spu_to_cuda = NULL,
-	.spu_to_spu = NULL
+	.cuda_to_cuda = dummy_copy,
+	.cuda_to_spu = dummy_copy,
+	.spu_to_ram = dummy_copy,
+	.spu_to_cuda = dummy_copy,
+	.spu_to_spu = dummy_copy
 };
 
 static void register_void_handle(starpu_data_handle handle, uint32_t home_node, void *interface);