15 years ago · 19f09aba2d
--- a/src/datawizard/interfaces/bcsr_interface.c
+++ b/src/datawizard/interfaces/bcsr_interface.c
@@ -34,6 +34,7 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__(
 
				 #ifdef STARPU_USE_CUDA
			
 
				 static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				 static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
@@ -46,12 +47,12 @@ static const struct starpu_data_copy_methods bcsr_copy_data_methods_s = {
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.ram_to_cuda = copy_ram_to_cuda,
			
 
				 	.cuda_to_ram = copy_cuda_to_ram,
			
 
				+	.cuda_to_cuda = copy_cuda_to_cuda,
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 	.ram_to_opencl = copy_ram_to_opencl,
			
 
				 	.opencl_to_ram = copy_opencl_to_ram,
			
 
				 #endif
			
 
				-	.cuda_to_cuda = NULL,
			
 
				 	.cuda_to_spu = NULL,
			
 
				 	.spu_to_ram = NULL,
			
 
				 	.spu_to_cuda = NULL,
			
@@ -410,7 +411,7 @@ static void free_bcsr_buffer_on_node(void *interface, uint32_t node)
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), enum cudaMemcpyKind kind)
			
 
				 {
			
 
				 	starpu_bcsr_interface_t *src_bcsr = src_interface;
			
 
				 	starpu_bcsr_interface_t *dst_bcsr = dst_interface;
			
@@ -424,15 +425,15 @@ static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__
 
				 
			
 
				 	cudaError_t cures;
			
 
				 
			
 
				-	cures = cudaMemcpy((char *)dst_bcsr->nzval, (char *)src_bcsr->nzval, nnz*r*c*elemsize, cudaMemcpyDeviceToHost);
			
 
				+	cures = cudaMemcpy((char *)dst_bcsr->nzval, (char *)src_bcsr->nzval, nnz*r*c*elemsize, kind);
			
 
				 	if (STARPU_UNLIKELY(cures))
			
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				-	cures = cudaMemcpy((char *)dst_bcsr->colind, (char *)src_bcsr->colind, nnz*sizeof(uint32_t), cudaMemcpyDeviceToHost);
			
 
				+	cures = cudaMemcpy((char *)dst_bcsr->colind, (char *)src_bcsr->colind, nnz*sizeof(uint32_t), kind);
			
 
				 	if (STARPU_UNLIKELY(cures))
			
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				-	cures = cudaMemcpy((char *)dst_bcsr->rowptr, (char *)src_bcsr->rowptr, (nrow+1)*sizeof(uint32_t), cudaMemcpyDeviceToHost);
			
 
				+	cures = cudaMemcpy((char *)dst_bcsr->rowptr, (char *)src_bcsr->rowptr, (nrow+1)*sizeof(uint32_t), kind);
			
 
				 	if (STARPU_UNLIKELY(cures))
			
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
@@ -443,37 +444,19 @@ static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				 {
			
 
				-	starpu_bcsr_interface_t *src_bcsr = src_interface;
			
 
				-	starpu_bcsr_interface_t *dst_bcsr = dst_interface;
			
 
				-
			
 
				-	uint32_t nnz = src_bcsr->nnz;
			
 
				-	uint32_t nrow = src_bcsr->nrow;
			
 
				-	size_t elemsize = src_bcsr->elemsize;
			
 
				-
			
 
				-	uint32_t r = src_bcsr->r;
			
 
				-	uint32_t c = src_bcsr->c;
			
 
				-
			
 
				-	cudaError_t cures;
			
 
				-
			
 
				-	cures = cudaMemcpy((char *)dst_bcsr->nzval, (char *)src_bcsr->nzval, nnz*r*c*elemsize, cudaMemcpyHostToDevice);
			
 
				-	if (STARPU_UNLIKELY(cures))
			
 
				-		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-
			
 
				-	cures = cudaMemcpy((char *)dst_bcsr->colind, (char *)src_bcsr->colind, nnz*sizeof(uint32_t), cudaMemcpyHostToDevice);
			
 
				-	if (STARPU_UNLIKELY(cures))
			
 
				-		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-
			
 
				-	cures = cudaMemcpy((char *)dst_bcsr->rowptr, (char *)src_bcsr->rowptr, (nrow+1)*sizeof(uint32_t), cudaMemcpyHostToDevice);
			
 
				-	if (STARPU_UNLIKELY(cures))
			
 
				-		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-
			
 
				-	cudaThreadSynchronize();
			
 
				+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
			
 
				+}
			
 
				 
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				+static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+{
			
 
				+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
			
 
				+}
			
 
				 
			
 
				-	return 0;
			
 
				+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+{
			
 
				+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
			
 
				 }
			
 
				 #endif // STARPU_USE_CUDA
			
 
				 
			
--- a/src/datawizard/interfaces/block_interface.c
+++ b/src/datawizard/interfaces/block_interface.c
@@ -32,6 +32,7 @@ static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__
 
				 static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				 static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream);
			
 
				 static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream);
			
 
				+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
@@ -48,6 +49,7 @@ static const struct starpu_data_copy_methods block_copy_data_methods_s = {
 
				 	.cuda_to_ram = copy_cuda_to_ram,
			
 
				 	.ram_to_cuda_async = copy_ram_to_cuda_async,
			
 
				 	.cuda_to_ram_async = copy_cuda_to_ram_async,
			
 
				+	.cuda_to_cuda = copy_cuda_to_cuda,
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 	.ram_to_opencl = copy_ram_to_opencl,
			
@@ -55,7 +57,6 @@ static const struct starpu_data_copy_methods block_copy_data_methods_s = {
 
				         .ram_to_opencl_async = copy_ram_to_opencl_async,
			
 
				 	.opencl_to_ram_async = copy_opencl_to_ram_async,
			
 
				 #endif
			
 
				-	.cuda_to_cuda = NULL,
			
 
				 	.cuda_to_spu = NULL,
			
 
				 	.spu_to_ram = NULL,
			
 
				 	.spu_to_cuda = NULL,
			
@@ -380,7 +381,7 @@ static void free_block_buffer_on_node(void *interface, uint32_t node)
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), enum cudaMemcpyKind kind)
			
 
				 {
			
 
				 	starpu_block_interface_t *src_block = src_interface;
			
 
				 	starpu_block_interface_t *dst_block = dst_interface;
			
@@ -398,7 +399,7 @@ static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__
 
				 		if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
			
 
				 		{
			
 
				                         cures = cudaMemcpy((char *)dst_block->ptr, (char *)src_block->ptr,
			
 
				-                                           nx*ny*nz*elemsize, cudaMemcpyDeviceToHost);
			
 
				+                                           nx*ny*nz*elemsize, kind);
			
 
				                         if (STARPU_UNLIKELY(cures))
			
 
				                                 STARPU_CUDA_REPORT_ERROR(cures);
			
 
				                 }
			
@@ -406,7 +407,7 @@ static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__
 
				 			/* Are all plans contiguous */
			
 
				                         cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
			
 
				                                              (char *)src_block->ptr, src_block->ldz*elemsize,
			
 
				-                                             nx*ny*elemsize, nz, cudaMemcpyDeviceToHost);
			
 
				+                                             nx*ny*elemsize, nz, kind);
			
 
				                         if (STARPU_UNLIKELY(cures))
			
 
				                                 STARPU_CUDA_REPORT_ERROR(cures);
			
 
				                 }
			
@@ -421,7 +422,7 @@ static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__
 
				 
			
 
				 			cures = cudaMemcpy2D((char *)dst_ptr, dst_block->ldy*elemsize,
			
 
				                                              (char *)src_ptr, src_block->ldy*elemsize,
			
 
				-                                             nx*elemsize, ny, cudaMemcpyDeviceToHost);
			
 
				+                                             nx*elemsize, ny, kind);
			
 
				 
			
 
				 			if (STARPU_UNLIKELY(cures))
			
 
				 				STARPU_CUDA_REPORT_ERROR(cures);
			
@@ -435,7 +436,7 @@ static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
			
 
				+static int copy_cuda_async_common(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream, enum cudaMemcpyKind kind)
			
 
				 {
			
 
				 	starpu_block_interface_t *src_block = src_interface;
			
 
				 	starpu_block_interface_t *dst_block = dst_interface;
			
@@ -457,11 +458,11 @@ static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attri
 
				 		if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
			
 
				 		{
			
 
				 			cures = cudaMemcpyAsync((char *)dst_block->ptr, (char *)src_block->ptr,
			
 
				-					nx*ny*nz*elemsize, cudaMemcpyDeviceToHost, *stream);
			
 
				+					nx*ny*nz*elemsize, kind, *stream);
			
 
				 			if (STARPU_UNLIKELY(cures))
			
 
				 			{
			
 
				 				cures = cudaMemcpy((char *)dst_block->ptr, (char *)src_block->ptr,
			
 
				-					nx*ny*nz*elemsize, cudaMemcpyDeviceToHost);
			
 
				+					nx*ny*nz*elemsize, kind);
			
 
				 				if (STARPU_UNLIKELY(cures))
			
 
				 					STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 				cudaThreadSynchronize();
			
@@ -477,12 +478,12 @@ static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attri
 
				 			/* Are all plans contiguous */
			
 
				 			cures = cudaMemcpy2DAsync((char *)dst_block->ptr, dst_block->ldz*elemsize,
			
 
				 					(char *)src_block->ptr, src_block->ldz*elemsize,
			
 
				-					nx*ny*elemsize, nz, cudaMemcpyDeviceToHost, *stream);
			
 
				+					nx*ny*elemsize, nz, kind, *stream);
			
 
				 			if (STARPU_UNLIKELY(cures))
			
 
				 			{
			
 
				 				cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
			
 
				 						(char *)src_block->ptr, src_block->ldz*elemsize,
			
 
				-						nx*ny*elemsize, nz, cudaMemcpyDeviceToHost);
			
 
				+						nx*ny*elemsize, nz, kind);
			
 
				 				if (STARPU_UNLIKELY(cures))
			
 
				 					STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 				cudaThreadSynchronize();
			
@@ -504,7 +505,7 @@ static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attri
 
				 
			
 
				 			cures = cudaMemcpy2DAsync((char *)dst_ptr, dst_block->ldy*elemsize,
			
 
				                                                   (char *)src_ptr, src_block->ldy*elemsize,
			
 
				-                                                  nx*elemsize, ny, cudaMemcpyDeviceToHost, *stream);
			
 
				+                                                  nx*elemsize, ny, kind, *stream);
			
 
				 
			
 
				 			if (STARPU_UNLIKELY(cures))
			
 
				 			{
			
@@ -533,7 +534,7 @@ no_async_default:
 
				 
			
 
				 		cures = cudaMemcpy2D((char *)dst_ptr, dst_block->ldy*elemsize,
			
 
				                                      (char *)src_ptr, src_block->ldy*elemsize,
			
 
				-                                     nx*elemsize, ny, cudaMemcpyDeviceToHost);
			
 
				+                                     nx*elemsize, ny, kind);
			
 
				 
			
 
				 		if (STARPU_UNLIKELY(cures))
			
 
				 			STARPU_CUDA_REPORT_ERROR(cures);
			
@@ -545,170 +546,29 @@ no_async_default:
 
				 	}
			
 
				 }
			
 
				 
			
 
				-
			
 
				-
			
 
				-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
			
 
				+static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				 {
			
 
				-	starpu_block_interface_t *src_block = src_interface;
			
 
				-	starpu_block_interface_t *dst_block = dst_interface;
			
 
				-
			
 
				-	uint32_t nx = src_block->nx;
			
 
				-	uint32_t ny = src_block->ny;
			
 
				-	uint32_t nz = src_block->nz;
			
 
				-	size_t elemsize = src_block->elemsize;
			
 
				-
			
 
				-	cudaError_t cures;
			
 
				-	int ret;
			
 
				-
			
 
				-	/* We may have a contiguous buffer for the entire block, or contiguous
			
 
				-	 * plans within the block, we can avoid many small transfers that way */
			
 
				-	if ((nx == src_block->ldy) && (src_block->ldy == dst_block->ldy))
			
 
				-	{
			
 
				-		/* Is that a single contiguous buffer ? */
			
 
				-		if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
			
 
				-		{
			
 
				-			cures = cudaMemcpyAsync((char *)dst_block->ptr, (char *)src_block->ptr,
			
 
				-					nx*ny*nz*elemsize, cudaMemcpyHostToDevice, *stream);
			
 
				-			if (STARPU_UNLIKELY(cures))
			
 
				-			{
			
 
				-				cures = cudaMemcpy((char *)dst_block->ptr, (char *)src_block->ptr,
			
 
				-					nx*ny*nz*elemsize, cudaMemcpyHostToDevice);
			
 
				-				if (STARPU_UNLIKELY(cures))
			
 
				-					STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-				cudaThreadSynchronize();
			
 
				-
			
 
				-				ret = 0;
			
 
				-			}
			
 
				-			else {
			
 
				-				ret = -EAGAIN;
			
 
				-			}
			
 
				-			
			
 
				-		}
			
 
				-		else {
			
 
				-			/* Are all plans contiguous */
			
 
				-			cures = cudaMemcpy2DAsync((char *)dst_block->ptr, dst_block->ldz*elemsize,
			
 
				-					(char *)src_block->ptr, src_block->ldz*elemsize,
			
 
				-					nx*ny*elemsize, nz, cudaMemcpyHostToDevice, *stream);
			
 
				-			if (STARPU_UNLIKELY(cures))
			
 
				-			{
			
 
				-				cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
			
 
				-						(char *)src_block->ptr, src_block->ldz*elemsize,
			
 
				-						nx*ny*elemsize, nz, cudaMemcpyHostToDevice);
			
 
				-				if (STARPU_UNLIKELY(cures))
			
 
				-					STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-				cudaThreadSynchronize();
			
 
				-
			
 
				-				ret = 0;
			
 
				-			}
			
 
				-			else {
			
 
				-				ret = -EAGAIN;
			
 
				-			}
			
 
				-		}
			
 
				-	}
			
 
				-	else {
			
 
				-		/* Default case: we transfer all lines one by one: ny*nz transfers */
			
 
				-		unsigned layer;
			
 
				-		for (layer = 0; layer < src_block->nz; layer++)
			
 
				-		{
			
 
				-			uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + layer*src_block->ldz*src_block->elemsize;
			
 
				-			uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + layer*dst_block->ldz*dst_block->elemsize;
			
 
				-
			
 
				-			cures = cudaMemcpy2DAsync((char *)dst_ptr, dst_block->ldy*elemsize,
			
 
				-					(char *)src_ptr, src_block->ldy*elemsize,
			
 
				-					nx*elemsize, ny, cudaMemcpyHostToDevice, *stream);
			
 
				-
			
 
				-			if (STARPU_UNLIKELY(cures))
			
 
				-			{
			
 
				-				/* I don't know how to do that "better" */
			
 
				-				goto no_async_default;
			
 
				-			}
			
 
				-
			
 
				-		}
			
 
				-
			
 
				-		ret = -EAGAIN;
			
 
				-
			
 
				-	}
			
 
				-
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
			
 
				-
			
 
				-	return ret;
			
 
				-
			
 
				-no_async_default:
			
 
				-
			
 
				-	{
			
 
				-	unsigned layer;
			
 
				-	for (layer = 0; layer < src_block->nz; layer++)
			
 
				-	{
			
 
				-		uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + layer*src_block->ldz*src_block->elemsize;
			
 
				-		uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + layer*dst_block->ldz*dst_block->elemsize;
			
 
				-
			
 
				-		cures = cudaMemcpy2D((char *)dst_ptr, dst_block->ldy*elemsize,
			
 
				-                                     (char *)src_ptr, src_block->ldy*elemsize,
			
 
				-                                     nx*elemsize, ny, cudaMemcpyHostToDevice);
			
 
				-
			
 
				-		if (STARPU_UNLIKELY(cures))
			
 
				-			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-	}
			
 
				-	cudaThreadSynchronize();
			
 
				-
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
			
 
				-	return 0;
			
 
				-	}
			
 
				+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
			
 
				 }
			
 
				 
			
 
				 static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				 {
			
 
				-	starpu_block_interface_t *src_block = src_interface;
			
 
				-	starpu_block_interface_t *dst_block = dst_interface;
			
 
				-
			
 
				-	uint32_t nx = src_block->nx;
			
 
				-	uint32_t ny = src_block->ny;
			
 
				-	uint32_t nz = src_block->nz;
			
 
				-	size_t elemsize = src_block->elemsize;
			
 
				-
			
 
				-	cudaError_t cures;
			
 
				-
			
 
				-	/* We may have a contiguous buffer for the entire block, or contiguous
			
 
				-	 * plans within the block, we can avoid many small transfers that way */
			
 
				-	if ((nx == src_block->ldy) && (src_block->ldy == dst_block->ldy))
			
 
				-	{
			
 
				-		/* Is that a single contiguous buffer ? */
			
 
				-		if (((nx*ny) == src_block->ldz) && (src_block->ldz == dst_block->ldz))
			
 
				-		{
			
 
				-			cures = cudaMemcpy((char *)dst_block->ptr, (char *)src_block->ptr,
			
 
				-                                           nx*ny*nz*elemsize, cudaMemcpyHostToDevice);
			
 
				-                }
			
 
				-                else {
			
 
				-			/* Are all plans contiguous */
			
 
				-			cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
			
 
				-                                             (char *)src_block->ptr, src_block->ldz*elemsize,
			
 
				-                                             nx*ny*elemsize, nz, cudaMemcpyHostToDevice);
			
 
				-                }
			
 
				-		if (STARPU_UNLIKELY(cures))
			
 
				-			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-	}
			
 
				-	else {
			
 
				-		/* Default case: we transfer all lines one by one: ny*nz transfers */
			
 
				-		unsigned layer;
			
 
				-		for (layer = 0; layer < src_block->nz; layer++)
			
 
				-		{
			
 
				-			uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + layer*src_block->ldz*src_block->elemsize;
			
 
				-			uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + layer*dst_block->ldz*dst_block->elemsize;
			
 
				-
			
 
				-			cures = cudaMemcpy2D((char *)dst_ptr, dst_block->ldy*elemsize,
			
 
				-                                             (char *)src_ptr, src_block->ldy*elemsize,
			
 
				-                                             nx*elemsize, ny, cudaMemcpyHostToDevice);
			
 
				-
			
 
				-			if (STARPU_UNLIKELY(cures))
			
 
				-				STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-		}
			
 
				-	}
			
 
				+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
			
 
				+}
			
 
				 
			
 
				-	cudaThreadSynchronize();
			
 
				+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+{
			
 
				+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
			
 
				+}
			
 
				 
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
			
 
				+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
			
 
				+{
			
 
				+	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
			
 
				+}
			
 
				 
			
 
				-	return 0;
			
 
				+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
			
 
				+{
			
 
				+	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
			
 
				 }
			
 
				 #endif // STARPU_USE_CUDA
			
 
				 
			
--- a/src/datawizard/interfaces/csr_interface.c
+++ b/src/datawizard/interfaces/csr_interface.c
@@ -30,6 +30,7 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__(
 
				 #ifdef STARPU_USE_CUDA
			
 
				 static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				 static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
@@ -42,12 +43,12 @@ static const struct starpu_data_copy_methods csr_copy_data_methods_s = {
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.ram_to_cuda = copy_ram_to_cuda,
			
 
				 	.cuda_to_ram = copy_cuda_to_ram,
			
 
				+	.cuda_to_cuda = copy_cuda_to_cuda,
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 	.ram_to_opencl = copy_ram_to_opencl,
			
 
				 	.opencl_to_ram = copy_opencl_to_ram,
			
 
				 #endif
			
 
				-	.cuda_to_cuda = NULL,
			
 
				 	.cuda_to_spu = NULL,
			
 
				 	.spu_to_ram = NULL,
			
 
				 	.spu_to_cuda = NULL,
			
@@ -377,7 +378,7 @@ static void free_csr_buffer_on_node(void *interface, uint32_t node)
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), enum cudaMemcpyKind kind)
			
 
				 {
			
 
				 	starpu_csr_interface_t *src_csr = src_interface;
			
 
				 	starpu_csr_interface_t *dst_csr = dst_interface;
			
@@ -388,15 +389,15 @@ static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__
 
				 
			
 
				 	cudaError_t cures;
			
 
				 
			
 
				-	cures = cudaMemcpy((char *)dst_csr->nzval, (char *)src_csr->nzval, nnz*elemsize, cudaMemcpyDeviceToHost);
			
 
				+	cures = cudaMemcpy((char *)dst_csr->nzval, (char *)src_csr->nzval, nnz*elemsize, kind);
			
 
				 	if (STARPU_UNLIKELY(cures))
			
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				-	cures = cudaMemcpy((char *)dst_csr->colind, (char *)src_csr->colind, nnz*sizeof(uint32_t), cudaMemcpyDeviceToHost);
			
 
				+	cures = cudaMemcpy((char *)dst_csr->colind, (char *)src_csr->colind, nnz*sizeof(uint32_t), kind);
			
 
				 	if (STARPU_UNLIKELY(cures))
			
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				-	cures = cudaMemcpy((char *)dst_csr->rowptr, (char *)src_csr->rowptr, (nrow+1)*sizeof(uint32_t), cudaMemcpyDeviceToHost);
			
 
				+	cures = cudaMemcpy((char *)dst_csr->rowptr, (char *)src_csr->rowptr, (nrow+1)*sizeof(uint32_t), kind);
			
 
				 	if (STARPU_UNLIKELY(cures))
			
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
@@ -407,34 +408,19 @@ static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				 {
			
 
				-	starpu_csr_interface_t *src_csr = src_interface;
			
 
				-	starpu_csr_interface_t *dst_csr = dst_interface;
			
 
				-
			
 
				-	uint32_t nnz = src_csr->nnz;
			
 
				-	uint32_t nrow = src_csr->nrow;
			
 
				-	size_t elemsize = src_csr->elemsize;
			
 
				-
			
 
				-	cudaError_t cures;
			
 
				-
			
 
				-	cures = cudaMemcpy((char *)dst_csr->nzval, (char *)src_csr->nzval, nnz*elemsize, cudaMemcpyHostToDevice);
			
 
				-	if (STARPU_UNLIKELY(cures))
			
 
				-		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-
			
 
				-	cures = cudaMemcpy((char *)dst_csr->colind, (char *)src_csr->colind, nnz*sizeof(uint32_t), cudaMemcpyHostToDevice);
			
 
				-	if (STARPU_UNLIKELY(cures))
			
 
				-		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-
			
 
				-	cures = cudaMemcpy((char *)dst_csr->rowptr, (char *)src_csr->rowptr, (nrow+1)*sizeof(uint32_t), cudaMemcpyHostToDevice);
			
 
				-	if (STARPU_UNLIKELY(cures))
			
 
				-		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-
			
 
				-	cudaThreadSynchronize();
			
 
				+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
			
 
				+}
			
 
				 
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				+static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+{
			
 
				+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
			
 
				+}
			
 
				 
			
 
				-	return 0;
			
 
				+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+{
			
 
				+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
			
 
				 }
			
 
				 #endif // STARPU_USE_CUDA
			
 
				 
			
--- a/src/datawizard/interfaces/matrix_interface.c
+++ b/src/datawizard/interfaces/matrix_interface.c
@@ -28,6 +28,7 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node __attribute__(
 
				 #ifdef STARPU_USE_CUDA
			
 
				 static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				 static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				 static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream);
			
 
				 static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream);
			
 
				 #endif
			
@@ -46,6 +47,7 @@ static const struct starpu_data_copy_methods matrix_copy_data_methods_s = {
 
				 	.cuda_to_ram = copy_cuda_to_ram,
			
 
				 	.ram_to_cuda_async = copy_ram_to_cuda_async,
			
 
				 	.cuda_to_ram_async = copy_cuda_to_ram_async,
			
 
				+	.cuda_to_cuda = copy_cuda_to_cuda,
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 	.ram_to_opencl = copy_ram_to_opencl,
			
@@ -53,7 +55,6 @@ static const struct starpu_data_copy_methods matrix_copy_data_methods_s = {
 
				         .ram_to_opencl_async = copy_ram_to_opencl_async,
			
 
				 	.opencl_to_ram_async = copy_opencl_to_ram_async,
			
 
				 #endif
			
 
				-	.cuda_to_cuda = NULL,
			
 
				 	.cuda_to_spu = NULL,
			
 
				 	.spu_to_ram = NULL,
			
 
				 	.spu_to_cuda = NULL,
			
@@ -348,7 +349,7 @@ static void free_matrix_buffer_on_node(void *interface, uint32_t node)
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), enum cudaMemcpyKind kind)
			
 
				 {
			
 
				 	starpu_matrix_interface_t *src_matrix = src_interface;
			
 
				 	starpu_matrix_interface_t *dst_matrix = dst_interface;
			
@@ -358,7 +359,7 @@ static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__
 
				 	cudaError_t cures;
			
 
				 	cures = cudaMemcpy2D((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
			
 
				 			(char *)src_matrix->ptr, src_matrix->ld*elemsize,
			
 
				-			src_matrix->nx*elemsize, src_matrix->ny, cudaMemcpyDeviceToHost);
			
 
				+			src_matrix->nx*elemsize, src_matrix->ny, kind);
			
 
				 	if (STARPU_UNLIKELY(cures))
			
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
@@ -367,27 +368,20 @@ static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				-{
			
 
				-	starpu_matrix_interface_t *src_matrix = src_interface;
			
 
				-	starpu_matrix_interface_t *dst_matrix = dst_interface;
			
 
				 
			
 
				-	size_t elemsize = src_matrix->elemsize;
			
 
				+static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+{
			
 
				+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
			
 
				+}
			
 
				 
			
 
				-	cudaError_t cures;
			
 
				-	cures = cudaMemcpy2D((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
			
 
				-			(char *)src_matrix->ptr, src_matrix->ld*elemsize,
			
 
				-			src_matrix->nx*elemsize, src_matrix->ny, cudaMemcpyHostToDevice);
			
 
				-	if (STARPU_UNLIKELY(cures))
			
 
				-		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-		
			
 
				-	cures = cudaThreadSynchronize();
			
 
				-	if (STARPU_UNLIKELY(cures))
			
 
				-		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-		
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, (size_t)src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
			
 
				+static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+{
			
 
				+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
			
 
				+}
			
 
				 
			
 
				-	return 0;
			
 
				+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+{
			
 
				+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
			
 
				 }
			
 
				 
			
 
				 static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
			
--- a/src/datawizard/interfaces/variable_interface.c
+++ b/src/datawizard/interfaces/variable_interface.c
@@ -31,6 +31,7 @@ static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_in
 
				 static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				 static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream);
			
 
				 static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream);
			
 
				+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node __attribute__((unused)));
			
@@ -47,6 +48,7 @@ static const struct starpu_data_copy_methods variable_copy_data_methods_s = {
 
				 	.cuda_to_ram = copy_cuda_to_ram,
			
 
				 	.ram_to_cuda_async = copy_ram_to_cuda_async,
			
 
				 	.cuda_to_ram_async = copy_cuda_to_ram_async,
			
 
				+	.cuda_to_cuda = copy_cuda_to_cuda,
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 	.ram_to_opencl = copy_ram_to_opencl,
			
@@ -54,7 +56,6 @@ static const struct starpu_data_copy_methods variable_copy_data_methods_s = {
 
				         .ram_to_opencl_async = copy_ram_to_opencl_async,
			
 
				 	.opencl_to_ram_async = copy_opencl_to_ram_async,
			
 
				 #endif
			
 
				-	.cuda_to_cuda = NULL,
			
 
				 	.cuda_to_spu = NULL,
			
 
				 	.spu_to_ram = NULL,
			
 
				 	.spu_to_cuda = NULL,
			
@@ -264,13 +265,14 @@ static void free_variable_buffer_on_node(void *interface, uint32_t node)
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				+				void *dst_interface, unsigned dst_node __attribute__((unused)), enum cudaMemcpyKind kind)
			
 
				 {
			
 
				 	starpu_variable_interface_t *src_variable = src_interface;
			
 
				 	starpu_variable_interface_t *dst_variable = dst_interface;
			
 
				 
			
 
				 	cudaError_t cures;
			
 
				-	cures = cudaMemcpy((char *)dst_variable->ptr, (char *)src_variable->ptr, src_variable->elemsize, cudaMemcpyDeviceToHost);
			
 
				+	cures = cudaMemcpy((char *)dst_variable->ptr, (char *)src_variable->ptr, src_variable->elemsize, kind);
			
 
				 	cudaThreadSynchronize();
			
 
				 
			
 
				 	if (STARPU_UNLIKELY(cures))
			
@@ -281,34 +283,38 @@ static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				-{
			
 
				-	starpu_variable_interface_t *src_variable = src_interface;
			
 
				-	starpu_variable_interface_t *dst_variable = dst_interface;
			
 
				-
			
 
				-	cudaError_t cures;
			
 
				-	cures = cudaMemcpy((char *)dst_variable->ptr, (char *)src_variable->ptr, src_variable->elemsize, cudaMemcpyHostToDevice);
			
 
				-	cudaThreadSynchronize();
			
 
				 
			
 
				-	if (STARPU_UNLIKELY(cures))
			
 
				-		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				+				void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+{
			
 
				+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
			
 
				+}
			
 
				 
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
			
 
				+static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				+				void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+{
			
 
				+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
			
 
				+}
			
 
				 
			
 
				-	return 0;
			
 
				+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				+				void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+{
			
 
				+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
			
 
				 }
			
 
				 
			
 
				-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
			
 
				+static int copy_cuda_async_common(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				+					void *dst_interface, unsigned dst_node __attribute__((unused)),
			
 
				+					cudaStream_t *stream, enum cudaMemcpyKind kind)
			
 
				 {
			
 
				 	starpu_variable_interface_t *src_variable = src_interface;
			
 
				 	starpu_variable_interface_t *dst_variable = dst_interface;
			
 
				 
			
 
				 	cudaError_t cures;
			
 
				-	cures = cudaMemcpyAsync((char *)dst_variable->ptr, (char *)src_variable->ptr, src_variable->elemsize, cudaMemcpyDeviceToHost, *stream);
			
 
				+	cures = cudaMemcpyAsync((char *)dst_variable->ptr, (char *)src_variable->ptr, src_variable->elemsize, kind, *stream);
			
 
				 	if (cures)
			
 
				 	{
			
 
				 		/* do it in a synchronous fashion */
			
 
				-		cures = cudaMemcpy((char *)dst_variable->ptr, (char *)src_variable->ptr, src_variable->elemsize, cudaMemcpyDeviceToHost);
			
 
				+		cures = cudaMemcpy((char *)dst_variable->ptr, (char *)src_variable->ptr, src_variable->elemsize, kind);
			
 
				 		cudaThreadSynchronize();
			
 
				 
			
 
				 		if (STARPU_UNLIKELY(cures))
			
@@ -317,34 +323,22 @@ static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attri
 
				 		return 0;
			
 
				 	}
			
 
				 
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
			
 
				+	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->elemsize);
			
 
				 
			
 
				 	return -EAGAIN;
			
 
				 }
			
 
				 
			
 
				-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
			
 
				-{
			
 
				-	starpu_variable_interface_t *src_variable = src_interface;
			
 
				-	starpu_variable_interface_t *dst_variable = dst_interface;
			
 
				 
			
 
				-	cudaError_t cures;
			
 
				-	
			
 
				-	cures = cudaMemcpyAsync((char *)dst_variable->ptr, (char *)src_variable->ptr, src_variable->elemsize, cudaMemcpyHostToDevice, *stream);
			
 
				-	if (cures)
			
 
				-	{
			
 
				-		/* do it in a synchronous fashion */
			
 
				-		cures = cudaMemcpy((char *)dst_variable->ptr, (char *)src_variable->ptr, src_variable->elemsize, cudaMemcpyHostToDevice);
			
 
				-		cudaThreadSynchronize();
			
 
				-
			
 
				-		if (STARPU_UNLIKELY(cures))
			
 
				-			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-
			
 
				-		return 0;
			
 
				-	}
			
 
				-
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
			
 
				+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				+					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
			
 
				+{
			
 
				+	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
			
 
				+}
			
 
				 
			
 
				-	return -EAGAIN;
			
 
				+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				+					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
			
 
				+{
			
 
				+	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
			
 
				 }
			
 
				 #endif // STARPU_USE_CUDA
			
 
				 
			
--- a/src/datawizard/interfaces/vector_interface.c
+++ b/src/datawizard/interfaces/vector_interface.c
@@ -30,6 +30,7 @@ static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__
 
				 static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node);
			
 
				 static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node, cudaStream_t *stream);
			
 
				 static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node, cudaStream_t *stream);
			
 
				+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node __attribute__((unused)));
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 static int copy_ram_to_opencl(void *src_interface, unsigned src_node __attribute__((unused)), void *dst_interface, unsigned dst_node);
			
@@ -46,6 +47,7 @@ static const struct starpu_data_copy_methods vector_copy_data_methods_s = {
 
				 	.cuda_to_ram = copy_cuda_to_ram,
			
 
				 	.ram_to_cuda_async = copy_ram_to_cuda_async,
			
 
				 	.cuda_to_ram_async = copy_cuda_to_ram_async,
			
 
				+	.cuda_to_cuda = copy_cuda_to_cuda,
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 	.ram_to_opencl = copy_ram_to_opencl,
			
@@ -53,7 +55,6 @@ static const struct starpu_data_copy_methods vector_copy_data_methods_s = {
 
				         .ram_to_opencl_async = copy_ram_to_opencl_async,
			
 
				 	.opencl_to_ram_async = copy_opencl_to_ram_async,
			
 
				 #endif
			
 
				-	.cuda_to_cuda = NULL,
			
 
				 	.cuda_to_spu = NULL,
			
 
				 	.spu_to_ram = NULL,
			
 
				 	.spu_to_cuda = NULL,
			
@@ -299,14 +300,14 @@ static void free_vector_buffer_on_node(void *interface, uint32_t node)
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				-static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				-				void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+static int copy_cuda_common(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				+				void *dst_interface, unsigned dst_node __attribute__((unused)), enum cudaMemcpyKind kind)
			
 
				 {
			
 
				 	starpu_vector_interface_t *src_vector = src_interface;
			
 
				 	starpu_vector_interface_t *dst_vector = dst_interface;
			
 
				 
			
 
				 	cudaError_t cures;
			
 
				-	cures = cudaMemcpy((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, cudaMemcpyDeviceToHost);
			
 
				+	cures = cudaMemcpy((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, kind);
			
 
				 	cudaThreadSynchronize();
			
 
				 
			
 
				 	if (STARPU_UNLIKELY(cures))
			
@@ -317,36 +318,38 @@ static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				+
			
 
				+static int copy_cuda_to_ram(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				 				void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				 {
			
 
				-	starpu_vector_interface_t *src_vector = src_interface;
			
 
				-	starpu_vector_interface_t *dst_vector = dst_interface;
			
 
				-
			
 
				-	cudaError_t cures;
			
 
				-	cures = cudaMemcpy((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, cudaMemcpyHostToDevice);
			
 
				-	cudaThreadSynchronize();
			
 
				-
			
 
				-	if (STARPU_UNLIKELY(cures))
			
 
				-		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
			
 
				+}
			
 
				 
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
			
 
				+static int copy_ram_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				+				void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+{
			
 
				+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
			
 
				+}
			
 
				 
			
 
				-	return 0;
			
 
				+static int copy_cuda_to_cuda(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				+				void *dst_interface, unsigned dst_node __attribute__((unused)))
			
 
				+{
			
 
				+	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
			
 
				 }
			
 
				 
			
 
				-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				-					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
			
 
				+static int copy_cuda_async_common(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				+					void *dst_interface, unsigned dst_node __attribute__((unused)),
			
 
				+					cudaStream_t *stream, enum cudaMemcpyKind kind)
			
 
				 {
			
 
				 	starpu_vector_interface_t *src_vector = src_interface;
			
 
				 	starpu_vector_interface_t *dst_vector = dst_interface;
			
 
				 
			
 
				 	cudaError_t cures;
			
 
				-	cures = cudaMemcpyAsync((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, cudaMemcpyDeviceToHost, *stream);
			
 
				+	cures = cudaMemcpyAsync((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, kind, *stream);
			
 
				 	if (cures)
			
 
				 	{
			
 
				 		/* do it in a synchronous fashion */
			
 
				-		cures = cudaMemcpy((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, cudaMemcpyDeviceToHost);
			
 
				+		cures = cudaMemcpy((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, kind);
			
 
				 		cudaThreadSynchronize();
			
 
				 
			
 
				 		if (STARPU_UNLIKELY(cures))
			
@@ -360,31 +363,19 @@ static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attri
 
				 	return -EAGAIN;
			
 
				 }
			
 
				 
			
 
				-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				+
			
 
				+static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				 					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
			
 
				 {
			
 
				-	starpu_vector_interface_t *src_vector = src_interface;
			
 
				-	starpu_vector_interface_t *dst_vector = dst_interface;
			
 
				-
			
 
				-	cudaError_t cures;
			
 
				-	
			
 
				-	cures = cudaMemcpyAsync((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, cudaMemcpyHostToDevice, *stream);
			
 
				-	if (cures)
			
 
				-	{
			
 
				-		/* do it in a synchronous fashion */
			
 
				-		cures = cudaMemcpy((char *)dst_vector->ptr, (char *)src_vector->ptr, src_vector->nx*src_vector->elemsize, cudaMemcpyHostToDevice);
			
 
				-		cudaThreadSynchronize();
			
 
				-
			
 
				-		if (STARPU_UNLIKELY(cures))
			
 
				-			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-
			
 
				-		return 0;
			
 
				-	}
			
 
				-
			
 
				-	STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
			
 
				+	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
			
 
				+}
			
 
				 
			
 
				-	return -EAGAIN;
			
 
				+static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node __attribute__((unused)),
			
 
				+					void *dst_interface, unsigned dst_node __attribute__((unused)), cudaStream_t *stream)
			
 
				+{
			
 
				+	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
			
 
				 }
			
 
				+
			
 
				 #endif // STARPU_USE_CUDA
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
--- a/src/datawizard/interfaces/void_interface.c
+++ b/src/datawizard/interfaces/void_interface.c
@@ -34,7 +34,7 @@ static int dummy_opencl_copy_async(void *src_interface, unsigned src_node, void
 
				 
			
 
				 static const struct starpu_data_copy_methods void_copy_data_methods_s = {
			
 
				 	.ram_to_ram = dummy_copy,
			
 
				-	.ram_to_spu = NULL,
			
 
				+	.ram_to_spu = dummy_copy,
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.ram_to_cuda = dummy_copy,
			
 
				 	.cuda_to_ram = dummy_copy,
			
@@ -47,11 +47,11 @@ static const struct starpu_data_copy_methods void_copy_data_methods_s = {
 
				         .ram_to_opencl_async = dummy_opencl_copy_async,
			
 
				 	.opencl_to_ram_async = dummy_opencl_copy_async,
			
 
				 #endif
			
 
				-	.cuda_to_cuda = NULL,
			
 
				-	.cuda_to_spu = NULL,
			
 
				-	.spu_to_ram = NULL,
			
 
				-	.spu_to_cuda = NULL,
			
 
				-	.spu_to_spu = NULL
			
 
				+	.cuda_to_cuda = dummy_copy,
			
 
				+	.cuda_to_spu = dummy_copy,
			
 
				+	.spu_to_ram = dummy_copy,
			
 
				+	.spu_to_cuda = dummy_copy,
			
 
				+	.spu_to_spu = dummy_copy
			
 
				 };
			
 
				 
			
 
				 static void register_void_handle(starpu_data_handle handle, uint32_t home_node, void *interface);