5 years ago · d5a8eb5e00
--- a/ChangeLog
+++ b/ChangeLog
@@ -31,6 +31,9 @@ Small changes:
 
				   * Use the S4U interface of Simgrid instead of xbt and MSG.
			
 
				   * Add starpu_interface_copy2d, 3d, and 4d to easily request data copies from
			
 
				     data interfaces.
			
 
				+  * Move optimized cuda 2d copy from interfaces to new
			
 
				+    starpu_cuda_copy2d_async_sync and starpu_cuda_copy3d_async_sync, and use
			
 
				+    them from starpu_interface_copy2d and 3d.
			
 
				 
			
 
				 StarPU 1.3.4 (git revision xxx)
			
 
				 ==============================================
			
--- a/include/starpu_cuda.h
+++ b/include/starpu_cuda.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2012,2014                           Université de Bordeaux
			
 
				+ * Copyright (C) 2010-2012,2014,2020                      Université de Bordeaux
			
 
				  * Copyright (C) 2011                                     Inria
			
 
				  * Copyright (C) 2010-2013,2015,2017,2019                 CNRS
			
 
				  *
			
@@ -87,6 +87,44 @@ const struct cudaDeviceProp *starpu_cuda_get_device_properties(unsigned workerid
 
				 int starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node, void *dst_ptr, unsigned dst_node, size_t ssize, cudaStream_t stream, enum cudaMemcpyKind kind);
			
 
				 
			
 
				 /**
			
 
				+   Copy \p numblocks blocks of \p blocksize bytes from the pointer \p src_ptr on
			
 
				+   \p src_node to the pointer \p dst_ptr on \p dst_node.
			
 
				+
			
 
				+   The blocks start at addresses which are ld_src (resp. ld_dst) bytes apart in
			
 
				+   the source (resp. destination) interface.
			
 
				+
			
 
				+   The function first tries to copy the data asynchronous (unless \p stream is
			
 
				+   <c>NULL</c>). If the asynchronous copy fails or if \p stream is <c>NULL</c>,
			
 
				+   it copies the data synchronously. The function returns <c>-EAGAIN</c> if the
			
 
				+   asynchronous launch was successfull. It returns 0 if the synchronous copy was
			
 
				+   successful, or fails otherwise.
			
 
				+*/
			
 
				+int starpu_cuda_copy2d_async_sync(void *src_ptr, unsigned src_node, void *dst_ptr, unsigned dst_node,
			
 
				+				  size_t blocksize,
			
 
				+				  size_t numblocks, size_t ld_src, size_t ld_dst,
			
 
				+				  cudaStream_t stream, enum cudaMemcpyKind kind);
			
 
				+
			
 
				+/**
			
 
				+   Copy \p numblocks_1 * \p numblocks_2 blocks of \p blocksize bytes from the
			
 
				+   pointer \p src_ptr on \p src_node to the pointer \p dst_ptr on \p dst_node.
			
 
				+
			
 
				+   The blocks are grouped by \p numblocks_1 blocks whose start addresses are
			
 
				+   ld1_src (resp. ld1_dst) bytes apart in the source (resp. destination)
			
 
				+   interface.
			
 
				+
			
 
				+   The function first tries to copy the data asynchronous (unless \p stream is
			
 
				+   <c>NULL</c>). If the asynchronous copy fails or if \p stream is <c>NULL</c>,
			
 
				+   it copies the data synchronously. The function returns <c>-EAGAIN</c> if the
			
 
				+   asynchronous launch was successfull. It returns 0 if the synchronous copy was
			
 
				+   successful, or fails otherwise.
			
 
				+*/
			
 
				+int starpu_cuda_copy3d_async_sync(void *src_ptr, unsigned src_node, void *dst_ptr, unsigned dst_node,
			
 
				+				  size_t blocksize,
			
 
				+				  size_t numblocks_1, size_t ld1_src, size_t ld1_dst,
			
 
				+				  size_t numblocks_2, size_t ld2_src, size_t ld2_dst,
			
 
				+				  cudaStream_t stream, enum cudaMemcpyKind kind);
			
 
				+
			
 
				+/**
			
 
				    Call <c>cudaSetDevice(\p devid)</c> or <c>cudaGLSetGLDevice(\p devid)</c>,
			
 
				    according to whether \p devid is among the field
			
 
				    starpu_conf::cuda_opengl_interoperability.
			
--- a/src/datawizard/copy_driver.c
+++ b/src/datawizard/copy_driver.c
@@ -324,6 +324,9 @@ int starpu_interface_copy2d(uintptr_t src, size_t src_offset, unsigned src_node,
 
				 {
			
 
				 	int ret = 0;
			
 
				 	unsigned i;
			
 
				+	struct _starpu_async_channel *async_channel = async_data;
			
 
				+	enum starpu_node_kind dst_kind = starpu_node_get_kind(dst_node);
			
 
				+	struct _starpu_node_ops *node_ops = _starpu_memory_node_get_node_ops(src_node);
			
 
				 
			
 
				 	STARPU_ASSERT_MSG(ld_src >= blocksize, "block size %lu is bigger than ld %lu in source", (unsigned long) blocksize, (unsigned long) ld_src);
			
 
				 	STARPU_ASSERT_MSG(ld_dst >= blocksize, "block size %lu is bigger than ld %lu in destination", (unsigned long) blocksize, (unsigned long) ld_dst);
			
@@ -334,7 +337,13 @@ int starpu_interface_copy2d(uintptr_t src, size_t src_offset, unsigned src_node,
 
				 					     dst, dst_offset, dst_node,
			
 
				 					     blocksize * numblocks, async_data);
			
 
				 
			
 
				-	/* TODO: introduce and call node_ops->copy2d_data_to when available */
			
 
				+	if (node_ops && node_ops->copy2d_data_to[dst_kind])
			
 
				+		/* Hardware-optimized non-contiguous case */
			
 
				+		return node_ops->copy2d_data_to[dst_kind](src, src_offset, src_node,
			
 
				+							     dst, dst_offset, dst_node,
			
 
				+							     blocksize,
			
 
				+							     numblocks, ld_src, ld_dst,
			
 
				+							     async_channel);
			
 
				 
			
 
				 	for (i = 0; i < numblocks; i++)
			
 
				 	{
			
@@ -356,6 +365,9 @@ int starpu_interface_copy3d(uintptr_t src, size_t src_offset, unsigned src_node,
 
				 {
			
 
				 	int ret = 0;
			
 
				 	unsigned i;
			
 
				+	struct _starpu_async_channel *async_channel = async_data;
			
 
				+	enum starpu_node_kind dst_kind = starpu_node_get_kind(dst_node);
			
 
				+	struct _starpu_node_ops *node_ops = _starpu_memory_node_get_node_ops(src_node);
			
 
				 
			
 
				 	STARPU_ASSERT_MSG(ld1_src >= blocksize, "block size %lu is bigger than ld %lu in source", (unsigned long) blocksize, (unsigned long) ld1_src);
			
 
				 	STARPU_ASSERT_MSG(ld1_dst >= blocksize, "block size %lu is bigger than ld %lu in destination", (unsigned long) blocksize, (unsigned long) ld1_dst);
			
@@ -371,7 +383,15 @@ int starpu_interface_copy3d(uintptr_t src, size_t src_offset, unsigned src_node,
 
				 					     blocksize * numblocks_1 * numblocks_2,
			
 
				 					     async_data);
			
 
				 
			
 
				-	/* TODO: introduce and call node_ops->copy3d_data_to when available */
			
 
				+	if (node_ops && node_ops->copy3d_data_to[dst_kind])
			
 
				+		/* Hardware-optimized non-contiguous case */
			
 
				+		return node_ops->copy3d_data_to[dst_kind](src, src_offset, src_node,
			
 
				+							     dst, dst_offset, dst_node,
			
 
				+							     blocksize,
			
 
				+							     numblocks_1, ld1_src, ld1_dst,
			
 
				+							     numblocks_2, ld2_src, ld2_dst,
			
 
				+							     async_channel);
			
 
				+
			
 
				 
			
 
				 	for (i = 0; i < numblocks_2; i++)
			
 
				 	{
			
--- a/src/datawizard/interfaces/block_interface.c
+++ b/src/datawizard/interfaces/block_interface.c
@@ -18,40 +18,10 @@
 
				 
			
 
				 #include <starpu.h>
			
 
				 
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				-static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
			
 
				-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
			
 
				-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				-static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
			
 
				-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
			
 
				-static int copy_opencl_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
			
 
				-#endif
			
 
				 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
			
 
				 
			
 
				 static const struct starpu_data_copy_methods block_copy_data_methods_s =
			
 
				 {
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	.ram_to_cuda = copy_ram_to_cuda,
			
 
				-	.cuda_to_ram = copy_cuda_to_ram,
			
 
				-	.ram_to_cuda_async = copy_ram_to_cuda_async,
			
 
				-	.cuda_to_ram_async = copy_cuda_to_ram_async,
			
 
				-	.cuda_to_cuda = copy_cuda_to_cuda,
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-	.ram_to_opencl = copy_ram_to_opencl,
			
 
				-	.opencl_to_ram = copy_opencl_to_ram,
			
 
				-	.opencl_to_opencl = copy_opencl_to_opencl,
			
 
				-        .ram_to_opencl_async = copy_ram_to_opencl_async,
			
 
				-	.opencl_to_ram_async = copy_opencl_to_ram_async,
			
 
				-	.opencl_to_opencl_async = copy_opencl_to_opencl_async,
			
 
				-#endif
			
 
				 	.any_to_any = copy_any_to_any,
			
 
				 };
			
 
				 
			
@@ -494,279 +464,6 @@ static void free_block_buffer_on_node(void *data_interface, unsigned node)
 
				 	starpu_free_on_node(node, block_interface->dev_handle, nx*ny*nz*elemsize);
			
 
				 }
			
 
				 
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind)
			
 
				-{
			
 
				-	struct starpu_block_interface *src_block = src_interface;
			
 
				-	struct starpu_block_interface *dst_block = dst_interface;
			
 
				-
			
 
				-	uint32_t nx = src_block->nx;
			
 
				-	uint32_t ny = src_block->ny;
			
 
				-	uint32_t nz = src_block->nz;
			
 
				-	size_t elemsize = src_block->elemsize;
			
 
				-
			
 
				-	cudaError_t cures;
			
 
				-
			
 
				-	if (IS_CONTIGUOUS_MATRIX(nx, ny, src_block->ldy) && (src_block->ldy == dst_block->ldy))
			
 
				-	{
			
 
				-		/* Is that a single contiguous buffer ? */
			
 
				-		if (IS_CONTIGUOUS_BLOCK(nx, ny, nz, src_block->ldy, src_block->ldz) &&
			
 
				-		    IS_CONTIGUOUS_BLOCK(nx, ny, nz, dst_block->ldy, dst_block->ldz))
			
 
				-		{
			
 
				-			starpu_cuda_copy_async_sync((void *)src_block->ptr, src_node, (void *)dst_block->ptr, dst_node, nx*ny*nz*elemsize, NULL, kind);
			
 
				-                }
			
 
				-		else
			
 
				-		{
			
 
				-			/* Are all plans contiguous */
			
 
				-                        cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
			
 
				-                                             (char *)src_block->ptr, src_block->ldz*elemsize,
			
 
				-                                             nx*ny*elemsize, nz, kind);
			
 
				-			if (!cures)
			
 
				-				cures = cudaDeviceSynchronize();
			
 
				-                        if (STARPU_UNLIKELY(cures))
			
 
				-                                STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-                }
			
 
				-	}
			
 
				-	else
			
 
				-	{
			
 
				-		/* Default case: we transfer all blocks one by one: nz transfers */
			
 
				-		/* TODO: use cudaMemcpy3D now that it works (except on cuda 4.2) */
			
 
				-		unsigned layer;
			
 
				-		for (layer = 0; layer < src_block->nz; layer++)
			
 
				-		{
			
 
				-			uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + layer*src_block->ldz*src_block->elemsize;
			
 
				-			uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + layer*dst_block->ldz*dst_block->elemsize;
			
 
				-
			
 
				-			cures = cudaMemcpy2D((char *)dst_ptr, dst_block->ldy*elemsize,
			
 
				-                                             (char *)src_ptr, src_block->ldy*elemsize,
			
 
				-                                             nx*elemsize, ny, kind);
			
 
				-
			
 
				-			if (!cures)
			
 
				-				cures = cudaDeviceSynchronize();
			
 
				-			if (STARPU_UNLIKELY(cures))
			
 
				-				STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	starpu_interface_data_copy(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream, enum cudaMemcpyKind kind)
			
 
				-{
			
 
				-	struct starpu_block_interface *src_block = src_interface;
			
 
				-	struct starpu_block_interface *dst_block = dst_interface;
			
 
				-
			
 
				-	uint32_t nx = src_block->nx;
			
 
				-	uint32_t ny = src_block->ny;
			
 
				-	uint32_t nz = src_block->nz;
			
 
				-	size_t elemsize = src_block->elemsize;
			
 
				-
			
 
				-	cudaError_t cures;
			
 
				-
			
 
				-	int ret;
			
 
				-
			
 
				-	/* We may have a contiguous buffer for the entire block, or contiguous
			
 
				-	 * plans within the block, we can avoid many small transfers that way */
			
 
				-	if (IS_CONTIGUOUS_MATRIX(nx, ny, src_block->ldy) && (src_block->ldy == dst_block->ldy))
			
 
				-	{
			
 
				-		/* Is that a single contiguous buffer ? */
			
 
				-		if (IS_CONTIGUOUS_BLOCK(nx, ny, nz, src_block->ldy, src_block->ldz) &&
			
 
				-		    IS_CONTIGUOUS_BLOCK(nx, ny, nz, dst_block->ldy, dst_block->ldz))
			
 
				-		{
			
 
				-			ret = starpu_cuda_copy_async_sync((void *)src_block->ptr, src_node, (void *)dst_block->ptr, dst_node, nx*ny*nz*elemsize, stream, kind);
			
 
				-		}
			
 
				-		else
			
 
				-		{
			
 
				-			double start;
			
 
				-			/* Are all plans contiguous */
			
 
				-			starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
			
 
				-			cures = cudaMemcpy2DAsync((char *)dst_block->ptr, dst_block->ldz*elemsize,
			
 
				-					(char *)src_block->ptr, src_block->ldz*elemsize,
			
 
				-					nx*ny*elemsize, nz, kind, stream);
			
 
				-			starpu_interface_end_driver_copy_async(src_node, dst_node, start);
			
 
				-			if (STARPU_UNLIKELY(cures))
			
 
				-			{
			
 
				-				cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
			
 
				-						(char *)src_block->ptr, src_block->ldz*elemsize,
			
 
				-						nx*ny*elemsize, nz, kind);
			
 
				-				if (!cures)
			
 
				-					cures = cudaDeviceSynchronize();
			
 
				-				if (STARPU_UNLIKELY(cures))
			
 
				-					STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-
			
 
				-				ret = 0;
			
 
				-			}
			
 
				-			else
			
 
				-			{
			
 
				-				ret = -EAGAIN;
			
 
				-			}
			
 
				-		}
			
 
				-	}
			
 
				-	else
			
 
				-	{
			
 
				-		/* Default case: we transfer all blocks one by one: nz 2D transfers */
			
 
				-		/* TODO: use cudaMemcpy3D now that it works (except on cuda 4.2) */
			
 
				-		unsigned layer;
			
 
				-		for (layer = 0; layer < src_block->nz; layer++)
			
 
				-		{
			
 
				-			uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + layer*src_block->ldz*src_block->elemsize;
			
 
				-			uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + layer*dst_block->ldz*dst_block->elemsize;
			
 
				-			double start;
			
 
				-
			
 
				-			starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
			
 
				-			cures = cudaMemcpy2DAsync((char *)dst_ptr, dst_block->ldy*elemsize,
			
 
				-                                                  (char *)src_ptr, src_block->ldy*elemsize,
			
 
				-                                                  nx*elemsize, ny, kind, stream);
			
 
				-			starpu_interface_end_driver_copy_async(src_node, dst_node, start);
			
 
				-
			
 
				-			if (STARPU_UNLIKELY(cures))
			
 
				-			{
			
 
				-				/* I don't know how to do that "better" */
			
 
				-				goto no_async_default;
			
 
				-			}
			
 
				-
			
 
				-		}
			
 
				-
			
 
				-		ret = -EAGAIN;
			
 
				-
			
 
				-	}
			
 
				-
			
 
				-	starpu_interface_data_copy(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
			
 
				-
			
 
				-	return ret;
			
 
				-
			
 
				-no_async_default:
			
 
				-
			
 
				-	{
			
 
				-	unsigned layer;
			
 
				-	for (layer = 0; layer < src_block->nz; layer++)
			
 
				-	{
			
 
				-		uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + layer*src_block->ldz*src_block->elemsize;
			
 
				-		uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + layer*dst_block->ldz*dst_block->elemsize;
			
 
				-
			
 
				-		cures = cudaMemcpy2D((char *)dst_ptr, dst_block->ldy*elemsize,
			
 
				-                                     (char *)src_ptr, src_block->ldy*elemsize,
			
 
				-                                     nx*elemsize, ny, kind);
			
 
				-
			
 
				-		if (!cures)
			
 
				-			cures = cudaDeviceSynchronize();
			
 
				-		if (STARPU_UNLIKELY(cures))
			
 
				-			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-	}
			
 
				-
			
 
				-	starpu_interface_data_copy(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
			
 
				-	return 0;
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
			
 
				-}
			
 
				-
			
 
				-static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				-{
			
 
				-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
			
 
				-}
			
 
				-
			
 
				-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				-{
			
 
				-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
			
 
				-}
			
 
				-
			
 
				-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
			
 
				-{
			
 
				-	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
			
 
				-}
			
 
				-
			
 
				-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
			
 
				-{
			
 
				-	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
			
 
				-}
			
 
				-#endif // STARPU_USE_CUDA
			
 
				-
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-static int copy_opencl_common(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
			
 
				-{
			
 
				-	struct starpu_block_interface *src_block = src_interface;
			
 
				-	struct starpu_block_interface *dst_block = dst_interface;
			
 
				-        int ret = 0;
			
 
				-
			
 
				-	uint32_t nx = src_block->nx;
			
 
				-	uint32_t ny = src_block->ny;
			
 
				-
			
 
				-	/* We may have a contiguous buffer for the entire block, or contiguous
			
 
				-	 * plans within the block, we can avoid many small transfers that way */
			
 
				-	if (IS_CONTIGUOUS_BLOCK(nx, ny, nz, src_block->ldy, src_block->ldz) &&
			
 
				-	    IS_CONTIGUOUS_BLOCK(nx, ny, nz, dst_block->ldy, dst_block->ldz))
			
 
				-		/* Is that a single contiguous buffer ? */
			
 
				-	{
			
 
				-		ret = starpu_opencl_copy_async_sync(src_block->dev_handle, src_block->offset, src_node,
			
 
				-						    dst_block->dev_handle, dst_block->offset, dst_node,
			
 
				-						    src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
			
 
				-						    event);
			
 
				-	}
			
 
				-	else
			
 
				-	{
			
 
				-		/* Default case: we transfer all lines one by one: ny*nz transfers */
			
 
				-		/* TODO: rect support */
			
 
				-		unsigned layer;
			
 
				-		for (layer = 0; layer < src_block->nz; layer++)
			
 
				-		{
			
 
				-                        unsigned j;
			
 
				-                        for(j=0 ; j<src_block->ny ; j++)
			
 
				-			{
			
 
				-				ret = starpu_opencl_copy_async_sync(src_block->dev_handle,
			
 
				-								    src_block->offset + layer*src_block->ldz*src_block->elemsize + j*src_block->ldy*src_block->elemsize,
			
 
				-								    src_node,
			
 
				-								    dst_block->dev_handle,
			
 
				-								    dst_block->offset + layer*dst_block->ldz*dst_block->elemsize + j*dst_block->ldy*dst_block->elemsize,
			
 
				-								    dst_node,
			
 
				-								    src_block->nx*src_block->elemsize,
			
 
				-								    event);
			
 
				-                        }
			
 
				-                }
			
 
				-        }
			
 
				-
			
 
				-	starpu_interface_data_copy(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
			
 
				-
			
 
				-	return ret;
			
 
				-}
			
 
				-
			
 
				-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
			
 
				-{
			
 
				-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
			
 
				-}
			
 
				-
			
 
				-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
			
 
				-{
			
 
				-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
			
 
				-}
			
 
				-
			
 
				-static int copy_opencl_to_opencl_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
			
 
				-{
			
 
				-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
			
 
				-}
			
 
				-
			
 
				-static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				-{
			
 
				-        return copy_ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
			
 
				-}
			
 
				-
			
 
				-static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				-{
			
 
				-        return copy_opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, NULL);
			
 
				-}
			
 
				-
			
 
				-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				-{
			
 
				-	return copy_opencl_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
			
 
				-}
			
 
				-
			
 
				-#endif
			
 
				-
			
 
				 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data)
			
 
				 {
			
 
				 	struct starpu_block_interface *src_block = (struct starpu_block_interface *) src_interface;
			
--- a/src/datawizard/interfaces/matrix_interface.c
+++ b/src/datawizard/interfaces/matrix_interface.c
@@ -18,58 +18,10 @@
 
				 
			
 
				 #include <starpu.h>
			
 
				 
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-/* At least CUDA 4.2 still didn't have working memcpy3D */
			
 
				-#if CUDART_VERSION < 5000
			
 
				-#define BUGGED_MEMCPY3D
			
 
				-#endif
			
 
				-
			
 
				-static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				-static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
			
 
				-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
			
 
				-#ifndef BUGGED_MEMCPY3D
			
 
				-static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
			
 
				-#endif
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				-static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
			
 
				-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
			
 
				-static int copy_opencl_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
			
 
				-#endif
			
 
				 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
			
 
				 
			
 
				 static const struct starpu_data_copy_methods matrix_copy_data_methods_s =
			
 
				 {
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	.ram_to_cuda = copy_ram_to_cuda,
			
 
				-	.cuda_to_ram = copy_cuda_to_ram,
			
 
				-	.ram_to_cuda_async = copy_ram_to_cuda_async,
			
 
				-	.cuda_to_ram_async = copy_cuda_to_ram_async,
			
 
				-	.cuda_to_cuda = copy_cuda_to_cuda,
			
 
				-#ifndef BUGGED_MEMCPY3D
			
 
				-	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
			
 
				-#endif
			
 
				-#else
			
 
				-#ifdef STARPU_SIMGRID
			
 
				-#ifndef BUGGED_MEMCPY3D
			
 
				-	/* Enable GPU-GPU transfers in simgrid */
			
 
				-	.cuda_to_cuda_async = (void *)1,
			
 
				-#endif
			
 
				-#endif
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-	.ram_to_opencl = copy_ram_to_opencl,
			
 
				-	.opencl_to_ram = copy_opencl_to_ram,
			
 
				-	.opencl_to_opencl = copy_opencl_to_opencl,
			
 
				-        .ram_to_opencl_async = copy_ram_to_opencl_async,
			
 
				-	.opencl_to_ram_async = copy_opencl_to_ram_async,
			
 
				-	.opencl_to_opencl_async = copy_opencl_to_opencl_async,
			
 
				-#endif
			
 
				 	.any_to_any = copy_any_to_any,
			
 
				 };
			
 
				 
			
@@ -537,185 +489,6 @@ static void free_matrix_buffer_on_node(void *data_interface, unsigned node)
 
				 	starpu_free_on_node(node, matrix_interface->dev_handle, matrix_interface->allocsize);
			
 
				 }
			
 
				 
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind, int is_async, cudaStream_t stream)
			
 
				-{
			
 
				-	struct starpu_matrix_interface *src_matrix = src_interface;
			
 
				-	struct starpu_matrix_interface *dst_matrix = dst_interface;
			
 
				-
			
 
				-	size_t elemsize = src_matrix->elemsize;
			
 
				-	cudaError_t cures;
			
 
				-
			
 
				-	if (is_async)
			
 
				-	{
			
 
				-		double start;
			
 
				-		starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
			
 
				-		cures = cudaMemcpy2DAsync((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
			
 
				-			(char *)src_matrix->ptr, src_matrix->ld*elemsize,
			
 
				-			src_matrix->nx*elemsize, src_matrix->ny, kind, stream);
			
 
				-		starpu_interface_end_driver_copy_async(src_node, dst_node, start);
			
 
				-		if (!cures)
			
 
				-			return -EAGAIN;
			
 
				-	}
			
 
				-
			
 
				-	cures = cudaMemcpy2D((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
			
 
				-		(char *)src_matrix->ptr, src_matrix->ld*elemsize,
			
 
				-		src_matrix->nx*elemsize, src_matrix->ny, kind);
			
 
				-	if (!cures)
			
 
				-		cures = cudaDeviceSynchronize();
			
 
				-	if (STARPU_UNLIKELY(cures))
			
 
				-	{
			
 
				-		int ret = copy_any_to_any(src_interface, src_node, dst_interface, dst_node, (void*)(uintptr_t)is_async);
			
 
				-		if (ret == -EAGAIN) return ret;
			
 
				-		if (ret) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-	}
			
 
				-
			
 
				-	starpu_interface_data_copy(src_node, dst_node, (size_t)src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-#ifndef BUGGED_MEMCPY3D
			
 
				-static int copy_cuda_peer(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, int is_async, cudaStream_t stream)
			
 
				-{
			
 
				-#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
			
 
				-	struct starpu_matrix_interface *src_matrix = src_interface;
			
 
				-	struct starpu_matrix_interface *dst_matrix = dst_interface;
			
 
				-
			
 
				-	size_t elemsize = src_matrix->elemsize;
			
 
				-	cudaError_t cures;
			
 
				-
			
 
				-	int src_dev = starpu_memory_node_get_devid(src_node);
			
 
				-	int dst_dev = starpu_memory_node_get_devid(dst_node);
			
 
				-
			
 
				-	struct cudaMemcpy3DPeerParms p;
			
 
				-	memset(&p, 0, sizeof(p));
			
 
				-
			
 
				-	p.srcDevice = src_dev;
			
 
				-	p.dstDevice = dst_dev;
			
 
				-	p.srcPtr = make_cudaPitchedPtr((char *)src_matrix->ptr, src_matrix->ld * elemsize, src_matrix->nx, src_matrix->ny);
			
 
				-	p.dstPtr = make_cudaPitchedPtr((char *)dst_matrix->ptr, dst_matrix->ld * elemsize, dst_matrix->nx, dst_matrix->ny);
			
 
				-	p.extent = make_cudaExtent(src_matrix->nx * elemsize, src_matrix->ny, 1);
			
 
				-
			
 
				-	if (is_async)
			
 
				-	{
			
 
				-		double start;
			
 
				-		starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
			
 
				-		cures = cudaMemcpy3DPeerAsync(&p, stream);
			
 
				-		starpu_interface_end_driver_copy_async(src_node, dst_node, start);
			
 
				-		if (!cures)
			
 
				-			return -EAGAIN;
			
 
				-	}
			
 
				-
			
 
				-	cures = cudaMemcpy3DPeer(&p);
			
 
				-	if (!cures)
			
 
				-		cures = cudaDeviceSynchronize();
			
 
				-	if (STARPU_UNLIKELY(cures))
			
 
				-		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-
			
 
				-	starpu_interface_data_copy(src_node, dst_node, (size_t)src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
			
 
				-
			
 
				-	return 0;
			
 
				-#else
			
 
				-	STARPU_ABORT_MSG("CUDA memcpy 3D peer not available, but core triggered one ?!");
			
 
				-#endif
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				-static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				-{
			
 
				-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, 0, 0);
			
 
				-}
			
 
				-
			
 
				-static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				-{
			
 
				-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, 0, 0);
			
 
				-}
			
 
				-
			
 
				-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				-{
			
 
				-	if (src_node == dst_node)
			
 
				-		return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice, 0, 0);
			
 
				-	else
			
 
				-#ifdef BUGGED_MEMCPY3D
			
 
				-		STARPU_ABORT_MSG("CUDA memcpy 3D peer not available, but core triggered one?!");
			
 
				-#else
			
 
				-		return copy_cuda_peer(src_interface, src_node, dst_interface, dst_node, 0, 0);
			
 
				-#endif
			
 
				-}
			
 
				-
			
 
				-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
			
 
				-{
			
 
				-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, 1, stream);
			
 
				-}
			
 
				-
			
 
				-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
			
 
				-{
			
 
				-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, 1, stream);
			
 
				-}
			
 
				-
			
 
				-#ifndef BUGGED_MEMCPY3D
			
 
				-static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
			
 
				-{
			
 
				-	if (src_node == dst_node)
			
 
				-		return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice, 1, stream);
			
 
				-	else
			
 
				-		return copy_cuda_peer(src_interface, src_node, dst_interface, dst_node, 1, stream);
			
 
				-}
			
 
				-#endif
			
 
				-#endif // STARPU_USE_CUDA
			
 
				-
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-static int copy_opencl_common(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
			
 
				-{
			
 
				-	struct starpu_matrix_interface *src_matrix = src_interface;
			
 
				-	struct starpu_matrix_interface *dst_matrix = dst_interface;
			
 
				-        int ret;
			
 
				-
			
 
				-	STARPU_ASSERT_MSG((src_matrix->ld == src_matrix->nx) && (dst_matrix->ld == dst_matrix->nx), "XXX non contiguous buffers are not properly supported in OpenCL yet. (TODO)");
			
 
				-
			
 
				-	ret = starpu_opencl_copy_async_sync(src_matrix->dev_handle, src_matrix->offset, src_node,
			
 
				-					    dst_matrix->dev_handle, dst_matrix->offset, dst_node,
			
 
				-					    src_matrix->nx*src_matrix->ny*src_matrix->elemsize,
			
 
				-					    event);
			
 
				-
			
 
				-	starpu_interface_data_copy(src_node, dst_node, src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
			
 
				-
			
 
				-	return ret;
			
 
				-}
			
 
				-
			
 
				-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
			
 
				-{
			
 
				-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
			
 
				-}
			
 
				-
			
 
				-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
			
 
				-{
			
 
				-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
			
 
				-}
			
 
				-
			
 
				-static int copy_opencl_to_opencl_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
			
 
				-{
			
 
				-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
			
 
				-}
			
 
				-
			
 
				-static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				-{
			
 
				-        return copy_ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
			
 
				-}
			
 
				-
			
 
				-static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				-{
			
 
				-        return copy_opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, NULL);
			
 
				-}
			
 
				-
			
 
				-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				-{
			
 
				-	return copy_opencl_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
			
 
				-}
			
 
				-
			
 
				-#endif
			
 
				-
			
 
				 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data)
			
 
				 {
			
 
				 	struct starpu_matrix_interface *src_matrix = (struct starpu_matrix_interface *) src_interface;
			
--- a/src/datawizard/interfaces/tensor_interface.c
+++ b/src/datawizard/interfaces/tensor_interface.c
@@ -18,40 +18,10 @@
 
				 
			
 
				 #include <starpu.h>
			
 
				 
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				-static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
			
 
				-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
			
 
				-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				-static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
			
 
				-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
			
 
				-static int copy_opencl_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
			
 
				-#endif
			
 
				 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
			
 
				 
			
 
				 static const struct starpu_data_copy_methods tensor_copy_data_methods_s =
			
 
				 {
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	.ram_to_cuda = copy_ram_to_cuda,
			
 
				-	.cuda_to_ram = copy_cuda_to_ram,
			
 
				-	.ram_to_cuda_async = copy_ram_to_cuda_async,
			
 
				-	.cuda_to_ram_async = copy_cuda_to_ram_async,
			
 
				-	.cuda_to_cuda = copy_cuda_to_cuda,
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-	.ram_to_opencl = copy_ram_to_opencl,
			
 
				-	.opencl_to_ram = copy_opencl_to_ram,
			
 
				-	.opencl_to_opencl = copy_opencl_to_opencl,
			
 
				-        .ram_to_opencl_async = copy_ram_to_opencl_async,
			
 
				-	.opencl_to_ram_async = copy_opencl_to_ram_async,
			
 
				-	.opencl_to_opencl_async = copy_opencl_to_opencl_async,
			
 
				-#endif
			
 
				 	.any_to_any = copy_any_to_any,
			
 
				 };
			
 
				 
			
@@ -566,255 +536,6 @@ static void free_tensor_buffer_on_node(void *data_interface, unsigned node)
 
				 	starpu_free_on_node(node, tensor_interface->dev_handle, nx*ny*nz*nt*elemsize);
			
 
				 }
			
 
				 
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind)
			
 
				-{
			
 
				-	struct starpu_tensor_interface *src_block = src_interface;
			
 
				-	struct starpu_tensor_interface *dst_block = dst_interface;
			
 
				-
			
 
				-	uint32_t nx = src_block->nx;
			
 
				-	uint32_t ny = src_block->ny;
			
 
				-	uint32_t nz = src_block->nz;
			
 
				-	uint32_t nt = src_block->nt;
			
 
				-	size_t elemsize = src_block->elemsize;
			
 
				-
			
 
				-	cudaError_t cures;
			
 
				-
			
 
				-	if (IS_CONTIGUOUS_TENSOR(nx, ny, nz, nt, src_block->ldy, src_block->ldz, src_block->ldt) &&
			
 
				-	    IS_CONTIGUOUS_TENSOR(nx, ny, nz, nt, dst_block->ldy, dst_block->ldz, dst_block->ldt))
			
 
				-	{
			
 
				-		/* Same lds on both sides, and contiguous, simple */
			
 
				-		starpu_cuda_copy_async_sync((void *)src_block->ptr, src_node, (void *)dst_block->ptr, dst_node, nx*ny*nz*nt*elemsize, NULL, kind);
			
 
				-	}
			
 
				-	else
			
 
				-	{
			
 
				-		/* TODO: use cudaMemcpy2D for whole 3D blocks etc. when they are contiguous */
			
 
				-
			
 
				-		/* Default case: we transfer all blocks one by one: nz transfers */
			
 
				-		/* TODO: use cudaMemcpy3D now that it works (except on cuda 4.2) */
			
 
				-		unsigned t;
			
 
				-		for (t = 0; t < src_block->nt; t++)
			
 
				-		{
			
 
				-		    unsigned z;
			
 
				-		    for (z = 0; z < src_block->nz; z++)
			
 
				-		    {
			
 
				-			uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + t*src_block->ldt*src_block->elemsize + z*src_block->ldz*src_block->elemsize;
			
 
				-			uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + t*dst_block->ldt*src_block->elemsize + z*dst_block->ldz*dst_block->elemsize;
			
 
				-
			
 
				-			cures = cudaMemcpy2D((char *)dst_ptr, dst_block->ldy*elemsize,
			
 
				-                                             (char *)src_ptr, src_block->ldy*elemsize,
			
 
				-                                             nx*elemsize, ny, kind);
			
 
				-
			
 
				-			if (!cures)
			
 
				-				cures = cudaDeviceSynchronize();
			
 
				-			if (STARPU_UNLIKELY(cures))
			
 
				-				STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-		    }
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	starpu_interface_data_copy(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->nt*src_block->elemsize);
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream, enum cudaMemcpyKind kind)
			
 
				-{
			
 
				-	struct starpu_tensor_interface *src_block = src_interface;
			
 
				-	struct starpu_tensor_interface *dst_block = dst_interface;
			
 
				-
			
 
				-	uint32_t nx = src_block->nx;
			
 
				-	uint32_t ny = src_block->ny;
			
 
				-	uint32_t nz = src_block->nz;
			
 
				-	uint32_t nt = src_block->nt;
			
 
				-	size_t elemsize = src_block->elemsize;
			
 
				-
			
 
				-	cudaError_t cures;
			
 
				-
			
 
				-	int ret;
			
 
				-
			
 
				-	if (IS_CONTIGUOUS_TENSOR(nx, ny, nz, nt, src_block->ldy, src_block->ldz, src_block->ldt) &&
			
 
				-	    IS_CONTIGUOUS_TENSOR(nx, ny, nz, nt, dst_block->ldy, dst_block->ldz, dst_block->ldt))
			
 
				-	{
			
 
				-		/* Same lds on both sides, and contiguous, simple */
			
 
				-		ret = starpu_cuda_copy_async_sync((void *)src_block->ptr, src_node, (void *)dst_block->ptr, dst_node, nx*ny*nz*nt*elemsize, stream, kind);
			
 
				-	}
			
 
				-	else
			
 
				-	{
			
 
				-		/* TODO: use cudaMemcpy2D for whole 3D blocks etc. when they are contiguous */
			
 
				-
			
 
				-		/* Default case: we transfer all blocks one by one: nz transfers */
			
 
				-		/* TODO: use cudaMemcpy3D now that it works (except on cuda 4.2) */
			
 
				-		unsigned t;
			
 
				-		for (t = 0; t < src_block->nt; t++)
			
 
				-		{
			
 
				-		    unsigned z;
			
 
				-		    for (z = 0; z < src_block->nz; z++)
			
 
				-		    {
			
 
				-			uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + t*src_block->ldt*src_block->elemsize + z*src_block->ldz*src_block->elemsize;
			
 
				-			uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + t*dst_block->ldt*dst_block->elemsize + z*dst_block->ldz*dst_block->elemsize;
			
 
				-			double start;
			
 
				-
			
 
				-			starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
			
 
				-			cures = cudaMemcpy2DAsync((char *)dst_ptr, dst_block->ldy*elemsize,
			
 
				-                                                  (char *)src_ptr, src_block->ldy*elemsize,
			
 
				-                                                  nx*elemsize, ny, kind, stream);
			
 
				-			starpu_interface_end_driver_copy_async(src_node, dst_node, start);
			
 
				-
			
 
				-			if (STARPU_UNLIKELY(cures))
			
 
				-			{
			
 
				-				/* I don't know how to do that "better" */
			
 
				-				goto no_async_default;
			
 
				-			}
			
 
				-		    }
			
 
				-		}
			
 
				-
			
 
				-		ret = -EAGAIN;
			
 
				-
			
 
				-	}
			
 
				-
			
 
				-	starpu_interface_data_copy(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->nt*src_block->elemsize);
			
 
				-
			
 
				-	return ret;
			
 
				-
			
 
				-no_async_default:
			
 
				-
			
 
				-	{
			
 
				-	unsigned t;
			
 
				-	for (t = 0; t < src_block->nt; t++)
			
 
				-	{
			
 
				-	    unsigned z;
			
 
				-	    for (z = 0; z < src_block->nz; z++)
			
 
				-	    {
			
 
				-		uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + t*src_block->ldt*src_block->elemsize + z*src_block->ldz*src_block->elemsize;
			
 
				-		uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + t*dst_block->ldt*dst_block->elemsize + z*dst_block->ldz*dst_block->elemsize;
			
 
				-
			
 
				-		cures = cudaMemcpy2D((char *)dst_ptr, dst_block->ldy*elemsize,
			
 
				-                                     (char *)src_ptr, src_block->ldy*elemsize,
			
 
				-                                     nx*elemsize, ny, kind);
			
 
				-
			
 
				-		if (!cures)
			
 
				-			cures = cudaDeviceSynchronize();
			
 
				-		if (STARPU_UNLIKELY(cures))
			
 
				-			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-	    }
			
 
				-	}
			
 
				-
			
 
				-	starpu_interface_data_copy(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->nt*src_block->elemsize);
			
 
				-	return 0;
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
			
 
				-}
			
 
				-
			
 
				-static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				-{
			
 
				-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
			
 
				-}
			
 
				-
			
 
				-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				-{
			
 
				-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
			
 
				-}
			
 
				-
			
 
				-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
			
 
				-{
			
 
				-	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
			
 
				-}
			
 
				-
			
 
				-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
			
 
				-{
			
 
				-	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
			
 
				-}
			
 
				-#endif // STARPU_USE_CUDA
			
 
				-
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-static int copy_opencl_common(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
			
 
				-{
			
 
				-	struct starpu_tensor_interface *src_block = src_interface;
			
 
				-	struct starpu_tensor_interface *dst_block = dst_interface;
			
 
				-        int ret = 0;
			
 
				-
			
 
				-	uint32_t nx = src_block->nx;
			
 
				-	uint32_t ny = src_block->ny;
			
 
				-	uint32_t nz = src_block->nz;
			
 
				-
			
 
				-	/* We may have a contiguous buffer for the entire block, or contiguous
			
 
				-	 * plans within the block, we can avoid many small transfers that way */
			
 
				-	if (IS_CONTIGUOUS_TENSOR(nx, ny, nz, nt, src_block->ldy, src_block->ldz, src_block->ldt) &&
			
 
				-	    IS_CONTIGUOUS_TENSOR(nx, ny, nz, nt, dst_block->ldy, dst_block->ldz, dst_block->ldt))
			
 
				-	{
			
 
				-		ret = starpu_opencl_copy_async_sync(src_block->dev_handle, src_block->offset, src_node,
			
 
				-						    dst_block->dev_handle, dst_block->offset, dst_node,
			
 
				-						    src_block->nx*src_block->ny*src_block->nz*src_block->nt*src_block->elemsize,
			
 
				-						    event);
			
 
				-	}
			
 
				-	else
			
 
				-	{
			
 
				-		/* Default case: we transfer all lines one by one: ny*nz transfers */
			
 
				-		/* TODO: rect support */
			
 
				-		unsigned t;
			
 
				-		for (t = 0; t < src_block->nt; t++)
			
 
				-		{
			
 
				-		    unsigned z;
			
 
				-		    for (z = 0; z < src_block->nz; z++)
			
 
				-		    {
			
 
				-                        unsigned j;
			
 
				-                        for(j=0 ; j<src_block->ny ; j++)
			
 
				-			{
			
 
				-				ret = starpu_opencl_copy_async_sync(src_block->dev_handle,
			
 
				-								    src_block->offset + t*src_block->ldt*src_block->elemsize + z*src_block->ldz*src_block->elemsize + j*src_block->ldy*src_block->elemsize,
			
 
				-								    src_node,
			
 
				-								    dst_block->dev_handle,
			
 
				-								    dst_block->offset + t*dst_block->ldt*dst_block->elemsize + z*dst_block->ldz*dst_block->elemsize + j*dst_block->ldy*dst_block->elemsize,
			
 
				-								    dst_node,
			
 
				-								    src_block->nx*src_block->elemsize,
			
 
				-								    event);
			
 
				-                        }
			
 
				-		    }
			
 
				-                }
			
 
				-        }
			
 
				-
			
 
				-	starpu_interface_data_copy(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->nt*src_block->elemsize);
			
 
				-
			
 
				-	return ret;
			
 
				-}
			
 
				-
			
 
				-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
			
 
				-{
			
 
				-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
			
 
				-}
			
 
				-
			
 
				-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
			
 
				-{
			
 
				-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
			
 
				-}
			
 
				-
			
 
				-static int copy_opencl_to_opencl_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
			
 
				-{
			
 
				-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
			
 
				-}
			
 
				-
			
 
				-static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				-{
			
 
				-        return copy_ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
			
 
				-}
			
 
				-
			
 
				-static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				-{
			
 
				-        return copy_opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, NULL);
			
 
				-}
			
 
				-
			
 
				-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				-{
			
 
				-	return copy_opencl_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
			
 
				-}
			
 
				-
			
 
				-#endif
			
 
				-
			
 
				 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data)
			
 
				 {
			
 
				 	struct starpu_tensor_interface *src_block = (struct starpu_tensor_interface *) src_interface;
			
--- a/src/datawizard/node_ops.h
+++ b/src/datawizard/node_ops.h
@@ -31,10 +31,25 @@ typedef int (*copy_data_t)(uintptr_t src_ptr, size_t src_offset, unsigned src_no
 
				 				uintptr_t dst_ptr, size_t dst_offset, unsigned dst_node,
			
 
				 				size_t ssize, struct _starpu_async_channel *async_channel);
			
 
				 
			
 
				+typedef int (*copy2d_data_t)(uintptr_t src_ptr, size_t src_offset, unsigned src_node,
			
 
				+				uintptr_t dst_ptr, size_t dst_offset, unsigned dst_node,
			
 
				+				size_t blocksize,
			
 
				+				size_t numblocks, size_t ld_src, size_t ld_dst,
			
 
				+				struct _starpu_async_channel *async_channel);
			
 
				+
			
 
				+typedef int (*copy3d_data_t)(uintptr_t src_ptr, size_t src_offset, unsigned src_node,
			
 
				+				uintptr_t dst_ptr, size_t dst_offset, unsigned dst_node,
			
 
				+				size_t blocksize,
			
 
				+				size_t numblocks_1, size_t ld1_src, size_t ld1_dst,
			
 
				+				size_t numblocks_2, size_t ld2_src, size_t ld2_dst,
			
 
				+				struct _starpu_async_channel *async_channel);
			
 
				+
			
 
				 struct _starpu_node_ops
			
 
				 {
			
 
				 	copy_interface_func_t copy_interface_to[STARPU_MPI_MS_RAM+1];
			
 
				 	copy_data_t copy_data_to[STARPU_MPI_MS_RAM+1];
			
 
				+	copy2d_data_t copy2d_data_to[STARPU_MPI_MS_RAM+1];
			
 
				+	copy3d_data_t copy3d_data_to[STARPU_MPI_MS_RAM+1];
			
 
				 	void (*wait_request_completion)(struct _starpu_async_channel *async_channel);
			
 
				 	unsigned (*test_request_completion)(struct _starpu_async_channel *async_channel);
			
 
				 	int (*is_direct_access_supported)(unsigned node, unsigned handling_node);
			
--- a/src/drivers/cpu/driver_cpu.c
+++ b/src/drivers/cpu/driver_cpu.c
@@ -564,6 +564,34 @@ struct _starpu_node_ops _starpu_driver_cpu_node_ops =
 
				 	.copy_data_to[STARPU_MPI_MS_RAM] = NULL,
			
 
				 #endif
			
 
				 
			
 
				+	.copy2d_data_to[STARPU_UNUSED] = NULL,
			
 
				+	.copy2d_data_to[STARPU_CPU_RAM] = NULL,
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.copy2d_data_to[STARPU_CUDA_RAM] = _starpu_cuda_copy2d_data_from_cpu_to_cuda,
			
 
				+#else
			
 
				+	.copy2d_data_to[STARPU_CUDA_RAM] = NULL,
			
 
				+#endif
			
 
				+	.copy2d_data_to[STARPU_OPENCL_RAM] = NULL,
			
 
				+	.copy2d_data_to[STARPU_DISK_RAM] = NULL,
			
 
				+	.copy2d_data_to[STARPU_MIC_RAM] = NULL,
			
 
				+	.copy2d_data_to[STARPU_MPI_MS_RAM] = NULL,
			
 
				+
			
 
				+	.copy3d_data_to[STARPU_UNUSED] = NULL,
			
 
				+	.copy3d_data_to[STARPU_CPU_RAM] = NULL,
			
 
				+#if 0
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	.copy3d_data_to[STARPU_CUDA_RAM] = _starpu_cuda_copy3d_data_from_cpu_to_cuda,
			
 
				+#else
			
 
				+	.copy3d_data_to[STARPU_CUDA_RAM] = NULL,
			
 
				+#endif
			
 
				+#else
			
 
				+	.copy3d_data_to[STARPU_CUDA_RAM] = NULL,
			
 
				+#endif
			
 
				+	.copy3d_data_to[STARPU_OPENCL_RAM] = NULL,
			
 
				+	.copy3d_data_to[STARPU_DISK_RAM] = NULL,
			
 
				+	.copy3d_data_to[STARPU_MIC_RAM] = NULL,
			
 
				+	.copy3d_data_to[STARPU_MPI_MS_RAM] = NULL,
			
 
				+
			
 
				 	.wait_request_completion = NULL,
			
 
				 	.test_request_completion = NULL,
			
 
				 	.is_direct_access_supported = _starpu_cpu_is_direct_access_supported,
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -53,6 +53,11 @@
 
				 #else
			
 
				 #define starpu_cudaStreamCreate(stream) cudaStreamCreate(stream)
			
 
				 #endif
			
 
				+
			
 
				+/* At least CUDA 4.2 still didn't have working memcpy3D */
			
 
				+#if CUDART_VERSION < 5000
			
 
				+#define BUGGED_MEMCPY3D
			
 
				+#endif
			
 
				 #endif
			
 
				 
			
 
				 /* the number of CUDA devices */
			
@@ -1187,6 +1192,195 @@ starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node,
 
				 
			
 
				 	return -EAGAIN;
			
 
				 }
			
 
				+
			
 
				+int
			
 
				+starpu_cuda_copy2d_async_sync(void *src_ptr, unsigned src_node,
			
 
				+			      void *dst_ptr, unsigned dst_node,
			
 
				+			      size_t blocksize,
			
 
				+			      size_t numblocks, size_t ld_src, size_t ld_dst,
			
 
				+			      cudaStream_t stream, enum cudaMemcpyKind kind)
			
 
				+{
			
 
				+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
			
 
				+	int peer_copy = 0;
			
 
				+	int src_dev = -1, dst_dev = -1;
			
 
				+#endif
			
 
				+	cudaError_t cures = 0;
			
 
				+
			
 
				+	if (kind == cudaMemcpyDeviceToDevice && src_node != dst_node)
			
 
				+	{
			
 
				+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
			
 
				+#  ifdef BUGGED_MEMCPY3D
			
 
				+		STARPU_ABORT_MSG("CUDA memcpy 3D peer buggy, but core triggered one?!");
			
 
				+#  endif
			
 
				+		peer_copy = 1;
			
 
				+		src_dev = starpu_memory_node_get_devid(src_node);
			
 
				+		dst_dev = starpu_memory_node_get_devid(dst_node);
			
 
				+#else
			
 
				+		STARPU_ABORT_MSG("CUDA memcpy 3D peer not available, but core triggered one ?!");
			
 
				+#endif
			
 
				+	}
			
 
				+
			
 
				+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
			
 
				+	if (peer_copy)
			
 
				+	{
			
 
				+		struct cudaMemcpy3DPeerParms p;
			
 
				+		memset(&p, 0, sizeof(p));
			
 
				+
			
 
				+		p.srcDevice = src_dev;
			
 
				+		p.dstDevice = dst_dev;
			
 
				+		p.srcPtr = make_cudaPitchedPtr((char *)src_ptr, ld_src, blocksize, numblocks);
			
 
				+		p.dstPtr = make_cudaPitchedPtr((char *)dst_ptr, ld_dst, blocksize, numblocks);
			
 
				+		p.extent = make_cudaExtent(blocksize, numblocks, 1);
			
 
				+
			
 
				+
			
 
				+		if (stream)
			
 
				+		{
			
 
				+			double start;
			
 
				+			starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
			
 
				+			cures = cudaMemcpy3DPeerAsync(&p, stream);
			
 
				+		}
			
 
				+
			
 
				+		/* Test if the asynchronous copy has failed or if the caller only asked for a synchronous copy */
			
 
				+		if (stream == NULL || cures)
			
 
				+		{
			
 
				+			cures = cudaMemcpy3DPeer(&p);
			
 
				+
			
 
				+			if (!cures)
			
 
				+				cures = cudaDeviceSynchronize();
			
 
				+			if (STARPU_UNLIKELY(cures))
			
 
				+				STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+
			
 
				+			return 0;
			
 
				+		}
			
 
				+	}
			
 
				+	else
			
 
				+#endif
			
 
				+	{
			
 
				+		if (stream)
			
 
				+		{
			
 
				+			double start;
			
 
				+			starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
			
 
				+			cures = cudaMemcpy2DAsync((char *)dst_ptr, ld_dst, (char *)src_ptr, ld_src,
			
 
				+				blocksize, numblocks, kind, stream);
			
 
				+			starpu_interface_end_driver_copy_async(src_node, dst_node, start);
			
 
				+		}
			
 
				+
			
 
				+		/* Test if the asynchronous copy has failed or if the caller only asked for a synchronous copy */
			
 
				+		if (stream == NULL || cures)
			
 
				+		{
			
 
				+			cures = cudaMemcpy2D((char *)dst_ptr, ld_dst, (char *)src_ptr, ld_src,
			
 
				+					blocksize, numblocks, kind);
			
 
				+			if (!cures)
			
 
				+				cures = cudaDeviceSynchronize();
			
 
				+			if (STARPU_UNLIKELY(cures))
			
 
				+				STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+
			
 
				+			return 0;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+	return -EAGAIN;
			
 
				+}
			
 
				+
			
 
				+#if 0
			
 
				+/* CUDA doesn't seem to be providing a way to set ld2?? */
			
 
				+int
			
 
				+starpu_cuda_copy3d_async_sync(void *src_ptr, unsigned src_node,
			
 
				+			      void *dst_ptr, unsigned dst_node,
			
 
				+			      size_t blocksize,
			
 
				+			      size_t numblocks_1, size_t ld1_src, size_t ld1_dst,
			
 
				+			      size_t numblocks_2, size_t ld2_src, size_t ld2_dst,
			
 
				+			      cudaStream_t stream, enum cudaMemcpyKind kind)
			
 
				+{
			
 
				+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
			
 
				+	int peer_copy = 0;
			
 
				+	int src_dev = -1, dst_dev = -1;
			
 
				+#endif
			
 
				+	cudaError_t cures = 0;
			
 
				+
			
 
				+	if (kind == cudaMemcpyDeviceToDevice && src_node != dst_node)
			
 
				+	{
			
 
				+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
			
 
				+		peer_copy = 1;
			
 
				+		src_dev = starpu_memory_node_get_devid(src_node);
			
 
				+		dst_dev = starpu_memory_node_get_devid(dst_node);
			
 
				+#else
			
 
				+		STARPU_ABORT_MSG("CUDA memcpy 3D peer not available, but core triggered one ?!");
			
 
				+#endif
			
 
				+	}
			
 
				+
			
 
				+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
			
 
				+	if (peer_copy)
			
 
				+	{
			
 
				+		struct cudaMemcpy3DPeerParms p;
			
 
				+		memset(&p, 0, sizeof(p));
			
 
				+
			
 
				+		p.srcDevice = src_dev;
			
 
				+		p.dstDevice = dst_dev;
			
 
				+		p.srcPtr = make_cudaPitchedPtr((char *)src_ptr, ld1_src, blocksize, numblocks);
			
 
				+		p.dstPtr = make_cudaPitchedPtr((char *)dst_ptr, ld1_dst, blocksize, numblocks);
			
 
				+		// FIXME: how to pass ld2_src / ld2_dst ??
			
 
				+		p.extent = make_cudaExtent(blocksize, numblocks_1, numblocks_2);
			
 
				+
			
 
				+
			
 
				+		if (stream)
			
 
				+		{
			
 
				+			double start;
			
 
				+			starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
			
 
				+			cures = cudaMemcpy3DPeerAsync(&p, stream);
			
 
				+		}
			
 
				+
			
 
				+		/* Test if the asynchronous copy has failed or if the caller only asked for a synchronous copy */
			
 
				+		if (stream == NULL || cures)
			
 
				+		{
			
 
				+			cures = cudaMemcpy3DPeer(&p);
			
 
				+
			
 
				+			if (!cures)
			
 
				+				cures = cudaDeviceSynchronize();
			
 
				+			if (STARPU_UNLIKELY(cures))
			
 
				+				STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+
			
 
				+			return 0;
			
 
				+		}
			
 
				+	}
			
 
				+	else
			
 
				+#endif
			
 
				+	{
			
 
				+		struct cudaMemcpy3DParms p;
			
 
				+		memset(&p, 0, sizeof(p));
			
 
				+
			
 
				+		p.srcPtr = make_cudaPitchedPtr((char *)src_ptr, ld1_src, blocksize, numblocks);
			
 
				+		p.dstPtr = make_cudaPitchedPtr((char *)dst_ptr, ld1_dst, blocksize, numblocks);
			
 
				+		// FIXME: how to pass ld2_src / ld2_dst ??
			
 
				+		p.extent = make_cudaExtent(blocksize, numblocks, 1);
			
 
				+		p.kind = kind;
			
 
				+
			
 
				+		if (stream)
			
 
				+		{
			
 
				+			double start;
			
 
				+			starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
			
 
				+			cures = cudaMemcpy3DAsync(&p, stream);
			
 
				+			starpu_interface_end_driver_copy_async(src_node, dst_node, start);
			
 
				+		}
			
 
				+
			
 
				+		/* Test if the asynchronous copy has failed or if the caller only asked for a synchronous copy */
			
 
				+		if (stream == NULL || cures)
			
 
				+		{
			
 
				+			cures = cudaMemcpy3D(&p);
			
 
				+			if (!cures)
			
 
				+				cures = cudaDeviceSynchronize();
			
 
				+			if (STARPU_UNLIKELY(cures))
			
 
				+				STARPU_CUDA_REPORT_ERROR(cures);
			
 
				+
			
 
				+			return 0;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+	return -EAGAIN;
			
 
				+}
			
 
				+#endif
			
 
				 #endif /* STARPU_USE_CUDA */
			
 
				 
			
 
				 int _starpu_run_cuda(struct _starpu_worker_set *workerarg)
			
@@ -1429,6 +1623,57 @@ int _starpu_cuda_copy_data_from_cpu_to_cuda(uintptr_t src, size_t src_offset, un
 
				 					   cudaMemcpyHostToDevice);
			
 
				 }
			
 
				 
			
 
				+int _starpu_cuda_copy2d_data_from_cuda_to_cpu(uintptr_t src, size_t src_offset, unsigned src_node,
			
 
				+					      uintptr_t dst, size_t dst_offset, unsigned dst_node,
			
 
				+					      size_t blocksize, size_t numblocks, size_t ld_src, size_t ld_dst,
			
 
				+					      struct _starpu_async_channel *async_channel)
			
 
				+{
			
 
				+	int src_kind = starpu_node_get_kind(src_node);
			
 
				+	int dst_kind = starpu_node_get_kind(dst_node);
			
 
				+
			
 
				+	STARPU_ASSERT(src_kind == STARPU_CUDA_RAM && dst_kind == STARPU_CPU_RAM);
			
 
				+
			
 
				+	return starpu_cuda_copy2d_async_sync((void*) (src + src_offset), src_node,
			
 
				+					   (void*) (dst + dst_offset), dst_node,
			
 
				+					   blocksize, numblocks, ld_src, ld_dst,
			
 
				+					   async_channel?starpu_cuda_get_out_transfer_stream(src_node):NULL,
			
 
				+					   cudaMemcpyDeviceToHost);
			
 
				+}
			
 
				+
			
 
				+int _starpu_cuda_copy2d_data_from_cuda_to_cuda(uintptr_t src, size_t src_offset, unsigned src_node,
			
 
				+					       uintptr_t dst, size_t dst_offset, unsigned dst_node,
			
 
				+					       size_t blocksize, size_t numblocks, size_t ld_src, size_t ld_dst,
			
 
				+					       struct _starpu_async_channel *async_channel)
			
 
				+{
			
 
				+	int src_kind = starpu_node_get_kind(src_node);
			
 
				+	int dst_kind = starpu_node_get_kind(dst_node);
			
 
				+
			
 
				+	STARPU_ASSERT(src_kind == STARPU_CUDA_RAM && dst_kind == STARPU_CUDA_RAM);
			
 
				+
			
 
				+	return starpu_cuda_copy2d_async_sync((void*) (src + src_offset), src_node,
			
 
				+					   (void*) (dst + dst_offset), dst_node,
			
 
				+					   blocksize, numblocks, ld_src, ld_dst,
			
 
				+					   async_channel?starpu_cuda_get_peer_transfer_stream(src_node, dst_node):NULL,
			
 
				+					   cudaMemcpyDeviceToDevice);
			
 
				+}
			
 
				+
			
 
				+int _starpu_cuda_copy2d_data_from_cpu_to_cuda(uintptr_t src, size_t src_offset, unsigned src_node,
			
 
				+					      uintptr_t dst, size_t dst_offset, unsigned dst_node,
			
 
				+					      size_t blocksize, size_t numblocks, size_t ld_src, size_t ld_dst,
			
 
				+					      struct _starpu_async_channel *async_channel)
			
 
				+{
			
 
				+	int src_kind = starpu_node_get_kind(src_node);
			
 
				+	int dst_kind = starpu_node_get_kind(dst_node);
			
 
				+
			
 
				+	STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_CUDA_RAM);
			
 
				+
			
 
				+	return starpu_cuda_copy2d_async_sync((void*) (src + src_offset), src_node,
			
 
				+					   (void*) (dst + dst_offset), dst_node,
			
 
				+					   blocksize, numblocks, ld_src, ld_dst,
			
 
				+					   async_channel?starpu_cuda_get_in_transfer_stream(dst_node):NULL,
			
 
				+					   cudaMemcpyHostToDevice);
			
 
				+}
			
 
				+
			
 
				 #endif /* STARPU_USE_CUDA */
			
 
				 
			
 
				 int _starpu_cuda_is_direct_access_supported(unsigned node, unsigned handling_node)
			
@@ -1586,6 +1831,22 @@ struct _starpu_node_ops _starpu_driver_cuda_node_ops =
 
				 	.copy_data_to[STARPU_MIC_RAM] = NULL,
			
 
				 	.copy_data_to[STARPU_MPI_MS_RAM] = NULL,
			
 
				 
			
 
				+	.copy2d_data_to[STARPU_UNUSED] = NULL,
			
 
				+	.copy2d_data_to[STARPU_CPU_RAM] = NULL,
			
 
				+	.copy2d_data_to[STARPU_CUDA_RAM] = NULL,
			
 
				+	.copy2d_data_to[STARPU_OPENCL_RAM] = NULL,
			
 
				+	.copy2d_data_to[STARPU_DISK_RAM] = NULL,
			
 
				+	.copy2d_data_to[STARPU_MIC_RAM] = NULL,
			
 
				+	.copy2d_data_to[STARPU_MPI_MS_RAM] = NULL,
			
 
				+
			
 
				+	.copy3d_data_to[STARPU_UNUSED] = NULL,
			
 
				+	.copy3d_data_to[STARPU_CPU_RAM] = NULL,
			
 
				+	.copy3d_data_to[STARPU_CUDA_RAM] = NULL,
			
 
				+	.copy3d_data_to[STARPU_OPENCL_RAM] = NULL,
			
 
				+	.copy3d_data_to[STARPU_DISK_RAM] = NULL,
			
 
				+	.copy3d_data_to[STARPU_MIC_RAM] = NULL,
			
 
				+	.copy3d_data_to[STARPU_MPI_MS_RAM] = NULL,
			
 
				+
			
 
				 	.wait_request_completion = NULL,
			
 
				 	.test_request_completion = NULL,
			
 
				 	.is_direct_access_supported = _starpu_cuda_is_direct_access_supported,
			
@@ -1612,6 +1873,27 @@ struct _starpu_node_ops _starpu_driver_cuda_node_ops =
 
				 	.copy_data_to[STARPU_MIC_RAM] = NULL,
			
 
				 	.copy_data_to[STARPU_MPI_MS_RAM] = NULL,
			
 
				 
			
 
				+	.copy2d_data_to[STARPU_UNUSED] = NULL,
			
 
				+	.copy2d_data_to[STARPU_CPU_RAM] = _starpu_cuda_copy2d_data_from_cuda_to_cpu,
			
 
				+	.copy2d_data_to[STARPU_CUDA_RAM] = _starpu_cuda_copy2d_data_from_cuda_to_cuda,
			
 
				+	.copy2d_data_to[STARPU_OPENCL_RAM] = NULL,
			
 
				+	.copy2d_data_to[STARPU_DISK_RAM] = NULL,
			
 
				+	.copy2d_data_to[STARPU_MIC_RAM] = NULL,
			
 
				+	.copy2d_data_to[STARPU_MPI_MS_RAM] = NULL,
			
 
				+
			
 
				+	.copy3d_data_to[STARPU_UNUSED] = NULL,
			
 
				+#if 0
			
 
				+	.copy3d_data_to[STARPU_CPU_RAM] = _starpu_cuda_copy3d_data_from_cuda_to_cpu,
			
 
				+	.copy3d_data_to[STARPU_CUDA_RAM] = _starpu_cuda_copy3d_data_from_cuda_to_cuda,
			
 
				+#else
			
 
				+	.copy3d_data_to[STARPU_CPU_RAM] = NULL,
			
 
				+	.copy3d_data_to[STARPU_CUDA_RAM] = NULL,
			
 
				+#endif
			
 
				+	.copy3d_data_to[STARPU_OPENCL_RAM] = NULL,
			
 
				+	.copy3d_data_to[STARPU_DISK_RAM] = NULL,
			
 
				+	.copy3d_data_to[STARPU_MIC_RAM] = NULL,
			
 
				+	.copy3d_data_to[STARPU_MPI_MS_RAM] = NULL,
			
 
				+
			
 
				 	.wait_request_completion = _starpu_cuda_wait_request_completion,
			
 
				 	.test_request_completion = _starpu_cuda_test_request_completion,
			
 
				 	.is_direct_access_supported = _starpu_cuda_is_direct_access_supported,
			
--- a/src/drivers/cuda/driver_cuda.h
+++ b/src/drivers/cuda/driver_cuda.h
@@ -65,6 +65,14 @@ int _starpu_cuda_copy_data_from_cuda_to_cuda(uintptr_t src, size_t src_offset, u
 
				 int _starpu_cuda_copy_data_from_cuda_to_cpu(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
			
 
				 int _starpu_cuda_copy_data_from_cpu_to_cuda(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
			
 
				 
			
 
				+int _starpu_cuda_copy2d_data_from_cuda_to_cuda(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t blocksize, size_t numblocks, size_t ld_src, size_t ld_dst, struct _starpu_async_channel *async_channel);
			
 
				+int _starpu_cuda_copy2d_data_from_cuda_to_cpu(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t blocksize, size_t numblocks, size_t ld_src, size_t ld_dst, struct _starpu_async_channel *async_channel);
			
 
				+int _starpu_cuda_copy2d_data_from_cpu_to_cuda(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t blocksize, size_t numblocks, size_t ld_src, size_t ld_dst, struct _starpu_async_channel *async_channel);
			
 
				+
			
 
				+int _starpu_cuda_copy3d_data_from_cuda_to_cuda(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t blocksize, size_t numblocks_1, size_t ld1_src, size_t ld1_dst, size_t numblocks_2, size_t ld2_src, size_t ld2_dst, struct _starpu_async_channel *async_channel);
			
 
				+int _starpu_cuda_copy3d_data_from_cuda_to_cpu(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t blocksize, size_t numblocks_1, size_t ld1_src, size_t ld1_dst, size_t numblocks_2, size_t ld2_src, size_t ld2_dst, struct _starpu_async_channel *async_channel);
			
 
				+int _starpu_cuda_copy3d_data_from_cpu_to_cuda(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t blocksize, size_t numblocks_1, size_t ld1_src, size_t ld1_dst, size_t numblocks_2, size_t ld2_src, size_t ld2_dst, struct _starpu_async_channel *async_channel);
			
 
				+
			
 
				 int _starpu_cuda_is_direct_access_supported(unsigned node, unsigned handling_node);
			
 
				 uintptr_t _starpu_cuda_malloc_on_node(unsigned dst_node, size_t size, int flags);
			
 
				 void _starpu_cuda_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags);
			
--- a/src/drivers/disk/driver_disk.c
+++ b/src/drivers/disk/driver_disk.c
@@ -268,6 +268,8 @@ struct _starpu_node_ops _starpu_driver_disk_node_ops =
 
				 	.copy_data_to[STARPU_MIC_RAM] = NULL,
			
 
				 	.copy_data_to[STARPU_MPI_MS_RAM] = NULL,
			
 
				 
			
 
				+	/* TODO: copy2D/3D? */
			
 
				+
			
 
				 	.wait_request_completion = _starpu_disk_wait_request_completion,
			
 
				 	.test_request_completion = _starpu_disk_test_request_completion,
			
 
				 	.is_direct_access_supported = _starpu_disk_is_direct_access_supported,
			
--- a/src/drivers/mic/driver_mic_source.c
+++ b/src/drivers/mic/driver_mic_source.c
@@ -704,6 +704,8 @@ struct _starpu_node_ops _starpu_driver_mic_node_ops =
 
				 	.copy_data_to[STARPU_MIC_RAM] = NULL,
			
 
				 	.copy_data_to[STARPU_MPI_MS_RAM] = NULL,
			
 
				 
			
 
				+	/* TODO: copy2D/3D? */
			
 
				+
			
 
				 	.wait_request_completion = _starpu_mic_wait_request_completion,
			
 
				 	.test_request_completion = _starpu_mic_test_request_completion,
			
 
				 	.is_direct_access_supported = _starpu_mic_is_direct_access_supported,
			
--- a/src/drivers/mpi/driver_mpi_source.c
+++ b/src/drivers/mpi/driver_mpi_source.c
@@ -564,6 +564,8 @@ struct _starpu_node_ops _starpu_driver_mpi_node_ops =
 
				 	.copy_data_to[STARPU_MIC_RAM] = NULL,
			
 
				 	.copy_data_to[STARPU_MPI_MS_RAM] = _starpu_mpi_copy_data_from_mpi_to_mpi,
			
 
				 
			
 
				+	/* TODO: copy2D/3D? */
			
 
				+
			
 
				 	.wait_request_completion = _starpu_mpi_common_wait_request_completion,
			
 
				 	.test_request_completion = _starpu_mpi_common_test_event,
			
 
				 	.is_direct_access_supported = _starpu_mpi_is_direct_access_supported,
			
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -1394,6 +1394,8 @@ struct _starpu_node_ops _starpu_driver_opencl_node_ops =
 
				 	.copy_data_to[STARPU_MIC_RAM] = NULL,
			
 
				 	.copy_data_to[STARPU_MPI_MS_RAM] = NULL,
			
 
				 
			
 
				+	/* TODO: copy2D/3D? */
			
 
				+
			
 
				 	.wait_request_completion = _starpu_opencl_wait_request_completion,
			
 
				 	.test_request_completion = _starpu_opencl_test_request_completion,
			
 
				 	.is_direct_access_supported = _starpu_opencl_is_direct_access_supported,