Преглед на файлове

Move optimized cuda 2d copy from interfaces to new starpu_cuda_copy2/3d_async_sync

and use them from starpu_interface_copy2d and 3d
Samuel Thibault преди 5 години
родител
ревизия
d5a8eb5e00

+ 3 - 0
ChangeLog

@@ -31,6 +31,9 @@ Small changes:
   * Use the S4U interface of Simgrid instead of xbt and MSG.
   * Add starpu_interface_copy2d, 3d, and 4d to easily request data copies from
     data interfaces.
+  * Move optimized cuda 2d copy from interfaces to new
+    starpu_cuda_copy2d_async_sync and starpu_cuda_copy3d_async_sync, and use
+    them from starpu_interface_copy2d and 3d.
 
 StarPU 1.3.4 (git revision xxx)
 ==============================================

+ 39 - 1
include/starpu_cuda.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2010-2012,2014                           Université de Bordeaux
+ * Copyright (C) 2010-2012,2014,2020                      Université de Bordeaux
  * Copyright (C) 2011                                     Inria
  * Copyright (C) 2010-2013,2015,2017,2019                 CNRS
  *
@@ -87,6 +87,44 @@ const struct cudaDeviceProp *starpu_cuda_get_device_properties(unsigned workerid
 int starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node, void *dst_ptr, unsigned dst_node, size_t ssize, cudaStream_t stream, enum cudaMemcpyKind kind);
 
 /**
+   Copy \p numblocks blocks of \p blocksize bytes from the pointer \p src_ptr on
+   \p src_node to the pointer \p dst_ptr on \p dst_node.
+
+   The blocks start at addresses which are ld_src (resp. ld_dst) bytes apart in
+   the source (resp. destination) interface.
+
+   The function first tries to copy the data asynchronous (unless \p stream is
+   <c>NULL</c>). If the asynchronous copy fails or if \p stream is <c>NULL</c>,
+   it copies the data synchronously. The function returns <c>-EAGAIN</c> if the
+   asynchronous launch was successfull. It returns 0 if the synchronous copy was
+   successful, or fails otherwise.
+*/
+int starpu_cuda_copy2d_async_sync(void *src_ptr, unsigned src_node, void *dst_ptr, unsigned dst_node,
+				  size_t blocksize,
+				  size_t numblocks, size_t ld_src, size_t ld_dst,
+				  cudaStream_t stream, enum cudaMemcpyKind kind);
+
+/**
+   Copy \p numblocks_1 * \p numblocks_2 blocks of \p blocksize bytes from the
+   pointer \p src_ptr on \p src_node to the pointer \p dst_ptr on \p dst_node.
+
+   The blocks are grouped by \p numblocks_1 blocks whose start addresses are
+   ld1_src (resp. ld1_dst) bytes apart in the source (resp. destination)
+   interface.
+
+   The function first tries to copy the data asynchronous (unless \p stream is
+   <c>NULL</c>). If the asynchronous copy fails or if \p stream is <c>NULL</c>,
+   it copies the data synchronously. The function returns <c>-EAGAIN</c> if the
+   asynchronous launch was successfull. It returns 0 if the synchronous copy was
+   successful, or fails otherwise.
+*/
+int starpu_cuda_copy3d_async_sync(void *src_ptr, unsigned src_node, void *dst_ptr, unsigned dst_node,
+				  size_t blocksize,
+				  size_t numblocks_1, size_t ld1_src, size_t ld1_dst,
+				  size_t numblocks_2, size_t ld2_src, size_t ld2_dst,
+				  cudaStream_t stream, enum cudaMemcpyKind kind);
+
+/**
    Call <c>cudaSetDevice(\p devid)</c> or <c>cudaGLSetGLDevice(\p devid)</c>,
    according to whether \p devid is among the field
    starpu_conf::cuda_opengl_interoperability.

+ 22 - 2
src/datawizard/copy_driver.c

@@ -324,6 +324,9 @@ int starpu_interface_copy2d(uintptr_t src, size_t src_offset, unsigned src_node,
 {
 	int ret = 0;
 	unsigned i;
+	struct _starpu_async_channel *async_channel = async_data;
+	enum starpu_node_kind dst_kind = starpu_node_get_kind(dst_node);
+	struct _starpu_node_ops *node_ops = _starpu_memory_node_get_node_ops(src_node);
 
 	STARPU_ASSERT_MSG(ld_src >= blocksize, "block size %lu is bigger than ld %lu in source", (unsigned long) blocksize, (unsigned long) ld_src);
 	STARPU_ASSERT_MSG(ld_dst >= blocksize, "block size %lu is bigger than ld %lu in destination", (unsigned long) blocksize, (unsigned long) ld_dst);
@@ -334,7 +337,13 @@ int starpu_interface_copy2d(uintptr_t src, size_t src_offset, unsigned src_node,
 					     dst, dst_offset, dst_node,
 					     blocksize * numblocks, async_data);
 
-	/* TODO: introduce and call node_ops->copy2d_data_to when available */
+	if (node_ops && node_ops->copy2d_data_to[dst_kind])
+		/* Hardware-optimized non-contiguous case */
+		return node_ops->copy2d_data_to[dst_kind](src, src_offset, src_node,
+							     dst, dst_offset, dst_node,
+							     blocksize,
+							     numblocks, ld_src, ld_dst,
+							     async_channel);
 
 	for (i = 0; i < numblocks; i++)
 	{
@@ -356,6 +365,9 @@ int starpu_interface_copy3d(uintptr_t src, size_t src_offset, unsigned src_node,
 {
 	int ret = 0;
 	unsigned i;
+	struct _starpu_async_channel *async_channel = async_data;
+	enum starpu_node_kind dst_kind = starpu_node_get_kind(dst_node);
+	struct _starpu_node_ops *node_ops = _starpu_memory_node_get_node_ops(src_node);
 
 	STARPU_ASSERT_MSG(ld1_src >= blocksize, "block size %lu is bigger than ld %lu in source", (unsigned long) blocksize, (unsigned long) ld1_src);
 	STARPU_ASSERT_MSG(ld1_dst >= blocksize, "block size %lu is bigger than ld %lu in destination", (unsigned long) blocksize, (unsigned long) ld1_dst);
@@ -371,7 +383,15 @@ int starpu_interface_copy3d(uintptr_t src, size_t src_offset, unsigned src_node,
 					     blocksize * numblocks_1 * numblocks_2,
 					     async_data);
 
-	/* TODO: introduce and call node_ops->copy3d_data_to when available */
+	if (node_ops && node_ops->copy3d_data_to[dst_kind])
+		/* Hardware-optimized non-contiguous case */
+		return node_ops->copy3d_data_to[dst_kind](src, src_offset, src_node,
+							     dst, dst_offset, dst_node,
+							     blocksize,
+							     numblocks_1, ld1_src, ld1_dst,
+							     numblocks_2, ld2_src, ld2_dst,
+							     async_channel);
+
 
 	for (i = 0; i < numblocks_2; i++)
 	{

+ 0 - 303
src/datawizard/interfaces/block_interface.c

@@ -18,40 +18,10 @@
 
 #include <starpu.h>
 
-#ifdef STARPU_USE_CUDA
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-#endif
-#ifdef STARPU_USE_OPENCL
-static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
-static int copy_opencl_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
-#endif
 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 
 static const struct starpu_data_copy_methods block_copy_data_methods_s =
 {
-#ifdef STARPU_USE_CUDA
-	.ram_to_cuda = copy_ram_to_cuda,
-	.cuda_to_ram = copy_cuda_to_ram,
-	.ram_to_cuda_async = copy_ram_to_cuda_async,
-	.cuda_to_ram_async = copy_cuda_to_ram_async,
-	.cuda_to_cuda = copy_cuda_to_cuda,
-#endif
-#ifdef STARPU_USE_OPENCL
-	.ram_to_opencl = copy_ram_to_opencl,
-	.opencl_to_ram = copy_opencl_to_ram,
-	.opencl_to_opencl = copy_opencl_to_opencl,
-        .ram_to_opencl_async = copy_ram_to_opencl_async,
-	.opencl_to_ram_async = copy_opencl_to_ram_async,
-	.opencl_to_opencl_async = copy_opencl_to_opencl_async,
-#endif
 	.any_to_any = copy_any_to_any,
 };
 
@@ -494,279 +464,6 @@ static void free_block_buffer_on_node(void *data_interface, unsigned node)
 	starpu_free_on_node(node, block_interface->dev_handle, nx*ny*nz*elemsize);
 }
 
-#ifdef STARPU_USE_CUDA
-static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind)
-{
-	struct starpu_block_interface *src_block = src_interface;
-	struct starpu_block_interface *dst_block = dst_interface;
-
-	uint32_t nx = src_block->nx;
-	uint32_t ny = src_block->ny;
-	uint32_t nz = src_block->nz;
-	size_t elemsize = src_block->elemsize;
-
-	cudaError_t cures;
-
-	if (IS_CONTIGUOUS_MATRIX(nx, ny, src_block->ldy) && (src_block->ldy == dst_block->ldy))
-	{
-		/* Is that a single contiguous buffer ? */
-		if (IS_CONTIGUOUS_BLOCK(nx, ny, nz, src_block->ldy, src_block->ldz) &&
-		    IS_CONTIGUOUS_BLOCK(nx, ny, nz, dst_block->ldy, dst_block->ldz))
-		{
-			starpu_cuda_copy_async_sync((void *)src_block->ptr, src_node, (void *)dst_block->ptr, dst_node, nx*ny*nz*elemsize, NULL, kind);
-                }
-		else
-		{
-			/* Are all plans contiguous */
-                        cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
-                                             (char *)src_block->ptr, src_block->ldz*elemsize,
-                                             nx*ny*elemsize, nz, kind);
-			if (!cures)
-				cures = cudaDeviceSynchronize();
-                        if (STARPU_UNLIKELY(cures))
-                                STARPU_CUDA_REPORT_ERROR(cures);
-                }
-	}
-	else
-	{
-		/* Default case: we transfer all blocks one by one: nz transfers */
-		/* TODO: use cudaMemcpy3D now that it works (except on cuda 4.2) */
-		unsigned layer;
-		for (layer = 0; layer < src_block->nz; layer++)
-		{
-			uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + layer*src_block->ldz*src_block->elemsize;
-			uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + layer*dst_block->ldz*dst_block->elemsize;
-
-			cures = cudaMemcpy2D((char *)dst_ptr, dst_block->ldy*elemsize,
-                                             (char *)src_ptr, src_block->ldy*elemsize,
-                                             nx*elemsize, ny, kind);
-
-			if (!cures)
-				cures = cudaDeviceSynchronize();
-			if (STARPU_UNLIKELY(cures))
-				STARPU_CUDA_REPORT_ERROR(cures);
-		}
-	}
-
-	starpu_interface_data_copy(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
-
-	return 0;
-}
-
-static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream, enum cudaMemcpyKind kind)
-{
-	struct starpu_block_interface *src_block = src_interface;
-	struct starpu_block_interface *dst_block = dst_interface;
-
-	uint32_t nx = src_block->nx;
-	uint32_t ny = src_block->ny;
-	uint32_t nz = src_block->nz;
-	size_t elemsize = src_block->elemsize;
-
-	cudaError_t cures;
-
-	int ret;
-
-	/* We may have a contiguous buffer for the entire block, or contiguous
-	 * plans within the block, we can avoid many small transfers that way */
-	if (IS_CONTIGUOUS_MATRIX(nx, ny, src_block->ldy) && (src_block->ldy == dst_block->ldy))
-	{
-		/* Is that a single contiguous buffer ? */
-		if (IS_CONTIGUOUS_BLOCK(nx, ny, nz, src_block->ldy, src_block->ldz) &&
-		    IS_CONTIGUOUS_BLOCK(nx, ny, nz, dst_block->ldy, dst_block->ldz))
-		{
-			ret = starpu_cuda_copy_async_sync((void *)src_block->ptr, src_node, (void *)dst_block->ptr, dst_node, nx*ny*nz*elemsize, stream, kind);
-		}
-		else
-		{
-			double start;
-			/* Are all plans contiguous */
-			starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
-			cures = cudaMemcpy2DAsync((char *)dst_block->ptr, dst_block->ldz*elemsize,
-					(char *)src_block->ptr, src_block->ldz*elemsize,
-					nx*ny*elemsize, nz, kind, stream);
-			starpu_interface_end_driver_copy_async(src_node, dst_node, start);
-			if (STARPU_UNLIKELY(cures))
-			{
-				cures = cudaMemcpy2D((char *)dst_block->ptr, dst_block->ldz*elemsize,
-						(char *)src_block->ptr, src_block->ldz*elemsize,
-						nx*ny*elemsize, nz, kind);
-				if (!cures)
-					cures = cudaDeviceSynchronize();
-				if (STARPU_UNLIKELY(cures))
-					STARPU_CUDA_REPORT_ERROR(cures);
-
-				ret = 0;
-			}
-			else
-			{
-				ret = -EAGAIN;
-			}
-		}
-	}
-	else
-	{
-		/* Default case: we transfer all blocks one by one: nz 2D transfers */
-		/* TODO: use cudaMemcpy3D now that it works (except on cuda 4.2) */
-		unsigned layer;
-		for (layer = 0; layer < src_block->nz; layer++)
-		{
-			uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + layer*src_block->ldz*src_block->elemsize;
-			uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + layer*dst_block->ldz*dst_block->elemsize;
-			double start;
-
-			starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
-			cures = cudaMemcpy2DAsync((char *)dst_ptr, dst_block->ldy*elemsize,
-                                                  (char *)src_ptr, src_block->ldy*elemsize,
-                                                  nx*elemsize, ny, kind, stream);
-			starpu_interface_end_driver_copy_async(src_node, dst_node, start);
-
-			if (STARPU_UNLIKELY(cures))
-			{
-				/* I don't know how to do that "better" */
-				goto no_async_default;
-			}
-
-		}
-
-		ret = -EAGAIN;
-
-	}
-
-	starpu_interface_data_copy(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
-
-	return ret;
-
-no_async_default:
-
-	{
-	unsigned layer;
-	for (layer = 0; layer < src_block->nz; layer++)
-	{
-		uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + layer*src_block->ldz*src_block->elemsize;
-		uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + layer*dst_block->ldz*dst_block->elemsize;
-
-		cures = cudaMemcpy2D((char *)dst_ptr, dst_block->ldy*elemsize,
-                                     (char *)src_ptr, src_block->ldy*elemsize,
-                                     nx*elemsize, ny, kind);
-
-		if (!cures)
-			cures = cudaDeviceSynchronize();
-		if (STARPU_UNLIKELY(cures))
-			STARPU_CUDA_REPORT_ERROR(cures);
-	}
-
-	starpu_interface_data_copy(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
-	return 0;
-	}
-}
-
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
-}
-
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
-{
-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
-}
-
-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
-{
-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
-}
-
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
-{
-	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
-}
-
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
-{
-	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
-}
-#endif // STARPU_USE_CUDA
-
-#ifdef STARPU_USE_OPENCL
-static int copy_opencl_common(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
-{
-	struct starpu_block_interface *src_block = src_interface;
-	struct starpu_block_interface *dst_block = dst_interface;
-        int ret = 0;
-
-	uint32_t nx = src_block->nx;
-	uint32_t ny = src_block->ny;
-
-	/* We may have a contiguous buffer for the entire block, or contiguous
-	 * plans within the block, we can avoid many small transfers that way */
-	if (IS_CONTIGUOUS_BLOCK(nx, ny, nz, src_block->ldy, src_block->ldz) &&
-	    IS_CONTIGUOUS_BLOCK(nx, ny, nz, dst_block->ldy, dst_block->ldz))
-		/* Is that a single contiguous buffer ? */
-	{
-		ret = starpu_opencl_copy_async_sync(src_block->dev_handle, src_block->offset, src_node,
-						    dst_block->dev_handle, dst_block->offset, dst_node,
-						    src_block->nx*src_block->ny*src_block->nz*src_block->elemsize,
-						    event);
-	}
-	else
-	{
-		/* Default case: we transfer all lines one by one: ny*nz transfers */
-		/* TODO: rect support */
-		unsigned layer;
-		for (layer = 0; layer < src_block->nz; layer++)
-		{
-                        unsigned j;
-                        for(j=0 ; j<src_block->ny ; j++)
-			{
-				ret = starpu_opencl_copy_async_sync(src_block->dev_handle,
-								    src_block->offset + layer*src_block->ldz*src_block->elemsize + j*src_block->ldy*src_block->elemsize,
-								    src_node,
-								    dst_block->dev_handle,
-								    dst_block->offset + layer*dst_block->ldz*dst_block->elemsize + j*dst_block->ldy*dst_block->elemsize,
-								    dst_node,
-								    src_block->nx*src_block->elemsize,
-								    event);
-                        }
-                }
-        }
-
-	starpu_interface_data_copy(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->elemsize);
-
-	return ret;
-}
-
-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
-{
-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
-}
-
-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
-{
-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
-}
-
-static int copy_opencl_to_opencl_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
-{
-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
-}
-
-static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
-{
-        return copy_ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
-}
-
-static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
-{
-        return copy_opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, NULL);
-}
-
-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
-{
-	return copy_opencl_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
-}
-
-#endif
-
 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data)
 {
 	struct starpu_block_interface *src_block = (struct starpu_block_interface *) src_interface;

+ 0 - 227
src/datawizard/interfaces/matrix_interface.c

@@ -18,58 +18,10 @@
 
 #include <starpu.h>
 
-#ifdef STARPU_USE_CUDA
-/* At least CUDA 4.2 still didn't have working memcpy3D */
-#if CUDART_VERSION < 5000
-#define BUGGED_MEMCPY3D
-#endif
-
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
-#ifndef BUGGED_MEMCPY3D
-static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
-#endif
-#endif
-#ifdef STARPU_USE_OPENCL
-static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
-static int copy_opencl_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
-#endif
 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 
 static const struct starpu_data_copy_methods matrix_copy_data_methods_s =
 {
-#ifdef STARPU_USE_CUDA
-	.ram_to_cuda = copy_ram_to_cuda,
-	.cuda_to_ram = copy_cuda_to_ram,
-	.ram_to_cuda_async = copy_ram_to_cuda_async,
-	.cuda_to_ram_async = copy_cuda_to_ram_async,
-	.cuda_to_cuda = copy_cuda_to_cuda,
-#ifndef BUGGED_MEMCPY3D
-	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
-#endif
-#else
-#ifdef STARPU_SIMGRID
-#ifndef BUGGED_MEMCPY3D
-	/* Enable GPU-GPU transfers in simgrid */
-	.cuda_to_cuda_async = (void *)1,
-#endif
-#endif
-#endif
-#ifdef STARPU_USE_OPENCL
-	.ram_to_opencl = copy_ram_to_opencl,
-	.opencl_to_ram = copy_opencl_to_ram,
-	.opencl_to_opencl = copy_opencl_to_opencl,
-        .ram_to_opencl_async = copy_ram_to_opencl_async,
-	.opencl_to_ram_async = copy_opencl_to_ram_async,
-	.opencl_to_opencl_async = copy_opencl_to_opencl_async,
-#endif
 	.any_to_any = copy_any_to_any,
 };
 
@@ -537,185 +489,6 @@ static void free_matrix_buffer_on_node(void *data_interface, unsigned node)
 	starpu_free_on_node(node, matrix_interface->dev_handle, matrix_interface->allocsize);
 }
 
-#ifdef STARPU_USE_CUDA
-static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind, int is_async, cudaStream_t stream)
-{
-	struct starpu_matrix_interface *src_matrix = src_interface;
-	struct starpu_matrix_interface *dst_matrix = dst_interface;
-
-	size_t elemsize = src_matrix->elemsize;
-	cudaError_t cures;
-
-	if (is_async)
-	{
-		double start;
-		starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
-		cures = cudaMemcpy2DAsync((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
-			(char *)src_matrix->ptr, src_matrix->ld*elemsize,
-			src_matrix->nx*elemsize, src_matrix->ny, kind, stream);
-		starpu_interface_end_driver_copy_async(src_node, dst_node, start);
-		if (!cures)
-			return -EAGAIN;
-	}
-
-	cures = cudaMemcpy2D((char *)dst_matrix->ptr, dst_matrix->ld*elemsize,
-		(char *)src_matrix->ptr, src_matrix->ld*elemsize,
-		src_matrix->nx*elemsize, src_matrix->ny, kind);
-	if (!cures)
-		cures = cudaDeviceSynchronize();
-	if (STARPU_UNLIKELY(cures))
-	{
-		int ret = copy_any_to_any(src_interface, src_node, dst_interface, dst_node, (void*)(uintptr_t)is_async);
-		if (ret == -EAGAIN) return ret;
-		if (ret) STARPU_CUDA_REPORT_ERROR(cures);
-	}
-
-	starpu_interface_data_copy(src_node, dst_node, (size_t)src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
-
-	return 0;
-}
-
-#ifndef BUGGED_MEMCPY3D
-static int copy_cuda_peer(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, int is_async, cudaStream_t stream)
-{
-#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
-	struct starpu_matrix_interface *src_matrix = src_interface;
-	struct starpu_matrix_interface *dst_matrix = dst_interface;
-
-	size_t elemsize = src_matrix->elemsize;
-	cudaError_t cures;
-
-	int src_dev = starpu_memory_node_get_devid(src_node);
-	int dst_dev = starpu_memory_node_get_devid(dst_node);
-
-	struct cudaMemcpy3DPeerParms p;
-	memset(&p, 0, sizeof(p));
-
-	p.srcDevice = src_dev;
-	p.dstDevice = dst_dev;
-	p.srcPtr = make_cudaPitchedPtr((char *)src_matrix->ptr, src_matrix->ld * elemsize, src_matrix->nx, src_matrix->ny);
-	p.dstPtr = make_cudaPitchedPtr((char *)dst_matrix->ptr, dst_matrix->ld * elemsize, dst_matrix->nx, dst_matrix->ny);
-	p.extent = make_cudaExtent(src_matrix->nx * elemsize, src_matrix->ny, 1);
-
-	if (is_async)
-	{
-		double start;
-		starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
-		cures = cudaMemcpy3DPeerAsync(&p, stream);
-		starpu_interface_end_driver_copy_async(src_node, dst_node, start);
-		if (!cures)
-			return -EAGAIN;
-	}
-
-	cures = cudaMemcpy3DPeer(&p);
-	if (!cures)
-		cures = cudaDeviceSynchronize();
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
-
-	starpu_interface_data_copy(src_node, dst_node, (size_t)src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
-
-	return 0;
-#else
-	STARPU_ABORT_MSG("CUDA memcpy 3D peer not available, but core triggered one ?!");
-#endif
-}
-#endif
-
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
-{
-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, 0, 0);
-}
-
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
-{
-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, 0, 0);
-}
-
-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
-{
-	if (src_node == dst_node)
-		return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice, 0, 0);
-	else
-#ifdef BUGGED_MEMCPY3D
-		STARPU_ABORT_MSG("CUDA memcpy 3D peer not available, but core triggered one?!");
-#else
-		return copy_cuda_peer(src_interface, src_node, dst_interface, dst_node, 0, 0);
-#endif
-}
-
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
-{
-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, 1, stream);
-}
-
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
-{
-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, 1, stream);
-}
-
-#ifndef BUGGED_MEMCPY3D
-static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
-{
-	if (src_node == dst_node)
-		return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice, 1, stream);
-	else
-		return copy_cuda_peer(src_interface, src_node, dst_interface, dst_node, 1, stream);
-}
-#endif
-#endif // STARPU_USE_CUDA
-
-#ifdef STARPU_USE_OPENCL
-static int copy_opencl_common(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
-{
-	struct starpu_matrix_interface *src_matrix = src_interface;
-	struct starpu_matrix_interface *dst_matrix = dst_interface;
-        int ret;
-
-	STARPU_ASSERT_MSG((src_matrix->ld == src_matrix->nx) && (dst_matrix->ld == dst_matrix->nx), "XXX non contiguous buffers are not properly supported in OpenCL yet. (TODO)");
-
-	ret = starpu_opencl_copy_async_sync(src_matrix->dev_handle, src_matrix->offset, src_node,
-					    dst_matrix->dev_handle, dst_matrix->offset, dst_node,
-					    src_matrix->nx*src_matrix->ny*src_matrix->elemsize,
-					    event);
-
-	starpu_interface_data_copy(src_node, dst_node, src_matrix->nx*src_matrix->ny*src_matrix->elemsize);
-
-	return ret;
-}
-
-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
-{
-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
-}
-
-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
-{
-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
-}
-
-static int copy_opencl_to_opencl_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
-{
-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
-}
-
-static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
-{
-        return copy_ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
-}
-
-static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
-{
-        return copy_opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, NULL);
-}
-
-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
-{
-	return copy_opencl_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
-}
-
-#endif
-
 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data)
 {
 	struct starpu_matrix_interface *src_matrix = (struct starpu_matrix_interface *) src_interface;

+ 0 - 279
src/datawizard/interfaces/tensor_interface.c

@@ -18,40 +18,10 @@
 
 #include <starpu.h>
 
-#ifdef STARPU_USE_CUDA
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-#endif
-#ifdef STARPU_USE_OPENCL
-static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
-static int copy_opencl_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
-#endif
 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 
 static const struct starpu_data_copy_methods tensor_copy_data_methods_s =
 {
-#ifdef STARPU_USE_CUDA
-	.ram_to_cuda = copy_ram_to_cuda,
-	.cuda_to_ram = copy_cuda_to_ram,
-	.ram_to_cuda_async = copy_ram_to_cuda_async,
-	.cuda_to_ram_async = copy_cuda_to_ram_async,
-	.cuda_to_cuda = copy_cuda_to_cuda,
-#endif
-#ifdef STARPU_USE_OPENCL
-	.ram_to_opencl = copy_ram_to_opencl,
-	.opencl_to_ram = copy_opencl_to_ram,
-	.opencl_to_opencl = copy_opencl_to_opencl,
-        .ram_to_opencl_async = copy_ram_to_opencl_async,
-	.opencl_to_ram_async = copy_opencl_to_ram_async,
-	.opencl_to_opencl_async = copy_opencl_to_opencl_async,
-#endif
 	.any_to_any = copy_any_to_any,
 };
 
@@ -566,255 +536,6 @@ static void free_tensor_buffer_on_node(void *data_interface, unsigned node)
 	starpu_free_on_node(node, tensor_interface->dev_handle, nx*ny*nz*nt*elemsize);
 }
 
-#ifdef STARPU_USE_CUDA
-static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind)
-{
-	struct starpu_tensor_interface *src_block = src_interface;
-	struct starpu_tensor_interface *dst_block = dst_interface;
-
-	uint32_t nx = src_block->nx;
-	uint32_t ny = src_block->ny;
-	uint32_t nz = src_block->nz;
-	uint32_t nt = src_block->nt;
-	size_t elemsize = src_block->elemsize;
-
-	cudaError_t cures;
-
-	if (IS_CONTIGUOUS_TENSOR(nx, ny, nz, nt, src_block->ldy, src_block->ldz, src_block->ldt) &&
-	    IS_CONTIGUOUS_TENSOR(nx, ny, nz, nt, dst_block->ldy, dst_block->ldz, dst_block->ldt))
-	{
-		/* Same lds on both sides, and contiguous, simple */
-		starpu_cuda_copy_async_sync((void *)src_block->ptr, src_node, (void *)dst_block->ptr, dst_node, nx*ny*nz*nt*elemsize, NULL, kind);
-	}
-	else
-	{
-		/* TODO: use cudaMemcpy2D for whole 3D blocks etc. when they are contiguous */
-
-		/* Default case: we transfer all blocks one by one: nz transfers */
-		/* TODO: use cudaMemcpy3D now that it works (except on cuda 4.2) */
-		unsigned t;
-		for (t = 0; t < src_block->nt; t++)
-		{
-		    unsigned z;
-		    for (z = 0; z < src_block->nz; z++)
-		    {
-			uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + t*src_block->ldt*src_block->elemsize + z*src_block->ldz*src_block->elemsize;
-			uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + t*dst_block->ldt*src_block->elemsize + z*dst_block->ldz*dst_block->elemsize;
-
-			cures = cudaMemcpy2D((char *)dst_ptr, dst_block->ldy*elemsize,
-                                             (char *)src_ptr, src_block->ldy*elemsize,
-                                             nx*elemsize, ny, kind);
-
-			if (!cures)
-				cures = cudaDeviceSynchronize();
-			if (STARPU_UNLIKELY(cures))
-				STARPU_CUDA_REPORT_ERROR(cures);
-		    }
-		}
-	}
-
-	starpu_interface_data_copy(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->nt*src_block->elemsize);
-
-	return 0;
-}
-
-static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream, enum cudaMemcpyKind kind)
-{
-	struct starpu_tensor_interface *src_block = src_interface;
-	struct starpu_tensor_interface *dst_block = dst_interface;
-
-	uint32_t nx = src_block->nx;
-	uint32_t ny = src_block->ny;
-	uint32_t nz = src_block->nz;
-	uint32_t nt = src_block->nt;
-	size_t elemsize = src_block->elemsize;
-
-	cudaError_t cures;
-
-	int ret;
-
-	if (IS_CONTIGUOUS_TENSOR(nx, ny, nz, nt, src_block->ldy, src_block->ldz, src_block->ldt) &&
-	    IS_CONTIGUOUS_TENSOR(nx, ny, nz, nt, dst_block->ldy, dst_block->ldz, dst_block->ldt))
-	{
-		/* Same lds on both sides, and contiguous, simple */
-		ret = starpu_cuda_copy_async_sync((void *)src_block->ptr, src_node, (void *)dst_block->ptr, dst_node, nx*ny*nz*nt*elemsize, stream, kind);
-	}
-	else
-	{
-		/* TODO: use cudaMemcpy2D for whole 3D blocks etc. when they are contiguous */
-
-		/* Default case: we transfer all blocks one by one: nz transfers */
-		/* TODO: use cudaMemcpy3D now that it works (except on cuda 4.2) */
-		unsigned t;
-		for (t = 0; t < src_block->nt; t++)
-		{
-		    unsigned z;
-		    for (z = 0; z < src_block->nz; z++)
-		    {
-			uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + t*src_block->ldt*src_block->elemsize + z*src_block->ldz*src_block->elemsize;
-			uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + t*dst_block->ldt*dst_block->elemsize + z*dst_block->ldz*dst_block->elemsize;
-			double start;
-
-			starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
-			cures = cudaMemcpy2DAsync((char *)dst_ptr, dst_block->ldy*elemsize,
-                                                  (char *)src_ptr, src_block->ldy*elemsize,
-                                                  nx*elemsize, ny, kind, stream);
-			starpu_interface_end_driver_copy_async(src_node, dst_node, start);
-
-			if (STARPU_UNLIKELY(cures))
-			{
-				/* I don't know how to do that "better" */
-				goto no_async_default;
-			}
-		    }
-		}
-
-		ret = -EAGAIN;
-
-	}
-
-	starpu_interface_data_copy(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->nt*src_block->elemsize);
-
-	return ret;
-
-no_async_default:
-
-	{
-	unsigned t;
-	for (t = 0; t < src_block->nt; t++)
-	{
-	    unsigned z;
-	    for (z = 0; z < src_block->nz; z++)
-	    {
-		uint8_t *src_ptr = ((uint8_t *)src_block->ptr) + t*src_block->ldt*src_block->elemsize + z*src_block->ldz*src_block->elemsize;
-		uint8_t *dst_ptr = ((uint8_t *)dst_block->ptr) + t*dst_block->ldt*dst_block->elemsize + z*dst_block->ldz*dst_block->elemsize;
-
-		cures = cudaMemcpy2D((char *)dst_ptr, dst_block->ldy*elemsize,
-                                     (char *)src_ptr, src_block->ldy*elemsize,
-                                     nx*elemsize, ny, kind);
-
-		if (!cures)
-			cures = cudaDeviceSynchronize();
-		if (STARPU_UNLIKELY(cures))
-			STARPU_CUDA_REPORT_ERROR(cures);
-	    }
-	}
-
-	starpu_interface_data_copy(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->nt*src_block->elemsize);
-	return 0;
-	}
-}
-
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
-}
-
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
-{
-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
-}
-
-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
-{
-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
-}
-
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
-{
-	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
-}
-
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream)
-{
-	return copy_cuda_async_common(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
-}
-#endif // STARPU_USE_CUDA
-
-#ifdef STARPU_USE_OPENCL
-static int copy_opencl_common(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
-{
-	struct starpu_tensor_interface *src_block = src_interface;
-	struct starpu_tensor_interface *dst_block = dst_interface;
-        int ret = 0;
-
-	uint32_t nx = src_block->nx;
-	uint32_t ny = src_block->ny;
-	uint32_t nz = src_block->nz;
-
-	/* We may have a contiguous buffer for the entire block, or contiguous
-	 * plans within the block, we can avoid many small transfers that way */
-	if (IS_CONTIGUOUS_TENSOR(nx, ny, nz, nt, src_block->ldy, src_block->ldz, src_block->ldt) &&
-	    IS_CONTIGUOUS_TENSOR(nx, ny, nz, nt, dst_block->ldy, dst_block->ldz, dst_block->ldt))
-	{
-		ret = starpu_opencl_copy_async_sync(src_block->dev_handle, src_block->offset, src_node,
-						    dst_block->dev_handle, dst_block->offset, dst_node,
-						    src_block->nx*src_block->ny*src_block->nz*src_block->nt*src_block->elemsize,
-						    event);
-	}
-	else
-	{
-		/* Default case: we transfer all lines one by one: ny*nz transfers */
-		/* TODO: rect support */
-		unsigned t;
-		for (t = 0; t < src_block->nt; t++)
-		{
-		    unsigned z;
-		    for (z = 0; z < src_block->nz; z++)
-		    {
-                        unsigned j;
-                        for(j=0 ; j<src_block->ny ; j++)
-			{
-				ret = starpu_opencl_copy_async_sync(src_block->dev_handle,
-								    src_block->offset + t*src_block->ldt*src_block->elemsize + z*src_block->ldz*src_block->elemsize + j*src_block->ldy*src_block->elemsize,
-								    src_node,
-								    dst_block->dev_handle,
-								    dst_block->offset + t*dst_block->ldt*dst_block->elemsize + z*dst_block->ldz*dst_block->elemsize + j*dst_block->ldy*dst_block->elemsize,
-								    dst_node,
-								    src_block->nx*src_block->elemsize,
-								    event);
-                        }
-		    }
-                }
-        }
-
-	starpu_interface_data_copy(src_node, dst_node, src_block->nx*src_block->ny*src_block->nz*src_block->nt*src_block->elemsize);
-
-	return ret;
-}
-
-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
-{
-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
-}
-
-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
-{
-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
-}
-
-static int copy_opencl_to_opencl_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event)
-{
-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
-}
-
-static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
-{
-        return copy_ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
-}
-
-static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
-{
-        return copy_opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, NULL);
-}
-
-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
-{
-	return copy_opencl_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
-}
-
-#endif
-
 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data)
 {
 	struct starpu_tensor_interface *src_block = (struct starpu_tensor_interface *) src_interface;

+ 15 - 0
src/datawizard/node_ops.h

@@ -31,10 +31,25 @@ typedef int (*copy_data_t)(uintptr_t src_ptr, size_t src_offset, unsigned src_no
 				uintptr_t dst_ptr, size_t dst_offset, unsigned dst_node,
 				size_t ssize, struct _starpu_async_channel *async_channel);
 
+typedef int (*copy2d_data_t)(uintptr_t src_ptr, size_t src_offset, unsigned src_node,
+				uintptr_t dst_ptr, size_t dst_offset, unsigned dst_node,
+				size_t blocksize,
+				size_t numblocks, size_t ld_src, size_t ld_dst,
+				struct _starpu_async_channel *async_channel);
+
+typedef int (*copy3d_data_t)(uintptr_t src_ptr, size_t src_offset, unsigned src_node,
+				uintptr_t dst_ptr, size_t dst_offset, unsigned dst_node,
+				size_t blocksize,
+				size_t numblocks_1, size_t ld1_src, size_t ld1_dst,
+				size_t numblocks_2, size_t ld2_src, size_t ld2_dst,
+				struct _starpu_async_channel *async_channel);
+
 struct _starpu_node_ops
 {
 	copy_interface_func_t copy_interface_to[STARPU_MPI_MS_RAM+1];
 	copy_data_t copy_data_to[STARPU_MPI_MS_RAM+1];
+	copy2d_data_t copy2d_data_to[STARPU_MPI_MS_RAM+1];
+	copy3d_data_t copy3d_data_to[STARPU_MPI_MS_RAM+1];
 	void (*wait_request_completion)(struct _starpu_async_channel *async_channel);
 	unsigned (*test_request_completion)(struct _starpu_async_channel *async_channel);
 	int (*is_direct_access_supported)(unsigned node, unsigned handling_node);

+ 28 - 0
src/drivers/cpu/driver_cpu.c

@@ -564,6 +564,34 @@ struct _starpu_node_ops _starpu_driver_cpu_node_ops =
 	.copy_data_to[STARPU_MPI_MS_RAM] = NULL,
 #endif
 
+	.copy2d_data_to[STARPU_UNUSED] = NULL,
+	.copy2d_data_to[STARPU_CPU_RAM] = NULL,
+#ifdef STARPU_USE_CUDA
+	.copy2d_data_to[STARPU_CUDA_RAM] = _starpu_cuda_copy2d_data_from_cpu_to_cuda,
+#else
+	.copy2d_data_to[STARPU_CUDA_RAM] = NULL,
+#endif
+	.copy2d_data_to[STARPU_OPENCL_RAM] = NULL,
+	.copy2d_data_to[STARPU_DISK_RAM] = NULL,
+	.copy2d_data_to[STARPU_MIC_RAM] = NULL,
+	.copy2d_data_to[STARPU_MPI_MS_RAM] = NULL,
+
+	.copy3d_data_to[STARPU_UNUSED] = NULL,
+	.copy3d_data_to[STARPU_CPU_RAM] = NULL,
+#if 0
+#ifdef STARPU_USE_CUDA
+	.copy3d_data_to[STARPU_CUDA_RAM] = _starpu_cuda_copy3d_data_from_cpu_to_cuda,
+#else
+	.copy3d_data_to[STARPU_CUDA_RAM] = NULL,
+#endif
+#else
+	.copy3d_data_to[STARPU_CUDA_RAM] = NULL,
+#endif
+	.copy3d_data_to[STARPU_OPENCL_RAM] = NULL,
+	.copy3d_data_to[STARPU_DISK_RAM] = NULL,
+	.copy3d_data_to[STARPU_MIC_RAM] = NULL,
+	.copy3d_data_to[STARPU_MPI_MS_RAM] = NULL,
+
 	.wait_request_completion = NULL,
 	.test_request_completion = NULL,
 	.is_direct_access_supported = _starpu_cpu_is_direct_access_supported,

+ 282 - 0
src/drivers/cuda/driver_cuda.c

@@ -53,6 +53,11 @@
 #else
 #define starpu_cudaStreamCreate(stream) cudaStreamCreate(stream)
 #endif
+
+/* At least CUDA 4.2 still didn't have working memcpy3D */
+#if CUDART_VERSION < 5000
+#define BUGGED_MEMCPY3D
+#endif
 #endif
 
 /* the number of CUDA devices */
@@ -1187,6 +1192,195 @@ starpu_cuda_copy_async_sync(void *src_ptr, unsigned src_node,
 
 	return -EAGAIN;
 }
+
+int
+starpu_cuda_copy2d_async_sync(void *src_ptr, unsigned src_node,
+			      void *dst_ptr, unsigned dst_node,
+			      size_t blocksize,
+			      size_t numblocks, size_t ld_src, size_t ld_dst,
+			      cudaStream_t stream, enum cudaMemcpyKind kind)
+{
+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
+	int peer_copy = 0;
+	int src_dev = -1, dst_dev = -1;
+#endif
+	cudaError_t cures = 0;
+
+	if (kind == cudaMemcpyDeviceToDevice && src_node != dst_node)
+	{
+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
+#  ifdef BUGGED_MEMCPY3D
+		STARPU_ABORT_MSG("CUDA memcpy 3D peer buggy, but core triggered one?!");
+#  endif
+		peer_copy = 1;
+		src_dev = starpu_memory_node_get_devid(src_node);
+		dst_dev = starpu_memory_node_get_devid(dst_node);
+#else
+		STARPU_ABORT_MSG("CUDA memcpy 3D peer not available, but core triggered one ?!");
+#endif
+	}
+
+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
+	if (peer_copy)
+	{
+		struct cudaMemcpy3DPeerParms p;
+		memset(&p, 0, sizeof(p));
+
+		p.srcDevice = src_dev;
+		p.dstDevice = dst_dev;
+		p.srcPtr = make_cudaPitchedPtr((char *)src_ptr, ld_src, blocksize, numblocks);
+		p.dstPtr = make_cudaPitchedPtr((char *)dst_ptr, ld_dst, blocksize, numblocks);
+		p.extent = make_cudaExtent(blocksize, numblocks, 1);
+
+
+		if (stream)
+		{
+			double start;
+			starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
+			cures = cudaMemcpy3DPeerAsync(&p, stream);
+		}
+
+		/* Test if the asynchronous copy has failed or if the caller only asked for a synchronous copy */
+		if (stream == NULL || cures)
+		{
+			cures = cudaMemcpy3DPeer(&p);
+
+			if (!cures)
+				cures = cudaDeviceSynchronize();
+			if (STARPU_UNLIKELY(cures))
+				STARPU_CUDA_REPORT_ERROR(cures);
+
+			return 0;
+		}
+	}
+	else
+#endif
+	{
+		if (stream)
+		{
+			double start;
+			starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
+			cures = cudaMemcpy2DAsync((char *)dst_ptr, ld_dst, (char *)src_ptr, ld_src,
+				blocksize, numblocks, kind, stream);
+			starpu_interface_end_driver_copy_async(src_node, dst_node, start);
+		}
+
+		/* Test if the asynchronous copy has failed or if the caller only asked for a synchronous copy */
+		if (stream == NULL || cures)
+		{
+			cures = cudaMemcpy2D((char *)dst_ptr, ld_dst, (char *)src_ptr, ld_src,
+					blocksize, numblocks, kind);
+			if (!cures)
+				cures = cudaDeviceSynchronize();
+			if (STARPU_UNLIKELY(cures))
+				STARPU_CUDA_REPORT_ERROR(cures);
+
+			return 0;
+		}
+	}
+
+
+	return -EAGAIN;
+}
+
+#if 0
+/* CUDA doesn't seem to be providing a way to set ld2?? */
+int
+starpu_cuda_copy3d_async_sync(void *src_ptr, unsigned src_node,
+			      void *dst_ptr, unsigned dst_node,
+			      size_t blocksize,
+			      size_t numblocks_1, size_t ld1_src, size_t ld1_dst,
+			      size_t numblocks_2, size_t ld2_src, size_t ld2_dst,
+			      cudaStream_t stream, enum cudaMemcpyKind kind)
+{
+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
+	int peer_copy = 0;
+	int src_dev = -1, dst_dev = -1;
+#endif
+	cudaError_t cures = 0;
+
+	if (kind == cudaMemcpyDeviceToDevice && src_node != dst_node)
+	{
+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
+		peer_copy = 1;
+		src_dev = starpu_memory_node_get_devid(src_node);
+		dst_dev = starpu_memory_node_get_devid(dst_node);
+#else
+		STARPU_ABORT_MSG("CUDA memcpy 3D peer not available, but core triggered one ?!");
+#endif
+	}
+
+#ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
+	if (peer_copy)
+	{
+		struct cudaMemcpy3DPeerParms p;
+		memset(&p, 0, sizeof(p));
+
+		p.srcDevice = src_dev;
+		p.dstDevice = dst_dev;
+		p.srcPtr = make_cudaPitchedPtr((char *)src_ptr, ld1_src, blocksize, numblocks);
+		p.dstPtr = make_cudaPitchedPtr((char *)dst_ptr, ld1_dst, blocksize, numblocks);
+		// FIXME: how to pass ld2_src / ld2_dst ??
+		p.extent = make_cudaExtent(blocksize, numblocks_1, numblocks_2);
+
+
+		if (stream)
+		{
+			double start;
+			starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
+			cures = cudaMemcpy3DPeerAsync(&p, stream);
+		}
+
+		/* Test if the asynchronous copy has failed or if the caller only asked for a synchronous copy */
+		if (stream == NULL || cures)
+		{
+			cures = cudaMemcpy3DPeer(&p);
+
+			if (!cures)
+				cures = cudaDeviceSynchronize();
+			if (STARPU_UNLIKELY(cures))
+				STARPU_CUDA_REPORT_ERROR(cures);
+
+			return 0;
+		}
+	}
+	else
+#endif
+	{
+		struct cudaMemcpy3DParms p;
+		memset(&p, 0, sizeof(p));
+
+		p.srcPtr = make_cudaPitchedPtr((char *)src_ptr, ld1_src, blocksize, numblocks);
+		p.dstPtr = make_cudaPitchedPtr((char *)dst_ptr, ld1_dst, blocksize, numblocks);
+		// FIXME: how to pass ld2_src / ld2_dst ??
+		p.extent = make_cudaExtent(blocksize, numblocks, 1);
+		p.kind = kind;
+
+		if (stream)
+		{
+			double start;
+			starpu_interface_start_driver_copy_async(src_node, dst_node, &start);
+			cures = cudaMemcpy3DAsync(&p, stream);
+			starpu_interface_end_driver_copy_async(src_node, dst_node, start);
+		}
+
+		/* Test if the asynchronous copy has failed or if the caller only asked for a synchronous copy */
+		if (stream == NULL || cures)
+		{
+			cures = cudaMemcpy3D(&p);
+			if (!cures)
+				cures = cudaDeviceSynchronize();
+			if (STARPU_UNLIKELY(cures))
+				STARPU_CUDA_REPORT_ERROR(cures);
+
+			return 0;
+		}
+	}
+
+
+	return -EAGAIN;
+}
+#endif
 #endif /* STARPU_USE_CUDA */
 
 int _starpu_run_cuda(struct _starpu_worker_set *workerarg)
@@ -1429,6 +1623,57 @@ int _starpu_cuda_copy_data_from_cpu_to_cuda(uintptr_t src, size_t src_offset, un
 					   cudaMemcpyHostToDevice);
 }
 
+int _starpu_cuda_copy2d_data_from_cuda_to_cpu(uintptr_t src, size_t src_offset, unsigned src_node,
+					      uintptr_t dst, size_t dst_offset, unsigned dst_node,
+					      size_t blocksize, size_t numblocks, size_t ld_src, size_t ld_dst,
+					      struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+
+	STARPU_ASSERT(src_kind == STARPU_CUDA_RAM && dst_kind == STARPU_CPU_RAM);
+
+	return starpu_cuda_copy2d_async_sync((void*) (src + src_offset), src_node,
+					   (void*) (dst + dst_offset), dst_node,
+					   blocksize, numblocks, ld_src, ld_dst,
+					   async_channel?starpu_cuda_get_out_transfer_stream(src_node):NULL,
+					   cudaMemcpyDeviceToHost);
+}
+
+int _starpu_cuda_copy2d_data_from_cuda_to_cuda(uintptr_t src, size_t src_offset, unsigned src_node,
+					       uintptr_t dst, size_t dst_offset, unsigned dst_node,
+					       size_t blocksize, size_t numblocks, size_t ld_src, size_t ld_dst,
+					       struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+
+	STARPU_ASSERT(src_kind == STARPU_CUDA_RAM && dst_kind == STARPU_CUDA_RAM);
+
+	return starpu_cuda_copy2d_async_sync((void*) (src + src_offset), src_node,
+					   (void*) (dst + dst_offset), dst_node,
+					   blocksize, numblocks, ld_src, ld_dst,
+					   async_channel?starpu_cuda_get_peer_transfer_stream(src_node, dst_node):NULL,
+					   cudaMemcpyDeviceToDevice);
+}
+
+int _starpu_cuda_copy2d_data_from_cpu_to_cuda(uintptr_t src, size_t src_offset, unsigned src_node,
+					      uintptr_t dst, size_t dst_offset, unsigned dst_node,
+					      size_t blocksize, size_t numblocks, size_t ld_src, size_t ld_dst,
+					      struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+
+	STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_CUDA_RAM);
+
+	return starpu_cuda_copy2d_async_sync((void*) (src + src_offset), src_node,
+					   (void*) (dst + dst_offset), dst_node,
+					   blocksize, numblocks, ld_src, ld_dst,
+					   async_channel?starpu_cuda_get_in_transfer_stream(dst_node):NULL,
+					   cudaMemcpyHostToDevice);
+}
+
 #endif /* STARPU_USE_CUDA */
 
 int _starpu_cuda_is_direct_access_supported(unsigned node, unsigned handling_node)
@@ -1586,6 +1831,22 @@ struct _starpu_node_ops _starpu_driver_cuda_node_ops =
 	.copy_data_to[STARPU_MIC_RAM] = NULL,
 	.copy_data_to[STARPU_MPI_MS_RAM] = NULL,
 
+	.copy2d_data_to[STARPU_UNUSED] = NULL,
+	.copy2d_data_to[STARPU_CPU_RAM] = NULL,
+	.copy2d_data_to[STARPU_CUDA_RAM] = NULL,
+	.copy2d_data_to[STARPU_OPENCL_RAM] = NULL,
+	.copy2d_data_to[STARPU_DISK_RAM] = NULL,
+	.copy2d_data_to[STARPU_MIC_RAM] = NULL,
+	.copy2d_data_to[STARPU_MPI_MS_RAM] = NULL,
+
+	.copy3d_data_to[STARPU_UNUSED] = NULL,
+	.copy3d_data_to[STARPU_CPU_RAM] = NULL,
+	.copy3d_data_to[STARPU_CUDA_RAM] = NULL,
+	.copy3d_data_to[STARPU_OPENCL_RAM] = NULL,
+	.copy3d_data_to[STARPU_DISK_RAM] = NULL,
+	.copy3d_data_to[STARPU_MIC_RAM] = NULL,
+	.copy3d_data_to[STARPU_MPI_MS_RAM] = NULL,
+
 	.wait_request_completion = NULL,
 	.test_request_completion = NULL,
 	.is_direct_access_supported = _starpu_cuda_is_direct_access_supported,
@@ -1612,6 +1873,27 @@ struct _starpu_node_ops _starpu_driver_cuda_node_ops =
 	.copy_data_to[STARPU_MIC_RAM] = NULL,
 	.copy_data_to[STARPU_MPI_MS_RAM] = NULL,
 
+	.copy2d_data_to[STARPU_UNUSED] = NULL,
+	.copy2d_data_to[STARPU_CPU_RAM] = _starpu_cuda_copy2d_data_from_cuda_to_cpu,
+	.copy2d_data_to[STARPU_CUDA_RAM] = _starpu_cuda_copy2d_data_from_cuda_to_cuda,
+	.copy2d_data_to[STARPU_OPENCL_RAM] = NULL,
+	.copy2d_data_to[STARPU_DISK_RAM] = NULL,
+	.copy2d_data_to[STARPU_MIC_RAM] = NULL,
+	.copy2d_data_to[STARPU_MPI_MS_RAM] = NULL,
+
+	.copy3d_data_to[STARPU_UNUSED] = NULL,
+#if 0
+	.copy3d_data_to[STARPU_CPU_RAM] = _starpu_cuda_copy3d_data_from_cuda_to_cpu,
+	.copy3d_data_to[STARPU_CUDA_RAM] = _starpu_cuda_copy3d_data_from_cuda_to_cuda,
+#else
+	.copy3d_data_to[STARPU_CPU_RAM] = NULL,
+	.copy3d_data_to[STARPU_CUDA_RAM] = NULL,
+#endif
+	.copy3d_data_to[STARPU_OPENCL_RAM] = NULL,
+	.copy3d_data_to[STARPU_DISK_RAM] = NULL,
+	.copy3d_data_to[STARPU_MIC_RAM] = NULL,
+	.copy3d_data_to[STARPU_MPI_MS_RAM] = NULL,
+
 	.wait_request_completion = _starpu_cuda_wait_request_completion,
 	.test_request_completion = _starpu_cuda_test_request_completion,
 	.is_direct_access_supported = _starpu_cuda_is_direct_access_supported,

+ 8 - 0
src/drivers/cuda/driver_cuda.h

@@ -65,6 +65,14 @@ int _starpu_cuda_copy_data_from_cuda_to_cuda(uintptr_t src, size_t src_offset, u
 int _starpu_cuda_copy_data_from_cuda_to_cpu(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
 int _starpu_cuda_copy_data_from_cpu_to_cuda(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
 
+int _starpu_cuda_copy2d_data_from_cuda_to_cuda(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t blocksize, size_t numblocks, size_t ld_src, size_t ld_dst, struct _starpu_async_channel *async_channel);
+int _starpu_cuda_copy2d_data_from_cuda_to_cpu(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t blocksize, size_t numblocks, size_t ld_src, size_t ld_dst, struct _starpu_async_channel *async_channel);
+int _starpu_cuda_copy2d_data_from_cpu_to_cuda(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t blocksize, size_t numblocks, size_t ld_src, size_t ld_dst, struct _starpu_async_channel *async_channel);
+
+int _starpu_cuda_copy3d_data_from_cuda_to_cuda(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t blocksize, size_t numblocks_1, size_t ld1_src, size_t ld1_dst, size_t numblocks_2, size_t ld2_src, size_t ld2_dst, struct _starpu_async_channel *async_channel);
+int _starpu_cuda_copy3d_data_from_cuda_to_cpu(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t blocksize, size_t numblocks_1, size_t ld1_src, size_t ld1_dst, size_t numblocks_2, size_t ld2_src, size_t ld2_dst, struct _starpu_async_channel *async_channel);
+int _starpu_cuda_copy3d_data_from_cpu_to_cuda(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t blocksize, size_t numblocks_1, size_t ld1_src, size_t ld1_dst, size_t numblocks_2, size_t ld2_src, size_t ld2_dst, struct _starpu_async_channel *async_channel);
+
 int _starpu_cuda_is_direct_access_supported(unsigned node, unsigned handling_node);
 uintptr_t _starpu_cuda_malloc_on_node(unsigned dst_node, size_t size, int flags);
 void _starpu_cuda_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags);

+ 2 - 0
src/drivers/disk/driver_disk.c

@@ -268,6 +268,8 @@ struct _starpu_node_ops _starpu_driver_disk_node_ops =
 	.copy_data_to[STARPU_MIC_RAM] = NULL,
 	.copy_data_to[STARPU_MPI_MS_RAM] = NULL,
 
+	/* TODO: copy2D/3D? */
+
 	.wait_request_completion = _starpu_disk_wait_request_completion,
 	.test_request_completion = _starpu_disk_test_request_completion,
 	.is_direct_access_supported = _starpu_disk_is_direct_access_supported,

+ 2 - 0
src/drivers/mic/driver_mic_source.c

@@ -704,6 +704,8 @@ struct _starpu_node_ops _starpu_driver_mic_node_ops =
 	.copy_data_to[STARPU_MIC_RAM] = NULL,
 	.copy_data_to[STARPU_MPI_MS_RAM] = NULL,
 
+	/* TODO: copy2D/3D? */
+
 	.wait_request_completion = _starpu_mic_wait_request_completion,
 	.test_request_completion = _starpu_mic_test_request_completion,
 	.is_direct_access_supported = _starpu_mic_is_direct_access_supported,

+ 2 - 0
src/drivers/mpi/driver_mpi_source.c

@@ -564,6 +564,8 @@ struct _starpu_node_ops _starpu_driver_mpi_node_ops =
 	.copy_data_to[STARPU_MIC_RAM] = NULL,
 	.copy_data_to[STARPU_MPI_MS_RAM] = _starpu_mpi_copy_data_from_mpi_to_mpi,
 
+	/* TODO: copy2D/3D? */
+
 	.wait_request_completion = _starpu_mpi_common_wait_request_completion,
 	.test_request_completion = _starpu_mpi_common_test_event,
 	.is_direct_access_supported = _starpu_mpi_is_direct_access_supported,

+ 2 - 0
src/drivers/opencl/driver_opencl.c

@@ -1394,6 +1394,8 @@ struct _starpu_node_ops _starpu_driver_opencl_node_ops =
 	.copy_data_to[STARPU_MIC_RAM] = NULL,
 	.copy_data_to[STARPU_MPI_MS_RAM] = NULL,
 
+	/* TODO: copy2D/3D? */
+
 	.wait_request_completion = _starpu_opencl_wait_request_completion,
 	.test_request_completion = _starpu_opencl_test_request_completion,
 	.is_direct_access_supported = _starpu_opencl_is_direct_access_supported,