Преглед изворни кода

Introduce the starpu_interface_copy helper, and the any_to_any copy method, which permits to factorize a lot of interface code.

Samuel Thibault пре 12 година
родитељ
комит
de6deb09f5

+ 40 - 35
doc/chapters/advanced-api.texi

@@ -354,51 +354,56 @@ Unpack the data handle from the contiguous buffer at the address @code{ptr} of s
 @end deftp
 
 @deftp {Data Type} {struct starpu_data_copy_methods}
-Defines the per-interface methods.
+Defines the per-interface methods. If the @code{any_to_any} method is provided,
+it will be used by default if no more specific method is provided. It can still
+be useful to provide more specific method in case of e.g. available particular
+CUDA or OpenCL support.
+
 @table @asis
-@item @code{int @{ram,cuda,opencl@}_to_@{ram,cuda,opencl@}(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)}
+@item @code{int (*@{ram,cuda,opencl@}_to_@{ram,cuda,opencl@})(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)}
 These 12 functions define how to copy data from the @var{src_interface}
 interface on the @var{src_node} node to the @var{dst_interface} interface
 on the @var{dst_node} node. They return 0 on success.
 
-@item @code{int (*ram_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)}
-Define how to copy data from the @var{src_interface} interface on the
-@var{src_node} node (in RAM) to the @var{dst_interface} interface on the
-@var{dst_node} node (on a CUDA device), using the given @var{stream}. Return 0
-on success.
-
-@item @code{int (*cuda_to_ram_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)}
-Define how to copy data from the @var{src_interface} interface on the
-@var{src_node} node (on a CUDA device) to the @var{dst_interface} interface on the
-@var{dst_node} node (in RAM), using the given @var{stream}. Return 0
-on success.
-
-@item @code{int (*cuda_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)}
-Define how to copy data from the @var{src_interface} interface on the
-@var{src_node} node (on a CUDA device) to the @var{dst_interface} interface on
-the @var{dst_node} node (on another CUDA device), using the given @var{stream}.
-Return 0 on success.
-
-@item @code{int (*ram_to_opencl_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, /* cl_event * */ void *event)}
+@item @code{int (*@{ram,cuda@}_to_@{ram,cuda@}_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)}
+These 3 functions (@code{ram_to_ram} is not among these) define how to copy
+data from the @var{src_interface} interface on the @var{src_node} node to the
+@var{dst_interface} interface on the @var{dst_node} node, using the given
+@var{stream}. Must return 0 if the transfer was actually completed completely
+synchronously, or -EAGAIN if at least some transfers are still ongoing and
+should be awaited for by the core.
+
+@item @code{int (*@{ram,opencl@}_to_@{ram,opencl@}_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, /* cl_event * */ void *event)}
+These 3 functions (@code{ram_to_ram} is not among them) define how to copy
+data from the @var{src_interface} interface on the @var{src_node} node to the
+@var{dst_interface} interface on the @var{dst_node} node, by recording in
+@var{event}, a pointer to a cl_event, the event of the last submitted transfer.
+Must return 0 if the transfer was actually completed completely synchronously,
+or -EAGAIN if at least some transfers are still ongoing and should be awaited
+for by the core.
+
+@item @code{int (*any_to_any)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data)}
 Define how to copy data from the @var{src_interface} interface on the
-@var{src_node} node (in RAM) to the @var{dst_interface} interface on the
-@var{dst_node} node (on an OpenCL device), using @var{event}, a pointer to a
-cl_event. Return 0 on success.
+@var{src_node} node to the @var{dst_interface} interface on the @var{dst_node}
+node. This is meant to be implemented through the @var{starpu_interface_copy}
+helper, to which @var{async_data} should be passed as such, and will be used to
+manage asynchronicity. This must return -EAGAIN if any of the
+@var{starpu_interface_copy} calls has returned -EAGAIN (i.e. at least some
+transfer is still ongoing), and return 0 otherwise.
 
-@item @code{int (*opencl_to_ram_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, /* cl_event * */ void *event)}
-Define how to copy data from the @var{src_interface} interface on the
-@var{src_node} node (on an OpenCL device) to the @var{dst_interface} interface
-on the @var{dst_node} node (in RAM), using the given @var{event}, a pointer to
-a cl_event. Return 0 on success.
-
-@item @code{int (*opencl_to_opencl_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, /* cl_event * */ void *event)}
-Define how to copy data from the @var{src_interface} interface on the
-@var{src_node} node (on an OpenCL device) to the @var{dst_interface} interface
-on the @var{dst_node} node (on another OpenCL device), using the given
-@var{event}, a pointer to a cl_event. Return 0 on success.
 @end table
 @end deftp
 
+@deftypefun int starpu_interface_copy(uintptr_t @var{src}, unsigned @var{src_node}, size_t @var{src_offset}, uintptr_t @var{dst}, unsigned @var{dst_node}, size_t @var{dst_offset}, size_t @var{size}, {void *}@var{async_data});
+Copy @var{size} bytes from byte offset @var{src_offset} of @var{src} on
+@var{src_node} to byte offset @var{dst_offset} of @var{dst} on @var{dst_node}.
+This is to be used in the @var{any_to_any} copy method, which is provided with
+the @var{async_data} to be pased to @var{starpu_interface_copy}. this returns
+-EAGAIN if the transfer is still ongoing, or 0 if the transfer is already
+completed.
+@end deftypefun
+
+
 @deftypefun uint32_t starpu_crc32_be_n ({void *}@var{input}, size_t @var{n}, uint32_t @var{inputcrc})
 Compute the CRC of a byte buffer seeded by the inputcrc "current
 state". The return value should be considered as the new "current

+ 4 - 0
include/starpu_data_interfaces.h

@@ -73,8 +73,12 @@ struct starpu_data_copy_methods
 	int (*opencl_to_ram_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event);
 	int (*opencl_to_opencl_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event);
 #endif
+
+	int (*any_to_any)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 };
 
+int starpu_interface_copy(uintptr_t src, unsigned src_node, size_t src_offset, uintptr_t dst, unsigned dst_node, size_t dst_offset, size_t size, void *async_data);
+
 enum starpu_data_interface_id
 {
 	STARPU_MATRIX_INTERFACE_ID=0,

+ 118 - 30
src/datawizard/copy_driver.c

@@ -134,8 +134,10 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 	{
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CPU_RAM):
 		/* STARPU_CPU_RAM -> STARPU_CPU_RAM */
-		STARPU_ASSERT(copy_methods->ram_to_ram);
-		copy_methods->ram_to_ram(src_interface, src_node, dst_interface, dst_node);
+		if (copy_methods->ram_to_ram)
+			copy_methods->ram_to_ram(src_interface, src_node, dst_interface, dst_node);
+		else
+			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
 		break;
 #ifdef STARPU_USE_CUDA
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CPU_RAM):
@@ -143,10 +145,10 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 #if !defined(HAVE_CUDA_MEMCPY_PEER)
 		STARPU_ASSERT(_starpu_memory_node_get_local_key() == src_node);
 #endif
-		STARPU_ASSERT(copy_methods->cuda_to_ram);
-		if (!req || !copy_methods->cuda_to_ram_async)
+		if (!req || !(copy_methods->cuda_to_ram_async || copy_methods->any_to_any))
 		{
 			/* this is not associated to a request so it's synchronous */
+			STARPU_ASSERT(copy_methods->cuda_to_ram);
 			copy_methods->cuda_to_ram(src_interface, src_node, dst_interface, dst_node);
 		}
 		else
@@ -156,7 +158,13 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
 
 			stream = starpu_cuda_get_local_out_transfer_stream();
-			ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
+			if (copy_methods->cuda_to_ram_async)
+				ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
+			else
+			{
+				STARPU_ASSERT(copy_methods->any_to_any);
+				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+			}
 
 			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
@@ -168,10 +176,10 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 #if !defined(HAVE_CUDA_MEMCPY_PEER)
 		STARPU_ASSERT(_starpu_memory_node_get_local_key() == dst_node);
 #endif
-		STARPU_ASSERT(copy_methods->ram_to_cuda);
-		if (!req || !copy_methods->ram_to_cuda_async)
+		if (!req || !(copy_methods->ram_to_cuda_async || copy_methods->any_to_any))
 		{
 			/* this is not associated to a request so it's synchronous */
+			STARPU_ASSERT(copy_methods->ram_to_cuda);
 			copy_methods->ram_to_cuda(src_interface, src_node, dst_interface, dst_node);
 		}
 		else
@@ -182,7 +190,13 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 				STARPU_CUDA_REPORT_ERROR(cures);
 
 			stream = starpu_cuda_get_local_in_transfer_stream();
-			ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
+			if (copy_methods->ram_to_cuda_async)
+				ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
+			else
+			{
+				STARPU_ASSERT(copy_methods->any_to_any);
+				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+			}
 
 			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
 			if (STARPU_UNLIKELY(cures != cudaSuccess))
@@ -191,8 +205,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 		break;
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CUDA_RAM):
 		/* CUDA - CUDA transfer */
-		STARPU_ASSERT(copy_methods->cuda_to_cuda || copy_methods->cuda_to_cuda_async);
-		if (!req || !copy_methods->cuda_to_cuda_async)
+		if (!req || !(copy_methods->cuda_to_cuda_async || copy_methods->any_to_any))
 		{
 			STARPU_ASSERT(copy_methods->cuda_to_cuda);
 			/* this is not associated to a request so it's synchronous */
@@ -205,7 +218,13 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
 
 			stream = starpu_cuda_get_local_peer_transfer_stream();
-			ret = copy_methods->cuda_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
+			if (copy_methods->cuda_to_cuda_async)
+				ret = copy_methods->cuda_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
+			else
+			{
+				STARPU_ASSERT(copy_methods->any_to_any);
+				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+			}
 
 			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
@@ -215,54 +234,65 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 #ifdef STARPU_USE_OPENCL
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_CPU_RAM):
 		/* OpenCL -> RAM */
-		if (_starpu_memory_node_get_local_key() == src_node)
+		STARPU_ASSERT(_starpu_memory_node_get_local_key() == src_node);
+		if (!req || !(copy_methods->opencl_to_ram_async || copy_methods->any_to_any))
 		{
 			STARPU_ASSERT(copy_methods->opencl_to_ram);
-			if (!req || !copy_methods->opencl_to_ram_async)
-			{
-				/* this is not associated to a request so it's synchronous */
-				copy_methods->opencl_to_ram(src_interface, src_node, dst_interface, dst_node);
-			}
-			else
-			{
-				req->async_channel.type = STARPU_OPENCL_RAM;
-				ret = copy_methods->opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
-			}
+			/* this is not associated to a request so it's synchronous */
+			copy_methods->opencl_to_ram(src_interface, src_node, dst_interface, dst_node);
 		}
 		else
 		{
-			/* we should not have a blocking call ! */
-			STARPU_ABORT();
+			req->async_channel.type = STARPU_OPENCL_RAM;
+			if (copy_methods->opencl_to_ram_async)
+				ret = copy_methods->opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
+			else
+			{
+				STARPU_ASSERT(copy_methods->any_to_any);
+				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+			}
 		}
 		break;
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_OPENCL_RAM):
 		/* STARPU_CPU_RAM -> STARPU_OPENCL_RAM */
 		STARPU_ASSERT(_starpu_memory_node_get_local_key() == dst_node);
-		STARPU_ASSERT(copy_methods->ram_to_opencl);
-		if (!req || !copy_methods->ram_to_opencl_async)
+		if (!req || !(copy_methods->ram_to_opencl_async || copy_methods->any_to_any))
 		{
+			STARPU_ASSERT(copy_methods->ram_to_opencl);
 			/* this is not associated to a request so it's synchronous */
 			copy_methods->ram_to_opencl(src_interface, src_node, dst_interface, dst_node);
 		}
 		else
 		{
 			req->async_channel.type = STARPU_OPENCL_RAM;
-			ret = copy_methods->ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
+			if (copy_methods->ram_to_opencl_async)
+				ret = copy_methods->ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
+			else
+			{
+				STARPU_ASSERT(copy_methods->any_to_any);
+				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+			}
 		}
 		break;
 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_OPENCL_RAM):
 		/* STARPU_OPENCL_RAM -> STARPU_OPENCL_RAM */
 		STARPU_ASSERT(_starpu_memory_node_get_local_key() == dst_node || _starpu_memory_node_get_local_key() == src_node);
-		STARPU_ASSERT(copy_methods->opencl_to_opencl);
-		if (!req || !copy_methods->opencl_to_opencl_async)
+		if (!req || !(copy_methods->opencl_to_opencl_async || copy_methods->any_to_any))
 		{
+			STARPU_ASSERT(copy_methods->opencl_to_opencl);
 			/* this is not associated to a request so it's synchronous */
 			copy_methods->opencl_to_opencl(src_interface, src_node, dst_interface, dst_node);
 		}
 		else
 		{
 			req->async_channel.type = STARPU_OPENCL_RAM;
-			ret = copy_methods->opencl_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
+			if (copy_methods->opencl_to_opencl_async)
+				ret = copy_methods->opencl_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
+			else
+			{
+				STARPU_ASSERT(copy_methods->any_to_any);
+				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+			}
 		}
 		break;
 #endif
@@ -331,6 +361,64 @@ int __attribute__((warn_unused_result)) _starpu_driver_copy_data_1_to_1(starpu_d
 	return 0;
 }
 
+/* This can be used by interfaces to easily transfer a piece of data without
+ * caring about the particular CUDA/OpenCL methods.  */
+
+int starpu_interface_copy(uintptr_t src, unsigned src_node, size_t src_offset, uintptr_t dst, unsigned dst_node, size_t dst_offset, size_t size, void *async_data)
+{
+	struct _starpu_async_channel *async_channel = async_data;
+	enum starpu_node_kind src_kind = starpu_node_get_kind(src_node);
+	enum starpu_node_kind dst_kind = starpu_node_get_kind(dst_node);
+
+	switch (_STARPU_MEMORY_NODE_TUPLE(src_kind,dst_kind))
+	{
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CPU_RAM):
+		memcpy((void *) dst + dst_offset, (void *) src + src_offset, size);
+		return 0;
+
+#ifdef STARPU_USE_CUDA
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CPU_RAM):
+		return starpu_cuda_copy_async_sync(
+				(void*) src + src_offset, src_node,
+				(void*) dst + dst_offset, dst_node,
+				size,
+				async_channel?starpu_cuda_get_local_out_transfer_stream():NULL,
+				cudaMemcpyDeviceToHost);
+
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CUDA_RAM):
+		return starpu_cuda_copy_async_sync(
+				(void*) src + src_offset, src_node,
+				(void*) dst + dst_offset, dst_node,
+				size,
+				async_channel?starpu_cuda_get_local_in_transfer_stream():NULL,
+				cudaMemcpyHostToDevice);
+
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CUDA_RAM):
+		return starpu_cuda_copy_async_sync(
+				(void*) src + src_offset, src_node,
+				(void*) dst + dst_offset, dst_node,
+				size,
+				async_channel?starpu_cuda_get_local_peer_transfer_stream():NULL,
+				cudaMemcpyDeviceToDevice);
+
+#endif
+#ifdef STARPU_USE_OPENCL
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_CPU_RAM):
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_OPENCL_RAM):
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_OPENCL_RAM):
+		return starpu_opencl_copy_async_sync(
+				src, src_node, src_offset,
+				dst, dst_node, dst_offset,
+				size,
+				&async_channel->event.opencl_event);
+#endif
+	default:
+		STARPU_ABORT();
+		return -1;
+	}
+	return 0;
+}
+
 void _starpu_driver_wait_request_completion(struct _starpu_async_channel *async_channel)
 {
 #ifdef STARPU_SIMGRID

+ 12 - 125
src/datawizard/interfaces/bcsr_interface.c

@@ -31,31 +31,11 @@
  * BCSR : blocked CSR, we use blocks of size (r x c)
  */
 
-static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-#ifdef STARPU_USE_CUDA
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-#endif
-#ifdef STARPU_USE_OPENCL
-static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-#endif
+static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 
 static struct starpu_data_copy_methods bcsr_copy_data_methods_s =
 {
-	.ram_to_ram = copy_ram_to_ram,
-#ifdef STARPU_USE_CUDA
-	.ram_to_cuda = copy_ram_to_cuda,
-	.cuda_to_ram = copy_cuda_to_ram,
-	.cuda_to_cuda = copy_cuda_to_cuda,
-#endif
-#ifdef STARPU_USE_OPENCL
-	.ram_to_opencl = copy_ram_to_opencl,
-	.opencl_to_ram = copy_opencl_to_ram,
-	.opencl_to_opencl = copy_opencl_to_opencl,
-#endif
+	.any_to_any = copy_any_to_any,
 };
 
 static void register_bcsr_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface);
@@ -315,105 +295,7 @@ static void free_bcsr_buffer_on_node(void *data_interface, unsigned node)
 	starpu_free_buffer_on_node(node, (uintptr_t) bcsr_interface->rowptr, (nrow+1)*sizeof(uint32_t));
 }
 
-#ifdef STARPU_USE_CUDA
-static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind)
-{
-	struct starpu_bcsr_interface *src_bcsr = src_interface;
-	struct starpu_bcsr_interface *dst_bcsr = dst_interface;
-
-	uint32_t nnz = src_bcsr->nnz;
-	uint32_t nrow = src_bcsr->nrow;
-	size_t elemsize = src_bcsr->elemsize;
-
-	uint32_t r = src_bcsr->r;
-	uint32_t c = src_bcsr->c;
-
-	cudaError_t cures;
-
-	cures = cudaMemcpy((char *)dst_bcsr->nzval, (char *)src_bcsr->nzval, nnz*r*c*elemsize, kind);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
-
-	cures = cudaMemcpy((char *)dst_bcsr->colind, (char *)src_bcsr->colind, nnz*sizeof(uint32_t), kind);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
-
-	cures = cudaMemcpy((char *)dst_bcsr->rowptr, (char *)src_bcsr->rowptr, (nrow+1)*sizeof(uint32_t), kind);
-	if (STARPU_UNLIKELY(cures))
-		STARPU_CUDA_REPORT_ERROR(cures);
-
-	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
-
-	return 0;
-}
-
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
-{
-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
-}
-
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
-{
-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
-}
-
-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
-{
-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
-}
-#endif // STARPU_USE_CUDA
-
-#ifdef STARPU_USE_OPENCL
-static int copy_opencl_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
-{
-	struct starpu_bcsr_interface *src_bcsr = src_interface;
-	struct starpu_bcsr_interface *dst_bcsr = dst_interface;
-
-	uint32_t nnz = src_bcsr->nnz;
-	uint32_t nrow = src_bcsr->nrow;
-	size_t elemsize = src_bcsr->elemsize;
-
-	uint32_t r = src_bcsr->r;
-	uint32_t c = src_bcsr->c;
-
-        int err;
-
-	err = starpu_opencl_copy_async_sync(src_bcsr->nzval, src_node, 0, dst_bcsr->nzval, dst_node, 0, nnz*r*c*elemsize, NULL);
-	if (STARPU_UNLIKELY(err))
-		STARPU_OPENCL_REPORT_ERROR(err);
-
-	err = starpu_opencl_copy_async_sync((uintptr_t)src_bcsr->colind, src_node, 0, (uintptr_t)dst_bcsr->colind, dst_node, 0, nnz*sizeof(uint32_t), NULL);
-	if (STARPU_UNLIKELY(err))
-		STARPU_OPENCL_REPORT_ERROR(err);
-
-	err = starpu_opencl_copy_async_sync((uintptr_t)src_bcsr->rowptr, src_node, 0, (uintptr_t)dst_bcsr->rowptr, dst_node, 0, (nrow+1)*sizeof(uint32_t), NULL);
-	if (STARPU_UNLIKELY(err))
-		STARPU_OPENCL_REPORT_ERROR(err);
-
-	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
-
-	return 0;
-}
-
-static int copy_opencl_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node);
-}
-
-static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node);
-}
-
-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node);
-}
-
-#endif // STARPU_USE_OPENCL
-
-/* as not all platform easily have a BLAS lib installed ... */
-static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
+static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data)
 {
 	struct starpu_bcsr_interface *src_bcsr = (struct starpu_bcsr_interface *) src_interface;
 	struct starpu_bcsr_interface *dst_bcsr = (struct starpu_bcsr_interface *) dst_interface;
@@ -425,13 +307,18 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBU
 	uint32_t r = src_bcsr->r;
 	uint32_t c = src_bcsr->c;
 
-	memcpy((void *)dst_bcsr->nzval, (void *)src_bcsr->nzval, nnz*elemsize*r*c);
+	int ret = 0;
+
+	if (starpu_interface_copy(src_bcsr->nzval, src_node, 0, dst_bcsr->nzval, dst_node, 0, nnz*elemsize*r*c, async_data))
+		ret = -EAGAIN;
 
-	memcpy((void *)dst_bcsr->colind, (void *)src_bcsr->colind, nnz*sizeof(uint32_t));
+	if (starpu_interface_copy((uintptr_t)src_bcsr->colind, src_node, 0, (uintptr_t)dst_bcsr->colind, dst_node, 0, nnz*sizeof(uint32_t), async_data))
+		ret = -EAGAIN;
 
-	memcpy((void *)dst_bcsr->rowptr, (void *)src_bcsr->rowptr, (nrow+1)*sizeof(uint32_t));
+	if (starpu_interface_copy((uintptr_t)src_bcsr->rowptr, src_node, 0, (uintptr_t)dst_bcsr->rowptr, dst_node, 0, (nrow+1)*sizeof(uint32_t), async_data))
+		ret = -EAGAIN;
 
 	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize*r*c + (nnz+nrow+1)*sizeof(uint32_t));
 
-	return 0;
+	return ret;
 }

+ 1 - 1
src/datawizard/interfaces/block_interface.c

@@ -420,7 +420,7 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_
 	}
 	else
 	{
-		/* Default case: we transfer all lines one by one: ny*nz transfers */
+		/* Default case: we transfer all blocks one by one: nz 2D transfers */
 		unsigned layer;
 		for (layer = 0; layer < src_block->nz; layer++)
 		{

+ 18 - 246
src/datawizard/interfaces/coo_interface.c

@@ -19,190 +19,36 @@
 #include <datawizard/memalloc.h>
 
 static int
-copy_ram_to_ram(void *src_interface, STARPU_ATTRIBUTE_UNUSED unsigned src_node,
-		void *dst_interface, STARPU_ATTRIBUTE_UNUSED unsigned dst_node)
+copy_any_to_any(void *src_interface, unsigned src_node,
+		void *dst_interface, unsigned dst_node, void *async_data)
 {
 	size_t size = 0;
 	struct starpu_coo_interface *src_coo, *dst_coo;
-
-	src_coo = (struct starpu_coo_interface *) src_interface;
-	dst_coo = (struct starpu_coo_interface *) dst_interface;
-
-	size = src_coo->n_values * sizeof(src_coo->columns[0]);
-	memcpy((void *) dst_coo->columns, (void *) src_coo->columns, size);
-
-	/* sizeof(src_coo->columns[0]) == sizeof(src_coo->rows[0]) */
-	memcpy((void *) dst_coo->rows, (void *) src_coo->rows, size);
-
-	size = src_coo->n_values * src_coo->elemsize;
-	memcpy((void *) dst_coo->values, (void *) src_coo->values, size);
-
-	_STARPU_TRACE_DATA_COPY(src_node, dst_node,
-		src_coo->n_values *
-		(2 * sizeof(src_coo->rows[0]) + src_coo->elemsize));
-
-	return 0;
-}
-
-#ifdef STARPU_USE_CUDA
-static int
-copy_cuda_async_sync(void *src_interface, unsigned src_node,
-		     void *dst_interface, unsigned dst_node,
-		     cudaStream_t stream, enum cudaMemcpyKind kind)
-{
-	int ret;
-	size_t size = 0;
-	struct starpu_coo_interface *src_coo, *dst_coo;
-
-	src_coo = (struct starpu_coo_interface *) src_interface;
-	dst_coo = (struct starpu_coo_interface *) dst_interface;
-
-	size = src_coo->n_values * sizeof(src_coo->columns[0]);
-	ret = starpu_cuda_copy_async_sync(
-		(void *) src_coo->columns,
-		src_node,
-		(void *) dst_coo->columns,
-		dst_node,
-		size,
-		stream,
-		kind);
-	if (ret == 0)
-		stream = NULL;
-
-	/* sizeof(src_coo->columns[0]) == sizeof(src_coo->rows[0]) */
-	ret = starpu_cuda_copy_async_sync(
-		(void *) src_coo->rows,
-		src_node,
-		(void *) dst_coo->rows,
-		dst_node,
-		size,
-		stream,
-		kind);
-	if (ret == 0)
-		stream = NULL;
-
-	size = src_coo->n_values * src_coo->elemsize;
-	ret = starpu_cuda_copy_async_sync(
-		(void *) src_coo->values,
-		src_node,
-		(void *) dst_coo->values,
-		dst_node,
-		size,
-		stream,
-		kind);
-
-	_STARPU_TRACE_DATA_COPY(src_node, dst_node,
-		src_coo->n_values *
-		(2 * sizeof(src_coo->rows[0]) + src_coo->elemsize));
-	return ret;
-}
-
-static int
-copy_ram_to_cuda(void *src_interface, unsigned src_node,
-		 void *dst_interface, unsigned dst_node)
-{
-	return copy_cuda_async_sync(src_interface, src_node,
-				    dst_interface, dst_node,
-				    NULL, cudaMemcpyHostToDevice);
-}
-
-static int
-copy_cuda_to_ram(void *src_interface, unsigned src_node,
-		 void *dst_interface, unsigned dst_node)
-{
-	return copy_cuda_async_sync(src_interface, src_node,
-				    dst_interface, dst_node,
-				    NULL, cudaMemcpyDeviceToHost);
-}
-
-static int
-copy_ram_to_cuda_async(void *src_interface, unsigned src_node,
-		       void *dst_interface, unsigned dst_node,
-		       cudaStream_t stream)
-{
-	return copy_cuda_async_sync(src_interface, src_node,
-				    dst_interface, dst_node,
-				    stream, cudaMemcpyHostToDevice);
-}
-
-static int
-copy_cuda_to_ram_async(void *src_interface, unsigned src_node,
-		       void *dst_interface, unsigned dst_node,
-		       cudaStream_t stream)
-{
-	return copy_cuda_async_sync(src_interface, src_node,
-				    dst_interface, dst_node,
-				    stream, cudaMemcpyDeviceToHost);
-}
-
-static int
-copy_cuda_to_cuda(void *src_interface, unsigned src_node,
-		  void *dst_interface, unsigned dst_node)
-{
-	return copy_cuda_async_sync(src_interface, src_node,
-				    dst_interface, dst_node,
-				    NULL, cudaMemcpyDeviceToDevice);
-}
-
-#ifdef NO_STRIDE
-static int
-copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,
-			void *dst_interface, unsigned dst_node,
-			cudaStream_t stream)
-{
-	return copy_cuda_async_sync(src_interface, src_node,
-				    dst_interface, dst_node,
-				    stream, cudaMemcpyDeviceToDevice);
-}
-#endif /* !NO_STRIDE */
-#endif /* !STARPU_USE_CUDA */
-
-#ifdef STARPU_USE_OPENCL
-static int
-copy_opencl_common(void *src_interface, unsigned src_node,
-			 void *dst_interface, unsigned dst_node,
-			 cl_event *event)
-{
 	int ret = 0;
-	size_t size = 0;
-	struct starpu_coo_interface *src_coo, *dst_coo;
 
 	src_coo = (struct starpu_coo_interface *) src_interface;
 	dst_coo = (struct starpu_coo_interface *) dst_interface;
 
-
 	size = src_coo->n_values * sizeof(src_coo->columns[0]);
-	ret = starpu_opencl_copy_async_sync(
-		(uintptr_t) src_coo->columns,
-		src_node,
-		0,
-		(uintptr_t) dst_coo->columns,
-		dst_node,
-		0,
-		size,
-		NULL);
+	if (starpu_interface_copy(
+		(uintptr_t) src_coo->columns, src_node, 0,
+		(uintptr_t) dst_coo->columns, dst_node, 0,
+		size, async_data))
+		ret = -EAGAIN;
 
 	/* sizeof(src_coo->columns[0]) == sizeof(src_coo->rows[0]) */
-	ret = starpu_opencl_copy_async_sync(
-		(uintptr_t) src_coo->rows,
-		src_node,
-		0,
-		(uintptr_t) dst_coo->rows,
-		dst_node,
-		0,
-		size,
-		NULL);
+	if (starpu_interface_copy(
+		(uintptr_t) src_coo->rows, src_node, 0,
+		(uintptr_t) dst_coo->rows, dst_node, 0,
+		size, async_data))
+		ret = -EAGAIN;
 
 	size = src_coo->n_values * src_coo->elemsize;
-	ret = starpu_opencl_copy_async_sync(
-		src_coo->values,
-		src_node,
-		0,
-		(uintptr_t) dst_coo->values,
-		dst_node,
-		0,
-		size,
-		event);
+	if (starpu_interface_copy(
+		src_coo->values, src_node, 0,
+		dst_coo->values, dst_node, 0,
+		size, async_data))
+		ret = -EAGAIN;
 
 	_STARPU_TRACE_DATA_COPY(src_node, dst_node,
 		src_coo->n_values *
@@ -211,83 +57,9 @@ copy_opencl_common(void *src_interface, unsigned src_node,
 	return ret;
 }
 
-static int
-copy_ram_to_opencl_async(void *src_interface, unsigned src_node,
-			 void *dst_interface, unsigned dst_node,
-			 cl_event *event)
-{
-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
-}
-
-static int
-copy_opencl_to_ram_async(void *src_interface, unsigned src_node,
-			 void *dst_interface, unsigned dst_node,
-			 cl_event *event)
-{
-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
-}
-
-static int
-copy_opencl_to_opencl_async(void *src_interface, unsigned src_node,
-			 void *dst_interface, unsigned dst_node,
-			 cl_event *event)
-{
-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
-}
-
-static int
-copy_ram_to_opencl(void *src_interface, unsigned src_node,
-		   void *dst_interface, unsigned dst_node)
-{
-	return copy_ram_to_opencl_async(src_interface, src_node,
-					dst_interface, dst_node,
-					NULL);
-}
-static int
-copy_opencl_to_ram(void *src_interface, unsigned src_node,
-		   void *dst_interface, unsigned dst_node)
-{
-	return copy_opencl_to_ram_async(src_interface, src_node,
-					dst_interface, dst_node,
-					NULL);
-}
-static int
-copy_opencl_to_opencl(void *src_interface, unsigned src_node,
-		   void *dst_interface, unsigned dst_node)
-{
-	return copy_opencl_to_opencl_async(src_interface, src_node,
-					dst_interface, dst_node,
-					NULL);
-}
-#endif /* !STARPU_USE_OPENCL */
-
 static struct starpu_data_copy_methods coo_copy_data_methods =
 {
-	.ram_to_ram          = copy_ram_to_ram,
-#ifdef STARPU_USE_CUDA
-	.ram_to_cuda         = copy_ram_to_cuda,
-	.cuda_to_ram         = copy_cuda_to_ram,
-	.ram_to_cuda_async   = copy_ram_to_cuda_async,
-	.cuda_to_ram_async   = copy_cuda_to_ram_async,
-	.cuda_to_cuda        = copy_cuda_to_cuda,
-#ifdef NO_STRIDE
-	.cuda_to_cuda_async  = copy_cuda_to_cuda_async,
-#endif
-#else
-#ifdef STARPU_SIMGRID
-#ifdef NO_STRIDE
-	/* Enable GPU-GPU transfers in simgrid */
-	.cuda_to_cuda_async = 1,
-#endif
-#endif
-#endif /* !STARPU_USE_CUDA */
-#ifdef STARPU_USE_OPENCL
-	.ram_to_opencl       = copy_ram_to_opencl,
-	.opencl_to_ram       = copy_opencl_to_ram,
-	.opencl_to_opencl    = copy_opencl_to_opencl,
-	.ram_to_opencl_async = copy_ram_to_opencl_async,
-	.opencl_to_opencl_async = copy_opencl_to_opencl_async,
-#endif /* !STARPU_USE_OPENCL */
+	.any_to_any          = copy_any_to_any,
 };
 
 static void

+ 11 - 218
src/datawizard/interfaces/csr_interface.c

@@ -28,42 +28,11 @@
 #include <starpu_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
 
-static int copy_ram_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
-#ifdef STARPU_USE_CUDA
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
-static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
-#endif
-#ifdef STARPU_USE_OPENCL
-static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
-static int copy_opencl_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
-#endif
+static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 
 static struct starpu_data_copy_methods csr_copy_data_methods_s =
 {
-	.ram_to_ram = copy_ram_to_ram,
-#ifdef STARPU_USE_CUDA
-	.ram_to_cuda = copy_ram_to_cuda,
-	.cuda_to_ram = copy_cuda_to_ram,
-	.cuda_to_cuda = copy_cuda_to_cuda,
-	.ram_to_cuda_async = copy_ram_to_cuda_async,
-	.cuda_to_ram_async = copy_cuda_to_ram_async,
-	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
-#else
-#ifdef STARPU_SIMGRID
-	/* Enable GPU-GPU transfers in simgrid */
-	.cuda_to_cuda_async = 1,
-#endif
-#endif
-#ifdef STARPU_USE_OPENCL
-	.ram_to_opencl = copy_ram_to_opencl,
-	.opencl_to_ram = copy_opencl_to_ram,
-	.opencl_to_opencl = copy_opencl_to_opencl,
-#endif
+	.any_to_any = copy_any_to_any,
 };
 
 static void register_csr_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface);
@@ -293,188 +262,8 @@ static void free_csr_buffer_on_node(void *data_interface, unsigned node)
 	starpu_free_buffer_on_node(node, (uintptr_t) csr_interface->rowptr, (nrow+1)*sizeof(uint32_t));
 }
 
-#ifdef STARPU_USE_CUDA
-static int copy_cuda_async_sync(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind, cudaStream_t stream)
-{
-	struct starpu_csr_interface *src_csr = src_interface;
-	struct starpu_csr_interface *dst_csr = dst_interface;
-
-	uint32_t nnz = src_csr->nnz;
-	uint32_t nrow = src_csr->nrow;
-	size_t elemsize = src_csr->elemsize;
-
-	cudaStream_t sstream = stream;
-	int ret;
-
-	ret = starpu_cuda_copy_async_sync((void *)src_csr->nzval, src_node, (void *)dst_csr->nzval, dst_node, nnz*elemsize, sstream, kind);
-	if (ret == 0) sstream = NULL;
-
-	ret = starpu_cuda_copy_async_sync((void *)src_csr->colind, src_node, (void *)dst_csr->colind, dst_node, nnz*sizeof(uint32_t), sstream, kind);
-	if (ret == 0) sstream = NULL;
-
-	ret = starpu_cuda_copy_async_sync((void *)src_csr->rowptr, src_node, (void *)dst_csr->rowptr, dst_node, (nrow+1)*sizeof(uint32_t), sstream, kind);
-
-	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
-	return ret;
-}
-
-static int copy_cuda_peer_async(void *src_interface STARPU_ATTRIBUTE_UNUSED, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
-				void *dst_interface STARPU_ATTRIBUTE_UNUSED, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream STARPU_ATTRIBUTE_UNUSED)
-{
-#ifdef HAVE_CUDA_MEMCPY_PEER
-	struct starpu_csr_interface *src_csr = src_interface;
-	struct starpu_csr_interface *dst_csr = dst_interface;
-
-	uint32_t nnz = src_csr->nnz;
-	uint32_t nrow = src_csr->nrow;
-	size_t elemsize = src_csr->elemsize;
-
-	cudaError_t cures;
-
-	int src_dev = _starpu_memory_node_get_devid(src_node);
-	int dst_dev = _starpu_memory_node_get_devid(dst_node);
-
-	int synchronous_fallback = 0;
-
-	_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
-	cures = cudaMemcpyPeerAsync((char *)dst_csr->nzval, dst_dev, (char *)src_csr->nzval, src_dev, nnz*elemsize, stream);
-	if (cures)
-	{
-		synchronous_fallback = 1;
-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
-		cures = cudaMemcpyPeer((char *)dst_csr->nzval, dst_dev, (char *)src_csr->nzval, src_dev, nnz*elemsize);
-		if (STARPU_UNLIKELY(cures))
-			STARPU_CUDA_REPORT_ERROR(cures);
-	}
-
-	if (!synchronous_fallback)
-	{
-		cures = cudaMemcpyPeerAsync((char *)dst_csr->colind, dst_dev, (char *)src_csr->colind, src_dev, nnz*sizeof(uint32_t), stream);
-	}
-
-	if (synchronous_fallback || cures != cudaSuccess)
-	{
-		synchronous_fallback = 1;
-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
-		cures = cudaMemcpyPeer((char *)dst_csr->colind, dst_dev, (char *)src_csr->colind, src_dev, nnz*sizeof(uint32_t));
-		if (STARPU_UNLIKELY(cures))
-			STARPU_CUDA_REPORT_ERROR(cures);
-	}
-
-	if (!synchronous_fallback)
-	{
-		cures = cudaMemcpyPeerAsync((char *)dst_csr->rowptr, dst_dev, (char *)src_csr->rowptr, src_dev, (nrow+1)*sizeof(uint32_t), stream);
-	}
-
-	if (synchronous_fallback || cures != cudaSuccess)
-	{
-		synchronous_fallback = 1;
-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
-		cures = cudaMemcpyPeer((char *)dst_csr->rowptr, dst_dev, (char *)src_csr->rowptr, src_dev, (nrow+1)*sizeof(uint32_t));
-		if (STARPU_UNLIKELY(cures))
-			STARPU_CUDA_REPORT_ERROR(cures);
-	}
-
-	if (synchronous_fallback)
-	{
-		_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
-		return 0;
-	}
-	else
-	{
-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
-		return -EAGAIN;
-	}
-#else
-	/* Illegal without Peer tranfers */
-	STARPU_ABORT();
-	return 0;
-#endif
-}
-
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, NULL);
-}
-
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, NULL);
-}
-
-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice, NULL);
-}
-
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
-{
-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, stream);
-}
-
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
-{
-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, stream);
-}
-
-static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
-{
-	if (src_node == dst_node)
-		return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice, stream);
-	else
-		return copy_cuda_peer_async(src_interface, src_node, dst_interface, dst_node, stream);
-}
-
-#endif // STARPU_USE_CUDA
-
-#ifdef STARPU_USE_OPENCL
-static int copy_opencl_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
-{
-	struct starpu_csr_interface *src_csr = src_interface;
-	struct starpu_csr_interface *dst_csr = dst_interface;
-
-	uint32_t nnz = src_csr->nnz;
-	uint32_t nrow = src_csr->nrow;
-	size_t elemsize = src_csr->elemsize;
-
-        int err;
-
-	err = starpu_opencl_copy_async_sync(src_csr->nzval, src_node, 0, dst_csr->nzval, dst_node, 0, nnz*elemsize, NULL);
-	if (STARPU_UNLIKELY(err))
-                STARPU_OPENCL_REPORT_ERROR(err);
-
-	err = starpu_opencl_copy_async_sync((uintptr_t)src_csr->colind, src_node, 0, (uintptr_t)dst_csr->colind, dst_node, 0, nnz*sizeof(uint32_t), NULL);
-        if (STARPU_UNLIKELY(err))
-                STARPU_OPENCL_REPORT_ERROR(err);
-
-	err = starpu_opencl_copy_async_sync((uintptr_t)src_csr->rowptr, src_node, 0, (uintptr_t)dst_csr->rowptr, dst_node, 0, (nrow+1)*sizeof(uint32_t), NULL);
-	if (STARPU_UNLIKELY(err))
-                STARPU_OPENCL_REPORT_ERROR(err);
-
-	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
-
-	return 0;
-}
-
-static int copy_opencl_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node);
-}
-
-static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node);
-}
-
-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node);
-}
-
-#endif // STARPU_USE_OPENCL
-
 /* as not all platform easily have a BLAS lib installed ... */
-static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
+static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data)
 {
 	struct starpu_csr_interface *src_csr = (struct starpu_csr_interface *) src_interface;
 	struct starpu_csr_interface *dst_csr = (struct starpu_csr_interface *) dst_interface;
@@ -482,14 +271,18 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBU
 	uint32_t nnz = src_csr->nnz;
 	uint32_t nrow = src_csr->nrow;
 	size_t elemsize = src_csr->elemsize;
+	int ret = 0;
 
-	memcpy((void *)dst_csr->nzval, (void *)src_csr->nzval, nnz*elemsize);
+	if (starpu_interface_copy(src_csr->nzval, src_node, 0, dst_csr->nzval, dst_node, 0, nnz*elemsize, async_data))
+		ret = -EAGAIN;
 
-	memcpy((void *)dst_csr->colind, (void *)src_csr->colind, nnz*sizeof(uint32_t));
+	if (starpu_interface_copy((uintptr_t)src_csr->colind, src_node, 0, (uintptr_t)dst_csr->colind, dst_node, 0, nnz*sizeof(uint32_t), async_data))
+		ret = -EAGAIN;
 
-	memcpy((void *)dst_csr->rowptr, (void *)src_csr->rowptr, (nrow+1)*sizeof(uint32_t));
+	if (starpu_interface_copy((uintptr_t)src_csr->rowptr, src_node, 0, (uintptr_t)dst_csr->rowptr, dst_node, 0, (nrow+1)*sizeof(uint32_t), async_data))
+		ret = -EAGAIN;
 
 	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
 
-	return 0;
+	return ret;
 }

+ 6 - 136
src/datawizard/interfaces/variable_interface.c

@@ -25,48 +25,11 @@
 #include <starpu_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
 
-static int copy_ram_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-#ifdef STARPU_USE_CUDA
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
-static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-#endif
-#ifdef STARPU_USE_OPENCL
-static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-static int copy_opencl_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
-static int copy_opencl_to_opencl_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
-#endif
+static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 
 static struct starpu_data_copy_methods variable_copy_data_methods_s =
 {
-	.ram_to_ram = copy_ram_to_ram,
-#ifdef STARPU_USE_CUDA
-	.ram_to_cuda = copy_ram_to_cuda,
-	.cuda_to_ram = copy_cuda_to_ram,
-	.cuda_to_cuda = copy_cuda_to_cuda,
-	.ram_to_cuda_async = copy_ram_to_cuda_async,
-	.cuda_to_ram_async = copy_cuda_to_ram_async,
-	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
-#else
-#ifdef STARPU_SIMGRID
-	/* Enable GPU-GPU transfers in simgrid */
-	.cuda_to_cuda_async = 1,
-#endif
-#endif
-#ifdef STARPU_USE_OPENCL
-	.ram_to_opencl = copy_ram_to_opencl,
-	.opencl_to_ram = copy_opencl_to_ram,
-	.opencl_to_opencl = copy_opencl_to_opencl,
-        .ram_to_opencl_async = copy_ram_to_opencl_async,
-	.opencl_to_ram_async = copy_opencl_to_ram_async,
-	.opencl_to_opencl_async = copy_opencl_to_opencl_async,
-#endif
+	.any_to_any = copy_any_to_any,
 };
 
 static void register_variable_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface);
@@ -204,101 +167,7 @@ static void free_variable_buffer_on_node(void *data_interface, unsigned node)
 	starpu_free_buffer_on_node(node, variable_interface->ptr, variable_interface->elemsize);
 }
 
-#ifdef STARPU_USE_CUDA
-static int copy_cuda_async_sync(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream, enum cudaMemcpyKind kind)
-{
-	struct starpu_variable_interface *src_variable = src_interface;
-	struct starpu_variable_interface *dst_variable = dst_interface;
-	int ret;
-
-	ret = starpu_cuda_copy_async_sync((void *)src_variable->ptr, src_node, (void *)dst_variable->ptr, dst_node, src_variable->elemsize, stream, kind);
-	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
-	return ret;
-}
-
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, NULL, cudaMemcpyDeviceToHost);
-}
-
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, NULL, cudaMemcpyHostToDevice);
-}
-
-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, NULL, cudaMemcpyDeviceToDevice);
-}
-
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
-{
-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
-}
-
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
-{
-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
-}
-
-static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
-{
-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToDevice);
-}
-
-#endif // STARPU_USE_CUDA
-
-#ifdef STARPU_USE_OPENCL
-static int copy_opencl_common(void *src_interface, unsigned src_node,
-				void *dst_interface, unsigned dst_node, cl_event *event)
-{
-	struct starpu_variable_interface *src_variable = src_interface;
-	struct starpu_variable_interface *dst_variable = dst_interface;
-        int ret;
-
-	ret = starpu_opencl_copy_async_sync(src_variable->ptr, src_node, 0, dst_variable->ptr, dst_node, 0, src_variable->elemsize, event);
-
-	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
-
-	return ret;
-}
-
-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node,
-					void *dst_interface, unsigned dst_node, cl_event *event)
-{
-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
-}
-
-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node,
-					void *dst_interface, unsigned dst_node, cl_event *event)
-{
-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
-}
-
-static int copy_opencl_to_opencl_async(void *src_interface, unsigned src_node,
-					void *dst_interface, unsigned dst_node, cl_event *event)
-{
-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
-}
-
-static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
-{
-        return copy_ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
-}
-
-static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
-{
-        return copy_opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, NULL);
-}
-
-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-	return copy_opencl_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
-}
-
-#endif
-
-static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
+static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data)
 {
 	struct starpu_variable_interface *src_variable = (struct starpu_variable_interface *) src_interface;
 	struct starpu_variable_interface *dst_variable = (struct starpu_variable_interface *) dst_interface;
@@ -307,10 +176,11 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBU
 
 	uintptr_t ptr_src = src_variable->ptr;
 	uintptr_t ptr_dst = dst_variable->ptr;
+	int ret;
 
-	memcpy((void *)ptr_dst, (void *)ptr_src, elemsize);
+	ret = starpu_interface_copy(ptr_src, src_node, 0, ptr_dst, dst_node, 0, elemsize, async_data);
 
 	_STARPU_TRACE_DATA_COPY(src_node, dst_node, elemsize);
 
-	return 0;
+	return ret;
 }

+ 7 - 146
src/datawizard/interfaces/vector_interface.c

@@ -25,48 +25,11 @@
 #include <starpu_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
 
-static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
-#ifdef STARPU_USE_CUDA
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, cudaStream_t stream);
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, cudaStream_t stream);
-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
-static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,					void *dst_interface, unsigned dst_node, cudaStream_t stream);
-#endif
-#ifdef STARPU_USE_OPENCL
-static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
-static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, cl_event *event);
-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, cl_event *event);
-static int copy_opencl_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
-#endif
+static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 
 static struct starpu_data_copy_methods vector_copy_data_methods_s =
 {
-	.ram_to_ram = copy_ram_to_ram,
-#ifdef STARPU_USE_CUDA
-	.ram_to_cuda = copy_ram_to_cuda,
-	.cuda_to_ram = copy_cuda_to_ram,
-	.ram_to_cuda_async = copy_ram_to_cuda_async,
-	.cuda_to_ram_async = copy_cuda_to_ram_async,
-	.cuda_to_cuda = copy_cuda_to_cuda,
-	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
-#else
-#ifdef STARPU_SIMGRID
-	/* Enable GPU-GPU transfers in simgrid */
-	.cuda_to_cuda_async = 1,
-#endif
-#endif
-#ifdef STARPU_USE_OPENCL
-	.ram_to_opencl = copy_ram_to_opencl,
-	.opencl_to_ram = copy_opencl_to_ram,
-	.opencl_to_opencl = copy_opencl_to_opencl,
-        .ram_to_opencl_async = copy_ram_to_opencl_async,
-	.opencl_to_ram_async = copy_opencl_to_ram_async,
-	.opencl_to_opencl_async = copy_opencl_to_opencl_async,
-#endif
+	.any_to_any = copy_any_to_any,
 };
 
 static void register_vector_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface);
@@ -252,120 +215,18 @@ static void free_vector_buffer_on_node(void *data_interface, unsigned node)
 	starpu_free_buffer_on_node(node, vector_interface->ptr, nx*elemsize);
 }
 
-#ifdef STARPU_USE_CUDA
-static int copy_cuda_async_sync(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream, enum cudaMemcpyKind kind)
+static int copy_any_to_any(void *src_interface, unsigned src_node,
+                           void *dst_interface, unsigned dst_node, void *async_data)
 {
 	struct starpu_vector_interface *src_vector = src_interface;
 	struct starpu_vector_interface *dst_vector = dst_interface;
 	int ret;
 
-	ret = starpu_cuda_copy_async_sync((void *)src_vector->ptr, src_node, (void *)dst_vector->ptr, dst_node, src_vector->nx*src_vector->elemsize, stream, kind);
-	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
-	return ret;
-}
-
-static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, NULL, cudaMemcpyDeviceToHost);
-}
-
-static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, NULL, cudaMemcpyHostToDevice);
-}
-
-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
-{
-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, NULL, cudaMemcpyDeviceToDevice);
-}
-
-static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
-{
-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToDevice);
-}
-
-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
-{
-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
-}
-
-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
-{
-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
-}
-
-#endif // STARPU_USE_CUDA
-
-#ifdef STARPU_USE_OPENCL
-static int copy_opencl_common(void *src_interface, unsigned src_node,
-                                    void *dst_interface, unsigned dst_node, cl_event *event)
-{
-	struct starpu_vector_interface *src_vector = src_interface;
-	struct starpu_vector_interface *dst_vector = dst_interface;
-	int ret;
-
-	ret = starpu_opencl_copy_async_sync(src_vector->dev_handle, src_node, src_vector->offset,
-					    dst_vector->dev_handle, dst_node, dst_vector->offset,
-					       src_vector->nx*src_vector->elemsize, event);
+	ret = starpu_interface_copy(src_vector->dev_handle, src_node, src_vector->offset,
+				    dst_vector->dev_handle, dst_node, dst_vector->offset,
+				    src_vector->nx*src_vector->elemsize, async_data);
 
 	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
 	return ret;
 }
 
-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node,
-                                    void *dst_interface, unsigned dst_node, cl_event *event)
-{
-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
-}
-
-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node,
-                                    void *dst_interface, unsigned dst_node, cl_event *event)
-{
-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
-}
-
-static int copy_opencl_to_opencl_async(void *src_interface, unsigned src_node,
-                                       void *dst_interface, unsigned dst_node, cl_event *event)
-{
-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
-}
-
-static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
-                              void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
-{
-        return copy_ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
-}
-
-static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
-				void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
-{
-        return copy_opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, NULL);
-}
-
-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node,
-				 void *dst_interface, unsigned dst_node)
-{
-	return copy_opencl_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
-}
-
-
-#endif // STARPU_USE_OPENCL
-
-static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
-					void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
-{
-	struct starpu_vector_interface *src_vector = (struct starpu_vector_interface *) src_interface;
-	struct starpu_vector_interface *dst_vector = (struct starpu_vector_interface *) dst_interface;
-
-	uint32_t nx = dst_vector->nx;
-	size_t elemsize = dst_vector->elemsize;
-
-	uintptr_t ptr_src = src_vector->ptr;
-	uintptr_t ptr_dst = dst_vector->ptr;
-
-	memcpy((void *)ptr_dst, (void *)ptr_src, nx*elemsize);
-
-	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nx*elemsize);
-
-	return 0;
-}

+ 2 - 49
src/datawizard/interfaces/void_interface.c

@@ -25,36 +25,11 @@
 #include <starpu_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
 
-static int dummy_copy(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
-#ifdef STARPU_USE_CUDA
-static int dummy_cuda_copy_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
-#endif
-#ifdef STARPU_USE_OPENCL
-static int dummy_opencl_copy_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event);
-#endif
+static int dummy_copy(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 
 static struct starpu_data_copy_methods void_copy_data_methods_s =
 {
-	.ram_to_ram = dummy_copy,
-#ifdef STARPU_USE_CUDA
-	.ram_to_cuda = dummy_copy,
-	.cuda_to_ram = dummy_copy,
-	.cuda_to_cuda = dummy_copy,
-	.ram_to_cuda_async = dummy_cuda_copy_async,
-	.cuda_to_ram_async = dummy_cuda_copy_async,
-	.cuda_to_cuda_async = dummy_cuda_copy_async,
-#else
-#ifdef STARPU_SIMGRID
-	/* Enable GPU-GPU transfers in simgrid */
-	.cuda_to_cuda_async = 1,
-#endif
-#endif
-#ifdef STARPU_USE_OPENCL
-	.ram_to_opencl = dummy_copy,
-	.opencl_to_ram = dummy_copy,
-        .ram_to_opencl_async = dummy_opencl_copy_async,
-	.opencl_to_ram_async = dummy_opencl_copy_async,
-#endif
+	.any_to_any = dummy_copy,
 };
 
 static void register_void_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface);
@@ -139,25 +114,3 @@ static int dummy_copy(void *src_interface STARPU_ATTRIBUTE_UNUSED,
 {
 	return 0;
 }
-
-#ifdef STARPU_USE_CUDA
-static int dummy_cuda_copy_async(void *src_interface STARPU_ATTRIBUTE_UNUSED,
-				unsigned src_node STARPU_ATTRIBUTE_UNUSED,
-				void *dst_interface STARPU_ATTRIBUTE_UNUSED,
-				unsigned dst_node STARPU_ATTRIBUTE_UNUSED,
-				cudaStream_t stream __attribute__ ((unused)))
-{
-	return 0;
-}
-#endif // STARPU_USE_CUDA
-
-#ifdef STARPU_USE_OPENCL
-static int dummy_opencl_copy_async(void *src_interface STARPU_ATTRIBUTE_UNUSED,
-					unsigned src_node STARPU_ATTRIBUTE_UNUSED,
-					void *dst_interface STARPU_ATTRIBUTE_UNUSED,
-					unsigned dst_node STARPU_ATTRIBUTE_UNUSED,
-					cl_event *event STARPU_ATTRIBUTE_UNUSED)
-{
-	return 0;
-}
-#endif // STARPU_USE_OPENCL

+ 4 - 1
tests/datawizard/interfaces/test_interfaces.c

@@ -550,7 +550,10 @@ ram_to_ram(void)
 	/* We do not care about the nodes */
 	src_interface = starpu_data_get_interface_on_node(src, 0);
 	dst_interface = starpu_data_get_interface_on_node(dst, 0);
-	src->ops->copy_methods->ram_to_ram(src_interface, 0, dst_interface, 0);
+	if (src->ops->copy_methods->ram_to_ram)
+		src->ops->copy_methods->ram_to_ram(src_interface, 0, dst_interface, 0);
+	else
+		src->ops->copy_methods->any_to_any(src_interface, 0, dst_interface, 0, NULL);
 
 	err = create_task(&task, STARPU_CPU_WORKER, -1);
 	if (err != 0)