пре 12 година · de6deb09f5
--- a/doc/chapters/advanced-api.texi
+++ b/doc/chapters/advanced-api.texi
@@ -354,51 +354,56 @@ Unpack the data handle from the contiguous buffer at the address @code{ptr} of s
 
				 @end deftp
			
 
				 
			
 
				 @deftp {Data Type} {struct starpu_data_copy_methods}
			
 
				-Defines the per-interface methods.
			
 
				+Defines the per-interface methods. If the @code{any_to_any} method is provided,
			
 
				+it will be used by default if no more specific method is provided. It can still
			
 
				+be useful to provide more specific method in case of e.g. available particular
			
 
				+CUDA or OpenCL support.
			
 
				+
			
 
				 @table @asis
			
 
				-@item @code{int @{ram,cuda,opencl@}_to_@{ram,cuda,opencl@}(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)}
			
 
				+@item @code{int (*@{ram,cuda,opencl@}_to_@{ram,cuda,opencl@})(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)}
			
 
				 These 12 functions define how to copy data from the @var{src_interface}
			
 
				 interface on the @var{src_node} node to the @var{dst_interface} interface
			
 
				 on the @var{dst_node} node. They return 0 on success.
			
 
				 
			
 
				-@item @code{int (*ram_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)}
			
 
				-Define how to copy data from the @var{src_interface} interface on the
			
 
				-@var{src_node} node (in RAM) to the @var{dst_interface} interface on the
			
 
				-@var{dst_node} node (on a CUDA device), using the given @var{stream}. Return 0
			
 
				-on success.
			
 
				-
			
 
				-@item @code{int (*cuda_to_ram_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)}
			
 
				-Define how to copy data from the @var{src_interface} interface on the
			
 
				-@var{src_node} node (on a CUDA device) to the @var{dst_interface} interface on the
			
 
				-@var{dst_node} node (in RAM), using the given @var{stream}. Return 0
			
 
				-on success.
			
 
				-
			
 
				-@item @code{int (*cuda_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)}
			
 
				-Define how to copy data from the @var{src_interface} interface on the
			
 
				-@var{src_node} node (on a CUDA device) to the @var{dst_interface} interface on
			
 
				-the @var{dst_node} node (on another CUDA device), using the given @var{stream}.
			
 
				-Return 0 on success.
			
 
				-
			
 
				-@item @code{int (*ram_to_opencl_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, /* cl_event * */ void *event)}
			
 
				+@item @code{int (*@{ram,cuda@}_to_@{ram,cuda@}_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)}
			
 
				+These 3 functions (@code{ram_to_ram} is not among these) define how to copy
			
 
				+data from the @var{src_interface} interface on the @var{src_node} node to the
			
 
				+@var{dst_interface} interface on the @var{dst_node} node, using the given
			
 
				+@var{stream}. Must return 0 if the transfer was actually completed completely
			
 
				+synchronously, or -EAGAIN if at least some transfers are still ongoing and
			
 
				+should be awaited for by the core.
			
 
				+
			
 
				+@item @code{int (*@{ram,opencl@}_to_@{ram,opencl@}_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, /* cl_event * */ void *event)}
			
 
				+These 3 functions (@code{ram_to_ram} is not among them) define how to copy
			
 
				+data from the @var{src_interface} interface on the @var{src_node} node to the
			
 
				+@var{dst_interface} interface on the @var{dst_node} node, by recording in
			
 
				+@var{event}, a pointer to a cl_event, the event of the last submitted transfer.
			
 
				+Must return 0 if the transfer was actually completed completely synchronously,
			
 
				+or -EAGAIN if at least some transfers are still ongoing and should be awaited
			
 
				+for by the core.
			
 
				+
			
 
				+@item @code{int (*any_to_any)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data)}
			
 
				 Define how to copy data from the @var{src_interface} interface on the
			
 
				-@var{src_node} node (in RAM) to the @var{dst_interface} interface on the
			
 
				-@var{dst_node} node (on an OpenCL device), using @var{event}, a pointer to a
			
 
				-cl_event. Return 0 on success.
			
 
				+@var{src_node} node to the @var{dst_interface} interface on the @var{dst_node}
			
 
				+node. This is meant to be implemented through the @var{starpu_interface_copy}
			
 
				+helper, to which @var{async_data} should be passed as such, and will be used to
			
 
				+manage asynchronicity. This must return -EAGAIN if any of the
			
 
				+@var{starpu_interface_copy} calls has returned -EAGAIN (i.e. at least some
			
 
				+transfer is still ongoing), and return 0 otherwise.
			
 
				 
			
 
				-@item @code{int (*opencl_to_ram_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, /* cl_event * */ void *event)}
			
 
				-Define how to copy data from the @var{src_interface} interface on the
			
 
				-@var{src_node} node (on an OpenCL device) to the @var{dst_interface} interface
			
 
				-on the @var{dst_node} node (in RAM), using the given @var{event}, a pointer to
			
 
				-a cl_event. Return 0 on success.
			
 
				-
			
 
				-@item @code{int (*opencl_to_opencl_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, /* cl_event * */ void *event)}
			
 
				-Define how to copy data from the @var{src_interface} interface on the
			
 
				-@var{src_node} node (on an OpenCL device) to the @var{dst_interface} interface
			
 
				-on the @var{dst_node} node (on another OpenCL device), using the given
			
 
				-@var{event}, a pointer to a cl_event. Return 0 on success.
			
 
				 @end table
			
 
				 @end deftp
			
 
				 
			
 
				+@deftypefun int starpu_interface_copy(uintptr_t @var{src}, unsigned @var{src_node}, size_t @var{src_offset}, uintptr_t @var{dst}, unsigned @var{dst_node}, size_t @var{dst_offset}, size_t @var{size}, {void *}@var{async_data});
			
 
				+Copy @var{size} bytes from byte offset @var{src_offset} of @var{src} on
			
 
				+@var{src_node} to byte offset @var{dst_offset} of @var{dst} on @var{dst_node}.
			
 
				+This is to be used in the @var{any_to_any} copy method, which is provided with
			
 
				+the @var{async_data} to be pased to @var{starpu_interface_copy}. this returns
			
 
				+-EAGAIN if the transfer is still ongoing, or 0 if the transfer is already
			
 
				+completed.
			
 
				+@end deftypefun
			
 
				+
			
 
				+
			
 
				 @deftypefun uint32_t starpu_crc32_be_n ({void *}@var{input}, size_t @var{n}, uint32_t @var{inputcrc})
			
 
				 Compute the CRC of a byte buffer seeded by the inputcrc "current
			
 
				 state". The return value should be considered as the new "current
			
--- a/include/starpu_data_interfaces.h
+++ b/include/starpu_data_interfaces.h
@@ -73,8 +73,12 @@ struct starpu_data_copy_methods
 
				 	int (*opencl_to_ram_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event);
			
 
				 	int (*opencl_to_opencl_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event);
			
 
				 #endif
			
 
				+
			
 
				+	int (*any_to_any)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
			
 
				 };
			
 
				 
			
 
				+int starpu_interface_copy(uintptr_t src, unsigned src_node, size_t src_offset, uintptr_t dst, unsigned dst_node, size_t dst_offset, size_t size, void *async_data);
			
 
				+
			
 
				 enum starpu_data_interface_id
			
 
				 {
			
 
				 	STARPU_MATRIX_INTERFACE_ID=0,
			
--- a/src/datawizard/copy_driver.c
+++ b/src/datawizard/copy_driver.c
@@ -134,8 +134,10 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 
				 	{
			
 
				 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CPU_RAM):
			
 
				 		/* STARPU_CPU_RAM -> STARPU_CPU_RAM */
			
 
				-		STARPU_ASSERT(copy_methods->ram_to_ram);
			
 
				-		copy_methods->ram_to_ram(src_interface, src_node, dst_interface, dst_node);
			
 
				+		if (copy_methods->ram_to_ram)
			
 
				+			copy_methods->ram_to_ram(src_interface, src_node, dst_interface, dst_node);
			
 
				+		else
			
 
				+			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			
 
				 		break;
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CPU_RAM):
			
@@ -143,10 +145,10 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 
				 #if !defined(HAVE_CUDA_MEMCPY_PEER)
			
 
				 		STARPU_ASSERT(_starpu_memory_node_get_local_key() == src_node);
			
 
				 #endif
			
 
				-		STARPU_ASSERT(copy_methods->cuda_to_ram);
			
 
				-		if (!req || !copy_methods->cuda_to_ram_async)
			
 
				+		if (!req || !(copy_methods->cuda_to_ram_async || copy_methods->any_to_any))
			
 
				 		{
			
 
				 			/* this is not associated to a request so it's synchronous */
			
 
				+			STARPU_ASSERT(copy_methods->cuda_to_ram);
			
 
				 			copy_methods->cuda_to_ram(src_interface, src_node, dst_interface, dst_node);
			
 
				 		}
			
 
				 		else
			
@@ -156,7 +158,13 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 
				 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				 			stream = starpu_cuda_get_local_out_transfer_stream();
			
 
				-			ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
			
 
				+			if (copy_methods->cuda_to_ram_async)
			
 
				+				ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
			
 
				+			else
			
 
				+			{
			
 
				+				STARPU_ASSERT(copy_methods->any_to_any);
			
 
				+				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			
 
				+			}
			
 
				 
			
 
				 			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
			
 
				 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
			
@@ -168,10 +176,10 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 
				 #if !defined(HAVE_CUDA_MEMCPY_PEER)
			
 
				 		STARPU_ASSERT(_starpu_memory_node_get_local_key() == dst_node);
			
 
				 #endif
			
 
				-		STARPU_ASSERT(copy_methods->ram_to_cuda);
			
 
				-		if (!req || !copy_methods->ram_to_cuda_async)
			
 
				+		if (!req || !(copy_methods->ram_to_cuda_async || copy_methods->any_to_any))
			
 
				 		{
			
 
				 			/* this is not associated to a request so it's synchronous */
			
 
				+			STARPU_ASSERT(copy_methods->ram_to_cuda);
			
 
				 			copy_methods->ram_to_cuda(src_interface, src_node, dst_interface, dst_node);
			
 
				 		}
			
 
				 		else
			
@@ -182,7 +190,13 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 
				 				STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				 			stream = starpu_cuda_get_local_in_transfer_stream();
			
 
				-			ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
			
 
				+			if (copy_methods->ram_to_cuda_async)
			
 
				+				ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
			
 
				+			else
			
 
				+			{
			
 
				+				STARPU_ASSERT(copy_methods->any_to_any);
			
 
				+				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			
 
				+			}
			
 
				 
			
 
				 			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
			
 
				 			if (STARPU_UNLIKELY(cures != cudaSuccess))
			
@@ -191,8 +205,7 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 
				 		break;
			
 
				 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CUDA_RAM):
			
 
				 		/* CUDA - CUDA transfer */
			
 
				-		STARPU_ASSERT(copy_methods->cuda_to_cuda || copy_methods->cuda_to_cuda_async);
			
 
				-		if (!req || !copy_methods->cuda_to_cuda_async)
			
 
				+		if (!req || !(copy_methods->cuda_to_cuda_async || copy_methods->any_to_any))
			
 
				 		{
			
 
				 			STARPU_ASSERT(copy_methods->cuda_to_cuda);
			
 
				 			/* this is not associated to a request so it's synchronous */
			
@@ -205,7 +218,13 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 
				 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				 			stream = starpu_cuda_get_local_peer_transfer_stream();
			
 
				-			ret = copy_methods->cuda_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
			
 
				+			if (copy_methods->cuda_to_cuda_async)
			
 
				+				ret = copy_methods->cuda_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
			
 
				+			else
			
 
				+			{
			
 
				+				STARPU_ASSERT(copy_methods->any_to_any);
			
 
				+				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			
 
				+			}
			
 
				 
			
 
				 			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
			
 
				 			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
			
@@ -215,54 +234,65 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_CPU_RAM):
			
 
				 		/* OpenCL -> RAM */
			
 
				-		if (_starpu_memory_node_get_local_key() == src_node)
			
 
				+		STARPU_ASSERT(_starpu_memory_node_get_local_key() == src_node);
			
 
				+		if (!req || !(copy_methods->opencl_to_ram_async || copy_methods->any_to_any))
			
 
				 		{
			
 
				 			STARPU_ASSERT(copy_methods->opencl_to_ram);
			
 
				-			if (!req || !copy_methods->opencl_to_ram_async)
			
 
				-			{
			
 
				-				/* this is not associated to a request so it's synchronous */
			
 
				-				copy_methods->opencl_to_ram(src_interface, src_node, dst_interface, dst_node);
			
 
				-			}
			
 
				-			else
			
 
				-			{
			
 
				-				req->async_channel.type = STARPU_OPENCL_RAM;
			
 
				-				ret = copy_methods->opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
			
 
				-			}
			
 
				+			/* this is not associated to a request so it's synchronous */
			
 
				+			copy_methods->opencl_to_ram(src_interface, src_node, dst_interface, dst_node);
			
 
				 		}
			
 
				 		else
			
 
				 		{
			
 
				-			/* we should not have a blocking call ! */
			
 
				-			STARPU_ABORT();
			
 
				+			req->async_channel.type = STARPU_OPENCL_RAM;
			
 
				+			if (copy_methods->opencl_to_ram_async)
			
 
				+				ret = copy_methods->opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
			
 
				+			else
			
 
				+			{
			
 
				+				STARPU_ASSERT(copy_methods->any_to_any);
			
 
				+				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			
 
				+			}
			
 
				 		}
			
 
				 		break;
			
 
				 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_OPENCL_RAM):
			
 
				 		/* STARPU_CPU_RAM -> STARPU_OPENCL_RAM */
			
 
				 		STARPU_ASSERT(_starpu_memory_node_get_local_key() == dst_node);
			
 
				-		STARPU_ASSERT(copy_methods->ram_to_opencl);
			
 
				-		if (!req || !copy_methods->ram_to_opencl_async)
			
 
				+		if (!req || !(copy_methods->ram_to_opencl_async || copy_methods->any_to_any))
			
 
				 		{
			
 
				+			STARPU_ASSERT(copy_methods->ram_to_opencl);
			
 
				 			/* this is not associated to a request so it's synchronous */
			
 
				 			copy_methods->ram_to_opencl(src_interface, src_node, dst_interface, dst_node);
			
 
				 		}
			
 
				 		else
			
 
				 		{
			
 
				 			req->async_channel.type = STARPU_OPENCL_RAM;
			
 
				-			ret = copy_methods->ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
			
 
				+			if (copy_methods->ram_to_opencl_async)
			
 
				+				ret = copy_methods->ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
			
 
				+			else
			
 
				+			{
			
 
				+				STARPU_ASSERT(copy_methods->any_to_any);
			
 
				+				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			
 
				+			}
			
 
				 		}
			
 
				 		break;
			
 
				 	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_OPENCL_RAM):
			
 
				 		/* STARPU_OPENCL_RAM -> STARPU_OPENCL_RAM */
			
 
				 		STARPU_ASSERT(_starpu_memory_node_get_local_key() == dst_node || _starpu_memory_node_get_local_key() == src_node);
			
 
				-		STARPU_ASSERT(copy_methods->opencl_to_opencl);
			
 
				-		if (!req || !copy_methods->opencl_to_opencl_async)
			
 
				+		if (!req || !(copy_methods->opencl_to_opencl_async || copy_methods->any_to_any))
			
 
				 		{
			
 
				+			STARPU_ASSERT(copy_methods->opencl_to_opencl);
			
 
				 			/* this is not associated to a request so it's synchronous */
			
 
				 			copy_methods->opencl_to_opencl(src_interface, src_node, dst_interface, dst_node);
			
 
				 		}
			
 
				 		else
			
 
				 		{
			
 
				 			req->async_channel.type = STARPU_OPENCL_RAM;
			
 
				-			ret = copy_methods->opencl_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
			
 
				+			if (copy_methods->opencl_to_opencl_async)
			
 
				+				ret = copy_methods->opencl_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
			
 
				+			else
			
 
				+			{
			
 
				+				STARPU_ASSERT(copy_methods->any_to_any);
			
 
				+				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
			
 
				+			}
			
 
				 		}
			
 
				 		break;
			
 
				 #endif
			
@@ -331,6 +361,64 @@ int __attribute__((warn_unused_result)) _starpu_driver_copy_data_1_to_1(starpu_d
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+/* This can be used by interfaces to easily transfer a piece of data without
			
 
				+ * caring about the particular CUDA/OpenCL methods.  */
			
 
				+
			
 
				+int starpu_interface_copy(uintptr_t src, unsigned src_node, size_t src_offset, uintptr_t dst, unsigned dst_node, size_t dst_offset, size_t size, void *async_data)
			
 
				+{
			
 
				+	struct _starpu_async_channel *async_channel = async_data;
			
 
				+	enum starpu_node_kind src_kind = starpu_node_get_kind(src_node);
			
 
				+	enum starpu_node_kind dst_kind = starpu_node_get_kind(dst_node);
			
 
				+
			
 
				+	switch (_STARPU_MEMORY_NODE_TUPLE(src_kind,dst_kind))
			
 
				+	{
			
 
				+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CPU_RAM):
			
 
				+		memcpy((void *) dst + dst_offset, (void *) src + src_offset, size);
			
 
				+		return 0;
			
 
				+
			
 
				+#ifdef STARPU_USE_CUDA
			
 
				+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CPU_RAM):
			
 
				+		return starpu_cuda_copy_async_sync(
			
 
				+				(void*) src + src_offset, src_node,
			
 
				+				(void*) dst + dst_offset, dst_node,
			
 
				+				size,
			
 
				+				async_channel?starpu_cuda_get_local_out_transfer_stream():NULL,
			
 
				+				cudaMemcpyDeviceToHost);
			
 
				+
			
 
				+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CUDA_RAM):
			
 
				+		return starpu_cuda_copy_async_sync(
			
 
				+				(void*) src + src_offset, src_node,
			
 
				+				(void*) dst + dst_offset, dst_node,
			
 
				+				size,
			
 
				+				async_channel?starpu_cuda_get_local_in_transfer_stream():NULL,
			
 
				+				cudaMemcpyHostToDevice);
			
 
				+
			
 
				+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CUDA_RAM):
			
 
				+		return starpu_cuda_copy_async_sync(
			
 
				+				(void*) src + src_offset, src_node,
			
 
				+				(void*) dst + dst_offset, dst_node,
			
 
				+				size,
			
 
				+				async_channel?starpu_cuda_get_local_peer_transfer_stream():NULL,
			
 
				+				cudaMemcpyDeviceToDevice);
			
 
				+
			
 
				+#endif
			
 
				+#ifdef STARPU_USE_OPENCL
			
 
				+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_CPU_RAM):
			
 
				+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_OPENCL_RAM):
			
 
				+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_OPENCL_RAM):
			
 
				+		return starpu_opencl_copy_async_sync(
			
 
				+				src, src_node, src_offset,
			
 
				+				dst, dst_node, dst_offset,
			
 
				+				size,
			
 
				+				&async_channel->event.opencl_event);
			
 
				+#endif
			
 
				+	default:
			
 
				+		STARPU_ABORT();
			
 
				+		return -1;
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 void _starpu_driver_wait_request_completion(struct _starpu_async_channel *async_channel)
			
 
				 {
			
 
				 #ifdef STARPU_SIMGRID
			
--- a/src/datawizard/interfaces/bcsr_interface.c
+++ b/src/datawizard/interfaces/bcsr_interface.c
@@ -31,31 +31,11 @@
 
				  * BCSR : blocked CSR, we use blocks of size (r x c)
			
 
				  */
			
 
				 
			
 
				-static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				-static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				-static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				-#endif
			
 
				+static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
			
 
				 
			
 
				 static struct starpu_data_copy_methods bcsr_copy_data_methods_s =
			
 
				 {
			
 
				-	.ram_to_ram = copy_ram_to_ram,
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	.ram_to_cuda = copy_ram_to_cuda,
			
 
				-	.cuda_to_ram = copy_cuda_to_ram,
			
 
				-	.cuda_to_cuda = copy_cuda_to_cuda,
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-	.ram_to_opencl = copy_ram_to_opencl,
			
 
				-	.opencl_to_ram = copy_opencl_to_ram,
			
 
				-	.opencl_to_opencl = copy_opencl_to_opencl,
			
 
				-#endif
			
 
				+	.any_to_any = copy_any_to_any,
			
 
				 };
			
 
				 
			
 
				 static void register_bcsr_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface);
			
@@ -315,105 +295,7 @@ static void free_bcsr_buffer_on_node(void *data_interface, unsigned node)
 
				 	starpu_free_buffer_on_node(node, (uintptr_t) bcsr_interface->rowptr, (nrow+1)*sizeof(uint32_t));
			
 
				 }
			
 
				 
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-static int copy_cuda_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind)
			
 
				-{
			
 
				-	struct starpu_bcsr_interface *src_bcsr = src_interface;
			
 
				-	struct starpu_bcsr_interface *dst_bcsr = dst_interface;
			
 
				-
			
 
				-	uint32_t nnz = src_bcsr->nnz;
			
 
				-	uint32_t nrow = src_bcsr->nrow;
			
 
				-	size_t elemsize = src_bcsr->elemsize;
			
 
				-
			
 
				-	uint32_t r = src_bcsr->r;
			
 
				-	uint32_t c = src_bcsr->c;
			
 
				-
			
 
				-	cudaError_t cures;
			
 
				-
			
 
				-	cures = cudaMemcpy((char *)dst_bcsr->nzval, (char *)src_bcsr->nzval, nnz*r*c*elemsize, kind);
			
 
				-	if (STARPU_UNLIKELY(cures))
			
 
				-		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-
			
 
				-	cures = cudaMemcpy((char *)dst_bcsr->colind, (char *)src_bcsr->colind, nnz*sizeof(uint32_t), kind);
			
 
				-	if (STARPU_UNLIKELY(cures))
			
 
				-		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-
			
 
				-	cures = cudaMemcpy((char *)dst_bcsr->rowptr, (char *)src_bcsr->rowptr, (nrow+1)*sizeof(uint32_t), kind);
			
 
				-	if (STARPU_UNLIKELY(cures))
			
 
				-		STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-
			
 
				-	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				-{
			
 
				-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost);
			
 
				-}
			
 
				-
			
 
				-static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				-{
			
 
				-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice);
			
 
				-}
			
 
				-
			
 
				-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				-{
			
 
				-	return copy_cuda_common(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice);
			
 
				-}
			
 
				-#endif // STARPU_USE_CUDA
			
 
				-
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-static int copy_opencl_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				-{
			
 
				-	struct starpu_bcsr_interface *src_bcsr = src_interface;
			
 
				-	struct starpu_bcsr_interface *dst_bcsr = dst_interface;
			
 
				-
			
 
				-	uint32_t nnz = src_bcsr->nnz;
			
 
				-	uint32_t nrow = src_bcsr->nrow;
			
 
				-	size_t elemsize = src_bcsr->elemsize;
			
 
				-
			
 
				-	uint32_t r = src_bcsr->r;
			
 
				-	uint32_t c = src_bcsr->c;
			
 
				-
			
 
				-        int err;
			
 
				-
			
 
				-	err = starpu_opencl_copy_async_sync(src_bcsr->nzval, src_node, 0, dst_bcsr->nzval, dst_node, 0, nnz*r*c*elemsize, NULL);
			
 
				-	if (STARPU_UNLIKELY(err))
			
 
				-		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-
			
 
				-	err = starpu_opencl_copy_async_sync((uintptr_t)src_bcsr->colind, src_node, 0, (uintptr_t)dst_bcsr->colind, dst_node, 0, nnz*sizeof(uint32_t), NULL);
			
 
				-	if (STARPU_UNLIKELY(err))
			
 
				-		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-
			
 
				-	err = starpu_opencl_copy_async_sync((uintptr_t)src_bcsr->rowptr, src_node, 0, (uintptr_t)dst_bcsr->rowptr, dst_node, 0, (nrow+1)*sizeof(uint32_t), NULL);
			
 
				-	if (STARPU_UNLIKELY(err))
			
 
				-		STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-
			
 
				-	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*r*c*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static int copy_opencl_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node);
			
 
				-}
			
 
				-
			
 
				-static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node);
			
 
				-}
			
 
				-
			
 
				-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node);
			
 
				-}
			
 
				-
			
 
				-#endif // STARPU_USE_OPENCL
			
 
				-
			
 
				-/* as not all platform easily have a BLAS lib installed ... */
			
 
				-static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				+static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data)
			
 
				 {
			
 
				 	struct starpu_bcsr_interface *src_bcsr = (struct starpu_bcsr_interface *) src_interface;
			
 
				 	struct starpu_bcsr_interface *dst_bcsr = (struct starpu_bcsr_interface *) dst_interface;
			
@@ -425,13 +307,18 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBU
 
				 	uint32_t r = src_bcsr->r;
			
 
				 	uint32_t c = src_bcsr->c;
			
 
				 
			
 
				-	memcpy((void *)dst_bcsr->nzval, (void *)src_bcsr->nzval, nnz*elemsize*r*c);
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	if (starpu_interface_copy(src_bcsr->nzval, src_node, 0, dst_bcsr->nzval, dst_node, 0, nnz*elemsize*r*c, async_data))
			
 
				+		ret = -EAGAIN;
			
 
				 
			
 
				-	memcpy((void *)dst_bcsr->colind, (void *)src_bcsr->colind, nnz*sizeof(uint32_t));
			
 
				+	if (starpu_interface_copy((uintptr_t)src_bcsr->colind, src_node, 0, (uintptr_t)dst_bcsr->colind, dst_node, 0, nnz*sizeof(uint32_t), async_data))
			
 
				+		ret = -EAGAIN;
			
 
				 
			
 
				-	memcpy((void *)dst_bcsr->rowptr, (void *)src_bcsr->rowptr, (nrow+1)*sizeof(uint32_t));
			
 
				+	if (starpu_interface_copy((uintptr_t)src_bcsr->rowptr, src_node, 0, (uintptr_t)dst_bcsr->rowptr, dst_node, 0, (nrow+1)*sizeof(uint32_t), async_data))
			
 
				+		ret = -EAGAIN;
			
 
				 
			
 
				 	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize*r*c + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				 
			
 
				-	return 0;
			
 
				+	return ret;
			
 
				 }
			
--- a/src/datawizard/interfaces/block_interface.c
+++ b/src/datawizard/interfaces/block_interface.c
@@ -420,7 +420,7 @@ static int copy_cuda_async_common(void *src_interface, unsigned src_node STARPU_
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				-		/* Default case: we transfer all lines one by one: ny*nz transfers */
			
 
				+		/* Default case: we transfer all blocks one by one: nz 2D transfers */
			
 
				 		unsigned layer;
			
 
				 		for (layer = 0; layer < src_block->nz; layer++)
			
 
				 		{
			
--- a/src/datawizard/interfaces/coo_interface.c
+++ b/src/datawizard/interfaces/coo_interface.c
@@ -19,190 +19,36 @@
 
				 #include <datawizard/memalloc.h>
			
 
				 
			
 
				 static int
			
 
				-copy_ram_to_ram(void *src_interface, STARPU_ATTRIBUTE_UNUSED unsigned src_node,
			
 
				-		void *dst_interface, STARPU_ATTRIBUTE_UNUSED unsigned dst_node)
			
 
				+copy_any_to_any(void *src_interface, unsigned src_node,
			
 
				+		void *dst_interface, unsigned dst_node, void *async_data)
			
 
				 {
			
 
				 	size_t size = 0;
			
 
				 	struct starpu_coo_interface *src_coo, *dst_coo;
			
 
				-
			
 
				-	src_coo = (struct starpu_coo_interface *) src_interface;
			
 
				-	dst_coo = (struct starpu_coo_interface *) dst_interface;
			
 
				-
			
 
				-	size = src_coo->n_values * sizeof(src_coo->columns[0]);
			
 
				-	memcpy((void *) dst_coo->columns, (void *) src_coo->columns, size);
			
 
				-
			
 
				-	/* sizeof(src_coo->columns[0]) == sizeof(src_coo->rows[0]) */
			
 
				-	memcpy((void *) dst_coo->rows, (void *) src_coo->rows, size);
			
 
				-
			
 
				-	size = src_coo->n_values * src_coo->elemsize;
			
 
				-	memcpy((void *) dst_coo->values, (void *) src_coo->values, size);
			
 
				-
			
 
				-	_STARPU_TRACE_DATA_COPY(src_node, dst_node,
			
 
				-		src_coo->n_values *
			
 
				-		(2 * sizeof(src_coo->rows[0]) + src_coo->elemsize));
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-static int
			
 
				-copy_cuda_async_sync(void *src_interface, unsigned src_node,
			
 
				-		     void *dst_interface, unsigned dst_node,
			
 
				-		     cudaStream_t stream, enum cudaMemcpyKind kind)
			
 
				-{
			
 
				-	int ret;
			
 
				-	size_t size = 0;
			
 
				-	struct starpu_coo_interface *src_coo, *dst_coo;
			
 
				-
			
 
				-	src_coo = (struct starpu_coo_interface *) src_interface;
			
 
				-	dst_coo = (struct starpu_coo_interface *) dst_interface;
			
 
				-
			
 
				-	size = src_coo->n_values * sizeof(src_coo->columns[0]);
			
 
				-	ret = starpu_cuda_copy_async_sync(
			
 
				-		(void *) src_coo->columns,
			
 
				-		src_node,
			
 
				-		(void *) dst_coo->columns,
			
 
				-		dst_node,
			
 
				-		size,
			
 
				-		stream,
			
 
				-		kind);
			
 
				-	if (ret == 0)
			
 
				-		stream = NULL;
			
 
				-
			
 
				-	/* sizeof(src_coo->columns[0]) == sizeof(src_coo->rows[0]) */
			
 
				-	ret = starpu_cuda_copy_async_sync(
			
 
				-		(void *) src_coo->rows,
			
 
				-		src_node,
			
 
				-		(void *) dst_coo->rows,
			
 
				-		dst_node,
			
 
				-		size,
			
 
				-		stream,
			
 
				-		kind);
			
 
				-	if (ret == 0)
			
 
				-		stream = NULL;
			
 
				-
			
 
				-	size = src_coo->n_values * src_coo->elemsize;
			
 
				-	ret = starpu_cuda_copy_async_sync(
			
 
				-		(void *) src_coo->values,
			
 
				-		src_node,
			
 
				-		(void *) dst_coo->values,
			
 
				-		dst_node,
			
 
				-		size,
			
 
				-		stream,
			
 
				-		kind);
			
 
				-
			
 
				-	_STARPU_TRACE_DATA_COPY(src_node, dst_node,
			
 
				-		src_coo->n_values *
			
 
				-		(2 * sizeof(src_coo->rows[0]) + src_coo->elemsize));
			
 
				-	return ret;
			
 
				-}
			
 
				-
			
 
				-static int
			
 
				-copy_ram_to_cuda(void *src_interface, unsigned src_node,
			
 
				-		 void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-	return copy_cuda_async_sync(src_interface, src_node,
			
 
				-				    dst_interface, dst_node,
			
 
				-				    NULL, cudaMemcpyHostToDevice);
			
 
				-}
			
 
				-
			
 
				-static int
			
 
				-copy_cuda_to_ram(void *src_interface, unsigned src_node,
			
 
				-		 void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-	return copy_cuda_async_sync(src_interface, src_node,
			
 
				-				    dst_interface, dst_node,
			
 
				-				    NULL, cudaMemcpyDeviceToHost);
			
 
				-}
			
 
				-
			
 
				-static int
			
 
				-copy_ram_to_cuda_async(void *src_interface, unsigned src_node,
			
 
				-		       void *dst_interface, unsigned dst_node,
			
 
				-		       cudaStream_t stream)
			
 
				-{
			
 
				-	return copy_cuda_async_sync(src_interface, src_node,
			
 
				-				    dst_interface, dst_node,
			
 
				-				    stream, cudaMemcpyHostToDevice);
			
 
				-}
			
 
				-
			
 
				-static int
			
 
				-copy_cuda_to_ram_async(void *src_interface, unsigned src_node,
			
 
				-		       void *dst_interface, unsigned dst_node,
			
 
				-		       cudaStream_t stream)
			
 
				-{
			
 
				-	return copy_cuda_async_sync(src_interface, src_node,
			
 
				-				    dst_interface, dst_node,
			
 
				-				    stream, cudaMemcpyDeviceToHost);
			
 
				-}
			
 
				-
			
 
				-static int
			
 
				-copy_cuda_to_cuda(void *src_interface, unsigned src_node,
			
 
				-		  void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-	return copy_cuda_async_sync(src_interface, src_node,
			
 
				-				    dst_interface, dst_node,
			
 
				-				    NULL, cudaMemcpyDeviceToDevice);
			
 
				-}
			
 
				-
			
 
				-#ifdef NO_STRIDE
			
 
				-static int
			
 
				-copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,
			
 
				-			void *dst_interface, unsigned dst_node,
			
 
				-			cudaStream_t stream)
			
 
				-{
			
 
				-	return copy_cuda_async_sync(src_interface, src_node,
			
 
				-				    dst_interface, dst_node,
			
 
				-				    stream, cudaMemcpyDeviceToDevice);
			
 
				-}
			
 
				-#endif /* !NO_STRIDE */
			
 
				-#endif /* !STARPU_USE_CUDA */
			
 
				-
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-static int
			
 
				-copy_opencl_common(void *src_interface, unsigned src_node,
			
 
				-			 void *dst_interface, unsigned dst_node,
			
 
				-			 cl_event *event)
			
 
				-{
			
 
				 	int ret = 0;
			
 
				-	size_t size = 0;
			
 
				-	struct starpu_coo_interface *src_coo, *dst_coo;
			
 
				 
			
 
				 	src_coo = (struct starpu_coo_interface *) src_interface;
			
 
				 	dst_coo = (struct starpu_coo_interface *) dst_interface;
			
 
				 
			
 
				-
			
 
				 	size = src_coo->n_values * sizeof(src_coo->columns[0]);
			
 
				-	ret = starpu_opencl_copy_async_sync(
			
 
				-		(uintptr_t) src_coo->columns,
			
 
				-		src_node,
			
 
				-		0,
			
 
				-		(uintptr_t) dst_coo->columns,
			
 
				-		dst_node,
			
 
				-		0,
			
 
				-		size,
			
 
				-		NULL);
			
 
				+	if (starpu_interface_copy(
			
 
				+		(uintptr_t) src_coo->columns, src_node, 0,
			
 
				+		(uintptr_t) dst_coo->columns, dst_node, 0,
			
 
				+		size, async_data))
			
 
				+		ret = -EAGAIN;
			
 
				 
			
 
				 	/* sizeof(src_coo->columns[0]) == sizeof(src_coo->rows[0]) */
			
 
				-	ret = starpu_opencl_copy_async_sync(
			
 
				-		(uintptr_t) src_coo->rows,
			
 
				-		src_node,
			
 
				-		0,
			
 
				-		(uintptr_t) dst_coo->rows,
			
 
				-		dst_node,
			
 
				-		0,
			
 
				-		size,
			
 
				-		NULL);
			
 
				+	if (starpu_interface_copy(
			
 
				+		(uintptr_t) src_coo->rows, src_node, 0,
			
 
				+		(uintptr_t) dst_coo->rows, dst_node, 0,
			
 
				+		size, async_data))
			
 
				+		ret = -EAGAIN;
			
 
				 
			
 
				 	size = src_coo->n_values * src_coo->elemsize;
			
 
				-	ret = starpu_opencl_copy_async_sync(
			
 
				-		src_coo->values,
			
 
				-		src_node,
			
 
				-		0,
			
 
				-		(uintptr_t) dst_coo->values,
			
 
				-		dst_node,
			
 
				-		0,
			
 
				-		size,
			
 
				-		event);
			
 
				+	if (starpu_interface_copy(
			
 
				+		src_coo->values, src_node, 0,
			
 
				+		dst_coo->values, dst_node, 0,
			
 
				+		size, async_data))
			
 
				+		ret = -EAGAIN;
			
 
				 
			
 
				 	_STARPU_TRACE_DATA_COPY(src_node, dst_node,
			
 
				 		src_coo->n_values *
			
@@ -211,83 +57,9 @@ copy_opencl_common(void *src_interface, unsigned src_node,
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-static int
			
 
				-copy_ram_to_opencl_async(void *src_interface, unsigned src_node,
			
 
				-			 void *dst_interface, unsigned dst_node,
			
 
				-			 cl_event *event)
			
 
				-{
			
 
				-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
			
 
				-}
			
 
				-
			
 
				-static int
			
 
				-copy_opencl_to_ram_async(void *src_interface, unsigned src_node,
			
 
				-			 void *dst_interface, unsigned dst_node,
			
 
				-			 cl_event *event)
			
 
				-{
			
 
				-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
			
 
				-}
			
 
				-
			
 
				-static int
			
 
				-copy_opencl_to_opencl_async(void *src_interface, unsigned src_node,
			
 
				-			 void *dst_interface, unsigned dst_node,
			
 
				-			 cl_event *event)
			
 
				-{
			
 
				-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
			
 
				-}
			
 
				-
			
 
				-static int
			
 
				-copy_ram_to_opencl(void *src_interface, unsigned src_node,
			
 
				-		   void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-	return copy_ram_to_opencl_async(src_interface, src_node,
			
 
				-					dst_interface, dst_node,
			
 
				-					NULL);
			
 
				-}
			
 
				-static int
			
 
				-copy_opencl_to_ram(void *src_interface, unsigned src_node,
			
 
				-		   void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-	return copy_opencl_to_ram_async(src_interface, src_node,
			
 
				-					dst_interface, dst_node,
			
 
				-					NULL);
			
 
				-}
			
 
				-static int
			
 
				-copy_opencl_to_opencl(void *src_interface, unsigned src_node,
			
 
				-		   void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-	return copy_opencl_to_opencl_async(src_interface, src_node,
			
 
				-					dst_interface, dst_node,
			
 
				-					NULL);
			
 
				-}
			
 
				-#endif /* !STARPU_USE_OPENCL */
			
 
				-
			
 
				 static struct starpu_data_copy_methods coo_copy_data_methods =
			
 
				 {
			
 
				-	.ram_to_ram          = copy_ram_to_ram,
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	.ram_to_cuda         = copy_ram_to_cuda,
			
 
				-	.cuda_to_ram         = copy_cuda_to_ram,
			
 
				-	.ram_to_cuda_async   = copy_ram_to_cuda_async,
			
 
				-	.cuda_to_ram_async   = copy_cuda_to_ram_async,
			
 
				-	.cuda_to_cuda        = copy_cuda_to_cuda,
			
 
				-#ifdef NO_STRIDE
			
 
				-	.cuda_to_cuda_async  = copy_cuda_to_cuda_async,
			
 
				-#endif
			
 
				-#else
			
 
				-#ifdef STARPU_SIMGRID
			
 
				-#ifdef NO_STRIDE
			
 
				-	/* Enable GPU-GPU transfers in simgrid */
			
 
				-	.cuda_to_cuda_async = 1,
			
 
				-#endif
			
 
				-#endif
			
 
				-#endif /* !STARPU_USE_CUDA */
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-	.ram_to_opencl       = copy_ram_to_opencl,
			
 
				-	.opencl_to_ram       = copy_opencl_to_ram,
			
 
				-	.opencl_to_opencl    = copy_opencl_to_opencl,
			
 
				-	.ram_to_opencl_async = copy_ram_to_opencl_async,
			
 
				-	.opencl_to_opencl_async = copy_opencl_to_opencl_async,
			
 
				-#endif /* !STARPU_USE_OPENCL */
			
 
				+	.any_to_any          = copy_any_to_any,
			
 
				 };
			
 
				 
			
 
				 static void
			
--- a/src/datawizard/interfaces/csr_interface.c
+++ b/src/datawizard/interfaces/csr_interface.c
@@ -28,42 +28,11 @@
 
				 #include <starpu_opencl.h>
			
 
				 #include <drivers/opencl/driver_opencl.h>
			
 
				 
			
 
				-static int copy_ram_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
			
 
				-static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
			
 
				-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
			
 
				-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
			
 
				-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
			
 
				-static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
			
 
				-static int copy_opencl_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
			
 
				-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
			
 
				-#endif
			
 
				+static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
			
 
				 
			
 
				 static struct starpu_data_copy_methods csr_copy_data_methods_s =
			
 
				 {
			
 
				-	.ram_to_ram = copy_ram_to_ram,
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	.ram_to_cuda = copy_ram_to_cuda,
			
 
				-	.cuda_to_ram = copy_cuda_to_ram,
			
 
				-	.cuda_to_cuda = copy_cuda_to_cuda,
			
 
				-	.ram_to_cuda_async = copy_ram_to_cuda_async,
			
 
				-	.cuda_to_ram_async = copy_cuda_to_ram_async,
			
 
				-	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
			
 
				-#else
			
 
				-#ifdef STARPU_SIMGRID
			
 
				-	/* Enable GPU-GPU transfers in simgrid */
			
 
				-	.cuda_to_cuda_async = 1,
			
 
				-#endif
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-	.ram_to_opencl = copy_ram_to_opencl,
			
 
				-	.opencl_to_ram = copy_opencl_to_ram,
			
 
				-	.opencl_to_opencl = copy_opencl_to_opencl,
			
 
				-#endif
			
 
				+	.any_to_any = copy_any_to_any,
			
 
				 };
			
 
				 
			
 
				 static void register_csr_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface);
			
@@ -293,188 +262,8 @@ static void free_csr_buffer_on_node(void *data_interface, unsigned node)
 
				 	starpu_free_buffer_on_node(node, (uintptr_t) csr_interface->rowptr, (nrow+1)*sizeof(uint32_t));
			
 
				 }
			
 
				 
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-static int copy_cuda_async_sync(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, enum cudaMemcpyKind kind, cudaStream_t stream)
			
 
				-{
			
 
				-	struct starpu_csr_interface *src_csr = src_interface;
			
 
				-	struct starpu_csr_interface *dst_csr = dst_interface;
			
 
				-
			
 
				-	uint32_t nnz = src_csr->nnz;
			
 
				-	uint32_t nrow = src_csr->nrow;
			
 
				-	size_t elemsize = src_csr->elemsize;
			
 
				-
			
 
				-	cudaStream_t sstream = stream;
			
 
				-	int ret;
			
 
				-
			
 
				-	ret = starpu_cuda_copy_async_sync((void *)src_csr->nzval, src_node, (void *)dst_csr->nzval, dst_node, nnz*elemsize, sstream, kind);
			
 
				-	if (ret == 0) sstream = NULL;
			
 
				-
			
 
				-	ret = starpu_cuda_copy_async_sync((void *)src_csr->colind, src_node, (void *)dst_csr->colind, dst_node, nnz*sizeof(uint32_t), sstream, kind);
			
 
				-	if (ret == 0) sstream = NULL;
			
 
				-
			
 
				-	ret = starpu_cuda_copy_async_sync((void *)src_csr->rowptr, src_node, (void *)dst_csr->rowptr, dst_node, (nrow+1)*sizeof(uint32_t), sstream, kind);
			
 
				-
			
 
				-	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				-	return ret;
			
 
				-}
			
 
				-
			
 
				-static int copy_cuda_peer_async(void *src_interface STARPU_ATTRIBUTE_UNUSED, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
			
 
				-				void *dst_interface STARPU_ATTRIBUTE_UNUSED, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream STARPU_ATTRIBUTE_UNUSED)
			
 
				-{
			
 
				-#ifdef HAVE_CUDA_MEMCPY_PEER
			
 
				-	struct starpu_csr_interface *src_csr = src_interface;
			
 
				-	struct starpu_csr_interface *dst_csr = dst_interface;
			
 
				-
			
 
				-	uint32_t nnz = src_csr->nnz;
			
 
				-	uint32_t nrow = src_csr->nrow;
			
 
				-	size_t elemsize = src_csr->elemsize;
			
 
				-
			
 
				-	cudaError_t cures;
			
 
				-
			
 
				-	int src_dev = _starpu_memory_node_get_devid(src_node);
			
 
				-	int dst_dev = _starpu_memory_node_get_devid(dst_node);
			
 
				-
			
 
				-	int synchronous_fallback = 0;
			
 
				-
			
 
				-	_STARPU_TRACE_START_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				-	cures = cudaMemcpyPeerAsync((char *)dst_csr->nzval, dst_dev, (char *)src_csr->nzval, src_dev, nnz*elemsize, stream);
			
 
				-	if (cures)
			
 
				-	{
			
 
				-		synchronous_fallback = 1;
			
 
				-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				-		cures = cudaMemcpyPeer((char *)dst_csr->nzval, dst_dev, (char *)src_csr->nzval, src_dev, nnz*elemsize);
			
 
				-		if (STARPU_UNLIKELY(cures))
			
 
				-			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-	}
			
 
				-
			
 
				-	if (!synchronous_fallback)
			
 
				-	{
			
 
				-		cures = cudaMemcpyPeerAsync((char *)dst_csr->colind, dst_dev, (char *)src_csr->colind, src_dev, nnz*sizeof(uint32_t), stream);
			
 
				-	}
			
 
				-
			
 
				-	if (synchronous_fallback || cures != cudaSuccess)
			
 
				-	{
			
 
				-		synchronous_fallback = 1;
			
 
				-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				-		cures = cudaMemcpyPeer((char *)dst_csr->colind, dst_dev, (char *)src_csr->colind, src_dev, nnz*sizeof(uint32_t));
			
 
				-		if (STARPU_UNLIKELY(cures))
			
 
				-			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-	}
			
 
				-
			
 
				-	if (!synchronous_fallback)
			
 
				-	{
			
 
				-		cures = cudaMemcpyPeerAsync((char *)dst_csr->rowptr, dst_dev, (char *)src_csr->rowptr, src_dev, (nrow+1)*sizeof(uint32_t), stream);
			
 
				-	}
			
 
				-
			
 
				-	if (synchronous_fallback || cures != cudaSuccess)
			
 
				-	{
			
 
				-		synchronous_fallback = 1;
			
 
				-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				-		cures = cudaMemcpyPeer((char *)dst_csr->rowptr, dst_dev, (char *)src_csr->rowptr, src_dev, (nrow+1)*sizeof(uint32_t));
			
 
				-		if (STARPU_UNLIKELY(cures))
			
 
				-			STARPU_CUDA_REPORT_ERROR(cures);
			
 
				-	}
			
 
				-
			
 
				-	if (synchronous_fallback)
			
 
				-	{
			
 
				-		_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				-		return 0;
			
 
				-	}
			
 
				-	else
			
 
				-	{
			
 
				-		_STARPU_TRACE_END_DRIVER_COPY_ASYNC(src_node, dst_node);
			
 
				-		return -EAGAIN;
			
 
				-	}
			
 
				-#else
			
 
				-	/* Illegal without Peer tranfers */
			
 
				-	STARPU_ABORT();
			
 
				-	return 0;
			
 
				-#endif
			
 
				-}
			
 
				-
			
 
				-static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, NULL);
			
 
				-}
			
 
				-
			
 
				-static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, NULL);
			
 
				-}
			
 
				-
			
 
				-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice, NULL);
			
 
				-}
			
 
				-
			
 
				-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
			
 
				-{
			
 
				-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToHost, stream);
			
 
				-}
			
 
				-
			
 
				-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
			
 
				-{
			
 
				-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyHostToDevice, stream);
			
 
				-}
			
 
				-
			
 
				-static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
			
 
				-{
			
 
				-	if (src_node == dst_node)
			
 
				-		return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, cudaMemcpyDeviceToDevice, stream);
			
 
				-	else
			
 
				-		return copy_cuda_peer_async(src_interface, src_node, dst_interface, dst_node, stream);
			
 
				-}
			
 
				-
			
 
				-#endif // STARPU_USE_CUDA
			
 
				-
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-static int copy_opencl_common(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				-{
			
 
				-	struct starpu_csr_interface *src_csr = src_interface;
			
 
				-	struct starpu_csr_interface *dst_csr = dst_interface;
			
 
				-
			
 
				-	uint32_t nnz = src_csr->nnz;
			
 
				-	uint32_t nrow = src_csr->nrow;
			
 
				-	size_t elemsize = src_csr->elemsize;
			
 
				-
			
 
				-        int err;
			
 
				-
			
 
				-	err = starpu_opencl_copy_async_sync(src_csr->nzval, src_node, 0, dst_csr->nzval, dst_node, 0, nnz*elemsize, NULL);
			
 
				-	if (STARPU_UNLIKELY(err))
			
 
				-                STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-
			
 
				-	err = starpu_opencl_copy_async_sync((uintptr_t)src_csr->colind, src_node, 0, (uintptr_t)dst_csr->colind, dst_node, 0, nnz*sizeof(uint32_t), NULL);
			
 
				-        if (STARPU_UNLIKELY(err))
			
 
				-                STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-
			
 
				-	err = starpu_opencl_copy_async_sync((uintptr_t)src_csr->rowptr, src_node, 0, (uintptr_t)dst_csr->rowptr, dst_node, 0, (nrow+1)*sizeof(uint32_t), NULL);
			
 
				-	if (STARPU_UNLIKELY(err))
			
 
				-                STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-
			
 
				-	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static int copy_opencl_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node);
			
 
				-}
			
 
				-
			
 
				-static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node);
			
 
				-}
			
 
				-
			
 
				-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node);
			
 
				-}
			
 
				-
			
 
				-#endif // STARPU_USE_OPENCL
			
 
				-
			
 
				 /* as not all platform easily have a BLAS lib installed ... */
			
 
				-static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				+static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data)
			
 
				 {
			
 
				 	struct starpu_csr_interface *src_csr = (struct starpu_csr_interface *) src_interface;
			
 
				 	struct starpu_csr_interface *dst_csr = (struct starpu_csr_interface *) dst_interface;
			
@@ -482,14 +271,18 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBU
 
				 	uint32_t nnz = src_csr->nnz;
			
 
				 	uint32_t nrow = src_csr->nrow;
			
 
				 	size_t elemsize = src_csr->elemsize;
			
 
				+	int ret = 0;
			
 
				 
			
 
				-	memcpy((void *)dst_csr->nzval, (void *)src_csr->nzval, nnz*elemsize);
			
 
				+	if (starpu_interface_copy(src_csr->nzval, src_node, 0, dst_csr->nzval, dst_node, 0, nnz*elemsize, async_data))
			
 
				+		ret = -EAGAIN;
			
 
				 
			
 
				-	memcpy((void *)dst_csr->colind, (void *)src_csr->colind, nnz*sizeof(uint32_t));
			
 
				+	if (starpu_interface_copy((uintptr_t)src_csr->colind, src_node, 0, (uintptr_t)dst_csr->colind, dst_node, 0, nnz*sizeof(uint32_t), async_data))
			
 
				+		ret = -EAGAIN;
			
 
				 
			
 
				-	memcpy((void *)dst_csr->rowptr, (void *)src_csr->rowptr, (nrow+1)*sizeof(uint32_t));
			
 
				+	if (starpu_interface_copy((uintptr_t)src_csr->rowptr, src_node, 0, (uintptr_t)dst_csr->rowptr, dst_node, 0, (nrow+1)*sizeof(uint32_t), async_data))
			
 
				+		ret = -EAGAIN;
			
 
				 
			
 
				 	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nnz*elemsize + (nnz+nrow+1)*sizeof(uint32_t));
			
 
				 
			
 
				-	return 0;
			
 
				+	return ret;
			
 
				 }
			
--- a/src/datawizard/interfaces/variable_interface.c
+++ b/src/datawizard/interfaces/variable_interface.c
@@ -25,48 +25,11 @@
 
				 #include <starpu_opencl.h>
			
 
				 #include <drivers/opencl/driver_opencl.h>
			
 
				 
			
 
				-static int copy_ram_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				-static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
			
 
				-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
			
 
				-static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cudaStream_t stream);
			
 
				-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-static int copy_ram_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				-static int copy_opencl_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
			
 
				-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
			
 
				-static int copy_opencl_to_opencl_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
			
 
				-#endif
			
 
				+static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
			
 
				 
			
 
				 static struct starpu_data_copy_methods variable_copy_data_methods_s =
			
 
				 {
			
 
				-	.ram_to_ram = copy_ram_to_ram,
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	.ram_to_cuda = copy_ram_to_cuda,
			
 
				-	.cuda_to_ram = copy_cuda_to_ram,
			
 
				-	.cuda_to_cuda = copy_cuda_to_cuda,
			
 
				-	.ram_to_cuda_async = copy_ram_to_cuda_async,
			
 
				-	.cuda_to_ram_async = copy_cuda_to_ram_async,
			
 
				-	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
			
 
				-#else
			
 
				-#ifdef STARPU_SIMGRID
			
 
				-	/* Enable GPU-GPU transfers in simgrid */
			
 
				-	.cuda_to_cuda_async = 1,
			
 
				-#endif
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-	.ram_to_opencl = copy_ram_to_opencl,
			
 
				-	.opencl_to_ram = copy_opencl_to_ram,
			
 
				-	.opencl_to_opencl = copy_opencl_to_opencl,
			
 
				-        .ram_to_opencl_async = copy_ram_to_opencl_async,
			
 
				-	.opencl_to_ram_async = copy_opencl_to_ram_async,
			
 
				-	.opencl_to_opencl_async = copy_opencl_to_opencl_async,
			
 
				-#endif
			
 
				+	.any_to_any = copy_any_to_any,
			
 
				 };
			
 
				 
			
 
				 static void register_variable_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface);
			
@@ -204,101 +167,7 @@ static void free_variable_buffer_on_node(void *data_interface, unsigned node)
 
				 	starpu_free_buffer_on_node(node, variable_interface->ptr, variable_interface->elemsize);
			
 
				 }
			
 
				 
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-static int copy_cuda_async_sync(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream, enum cudaMemcpyKind kind)
			
 
				-{
			
 
				-	struct starpu_variable_interface *src_variable = src_interface;
			
 
				-	struct starpu_variable_interface *dst_variable = dst_interface;
			
 
				-	int ret;
			
 
				-
			
 
				-	ret = starpu_cuda_copy_async_sync((void *)src_variable->ptr, src_node, (void *)dst_variable->ptr, dst_node, src_variable->elemsize, stream, kind);
			
 
				-	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
			
 
				-	return ret;
			
 
				-}
			
 
				-
			
 
				-static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, NULL, cudaMemcpyDeviceToHost);
			
 
				-}
			
 
				-
			
 
				-static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, NULL, cudaMemcpyHostToDevice);
			
 
				-}
			
 
				-
			
 
				-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, NULL, cudaMemcpyDeviceToDevice);
			
 
				-}
			
 
				-
			
 
				-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
			
 
				-{
			
 
				-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
			
 
				-}
			
 
				-
			
 
				-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
			
 
				-{
			
 
				-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
			
 
				-}
			
 
				-
			
 
				-static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
			
 
				-{
			
 
				-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToDevice);
			
 
				-}
			
 
				-
			
 
				-#endif // STARPU_USE_CUDA
			
 
				-
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-static int copy_opencl_common(void *src_interface, unsigned src_node,
			
 
				-				void *dst_interface, unsigned dst_node, cl_event *event)
			
 
				-{
			
 
				-	struct starpu_variable_interface *src_variable = src_interface;
			
 
				-	struct starpu_variable_interface *dst_variable = dst_interface;
			
 
				-        int ret;
			
 
				-
			
 
				-	ret = starpu_opencl_copy_async_sync(src_variable->ptr, src_node, 0, dst_variable->ptr, dst_node, 0, src_variable->elemsize, event);
			
 
				-
			
 
				-	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_variable->elemsize);
			
 
				-
			
 
				-	return ret;
			
 
				-}
			
 
				-
			
 
				-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node,
			
 
				-					void *dst_interface, unsigned dst_node, cl_event *event)
			
 
				-{
			
 
				-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
			
 
				-}
			
 
				-
			
 
				-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node,
			
 
				-					void *dst_interface, unsigned dst_node, cl_event *event)
			
 
				-{
			
 
				-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
			
 
				-}
			
 
				-
			
 
				-static int copy_opencl_to_opencl_async(void *src_interface, unsigned src_node,
			
 
				-					void *dst_interface, unsigned dst_node, cl_event *event)
			
 
				-{
			
 
				-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
			
 
				-}
			
 
				-
			
 
				-static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				-{
			
 
				-        return copy_ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
			
 
				-}
			
 
				-
			
 
				-static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				-{
			
 
				-        return copy_opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, NULL);
			
 
				-}
			
 
				-
			
 
				-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-	return copy_opencl_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
			
 
				-}
			
 
				-
			
 
				-#endif
			
 
				-
			
 
				-static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				+static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data)
			
 
				 {
			
 
				 	struct starpu_variable_interface *src_variable = (struct starpu_variable_interface *) src_interface;
			
 
				 	struct starpu_variable_interface *dst_variable = (struct starpu_variable_interface *) dst_interface;
			
@@ -307,10 +176,11 @@ static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBU
 
				 
			
 
				 	uintptr_t ptr_src = src_variable->ptr;
			
 
				 	uintptr_t ptr_dst = dst_variable->ptr;
			
 
				+	int ret;
			
 
				 
			
 
				-	memcpy((void *)ptr_dst, (void *)ptr_src, elemsize);
			
 
				+	ret = starpu_interface_copy(ptr_src, src_node, 0, ptr_dst, dst_node, 0, elemsize, async_data);
			
 
				 
			
 
				 	_STARPU_TRACE_DATA_COPY(src_node, dst_node, elemsize);
			
 
				 
			
 
				-	return 0;
			
 
				+	return ret;
			
 
				 }
			
--- a/src/datawizard/interfaces/vector_interface.c
+++ b/src/datawizard/interfaces/vector_interface.c
@@ -25,48 +25,11 @@
 
				 #include <starpu_opencl.h>
			
 
				 #include <drivers/opencl/driver_opencl.h>
			
 
				 
			
 
				-static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-static int copy_ram_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
			
 
				-static int copy_cuda_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
			
 
				-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, cudaStream_t stream);
			
 
				-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, cudaStream_t stream);
			
 
				-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
			
 
				-static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node,					void *dst_interface, unsigned dst_node, cudaStream_t stream);
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
			
 
				-static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
			
 
				-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
			
 
				-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, cl_event *event);
			
 
				-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, cl_event *event);
			
 
				-static int copy_opencl_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
			
 
				-#endif
			
 
				+static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
			
 
				 
			
 
				 static struct starpu_data_copy_methods vector_copy_data_methods_s =
			
 
				 {
			
 
				-	.ram_to_ram = copy_ram_to_ram,
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	.ram_to_cuda = copy_ram_to_cuda,
			
 
				-	.cuda_to_ram = copy_cuda_to_ram,
			
 
				-	.ram_to_cuda_async = copy_ram_to_cuda_async,
			
 
				-	.cuda_to_ram_async = copy_cuda_to_ram_async,
			
 
				-	.cuda_to_cuda = copy_cuda_to_cuda,
			
 
				-	.cuda_to_cuda_async = copy_cuda_to_cuda_async,
			
 
				-#else
			
 
				-#ifdef STARPU_SIMGRID
			
 
				-	/* Enable GPU-GPU transfers in simgrid */
			
 
				-	.cuda_to_cuda_async = 1,
			
 
				-#endif
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-	.ram_to_opencl = copy_ram_to_opencl,
			
 
				-	.opencl_to_ram = copy_opencl_to_ram,
			
 
				-	.opencl_to_opencl = copy_opencl_to_opencl,
			
 
				-        .ram_to_opencl_async = copy_ram_to_opencl_async,
			
 
				-	.opencl_to_ram_async = copy_opencl_to_ram_async,
			
 
				-	.opencl_to_opencl_async = copy_opencl_to_opencl_async,
			
 
				-#endif
			
 
				+	.any_to_any = copy_any_to_any,
			
 
				 };
			
 
				 
			
 
				 static void register_vector_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface);
			
@@ -252,120 +215,18 @@ static void free_vector_buffer_on_node(void *data_interface, unsigned node)
 
				 	starpu_free_buffer_on_node(node, vector_interface->ptr, nx*elemsize);
			
 
				 }
			
 
				 
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-static int copy_cuda_async_sync(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream, enum cudaMemcpyKind kind)
			
 
				+static int copy_any_to_any(void *src_interface, unsigned src_node,
			
 
				+                           void *dst_interface, unsigned dst_node, void *async_data)
			
 
				 {
			
 
				 	struct starpu_vector_interface *src_vector = src_interface;
			
 
				 	struct starpu_vector_interface *dst_vector = dst_interface;
			
 
				 	int ret;
			
 
				 
			
 
				-	ret = starpu_cuda_copy_async_sync((void *)src_vector->ptr, src_node, (void *)dst_vector->ptr, dst_node, src_vector->nx*src_vector->elemsize, stream, kind);
			
 
				-	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
			
 
				-	return ret;
			
 
				-}
			
 
				-
			
 
				-static int copy_cuda_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, NULL, cudaMemcpyDeviceToHost);
			
 
				-}
			
 
				-
			
 
				-static int copy_ram_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, NULL, cudaMemcpyHostToDevice);
			
 
				-}
			
 
				-
			
 
				-static int copy_cuda_to_cuda(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, NULL, cudaMemcpyDeviceToDevice);
			
 
				-}
			
 
				-
			
 
				-static int copy_cuda_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
			
 
				-{
			
 
				-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToDevice);
			
 
				-}
			
 
				-
			
 
				-static int copy_cuda_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
			
 
				-{
			
 
				-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyDeviceToHost);
			
 
				-}
			
 
				-
			
 
				-static int copy_ram_to_cuda_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream)
			
 
				-{
			
 
				-	return copy_cuda_async_sync(src_interface, src_node, dst_interface, dst_node, stream, cudaMemcpyHostToDevice);
			
 
				-}
			
 
				-
			
 
				-#endif // STARPU_USE_CUDA
			
 
				-
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-static int copy_opencl_common(void *src_interface, unsigned src_node,
			
 
				-                                    void *dst_interface, unsigned dst_node, cl_event *event)
			
 
				-{
			
 
				-	struct starpu_vector_interface *src_vector = src_interface;
			
 
				-	struct starpu_vector_interface *dst_vector = dst_interface;
			
 
				-	int ret;
			
 
				-
			
 
				-	ret = starpu_opencl_copy_async_sync(src_vector->dev_handle, src_node, src_vector->offset,
			
 
				-					    dst_vector->dev_handle, dst_node, dst_vector->offset,
			
 
				-					       src_vector->nx*src_vector->elemsize, event);
			
 
				+	ret = starpu_interface_copy(src_vector->dev_handle, src_node, src_vector->offset,
			
 
				+				    dst_vector->dev_handle, dst_node, dst_vector->offset,
			
 
				+				    src_vector->nx*src_vector->elemsize, async_data);
			
 
				 
			
 
				 	_STARPU_TRACE_DATA_COPY(src_node, dst_node, src_vector->nx*src_vector->elemsize);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node,
			
 
				-                                    void *dst_interface, unsigned dst_node, cl_event *event)
			
 
				-{
			
 
				-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
			
 
				-}
			
 
				-
			
 
				-static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node,
			
 
				-                                    void *dst_interface, unsigned dst_node, cl_event *event)
			
 
				-{
			
 
				-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
			
 
				-}
			
 
				-
			
 
				-static int copy_opencl_to_opencl_async(void *src_interface, unsigned src_node,
			
 
				-                                       void *dst_interface, unsigned dst_node, cl_event *event)
			
 
				-{
			
 
				-	return copy_opencl_common(src_interface, src_node, dst_interface, dst_node, event);
			
 
				-}
			
 
				-
			
 
				-static int copy_ram_to_opencl(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
			
 
				-                              void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				-{
			
 
				-        return copy_ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
			
 
				-}
			
 
				-
			
 
				-static int copy_opencl_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
			
 
				-				void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				-{
			
 
				-        return copy_opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, NULL);
			
 
				-}
			
 
				-
			
 
				-static int copy_opencl_to_opencl(void *src_interface, unsigned src_node,
			
 
				-				 void *dst_interface, unsigned dst_node)
			
 
				-{
			
 
				-	return copy_opencl_to_opencl_async(src_interface, src_node, dst_interface, dst_node, NULL);
			
 
				-}
			
 
				-
			
 
				-
			
 
				-#endif // STARPU_USE_OPENCL
			
 
				-
			
 
				-static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED,
			
 
				-					void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
			
 
				-{
			
 
				-	struct starpu_vector_interface *src_vector = (struct starpu_vector_interface *) src_interface;
			
 
				-	struct starpu_vector_interface *dst_vector = (struct starpu_vector_interface *) dst_interface;
			
 
				-
			
 
				-	uint32_t nx = dst_vector->nx;
			
 
				-	size_t elemsize = dst_vector->elemsize;
			
 
				-
			
 
				-	uintptr_t ptr_src = src_vector->ptr;
			
 
				-	uintptr_t ptr_dst = dst_vector->ptr;
			
 
				-
			
 
				-	memcpy((void *)ptr_dst, (void *)ptr_src, nx*elemsize);
			
 
				-
			
 
				-	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nx*elemsize);
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
--- a/src/datawizard/interfaces/void_interface.c
+++ b/src/datawizard/interfaces/void_interface.c
@@ -25,36 +25,11 @@
 
				 #include <starpu_opencl.h>
			
 
				 #include <drivers/opencl/driver_opencl.h>
			
 
				 
			
 
				-static int dummy_copy(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-static int dummy_cuda_copy_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-static int dummy_opencl_copy_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event);
			
 
				-#endif
			
 
				+static int dummy_copy(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
			
 
				 
			
 
				 static struct starpu_data_copy_methods void_copy_data_methods_s =
			
 
				 {
			
 
				-	.ram_to_ram = dummy_copy,
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-	.ram_to_cuda = dummy_copy,
			
 
				-	.cuda_to_ram = dummy_copy,
			
 
				-	.cuda_to_cuda = dummy_copy,
			
 
				-	.ram_to_cuda_async = dummy_cuda_copy_async,
			
 
				-	.cuda_to_ram_async = dummy_cuda_copy_async,
			
 
				-	.cuda_to_cuda_async = dummy_cuda_copy_async,
			
 
				-#else
			
 
				-#ifdef STARPU_SIMGRID
			
 
				-	/* Enable GPU-GPU transfers in simgrid */
			
 
				-	.cuda_to_cuda_async = 1,
			
 
				-#endif
			
 
				-#endif
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-	.ram_to_opencl = dummy_copy,
			
 
				-	.opencl_to_ram = dummy_copy,
			
 
				-        .ram_to_opencl_async = dummy_opencl_copy_async,
			
 
				-	.opencl_to_ram_async = dummy_opencl_copy_async,
			
 
				-#endif
			
 
				+	.any_to_any = dummy_copy,
			
 
				 };
			
 
				 
			
 
				 static void register_void_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface);
			
@@ -139,25 +114,3 @@ static int dummy_copy(void *src_interface STARPU_ATTRIBUTE_UNUSED,
 
				 {
			
 
				 	return 0;
			
 
				 }
			
 
				-
			
 
				-#ifdef STARPU_USE_CUDA
			
 
				-static int dummy_cuda_copy_async(void *src_interface STARPU_ATTRIBUTE_UNUSED,
			
 
				-				unsigned src_node STARPU_ATTRIBUTE_UNUSED,
			
 
				-				void *dst_interface STARPU_ATTRIBUTE_UNUSED,
			
 
				-				unsigned dst_node STARPU_ATTRIBUTE_UNUSED,
			
 
				-				cudaStream_t stream __attribute__ ((unused)))
			
 
				-{
			
 
				-	return 0;
			
 
				-}
			
 
				-#endif // STARPU_USE_CUDA
			
 
				-
			
 
				-#ifdef STARPU_USE_OPENCL
			
 
				-static int dummy_opencl_copy_async(void *src_interface STARPU_ATTRIBUTE_UNUSED,
			
 
				-					unsigned src_node STARPU_ATTRIBUTE_UNUSED,
			
 
				-					void *dst_interface STARPU_ATTRIBUTE_UNUSED,
			
 
				-					unsigned dst_node STARPU_ATTRIBUTE_UNUSED,
			
 
				-					cl_event *event STARPU_ATTRIBUTE_UNUSED)
			
 
				-{
			
 
				-	return 0;
			
 
				-}
			
 
				-#endif // STARPU_USE_OPENCL
			
--- a/tests/datawizard/interfaces/test_interfaces.c
+++ b/tests/datawizard/interfaces/test_interfaces.c
@@ -550,7 +550,10 @@ ram_to_ram(void)
 
				 	/* We do not care about the nodes */
			
 
				 	src_interface = starpu_data_get_interface_on_node(src, 0);
			
 
				 	dst_interface = starpu_data_get_interface_on_node(dst, 0);
			
 
				-	src->ops->copy_methods->ram_to_ram(src_interface, 0, dst_interface, 0);
			
 
				+	if (src->ops->copy_methods->ram_to_ram)
			
 
				+		src->ops->copy_methods->ram_to_ram(src_interface, 0, dst_interface, 0);
			
 
				+	else
			
 
				+		src->ops->copy_methods->any_to_any(src_interface, 0, dst_interface, 0, NULL);
			
 
				 
			
 
				 	err = create_task(&task, STARPU_CPU_WORKER, -1);
			
 
				 	if (err != 0)