Переглянути джерело

src: updates for node operations:
- move driver functions to appropriate driver files
- specify copy_interface functions with 2 levels

Nathalie Furmento 6 роки тому
батько
коміт
d65545f9b3

+ 6 - 5
src/datawizard/copy_driver.c

@@ -300,13 +300,14 @@ int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, u
 {
 	struct _starpu_async_channel *async_channel = async_data;
 	enum starpu_node_kind src_kind = starpu_node_get_kind(src_node);
+	enum starpu_node_kind dst_kind = starpu_node_get_kind(dst_node);
 
-	if (_node_ops[src_kind].copy_interface)
+	if (_node_ops[src_kind].copy_interface_to[dst_kind])
 	{
-		return _node_ops[src_kind].copy_interface(src, src_offset, src_node,
-							  dst, dst_offset, dst_node,
-							  size,
-							  async_channel);
+		return _node_ops[src_kind].copy_interface_to[dst_kind](src, src_offset, src_node,
+								       dst, dst_offset, dst_node,
+								       size,
+								       async_channel);
 	}
 	else
 	{

+ 39 - 27
src/datawizard/node_ops.c

@@ -20,10 +20,11 @@
 #include <datawizard/node_ops.h>
 #include <drivers/cpu/driver_cpu.h>
 #include <drivers/cuda/driver_cuda.h>
-#include <drivers/mpi/driver_mpi_sink.h>
-#include <drivers/mpi/driver_mpi_source.h>
+#include <drivers/opencl/driver_opencl.h>
 #include <drivers/mpi/driver_mpi_common.h>
+#include <drivers/mpi/driver_mpi_source.h>
 #include <drivers/mic/driver_mic_source.h>
+#include <drivers/scc/driver_scc_source.h>
 #include <drivers/disk/driver_disk.h>
 
 struct _starpu_node_ops _node_ops[STARPU_MPI_MS_RAM+1];
@@ -34,14 +35,8 @@ void _starpu_node_ops_init()
 
 	// CPU
 	// CPU_RAM does not define wait_event operation
-	_node_ops[STARPU_CPU_RAM].copy_data_to[STARPU_CPU_RAM] = _starpu_cpu_copy_data_to_cpu;
-	_node_ops[STARPU_CPU_RAM].copy_data_to[STARPU_CUDA_RAM] = _starpu_cpu_copy_data_to_cuda;
-	_node_ops[STARPU_CPU_RAM].copy_data_to[STARPU_OPENCL_RAM] = _starpu_cpu_copy_data_to_opencl;
-	_node_ops[STARPU_CPU_RAM].copy_data_to[STARPU_DISK_RAM] = _starpu_cpu_copy_data_to_disk;
-	_node_ops[STARPU_CPU_RAM].copy_data_to[STARPU_MPI_MS_RAM] = _starpu_cpu_copy_data_to_mpi_ms;
-	_node_ops[STARPU_CPU_RAM].copy_data_to[STARPU_SCC_RAM] = _starpu_cpu_copy_data_to_scc;
-	_node_ops[STARPU_CPU_RAM].copy_data_to[STARPU_MIC_RAM] = _starpu_cpu_copy_data_to_mic;
-	_node_ops[STARPU_CPU_RAM].copy_interface = _starpu_cpu_copy_interface;
+	_node_ops[STARPU_CPU_RAM].copy_data_to[STARPU_CPU_RAM] = _starpu_cpu_copy_data;
+	_node_ops[STARPU_CPU_RAM].copy_interface_to[STARPU_CPU_RAM] = _starpu_cpu_copy_interface;
 	_node_ops[STARPU_CPU_RAM].direct_access_supported = _starpu_cpu_direct_access_supported;
 	_node_ops[STARPU_CPU_RAM].malloc_on_node = _starpu_cpu_malloc_on_node;
 	_node_ops[STARPU_CPU_RAM].free_on_node = _starpu_cpu_free_on_node;
@@ -49,9 +44,12 @@ void _starpu_node_ops_init()
 #ifdef STARPU_USE_CUDA
 	_node_ops[STARPU_CUDA_RAM].wait_request_completion = _starpu_cuda_wait_request_completion;
 	_node_ops[STARPU_CUDA_RAM].test_request_completion = _starpu_cuda_test_request_completion;
-	_node_ops[STARPU_CUDA_RAM].copy_data_to[STARPU_CUDA_RAM] = _starpu_cuda_copy_data_to_cuda;
-	_node_ops[STARPU_CUDA_RAM].copy_data_to[STARPU_CPU_RAM] = _starpu_cuda_copy_data_to_cpu;
-	_node_ops[STARPU_CUDA_RAM].copy_interface = _starpu_cuda_copy_interface;
+	_node_ops[STARPU_CUDA_RAM].copy_data_to[STARPU_CUDA_RAM] = _starpu_cuda_copy_data_from_cuda_to_cuda;
+	_node_ops[STARPU_CUDA_RAM].copy_data_to[STARPU_CPU_RAM] = _starpu_cuda_copy_data_from_cuda_to_cpu;
+	_node_ops[STARPU_CPU_RAM].copy_data_to[STARPU_CUDA_RAM] = _starpu_cuda_copy_data_from_cpu_to_cuda;
+	_node_ops[STARPU_CUDA_RAM].copy_interface_to[STARPU_CUDA_RAM] = _starpu_cuda_copy_interface_from_cuda_to_cuda;
+	_node_ops[STARPU_CUDA_RAM].copy_interface_to[STARPU_CPU_RAM] = _starpu_cuda_copy_interface_from_cuda_to_cpu;
+	_node_ops[STARPU_CPU_RAM].copy_interface_to[STARPU_CUDA_RAM] = _starpu_cuda_copy_interface_from_cpu_to_cuda;
 	_node_ops[STARPU_CUDA_RAM].direct_access_supported = _starpu_cuda_direct_access_supported;
 	_node_ops[STARPU_CUDA_RAM].malloc_on_node = _starpu_cuda_malloc_on_node;
 	_node_ops[STARPU_CUDA_RAM].free_on_node = _starpu_cuda_free_on_node;
@@ -60,9 +58,12 @@ void _starpu_node_ops_init()
 #ifdef STARPU_USE_OPENCL
 	_node_ops[STARPU_OPENCL_RAM].wait_request_completion = _starpu_opencl_wait_request_completion;
 	_node_ops[STARPU_OPENCL_RAM].test_request_completion = _starpu_opencl_test_request_completion;
-	_node_ops[STARPU_OPENCL_RAM].copy_data_to[STARPU_OPENCL_RAM] = _starpu_opencl_copy_data_to_opencl;
-	_node_ops[STARPU_OPENCL_RAM].copy_data_to[STARPU_CPU_RAM] = _starpu_opencl_copy_data_to_cpu;
-	_node_ops[STARPU_OPENCL_RAM].copy_interface = _starpu_opencl_copy_interface;
+	_node_ops[STARPU_OPENCL_RAM].copy_data_to[STARPU_OPENCL_RAM] = _starpu_opencl_copy_data_from_opencl_to_opencl;
+	_node_ops[STARPU_OPENCL_RAM].copy_data_to[STARPU_CPU_RAM] = _starpu_opencl_copy_data_from_opencl_to_cpu;
+	_node_ops[STARPU_CPU_RAM].copy_data_to[STARPU_OPENCL_RAM] = _starpu_opencl_copy_data_from_cpu_to_opencl;
+	_node_ops[STARPU_OPENCL_RAM].copy_interface_to[STARPU_OPENCL_RAM] = _starpu_opencl_copy_interface_from_opencl_to_opencl;
+	_node_ops[STARPU_OPENCL_RAM].copy_interface_to[STARPU_CPU_RAM] = _starpu_opencl_copy_interface_from_opencl_to_cpu;
+	_node_ops[STARPU_CPU_RAM].copy_interface_to[STARPU_OPENCL_RAM] = _starpu_opencl_copy_interface_from_cpu_to_opencl;
 	_node_ops[STARPU_OPENCL_RAM].direct_access_supported = _starpu_opencl_direct_access_supported;
 	_node_ops[STARPU_OPENCL_RAM].malloc_on_node = _starpu_opencl_malloc_on_node;
 	_node_ops[STARPU_OPENCL_RAM].free_on_node = _starpu_opencl_free_on_node;
@@ -71,9 +72,11 @@ void _starpu_node_ops_init()
 #ifdef STARPU_USE_MIC
 	_node_ops[STARPU_MIC_RAM].wait_request_completion = _starpu_mic_wait_request_completion;
 	_node_ops[STARPU_MIC_RAM].test_request_completion = _starpu_mic_test_request_completion;
-	_node_ops[STARPU_MIC_RAM].copy_data_to[STARPU_CPU_RAM] = _starpu_mic_copy_data_to_cpu;
 	/* TODO: MIC -> MIC */
-	_node_ops[STARPU_MIC_RAM].copy_interface = _starpu_mic_copy_interface;
+	_node_ops[STARPU_MIC_RAM].copy_data_to[STARPU_CPU_RAM] = _starpu_mic_copy_data_from_mic_to_cpu;
+	_node_ops[STARPU_CPU_RAM].copy_data_to[STARPU_MIC_RAM] = _starpu_mic_copy_data_from_cpu_to_mic;
+	_node_ops[STARPU_MIC_RAM].copy_interface_to[STARPU_CPU_RAM] = _starpu_mic_copy_interface_from_mic_to_cpu;
+	_node_ops[STARPU_CPU_RAM].copy_interface_to[STARPU_MIC_RAM] = _starpu_mic_copy_interface_from_cpu_to_mic;
 	_node_ops[STARPU_MIC_RAM].direct_access_supported = _starpu_mic_direct_access_supported;
 	_node_ops[STARPU_MIC_RAM].malloc_on_node = _starpu_mic_malloc_on_node;
 	_node_ops[STARPU_MIC_RAM].free_on_node = _starpu_mic_free_on_node;
@@ -82,9 +85,12 @@ void _starpu_node_ops_init()
 #ifdef STARPU_USE_MPI_MASTER_SLAVE
 	_node_ops[STARPU_MPI_MS_RAM].wait_request_completion = _starpu_mpi_common_wait_request_completion;
 	_node_ops[STARPU_MPI_MS_RAM].test_request_completion = _starpu_mpi_common_test_event;
-	_node_ops[STARPU_MPI_MS_RAM].copy_data_to[STARPU_CPU_RAM] = _starpu_mpi_common_copy_data_to_cpu;
-	_node_ops[STARPU_MPI_MS_RAM].copy_data_to[STARPU_MPI_MS_RAM] = _starpu_mpi_common_copy_data_to_mpi;
-	_node_ops[STARPU_MPI_MS_RAM].copy_interface = _starpu_mpi_copy_interface;
+	_node_ops[STARPU_MPI_MS_RAM].copy_data_to[STARPU_CPU_RAM] = _starpu_mpi_copy_data_from_mpi_to_cpu;
+	_node_ops[STARPU_MPI_MS_RAM].copy_data_to[STARPU_MPI_MS_RAM] = _starpu_mpi_copy_data_from_mpi_to_mpi;
+	_node_ops[STARPU_CPU_RAM].copy_data_to[STARPU_MPI_MS_RAM] = _starpu_mpi_copy_data_from_cpu_to_mpi;
+	_node_ops[STARPU_MPI_MS_RAM].copy_interface_to[STARPU_MPI_MS_RAM] = _starpu_mpi_copy_interface_from_mpi_to_mpi;
+	_node_ops[STARPU_MPI_MS_RAM].copy_interface_to[STARPU_CPU_RAM] = _starpu_mpi_copy_interface_from_mpi_to_cpu;
+	_node_ops[STARPU_CPU_RAM].copy_interface_to[STARPU_MPI_MS_RAM] = _starpu_mpi_copy_interface_from_cpu_to_mpi;
 	_node_ops[STARPU_MPI_MS_RAM].direct_access_supported = _starpu_mpi_direct_access_supported;
 	_node_ops[STARPU_MPI_MS_RAM].malloc_on_node = _starpu_mpi_malloc_on_node;
 	_node_ops[STARPU_MPI_MS_RAM].free_on_node = _starpu_mpi_free_on_node;
@@ -92,17 +98,23 @@ void _starpu_node_ops_init()
 
 	_node_ops[STARPU_DISK_RAM].wait_request_completion = _starpu_disk_wait_request_completion;
 	_node_ops[STARPU_DISK_RAM].test_request_completion = _starpu_disk_test_request_completion;
-	_node_ops[STARPU_DISK_RAM].copy_data_to[STARPU_CPU_RAM] = _starpu_disk_copy_data_to_cpu;
-	_node_ops[STARPU_DISK_RAM].copy_data_to[STARPU_DISK_RAM] = _starpu_disk_copy_data_to_disk;
-	_node_ops[STARPU_DISK_RAM].copy_interface = _starpu_disk_copy_interface;
+	_node_ops[STARPU_DISK_RAM].copy_data_to[STARPU_CPU_RAM] = _starpu_disk_copy_data_from_disk_to_cpu;
+	_node_ops[STARPU_DISK_RAM].copy_data_to[STARPU_DISK_RAM] = _starpu_disk_copy_data_from_disk_to_disk;
+	_node_ops[STARPU_CPU_RAM].copy_data_to[STARPU_DISK_RAM] = _starpu_disk_copy_data_from_cpu_to_disk;
+	_node_ops[STARPU_DISK_RAM].copy_interface_to[STARPU_DISK_RAM] = _starpu_disk_copy_interface_from_disk_to_disk;
+	_node_ops[STARPU_DISK_RAM].copy_interface_to[STARPU_CPU_RAM] = _starpu_disk_copy_interface_from_disk_to_cpu;
+	_node_ops[STARPU_CPU_RAM].copy_interface_to[STARPU_DISK_RAM] = _starpu_disk_copy_interface_from_cpu_to_disk;
 	_node_ops[STARPU_DISK_RAM].direct_access_supported = _starpu_disk_direct_access_supported;
 	_node_ops[STARPU_DISK_RAM].malloc_on_node = _starpu_disk_malloc_on_node;
 	_node_ops[STARPU_DISK_RAM].free_on_node = _starpu_disk_free_on_node;
 
 #ifdef STARPU_USE_SCC
-	_node_ops[STARPU_SCC_RAM].copy_data_to[STARPU_CPU_RAM] = _starpu_scc_common_copy_data_to_cpu;
-	_node_ops[STARPU_SCC_RAM].copy_data_to[STARPU_SCC_RAM] = _starpu_scc_common_copy_data_to_scc;
-	_node_ops[STARPU_SCC_RAM].copy_interface = _starpu_scc_copy_interface;
+	_node_ops[STARPU_SCC_RAM].copy_data_to[STARPU_CPU_RAM] = _starpu_scc_copy_data_from_scc_to_cpu;
+	_node_ops[STARPU_SCC_RAM].copy_data_to[STARPU_SCC_RAM] = _starpu_scc_copy_data_from_scc_to_scc;
+	_node_ops[STARPU_CPU_RAM].copy_data_to[STARPU_SCC_RAM] = _starpu_scc_copy_data_from_cpu_to_scc;
+	_node_ops[STARPU_SCC_RAM].copy_interface_to[STARPU_SCC_RAM] = _starpu_scc_copy_interface_from_scc_to_scc;
+	_node_ops[STARPU_SCC_RAM].copy_interface_to[STARPU_CPU_RAM] = _starpu_scc_copy_interface_from_scc_to_cpu;
+	_node_ops[STARPU_CPU_RAM].copy_interface_to[STARPU_SCC_RAM] = _starpu_scc_copy_interface_from_cpu_to_scc;
 	_node_ops[STARPU_SCC_RAM].direct_access_supported = _starpu_scc_direct_access_supported;
 	_node_ops[STARPU_SCC_RAM].malloc_on_node = _starpu_scc_malloc_on_node;
 	_node_ops[STARPU_SCC_RAM].free_on_node = _starpu_scc_free_on_node;

+ 1 - 1
src/datawizard/node_ops.h

@@ -34,7 +34,7 @@ typedef int (*copy_interface_t)(uintptr_t src_ptr, size_t src_offset, unsigned s
 struct _starpu_node_ops
 {
 	copy_data_func_t copy_data_to[STARPU_MPI_MS_RAM+1];
-	copy_interface_t copy_interface;
+	copy_interface_t copy_interface_to[STARPU_MPI_MS_RAM+1];
 	void (*wait_request_completion)(struct _starpu_async_channel *async_channel);
 	unsigned (*test_request_completion)(struct _starpu_async_channel *async_channel);
 	int (*direct_access_supported)(unsigned node, unsigned handling_node);

+ 7 - 286
src/drivers/cpu/driver_cpu.c

@@ -429,173 +429,7 @@ int _starpu_cpu_driver_run(struct _starpu_worker *worker)
 	return 0;
 }
 
-int _starpu_cpu_copy_data_to_opencl(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
-{
-	int src_kind = starpu_node_get_kind(src_node);
-	int dst_kind = starpu_node_get_kind(dst_node);
-	STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_OPENCL_RAM);
-
-	int ret = 1;
-
-#ifdef STARPU_USE_OPENCL
-	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
-	/* STARPU_CPU_RAM -> STARPU_OPENCL_RAM */
-	STARPU_ASSERT(starpu_worker_get_local_memory_node() == dst_node);
-	if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() || !(copy_methods->ram_to_opencl_async || copy_methods->any_to_any))
-	{
-		STARPU_ASSERT(copy_methods->ram_to_opencl || copy_methods->any_to_any);
-		/* this is not associated to a request so it's synchronous */
-		if (copy_methods->ram_to_opencl)
-			copy_methods->ram_to_opencl(src_interface, src_node, dst_interface, dst_node);
-		else
-			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
-	}
-	else
-	{
-		req->async_channel.type = STARPU_OPENCL_RAM;
-		if (copy_methods->ram_to_opencl_async)
-			ret = copy_methods->ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
-		else
-		{
-			STARPU_ASSERT(copy_methods->any_to_any);
-			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
-		}
-	}
-#endif
-	return ret;
-}
-
-int _starpu_cpu_copy_data_to_cuda(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
-{
-	int src_kind = starpu_node_get_kind(src_node);
-	int dst_kind = starpu_node_get_kind(dst_node);
-	STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_CUDA_RAM);
-
-	int ret = 1;
-
-#ifdef STARPU_USE_CUDA
-	cudaError_t cures;
-	cudaStream_t stream;
-	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
-
-	/* STARPU_CPU_RAM -> CUBLAS_RAM */
-	/* only the proper CUBLAS thread can initiate this ! */
-#if !defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
-	STARPU_ASSERT(starpu_worker_get_local_memory_node() == dst_node);
-#endif
-	if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() ||
-	    !(copy_methods->ram_to_cuda_async || copy_methods->any_to_any))
-	{
-		/* this is not associated to a request so it's synchronous */
-		STARPU_ASSERT(copy_methods->ram_to_cuda || copy_methods->any_to_any);
-		if (copy_methods->ram_to_cuda)
-			copy_methods->ram_to_cuda(src_interface, src_node, dst_interface, dst_node);
-		else
-			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
-	}
-	else
-	{
-		req->async_channel.type = STARPU_CUDA_RAM;
-		cures = cudaEventCreateWithFlags(&req->async_channel.event.cuda_event, cudaEventDisableTiming);
-		if (STARPU_UNLIKELY(cures != cudaSuccess))
-			STARPU_CUDA_REPORT_ERROR(cures);
-
-		stream = starpu_cuda_get_in_transfer_stream(dst_node);
-		if (copy_methods->ram_to_cuda_async)
-			ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
-		else
-		{
-			STARPU_ASSERT(copy_methods->any_to_any);
-			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
-		}
-
-		cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
-		if (STARPU_UNLIKELY(cures != cudaSuccess))
-			STARPU_CUDA_REPORT_ERROR(cures);
-	}
-#endif
-	return ret;
-}
-
-int _starpu_cpu_copy_data_to_mic(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
-{
-	int src_kind = starpu_node_get_kind(src_node);
-	int dst_kind = starpu_node_get_kind(dst_node);
-	STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_MIC_RAM);
-
-	int ret = 1;
-
-#ifdef STARPU_USE_MIC
-	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
-	/* RAM -> MIC */
-	if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mic_copy_disabled() || !(copy_methods->ram_to_mic_async || copy_methods->any_to_any))
-	{
-		/* this is not associated to a request so it's synchronous */
-		STARPU_ASSERT(copy_methods->ram_to_mic || copy_methods->any_to_any);
-		if (copy_methods->ram_to_mic)
-			copy_methods->ram_to_mic(src_interface, src_node, dst_interface, dst_node);
-		else
-			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
-	}
-	else
-	{
-		req->async_channel.type = STARPU_MIC_RAM;
-		if (copy_methods->ram_to_mic_async)
-			ret = copy_methods->ram_to_mic_async(src_interface, src_node, dst_interface, dst_node);
-		else
-		{
-			STARPU_ASSERT(copy_methods->any_to_any);
-			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
-		}
-		_starpu_mic_init_event(&(req->async_channel.event.mic_event), dst_node);
-	}
-#endif
-	return ret;
-}
-
-int _starpu_cpu_copy_data_to_disk(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
-{
-	int src_kind = starpu_node_get_kind(src_node);
-	int dst_kind = starpu_node_get_kind(dst_node);
-	STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_DISK_RAM);
-
-	int ret = 0;
-	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
-
-	if (req && !starpu_asynchronous_copy_disabled())
-	{
-		req->async_channel.type = STARPU_DISK_RAM;
-		req->async_channel.event.disk_event.requests = NULL;
-		req->async_channel.event.disk_event.ptr = NULL;
-		req->async_channel.event.disk_event.handle = NULL;
-	}
-	if(copy_methods->any_to_any)
-		ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req && !starpu_asynchronous_copy_disabled() ? &req->async_channel : NULL);
-	else
-	{
-		void *obj = starpu_data_handle_to_pointer(handle, dst_node);
-		void * ptr = NULL;
-		starpu_ssize_t size = 0;
-		handle->ops->pack_data(handle, src_node, &ptr, &size);
-		ret = _starpu_disk_full_write(src_node, dst_node, obj, ptr, size, req && !starpu_asynchronous_copy_disabled() ? &req->async_channel : NULL);
-		if (ret == 0)
-		{
-			/* write is already finished, ptr was allocated in pack_data */
-			_starpu_free_flags_on_node(src_node, ptr, size, 0);
-		}
-		else if (ret == -EAGAIN)
-		{
-			STARPU_ASSERT(req);
-			req->async_channel.event.disk_event.ptr = ptr;
-			req->async_channel.event.disk_event.node = src_node;
-			req->async_channel.event.disk_event.size = size;
-		}
-		STARPU_ASSERT(ret == 0 || ret == -EAGAIN);
-	}
-	return ret;
-}
-
-int _starpu_cpu_copy_data_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+int _starpu_cpu_copy_data(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
 {
 	int src_kind = starpu_node_get_kind(src_node);
 	int dst_kind = starpu_node_get_kind(dst_node);
@@ -610,135 +444,22 @@ int _starpu_cpu_copy_data_to_cpu(starpu_data_handle_t handle, void *src_interfac
 	return ret;
 }
 
-int _starpu_cpu_copy_data_to_mpi_ms(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
-{
-	int src_kind = starpu_node_get_kind(src_node);
-	int dst_kind = starpu_node_get_kind(dst_node);
-	STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_MPI_MS_RAM);
-
-	int ret = 0;
-	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
-
-	if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mpi_ms_copy_disabled() || !(copy_methods->ram_to_mpi_ms_async || copy_methods->any_to_any))
-	{
-		/* this is not associated to a request so it's synchronous */
-		STARPU_ASSERT(copy_methods->ram_to_mpi_ms || copy_methods->any_to_any);
-		if (copy_methods->ram_to_mpi_ms)
-			copy_methods->ram_to_mpi_ms(src_interface, src_node, dst_interface, dst_node);
-		else
-			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
-	}
-	else
-	{
-		req->async_channel.type = STARPU_MPI_MS_RAM;
-		if(copy_methods->ram_to_mpi_ms_async)
-			ret = copy_methods->ram_to_mpi_ms_async(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
-		else
-		{
-			STARPU_ASSERT(copy_methods->any_to_any);
-			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
-		}
-	}
-	return ret;
-}
-
-int _starpu_cpu_copy_data_to_scc(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
-{
-	int src_kind = starpu_node_get_kind(src_node);
-	int dst_kind = starpu_node_get_kind(dst_node);
-	STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_SCC_RAM);
-
-	int ret = 0;
-	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
-	if (copy_methods->scc_src_to_sink)
-		copy_methods->scc_src_to_sink(src_interface, src_node, dst_interface, dst_node);
-	else
-		copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
-	return ret;
-}
-
 int _starpu_cpu_copy_interface(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
 {
 	int src_kind = starpu_node_get_kind(src_node);
-	STARPU_ASSERT(src_kind == STARPU_CPU_RAM);
-
 	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_CPU_RAM);
 
-	if (dst_kind == STARPU_CPU_RAM)
-	{
-		memcpy((void *) (dst + dst_offset), (void *) (src + src_offset), size);
-		return 0;
-	}
-#ifdef STARPU_USE_CUDA
-	else if (dst_kind == STARPU_CUDA_RAM)
-	{
-		return starpu_cuda_copy_async_sync((void*) (src + src_offset), src_node,
-						   (void*) (dst + dst_offset), dst_node,
-						   size,
-						   async_channel?starpu_cuda_get_in_transfer_stream(dst_node):NULL,
-						   cudaMemcpyHostToDevice);
-	}
-#endif
-#ifdef STARPU_USE_OPENCL
-	else if (dst_kind == STARPU_OPENCL_RAM)
-	{
-		return starpu_opencl_copy_async_sync(src, src_offset, src_node,
-						     dst, dst_offset, dst_node,
-						     size,
-						     &async_channel->event.opencl_event);
-
-	}
-#endif
-#ifdef STARPU_USE_MIC
-	else if (dst_kind == STARPU_MIC_RAM)
-	{
-		if (async_channel)
-			return _starpu_mic_copy_ram_to_mic_async((void*) (src + src_offset), src_node,
-								 (void*) (dst + dst_offset), dst_node,
-								 size);
-		else
-			return _starpu_mic_copy_ram_to_mic((void*) (src + src_offset), src_node,
-							   (void*) (dst + dst_offset), dst_node,
-							   size);
+	(void) async_channel;
 
-	}
-#endif
-#ifdef STARPU_USE_SCC
-	else if (dst_kind == STARPU_MIC_RAM)
-	{
-		return _starpu_scc_copy_src_to_sink((void*) (src + src_offset), src_node,
-						    (void*) (dst + dst_offset), dst_node,
-						    size);
-	}
-#endif
-#ifdef STARPU_USE_MPI_MASTER_SLAVE
-	else if (dst_kind == STARPU_MPI_MS_RAM)
-	{
-                if (async_channel)
-                        return _starpu_mpi_copy_ram_to_mpi_async((void*) (src + src_offset), src_node,
-								 (void*) (dst + dst_offset), dst_node,
-								 size, async_channel);
-                else
-                        return _starpu_mpi_copy_ram_to_mpi_sync((void*) (src + src_offset), src_node,
-								(void*) (dst + dst_offset), dst_node,
-								size);
-	}
-#endif
-	else if (dst_kind == STARPU_DISK_RAM)
-	{
-		return _starpu_disk_copy_src_to_disk((void*) (src + src_offset), src_node,
-						     (void*) dst, dst_offset, dst_node,
-						     size, async_channel);
-	}
-	else
-	{
-		STARPU_ABORT();
-		return -1;
-	}
+	memcpy((void *) (dst + dst_offset), (void *) (src + src_offset), size);
+	return 0;
 }
 
 int _starpu_cpu_direct_access_supported(unsigned node, unsigned handling_node)
 {
+	(void) node;
+	(void) handling_node;
 	return 1;
 }
 

+ 2 - 26
src/drivers/cpu/driver_cpu.h

@@ -26,34 +26,10 @@
 extern struct _starpu_driver_ops _starpu_driver_cpu_ops;
 void *_starpu_cpu_worker(void *);
 
-int _starpu_cpu_copy_data_to_cuda(starpu_data_handle_t handle, void *src_interface, unsigned src_node,
-				  void *dst_interface, unsigned dst_node,
-				  struct _starpu_data_request *req);
-int _starpu_cpu_copy_data_to_opencl(starpu_data_handle_t handle, void *src_interface, unsigned src_node,
-				    void *dst_interface, unsigned dst_node,
-				    struct _starpu_data_request *req);
-int _starpu_cpu_copy_data_to_mic(starpu_data_handle_t handle, void *src_interface, unsigned src_node,
-				 void *dst_interface, unsigned dst_node,
-				 struct _starpu_data_request *req);
-int _starpu_cpu_copy_data_to_disk(starpu_data_handle_t handle, void *src_interface, unsigned src_node,
-				  void *dst_interface, unsigned dst_node,
-				  struct _starpu_data_request *req);
-int _starpu_cpu_copy_data_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node,
-				 void *dst_interface, unsigned dst_node,
-				 struct _starpu_data_request *req);
-int _starpu_cpu_copy_data_to_mpi_ms(starpu_data_handle_t handle, void *src_interface, unsigned src_node,
-				    void *dst_interface, unsigned dst_node,
-				    struct _starpu_data_request *req);
-int _starpu_cpu_copy_data_to_scc(starpu_data_handle_t handle, void *src_interface, unsigned src_node,
-				 void *dst_interface, unsigned dst_node,
-				 struct _starpu_data_request *req);
-
-int _starpu_cpu_copy_interface(uintptr_t src_ptr, size_t src_offset, unsigned src_node,
-			       uintptr_t dst_ptr, size_t dst_offset, unsigned dst_node,
-			       size_t ssize, struct _starpu_async_channel *async_channel);
+int _starpu_cpu_copy_data(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+int _starpu_cpu_copy_interface(uintptr_t src_ptr, size_t src_offset, unsigned src_node, uintptr_t dst_ptr, size_t dst_offset, unsigned dst_node, size_t ssize, struct _starpu_async_channel *async_channel);
 
 int _starpu_cpu_direct_access_supported(unsigned node, unsigned handling_node);
-
 uintptr_t _starpu_cpu_malloc_on_node(unsigned dst_node, size_t size, int flags);
 void _starpu_cpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags);
 

+ 92 - 25
src/drivers/cuda/driver_cuda.c

@@ -1214,6 +1214,7 @@ int _starpu_cuda_driver_deinit_from_worker(struct _starpu_worker *worker)
 	return _starpu_cuda_driver_deinit(worker->set);
 }
 
+#ifdef STARPU_USE_CUDA
 unsigned _starpu_cuda_test_request_completion(struct _starpu_async_channel *async_channel)
 {
 	cudaEvent_t event;
@@ -1248,14 +1249,13 @@ void _starpu_cuda_wait_request_completion(struct _starpu_async_channel *async_ch
 		STARPU_CUDA_REPORT_ERROR(cures);
 }
 
-int _starpu_cuda_copy_data_to_cuda(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+int _starpu_cuda_copy_data_from_cuda_to_cuda(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
 {
 	int src_kind = starpu_node_get_kind(src_node);
 	int dst_kind = starpu_node_get_kind(dst_node);
 	STARPU_ASSERT(src_kind == STARPU_CUDA_RAM && dst_kind == STARPU_CUDA_RAM);
 
 	int ret = 1;
-#ifdef STARPU_USE_CUDA
 	cudaError_t cures;
 	cudaStream_t stream;
 	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
@@ -1287,18 +1287,16 @@ int _starpu_cuda_copy_data_to_cuda(starpu_data_handle_t handle, void *src_interf
 		cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
 		if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
 	}
-#endif
 	return ret;
 }
 
-int _starpu_cuda_copy_data_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+int _starpu_cuda_copy_data_from_cuda_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
 {
 	int src_kind = starpu_node_get_kind(src_node);
 	int dst_kind = starpu_node_get_kind(dst_node);
 	STARPU_ASSERT(src_kind == STARPU_CUDA_RAM && dst_kind == STARPU_CPU_RAM);
 
 	int ret = 1;
-#ifdef STARPU_USE_CUDA
 	cudaError_t cures;
 	cudaStream_t stream;
 	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
@@ -1334,38 +1332,98 @@ int _starpu_cuda_copy_data_to_cpu(starpu_data_handle_t handle, void *src_interfa
 		cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
 		if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
 	}
-#endif
 	return ret;
 }
 
-int _starpu_cuda_copy_interface(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+int _starpu_cuda_copy_data_from_cpu_to_cuda(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
 {
 	int src_kind = starpu_node_get_kind(src_node);
-	STARPU_ASSERT(src_kind == STARPU_CUDA_RAM);
-
 	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_CUDA_RAM);
 
-	if (dst_kind == STARPU_CPU_RAM)
-	{
-		return starpu_cuda_copy_async_sync((void*) (src + src_offset), src_node,
-						   (void*) (dst + dst_offset), dst_node,
-						   size,
-						   async_channel?starpu_cuda_get_out_transfer_stream(src_node):NULL,
-						   cudaMemcpyDeviceToHost);
-	}
-	else if (dst_kind == STARPU_CUDA_RAM)
+	int ret = 1;
+	cudaError_t cures;
+	cudaStream_t stream;
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+
+	/* STARPU_CPU_RAM -> CUBLAS_RAM */
+	/* only the proper CUBLAS thread can initiate this ! */
+#if !defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
+	STARPU_ASSERT(starpu_worker_get_local_memory_node() == dst_node);
+#endif
+	if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() ||
+	    !(copy_methods->ram_to_cuda_async || copy_methods->any_to_any))
 	{
-		return starpu_cuda_copy_async_sync((void*) (src + src_offset), src_node,
-						   (void*) (dst + dst_offset), dst_node,
-						   size,
-						   async_channel?starpu_cuda_get_peer_transfer_stream(src_node, dst_node):NULL,
-						   cudaMemcpyDeviceToDevice);
+		/* this is not associated to a request so it's synchronous */
+		STARPU_ASSERT(copy_methods->ram_to_cuda || copy_methods->any_to_any);
+		if (copy_methods->ram_to_cuda)
+			copy_methods->ram_to_cuda(src_interface, src_node, dst_interface, dst_node);
+		else
+			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
 	}
 	else
 	{
-		STARPU_ABORT();
-		return -1;
+		req->async_channel.type = STARPU_CUDA_RAM;
+		cures = cudaEventCreateWithFlags(&req->async_channel.event.cuda_event, cudaEventDisableTiming);
+		if (STARPU_UNLIKELY(cures != cudaSuccess))
+			STARPU_CUDA_REPORT_ERROR(cures);
+
+		stream = starpu_cuda_get_in_transfer_stream(dst_node);
+		if (copy_methods->ram_to_cuda_async)
+			ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
+		else
+		{
+			STARPU_ASSERT(copy_methods->any_to_any);
+			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+		}
+
+		cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
+		if (STARPU_UNLIKELY(cures != cudaSuccess))
+			STARPU_CUDA_REPORT_ERROR(cures);
 	}
+	return ret;
+}
+
+int _starpu_cuda_copy_interface_from_cuda_to_cpu(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+
+	STARPU_ASSERT(src_kind == STARPU_CUDA_RAM && dst_kind == STARPU_CPU_RAM);
+
+	return starpu_cuda_copy_async_sync((void*) (src + src_offset), src_node,
+					   (void*) (dst + dst_offset), dst_node,
+					   size,
+					   async_channel?starpu_cuda_get_out_transfer_stream(src_node):NULL,
+					   cudaMemcpyDeviceToHost);
+}
+
+int _starpu_cuda_copy_interface_from_cuda_to_cuda(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+
+	STARPU_ASSERT(src_kind == STARPU_CUDA_RAM && dst_kind == STARPU_CUDA_RAM);
+
+	return starpu_cuda_copy_async_sync((void*) (src + src_offset), src_node,
+					   (void*) (dst + dst_offset), dst_node,
+					   size,
+					   async_channel?starpu_cuda_get_peer_transfer_stream(src_node, dst_node):NULL,
+					   cudaMemcpyDeviceToDevice);
+}
+
+int _starpu_cuda_copy_interface_from_cpu_to_cuda(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+
+	STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_CUDA_RAM);
+
+	return starpu_cuda_copy_async_sync((void*) (src + src_offset), src_node,
+					   (void*) (dst + dst_offset), dst_node,
+					   size,
+					   async_channel?starpu_cuda_get_in_transfer_stream(dst_node):NULL,
+					   cudaMemcpyHostToDevice);
 }
 
 int _starpu_cuda_direct_access_supported(unsigned node, unsigned handling_node)
@@ -1373,6 +1431,7 @@ int _starpu_cuda_direct_access_supported(unsigned node, unsigned handling_node)
 	/* GPUs not always allow direct remote access: if CUDA4
 	 * is enabled, we allow two CUDA devices to communicate. */
 #ifdef STARPU_SIMGRID
+	(void) node;
 	if (starpu_node_get_kind(handling_node) == STARPU_CUDA_RAM)
 	{
 		msg_host_t host = _starpu_simgrid_get_memnode_host(handling_node);
@@ -1382,10 +1441,13 @@ int _starpu_cuda_direct_access_supported(unsigned node, unsigned handling_node)
 	else
 		return 0;
 #elif defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
+	(void) node;
 	enum starpu_node_kind kind = starpu_node_get_kind(handling_node);
 	return kind == STARPU_CUDA_RAM;
 #else /* STARPU_HAVE_CUDA_MEMCPY_PEER */
 	/* Direct GPU-GPU transfers are not allowed in general */
+	(void) node;
+	(void) handling_node;
 	return 0;
 #endif /* STARPU_HAVE_CUDA_MEMCPY_PEER */
 }
@@ -1393,6 +1455,7 @@ int _starpu_cuda_direct_access_supported(unsigned node, unsigned handling_node)
 uintptr_t _starpu_cuda_malloc_on_node(unsigned dst_node, size_t size, int flags)
 {
 	uintptr_t addr = 0;
+	(void) flags;
 
 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
 
@@ -1445,6 +1508,9 @@ uintptr_t _starpu_cuda_malloc_on_node(unsigned dst_node, size_t size, int flags)
 
 void _starpu_cuda_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags)
 {
+	(void) size;
+	(void) flags;
+
 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
 #ifdef STARPU_SIMGRID
 	STARPU_PTHREAD_MUTEX_LOCK(&cuda_alloc_mutex);
@@ -1481,6 +1547,7 @@ void _starpu_cuda_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, i
 #endif /* STARPU_SIMGRID */
 #endif
 }
+#endif
 
 struct _starpu_driver_ops _starpu_driver_cuda_ops =
 {

+ 9 - 3
src/drivers/cuda/driver_cuda.h

@@ -55,9 +55,15 @@ cudaStream_t starpu_cuda_get_peer_transfer_stream(unsigned src_node, unsigned ds
 
 unsigned _starpu_cuda_test_request_completion(struct _starpu_async_channel *async_channel);
 void _starpu_cuda_wait_request_completion(struct _starpu_async_channel *async_channel);
-int _starpu_cuda_copy_data_to_cuda(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
-int _starpu_cuda_copy_data_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
-int _starpu_cuda_copy_interface(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+
+int _starpu_cuda_copy_data_from_cpu_to_cuda(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+int _starpu_cuda_copy_data_from_cuda_to_cuda(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+int _starpu_cuda_copy_data_from_cuda_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+
+int _starpu_cuda_copy_interface_from_cuda_to_cuda(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+int _starpu_cuda_copy_interface_from_cuda_to_cpu(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+int _starpu_cuda_copy_interface_from_cpu_to_cuda(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+
 int _starpu_cuda_direct_access_supported(unsigned node, unsigned handling_node);
 uintptr_t _starpu_cuda_malloc_on_node(unsigned dst_node, size_t size, int flags);
 void _starpu_cuda_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags);

+ 70 - 17
src/drivers/disk/driver_disk.c

@@ -81,7 +81,7 @@ void _starpu_disk_wait_request_completion(struct _starpu_async_channel *async_ch
 	}
 }
 
-int _starpu_disk_copy_data_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+int _starpu_disk_copy_data_from_disk_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
 {
 	int src_kind = starpu_node_get_kind(src_node);
 	int dst_kind = starpu_node_get_kind(dst_node);
@@ -124,7 +124,7 @@ int _starpu_disk_copy_data_to_cpu(starpu_data_handle_t handle, void *src_interfa
 	return ret;
 }
 
-int _starpu_disk_copy_data_to_disk(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+int _starpu_disk_copy_data_from_disk_to_disk(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
 {
 	int src_kind = starpu_node_get_kind(src_node);
 	int dst_kind = starpu_node_get_kind(dst_node);
@@ -144,30 +144,81 @@ int _starpu_disk_copy_data_to_disk(starpu_data_handle_t handle, void *src_interf
 	return ret;
 }
 
-int _starpu_disk_copy_interface(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+int _starpu_disk_copy_data_from_cpu_to_disk(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
 {
 	int src_kind = starpu_node_get_kind(src_node);
-	STARPU_ASSERT(src_kind == STARPU_DISK_RAM);
-
 	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_DISK_RAM);
 
-	if (dst_kind == STARPU_CPU_RAM)
-	{
-		return _starpu_disk_copy_disk_to_src((void*) src, src_offset, src_node,
-						     (void*) (dst + dst_offset), dst_node,
-						     size, async_channel);
-	}
-	else if (dst_kind == STARPU_DISK_RAM)
+	int ret = 0;
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+
+	if (req && !starpu_asynchronous_copy_disabled())
 	{
-		return _starpu_disk_copy_disk_to_disk((void*) src, src_offset, src_node,
-						      (void*) dst, dst_offset, dst_node,
-						      size, async_channel);
+		req->async_channel.type = STARPU_DISK_RAM;
+		req->async_channel.event.disk_event.requests = NULL;
+		req->async_channel.event.disk_event.ptr = NULL;
+		req->async_channel.event.disk_event.handle = NULL;
 	}
+
+	if(copy_methods->any_to_any)
+		ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req && !starpu_asynchronous_copy_disabled() ? &req->async_channel : NULL);
 	else
 	{
-		STARPU_ABORT();
-		return -1;
+		void *obj = starpu_data_handle_to_pointer(handle, dst_node);
+		void * ptr = NULL;
+		starpu_ssize_t size = 0;
+		handle->ops->pack_data(handle, src_node, &ptr, &size);
+		ret = _starpu_disk_full_write(src_node, dst_node, obj, ptr, size, req && !starpu_asynchronous_copy_disabled() ? &req->async_channel : NULL);
+		if (ret == 0)
+		{
+			/* write is already finished, ptr was allocated in pack_data */
+			_starpu_free_flags_on_node(src_node, ptr, size, 0);
+		}
+		else if (ret == -EAGAIN)
+		{
+			STARPU_ASSERT(req);
+			req->async_channel.event.disk_event.ptr = ptr;
+			req->async_channel.event.disk_event.node = src_node;
+			req->async_channel.event.disk_event.size = size;
+		}
+		STARPU_ASSERT(ret == 0 || ret == -EAGAIN);
 	}
+
+	return ret;
+}
+
+int _starpu_disk_copy_interface_from_disk_to_cpu(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_DISK_RAM && dst_kind == STARPU_CPU_RAM);
+
+	return _starpu_disk_copy_disk_to_src((void*) src, src_offset, src_node,
+					     (void*) (dst + dst_offset), dst_node,
+					     size, async_channel);
+}
+
+int _starpu_disk_copy_interface_from_disk_to_disk(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_DISK_RAM && dst_kind == STARPU_DISK_RAM);
+
+	return _starpu_disk_copy_disk_to_disk((void*) src, src_offset, src_node,
+					      (void*) dst, dst_offset, dst_node,
+					      size, async_channel);
+}
+
+int _starpu_disk_copy_interface_from_cpu_to_disk(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_DISK_RAM);
+
+	return _starpu_disk_copy_src_to_disk((void*) (src + src_offset), src_node,
+					     (void*) dst, dst_offset, dst_node,
+					     size, async_channel);
 }
 
 int _starpu_disk_direct_access_supported(unsigned node, unsigned handling_node)
@@ -186,6 +237,7 @@ int _starpu_disk_direct_access_supported(unsigned node, unsigned handling_node)
 
 uintptr_t _starpu_disk_malloc_on_node(unsigned dst_node, size_t size, int flags)
 {
+	(void) flags;
 	uintptr_t addr = 0;
 	addr = (uintptr_t) _starpu_disk_alloc(dst_node, size);
 	return addr;
@@ -193,5 +245,6 @@ uintptr_t _starpu_disk_malloc_on_node(unsigned dst_node, size_t size, int flags)
 
 void _starpu_disk_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags)
 {
+	(void) flags;
 	_starpu_disk_free(dst_node, (void *) addr , size);
 }

+ 9 - 3
src/drivers/disk/driver_disk.h

@@ -28,9 +28,15 @@ int _starpu_disk_copy_disk_to_disk(void * src, size_t src_offset, unsigned src_n
 
 unsigned _starpu_disk_test_request_completion(struct _starpu_async_channel *async_channel);
 void _starpu_disk_wait_request_completion(struct _starpu_async_channel *async_channel);
-int _starpu_disk_copy_data_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
-int _starpu_disk_copy_data_to_disk(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
-int _starpu_disk_copy_interface(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+
+int _starpu_disk_copy_data_from_disk_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+int _starpu_disk_copy_data_from_disk_to_disk(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+int _starpu_disk_copy_data_from_cpu_to_disk(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+
+int _starpu_disk_copy_interface_from_disk_to_cpu(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+int _starpu_disk_copy_interface_from_disk_to_disk(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+int _starpu_disk_copy_interface_from_cpu_to_disk(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+
 int _starpu_disk_direct_access_supported(unsigned node, unsigned handling_node);
 uintptr_t _starpu_disk_malloc_on_node(unsigned dst_node, size_t size, int flags);
 void _starpu_disk_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags);

+ 63 - 21
src/drivers/mic/driver_mic_source.c

@@ -563,15 +563,13 @@ void *_starpu_mic_src_worker(void *arg)
 
 }
 
-int _starpu_mic_copy_data_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+int _starpu_mic_copy_data_from_mic_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
 {
 	int src_kind = starpu_node_get_kind(src_node);
 	int dst_kind = starpu_node_get_kind(dst_node);
 	STARPU_ASSERT(src_kind == STARPU_MIC_RAM && dst_kind == STARPU_CPU_RAM);
 
-	int ret = 1;
-
-#ifdef STARPU_USE_MIC
+	int ret = 0;
 	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
 	/* MIC -> RAM */
 	if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mic_copy_disabled() || !(copy_methods->mic_to_ram_async || copy_methods->any_to_any))
@@ -579,9 +577,9 @@ int _starpu_mic_copy_data_to_cpu(starpu_data_handle_t handle, void *src_interfac
 		/* this is not associated to a request so it's synchronous */
 		STARPU_ASSERT(copy_methods->mic_to_ram || copy_methods->any_to_any);
 		if (copy_methods->mic_to_ram)
-			copy_methods->mic_to_ram(src_interface, src_node, dst_interface, dst_node);
+			ret = copy_methods->mic_to_ram(src_interface, src_node, dst_interface, dst_node);
 		else
-			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
 	}
 	else
 	{
@@ -595,43 +593,86 @@ int _starpu_mic_copy_data_to_cpu(starpu_data_handle_t handle, void *src_interfac
 		}
 		_starpu_mic_init_event(&(req->async_channel.event.mic_event), src_node);
 	}
-#endif
-	return 1;
+	return ret;
 }
 
-int _starpu_mic_copy_interface(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+int _starpu_mic_copy_data_from_cpu_to_mic(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
 {
 	int src_kind = starpu_node_get_kind(src_node);
-	STARPU_ASSERT(src_kind == STARPU_MIC_RAM);
-
 	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_MIC_RAM);
 
-	if (dst_kind == STARPU_CPU_RAM)
+	int ret = 0;
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+	/* RAM -> MIC */
+	if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mic_copy_disabled() || !(copy_methods->ram_to_mic_async || copy_methods->any_to_any))
 	{
-		if (async_channel)
-			return _starpu_mic_copy_mic_to_ram_async((void*) (src + src_offset), src_node,
-								 (void*) (dst + dst_offset), dst_node,
-								 size);
+		/* this is not associated to a request so it's synchronous */
+		STARPU_ASSERT(copy_methods->ram_to_mic || copy_methods->any_to_any);
+		if (copy_methods->ram_to_mic)
+			ret = copy_methods->ram_to_mic(src_interface, src_node, dst_interface, dst_node);
 		else
-			return _starpu_mic_copy_mic_to_ram((void*) (src + src_offset), src_node,
-							   (void*) (dst + dst_offset), dst_node,
-							   size);
+			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
 	}
 	else
 	{
-		STARPU_ABORT();
-		return -1;
+		req->async_channel.type = STARPU_MIC_RAM;
+		if (copy_methods->ram_to_mic_async)
+			ret = copy_methods->ram_to_mic_async(src_interface, src_node, dst_interface, dst_node);
+		else
+		{
+			STARPU_ASSERT(copy_methods->any_to_any);
+			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+		}
+		_starpu_mic_init_event(&(req->async_channel.event.mic_event), dst_node);
 	}
+
+	return ret;
+}
+
+int _starpu_mic_copy_interface_from_mic_to_cpu(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_MIC_RAM && dst_kind == STARPU_CPU_RAM);
+
+	if (async_channel)
+		return _starpu_mic_copy_mic_to_ram_async((void*) (src + src_offset), src_node,
+							 (void*) (dst + dst_offset), dst_node,
+							 size);
+	else
+		return _starpu_mic_copy_mic_to_ram((void*) (src + src_offset), src_node,
+						   (void*) (dst + dst_offset), dst_node,
+						   size);
+}
+
+int _starpu_mic_copy_interface_from_cpu_to_mic(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_MIC_RAM);
+
+	if (async_channel)
+		return _starpu_mic_copy_ram_to_mic_async((void*) (src + src_offset), src_node,
+							 (void*) (dst + dst_offset), dst_node,
+							 size);
+	else
+		return _starpu_mic_copy_ram_to_mic((void*) (src + src_offset), src_node,
+						   (void*) (dst + dst_offset), dst_node,
+						   size);
 }
 
 int _starpu_mic_direct_access_supported(unsigned node, unsigned handling_node)
 {
+	(void) node;
+	(void) handling_node;
 	/* TODO: We don't handle direct MIC-MIC transfers yet */
 	return 0;
 }
 
 uintptr_t _starpu_mic_malloc_on_node(unsigned dst_node, size_t size, int flags)
 {
+	(void) flags;
 	uintptr_t addr = 0;
 	if (_starpu_mic_allocate_memory((void **)(&addr), size, dst_node))
 		addr = 0;
@@ -640,5 +681,6 @@ uintptr_t _starpu_mic_malloc_on_node(unsigned dst_node, size_t size, int flags)
 
 void _starpu_mic_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags)
 {
+	(void) flags;
 	_starpu_mic_free_memory((void*) addr, size, dst_node);
 }

+ 7 - 2
src/drivers/mic/driver_mic_source.h

@@ -81,8 +81,13 @@ void *_starpu_mic_src_worker(void *arg);
 
 unsigned _starpu_mic_test_request_completion(struct _starpu_async_channel *async_channel);
 void _starpu_mic_wait_request_completion(struct _starpu_async_channel *async_channel);
-int _starpu_mic_copy_data_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
-int _starpu_mic_copy_interface(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+
+int _starpu_mic_copy_data_from_mic_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+int _starpu_mic_copy_data_from_cpu_to_mic(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+
+int _starpu_mic_copy_interface_from_mic_to_cpu(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+int _starpu_mic_copy_interface_from_cpu_to_mic(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+
 int _starpu_mic_direct_access_supported(unsigned node, unsigned handling_node);
 uintptr_t _starpu_mic_malloc_on_node(unsigned dst_node, size_t size, int flags);
 void _starpu_mic_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags);

+ 0 - 62
src/drivers/mpi/driver_mpi_common.c

@@ -551,65 +551,3 @@ void _starpu_mpi_common_measure_bandwidth_latency(double timing_dtod[STARPU_MAXM
         free(buf);
 }
 
-int _starpu_mpi_common_copy_data_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
-{
-	int src_kind = starpu_node_get_kind(src_node);
-	int dst_kind = starpu_node_get_kind(dst_node);
-	STARPU_ASSERT(src_kind == STARPU_MPI_MS_RAM && dst_kind == STARPU_CPU_RAM);
-
-	int ret = 0;
-	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
-	if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mpi_ms_copy_disabled() || !(copy_methods->mpi_ms_to_ram_async || copy_methods->any_to_any))
-	{
-		/* this is not associated to a request so it's synchronous */
-		STARPU_ASSERT(copy_methods->mpi_ms_to_ram || copy_methods->any_to_any);
-		if (copy_methods->mpi_ms_to_ram)
-			copy_methods->mpi_ms_to_ram(src_interface, src_node, dst_interface, dst_node);
-		else
-			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
-	}
-	else
-	{
-		req->async_channel.type = STARPU_MPI_MS_RAM;
-		if(copy_methods->mpi_ms_to_ram_async)
-			ret = copy_methods->mpi_ms_to_ram_async(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
-		else
-		{
-			STARPU_ASSERT(copy_methods->any_to_any);
-			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
-		}
-	}
-	return ret;
-}
-
-int _starpu_mpi_common_copy_data_to_mpi(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
-{
-	int src_kind = starpu_node_get_kind(src_node);
-	int dst_kind = starpu_node_get_kind(dst_node);
-	STARPU_ASSERT(src_kind == STARPU_MPI_MS_RAM && dst_kind == STARPU_MPI_MS_RAM);
-
-	int ret = 0;
-	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
-
-	if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mpi_ms_copy_disabled() || !(copy_methods->mpi_ms_to_mpi_ms_async || copy_methods->any_to_any))
-	{
-		/* this is not associated to a request so it's synchronous */
-		STARPU_ASSERT(copy_methods->mpi_ms_to_mpi_ms || copy_methods->any_to_any);
-		if (copy_methods->mpi_ms_to_mpi_ms)
-			copy_methods->mpi_ms_to_mpi_ms(src_interface, src_node, dst_interface, dst_node);
-		else
-			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
-	}
-	else
-	{
-		req->async_channel.type = STARPU_MPI_MS_RAM;
-		if(copy_methods->mpi_ms_to_mpi_ms_async)
-			ret = copy_methods->mpi_ms_to_mpi_ms_async(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
-		else
-		{
-			STARPU_ASSERT(copy_methods->any_to_any);
-			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
-		}
-	}
-	return ret;
-}

+ 0 - 3
src/drivers/mpi/driver_mpi_common.h

@@ -54,9 +54,6 @@ void _starpu_mpi_common_barrier(void);
 
 void _starpu_mpi_common_measure_bandwidth_latency(double bandwidth_dtod[STARPU_MAXMPIDEVS][STARPU_MAXMPIDEVS], double latency_dtod[STARPU_MAXMPIDEVS][STARPU_MAXMPIDEVS]);
 
-int _starpu_mpi_common_copy_data_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
-int _starpu_mpi_common_copy_data_to_mpi(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
-
 #endif  /* STARPU_USE_MPI_MASTER_SLAVE */
 
 #endif	/* __DRIVER_MPI_COMMON_H__ */

+ 133 - 22
src/drivers/mpi/driver_mpi_source.c

@@ -381,50 +381,159 @@ void *_starpu_mpi_src_worker(void *arg)
         return NULL;
 }
 
-int _starpu_mpi_copy_interface(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+int _starpu_mpi_copy_data_from_mpi_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
 {
 	int src_kind = starpu_node_get_kind(src_node);
-	STARPU_ASSERT(src_kind == STARPU_MPI_MS_RAM);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_MPI_MS_RAM && dst_kind == STARPU_CPU_RAM);
+
+	int ret = 0;
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+	if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mpi_ms_copy_disabled() || !(copy_methods->mpi_ms_to_ram_async || copy_methods->any_to_any))
+	{
+		/* this is not associated to a request so it's synchronous */
+		STARPU_ASSERT(copy_methods->mpi_ms_to_ram || copy_methods->any_to_any);
+		if (copy_methods->mpi_ms_to_ram)
+			copy_methods->mpi_ms_to_ram(src_interface, src_node, dst_interface, dst_node);
+		else
+			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+	}
+	else
+	{
+		req->async_channel.type = STARPU_MPI_MS_RAM;
+		if(copy_methods->mpi_ms_to_ram_async)
+			ret = copy_methods->mpi_ms_to_ram_async(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+		else
+		{
+			STARPU_ASSERT(copy_methods->any_to_any);
+			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+		}
+	}
+	return ret;
+}
 
+int _starpu_mpi_copy_data_from_mpi_to_mpi(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+{
+	int src_kind = starpu_node_get_kind(src_node);
 	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_MPI_MS_RAM && dst_kind == STARPU_MPI_MS_RAM);
 
-	if (dst_kind == STARPU_CPU_RAM)
+	int ret = 0;
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+
+	if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mpi_ms_copy_disabled() || !(copy_methods->mpi_ms_to_mpi_ms_async || copy_methods->any_to_any))
+	{
+		/* this is not associated to a request so it's synchronous */
+		STARPU_ASSERT(copy_methods->mpi_ms_to_mpi_ms || copy_methods->any_to_any);
+		if (copy_methods->mpi_ms_to_mpi_ms)
+			copy_methods->mpi_ms_to_mpi_ms(src_interface, src_node, dst_interface, dst_node);
+		else
+			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+	}
+	else
 	{
-                if (async_channel)
-                        return _starpu_mpi_copy_mpi_to_ram_async((void*) (src + src_offset), src_node,
-								 (void*) (dst + dst_offset), dst_node,
-								 size, async_channel);
-                else
-                        return _starpu_mpi_copy_mpi_to_ram_sync((void*) (src + src_offset), src_node,
-								(void*) (dst + dst_offset), dst_node,
-								size);
+		req->async_channel.type = STARPU_MPI_MS_RAM;
+		if(copy_methods->mpi_ms_to_mpi_ms_async)
+			ret = copy_methods->mpi_ms_to_mpi_ms_async(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+		else
+		{
+			STARPU_ASSERT(copy_methods->any_to_any);
+			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+		}
 	}
-	else if (dst_kind == STARPU_MPI_MS_RAM)
+	return ret;
+}
+
+int _starpu_mpi_copy_data_from_cpu_to_mpi(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_MPI_MS_RAM);
+
+	int ret = 0;
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+
+	if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mpi_ms_copy_disabled() || !(copy_methods->ram_to_mpi_ms_async || copy_methods->any_to_any))
 	{
-                if (async_channel)
-                        return _starpu_mpi_copy_sink_to_sink_async((void*) (src + src_offset), src_node,
-								   (void*) (dst + dst_offset), dst_node,
-								   size, async_channel);
-                else
-                        return _starpu_mpi_copy_sink_to_sink_sync((void*) (src + src_offset), src_node,
-								  (void*) (dst + dst_offset), dst_node,
-								  size);
+		/* this is not associated to a request so it's synchronous */
+		STARPU_ASSERT(copy_methods->ram_to_mpi_ms || copy_methods->any_to_any);
+		if (copy_methods->ram_to_mpi_ms)
+			copy_methods->ram_to_mpi_ms(src_interface, src_node, dst_interface, dst_node);
+		else
+			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
 	}
 	else
 	{
-		STARPU_ABORT();
-		return -1;
+		req->async_channel.type = STARPU_MPI_MS_RAM;
+		if(copy_methods->ram_to_mpi_ms_async)
+			ret = copy_methods->ram_to_mpi_ms_async(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+		else
+		{
+			STARPU_ASSERT(copy_methods->any_to_any);
+			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+		}
 	}
+	return ret;
+}
+
+int _starpu_mpi_copy_interface_from_mpi_to_cpu(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_MPI_MS_RAM && dst_kind == STARPU_CPU_RAM);
+
+	if (async_channel)
+		return _starpu_mpi_copy_mpi_to_ram_async((void*) (src + src_offset), src_node,
+							 (void*) (dst + dst_offset), dst_node,
+							 size, async_channel);
+	else
+		return _starpu_mpi_copy_mpi_to_ram_sync((void*) (src + src_offset), src_node,
+							(void*) (dst + dst_offset), dst_node,
+							size);
+}
+
+int _starpu_mpi_copy_interface_from_mpi_to_mpi(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_MPI_MS_RAM && dst_kind == STARPU_MPI_MS_RAM);
+
+	if (async_channel)
+		return _starpu_mpi_copy_sink_to_sink_async((void*) (src + src_offset), src_node,
+							   (void*) (dst + dst_offset), dst_node,
+							   size, async_channel);
+	else
+		return _starpu_mpi_copy_sink_to_sink_sync((void*) (src + src_offset), src_node,
+							  (void*) (dst + dst_offset), dst_node,
+							  size);
+}
+
+int _starpu_mpi_copy_interface_from_cpu_to_mpi(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_MPI_MS_RAM);
+
+	if (async_channel)
+		return _starpu_mpi_copy_ram_to_mpi_async((void*) (src + src_offset), src_node,
+							 (void*) (dst + dst_offset), dst_node,
+							 size, async_channel);
+	else
+		return _starpu_mpi_copy_ram_to_mpi_sync((void*) (src + src_offset), src_node,
+							(void*) (dst + dst_offset), dst_node,
+							size);
 }
 
 int _starpu_mpi_direct_access_supported(unsigned node, unsigned handling_node)
 {
+	(void) node;
 	enum starpu_node_kind kind = starpu_node_get_kind(handling_node);
 	return kind == STARPU_MPI_MS_RAM;
 }
 
 uintptr_t _starpu_mpi_malloc_on_node(unsigned dst_node, size_t size, int flags)
 {
+	(void) flags;
 	uintptr_t addr = 0;
 	if (_starpu_mpi_src_allocate_memory((void **)(&addr), size, dst_node))
 		addr = 0;
@@ -433,5 +542,7 @@ uintptr_t _starpu_mpi_malloc_on_node(unsigned dst_node, size_t size, int flags)
 
 void _starpu_mpi_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags)
 {
+	(void) flags;
+	(void) size;
 	_starpu_mpi_source_free_memory((void*) addr, dst_node);
 }

+ 9 - 1
src/drivers/mpi/driver_mpi_source.h

@@ -47,7 +47,15 @@ int _starpu_mpi_copy_sink_to_sink_sync(void *src, unsigned src_node, void *dst,
 int _starpu_mpi_copy_mpi_to_ram_async(void *src, unsigned src_node, void *dst, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size, void * event);
 int _starpu_mpi_copy_ram_to_mpi_async(void *src, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst, unsigned dst_node, size_t size, void * event);
 int _starpu_mpi_copy_sink_to_sink_async(void *src, unsigned src_node, void *dst, unsigned dst_node, size_t size, void * event);
-int _starpu_mpi_copy_interface(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+
+int _starpu_mpi_copy_data_from_mpi_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+int _starpu_mpi_copy_data_from_mpi_to_mpi(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+int _starpu_mpi_copy_data_from_cpu_to_mpi(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+
+int _starpu_mpi_copy_interface_from_mpi_to_cpu(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+int _starpu_mpi_copy_interface_from_mpi_to_mpi(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+int _starpu_mpi_copy_interface_from_cpu_to_mpi(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+
 int _starpu_mpi_direct_access_supported(unsigned node, unsigned handling_node);
 uintptr_t _starpu_mpi_malloc_on_node(unsigned dst_node, size_t size, int flags);
 void _starpu_mpi_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags);

+ 68 - 23
src/drivers/opencl/driver_opencl.c

@@ -1145,16 +1145,13 @@ struct _starpu_driver_ops _starpu_driver_opencl_ops =
 	.deinit = _starpu_opencl_driver_deinit
 };
 
-#endif /* STARPU_USE_OPENCL */
-
-int _starpu_opencl_copy_data_to_opencl(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+int _starpu_opencl_copy_data_from_opencl_to_opencl(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
 {
 	int src_kind = starpu_node_get_kind(src_node);
 	int dst_kind = starpu_node_get_kind(dst_node);
 	STARPU_ASSERT(src_kind == STARPU_OPENCL_RAM && dst_kind == STARPU_OPENCL_RAM);
 
 	int ret = 1;
-#ifdef STARPU_USE_OPENCL
 	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
 	/* STARPU_OPENCL_RAM -> STARPU_OPENCL_RAM */
 	STARPU_ASSERT(starpu_worker_get_local_memory_node() == dst_node || starpu_worker_get_local_memory_node() == src_node);
@@ -1178,18 +1175,16 @@ int _starpu_opencl_copy_data_to_opencl(starpu_data_handle_t handle, void *src_in
 			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
 		}
 	}
-#endif
 	return ret;
 }
 
-int _starpu_opencl_copy_data_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+int _starpu_opencl_copy_data_from_opencl_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
 {
 	int src_kind = starpu_node_get_kind(src_node);
 	int dst_kind = starpu_node_get_kind(dst_node);
 	STARPU_ASSERT(src_kind == STARPU_OPENCL_RAM && dst_kind == STARPU_CPU_RAM);
 
 	int ret = 1;
-#ifdef STARPU_USE_OPENCL
 	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
 	/* OpenCL -> RAM */
 	STARPU_ASSERT(starpu_worker_get_local_memory_node() == src_node);
@@ -1213,40 +1208,89 @@ int _starpu_opencl_copy_data_to_cpu(starpu_data_handle_t handle, void *src_inter
 			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
 		}
 	}
-#endif
 	return ret;
 }
 
-int _starpu_opencl_copy_interface(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+int _starpu_opencl_copy_data_from_cpu_to_opencl(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
 {
 	int src_kind = starpu_node_get_kind(src_node);
-	STARPU_ASSERT(src_kind == STARPU_OPENCL_RAM);
-
 	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_OPENCL_RAM);
 
-	if (dst_kind == STARPU_OPENCL_RAM || dst_kind == STARPU_CPU_RAM)
+	int ret = 0;
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+	/* STARPU_CPU_RAM -> STARPU_OPENCL_RAM */
+	STARPU_ASSERT(starpu_worker_get_local_memory_node() == dst_node);
+	if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() || !(copy_methods->ram_to_opencl_async || copy_methods->any_to_any))
 	{
-		return starpu_opencl_copy_async_sync(src, src_offset, src_node,
-						     dst, dst_offset, dst_node,
-						     size,
-						     &async_channel->event.opencl_event);
+		STARPU_ASSERT(copy_methods->ram_to_opencl || copy_methods->any_to_any);
+		/* this is not associated to a request so it's synchronous */
+		if (copy_methods->ram_to_opencl)
+			copy_methods->ram_to_opencl(src_interface, src_node, dst_interface, dst_node);
+		else
+			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
 	}
 	else
 	{
-		STARPU_ABORT();
-		return -1;
+		req->async_channel.type = STARPU_OPENCL_RAM;
+		if (copy_methods->ram_to_opencl_async)
+			ret = copy_methods->ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
+		else
+		{
+			STARPU_ASSERT(copy_methods->any_to_any);
+			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+		}
 	}
+	return ret;
+}
+
+int _starpu_opencl_copy_interface_from_opencl_to_opencl(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_OPENCL_RAM && dst_kind == STARPU_OPENCL_RAM);
+
+	return starpu_opencl_copy_async_sync(src, src_offset, src_node,
+					     dst, dst_offset, dst_node,
+					     size,
+					     &async_channel->event.opencl_event);
+}
+
+int _starpu_opencl_copy_interface_from_opencl_to_cpu(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_OPENCL_RAM && dst_kind == STARPU_CPU_RAM);
+
+	return starpu_opencl_copy_async_sync(src, src_offset, src_node,
+					     dst, dst_offset, dst_node,
+					     size,
+					     &async_channel->event.opencl_event);
+}
+
+int _starpu_opencl_copy_interface_from_cpu_to_opencl(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_OPENCL_RAM);
+
+	return starpu_opencl_copy_async_sync(src, src_offset, src_node,
+					     dst, dst_offset, dst_node,
+					     size,
+					     &async_channel->event.opencl_event);
 }
 
 int _starpu_opencl_direct_access_supported(unsigned node, unsigned handling_node)
 {
+	(void)node;
+	(void)handling_node;
 	return 0;
 }
 
 uintptr_t _starpu_opencl_malloc_on_node(unsigned dst_node, size_t size, int flags)
 {
+	(void)flags;
 	uintptr_t addr = 0;
-#if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
 #ifdef STARPU_SIMGRID
 	static uintptr_t last[STARPU_MAXNODES];
 	/* Sleep for the allocation */
@@ -1273,14 +1317,14 @@ uintptr_t _starpu_opencl_malloc_on_node(unsigned dst_node, size_t size, int flag
 		addr = (uintptr_t)ptr;
 	}
 #endif
-
-#endif
 	return addr;
 }
 
 void _starpu_opencl_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags)
 {
-#if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
+	(void)flags;
+	(void)size;
+	(void)dst_node;
 #ifdef STARPU_SIMGRID
 	STARPU_PTHREAD_MUTEX_LOCK(&opencl_alloc_mutex);
 	/* Sleep for the free */
@@ -1293,5 +1337,6 @@ void _starpu_opencl_free_on_node(unsigned dst_node, uintptr_t addr, size_t size,
 	if (STARPU_UNLIKELY(err != CL_SUCCESS))
 		STARPU_OPENCL_REPORT_ERROR(err);
 #endif
-#endif
 }
+
+#endif /* STARPU_USE_OPENCL */

+ 9 - 3
src/drivers/opencl/driver_opencl.h

@@ -72,9 +72,15 @@ cl_int _starpu_opencl_copy_rect_ram_to_opencl(void *ptr, unsigned src_node, cl_m
 
 unsigned _starpu_opencl_test_request_completion(struct _starpu_async_channel *async_channel);
 void _starpu_opencl_wait_request_completion(struct _starpu_async_channel *async_channel);
-int _starpu_opencl_copy_data_to_opencl(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
-int _starpu_opencl_copy_data_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
-int _starpu_opencl_copy_interface(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+
+int _starpu_opencl_copy_data_from_opencl_to_opencl(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+int _starpu_opencl_copy_data_from_opencl_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+int _starpu_opencl_copy_data_from_cpu_to_opencl(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+
+int _starpu_opencl_copy_interface_from_opencl_to_cpu(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+int _starpu_opencl_copy_interface_from_opencl_to_opencl(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+int _starpu_opencl_copy_interface_from_cpu_to_opencl(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+
 int _starpu_opencl_direct_access_supported(unsigned node, unsigned handling_node);
 uintptr_t _starpu_opencl_malloc_on_node(unsigned dst_node, size_t size, int flags);
 void _starpu_opencl_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags);

+ 0 - 30
src/drivers/scc/driver_scc_common.c

@@ -191,33 +191,3 @@ int _starpu_scc_common_recv_is_ready(const struct _starpu_mp_node *mp_node)
   ************/
   STARPU_ASSERT(0);
 }
-
-int _starpu_scc_common_copy_data_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
-{
-	int src_kind = starpu_node_get_kind(src_node);
-	int dst_kind = starpu_node_get_kind(dst_node);
-	STARPU_ASSERT(src_kind == STARPU_SCC_RAM && dst_kind == STARPU_CPU_RAM);
-
-	int ret = 0;
-	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
-	if (copy_methods->scc_sink_to_src)
-		copy_methods->scc_sink_to_src(src_interface, src_node, dst_interface, dst_node);
-	else
-		copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
-	return ret;
-}
-
-int _starpu_scc_common_copy_data_to_scc(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
-{
-	int src_kind = starpu_node_get_kind(src_node);
-	int dst_kind = starpu_node_get_kind(dst_node);
-	STARPU_ASSERT(src_kind == STARPU_SCC_RAM && dst_kind == STARPU_SCC_RAM);
-
-	int ret = 0;
-	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
-	if (copy_methods->scc_sink_to_sink)
-		copy_methods->scc_sink_to_sink(src_interface, src_node, dst_interface, dst_node);
-	else
-		copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
-	return ret;
-}

+ 0 - 3
src/drivers/scc/driver_scc_common.h

@@ -49,9 +49,6 @@ void _starpu_scc_common_report_rcce_error(const char *func, const char *file, co
 
 int _starpu_scc_common_recv_is_ready(const struct _starpu_mp_node *mp_node);
 
-int _starpu_scc_common_copy_data_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
-int _starpu_scc_common_copy_data_to_scc(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
-
 #endif /* STARPU_USE_SCC */
 
 

+ 76 - 18
src/drivers/scc/driver_scc_source.c

@@ -326,40 +326,97 @@ void *_starpu_scc_src_worker(void *arg)
 	return NULL;
 }
 
-int _starpu_scc_copy_interface(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+int _starpu_scc_copy_data_from_scc_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
 {
+	(void) req;
 	int src_kind = starpu_node_get_kind(src_node);
-	STARPU_ASSERT(src_kind == STARPU_SCC_RAM);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_SCC_RAM && dst_kind == STARPU_CPU_RAM);
 
+	int ret = 0;
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+	if (copy_methods->scc_sink_to_src)
+		ret = copy_methods->scc_sink_to_src(src_interface, src_node, dst_interface, dst_node);
+	else
+		ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+	return ret;
+}
+
+int _starpu_scc_copy_data_from_scc_to_scc(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+{
+	(void) req;
+	int src_kind = starpu_node_get_kind(src_node);
 	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_SCC_RAM && dst_kind == STARPU_SCC_RAM);
 
-	if (dst_kind == STARPU_CPU_RAM)
-	{
-		return _starpu_scc_copy_sink_to_src((void*) (src + src_offset), src_node,
-						    (void*) (dst + dst_offset), dst_node,
-						    size);
+	int ret = 0;
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+	if (copy_methods->scc_sink_to_sink)
+		ret = copy_methods->scc_sink_to_sink(src_interface, src_node, dst_interface, dst_node);
+	else
+		ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+	return ret;
+}
 
-	}
-	else if (dst_kind == STARPU_SCC_RAM)
-	{
-		return _starpu_scc_copy_sink_to_sink((void*) (src + src_offset), src_node,
-						     (void*) (dst + dst_offset), dst_node,
-						     size);
-	}
+int _starpu_scc_copy_data_from_cpu_to_scc(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+{
+	(void) req;
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_SCC_RAM);
+
+	int ret = 0;
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+	if (copy_methods->scc_src_to_sink)
+		ret = copy_methods->scc_src_to_sink(src_interface, src_node, dst_interface, dst_node);
 	else
-	{
-		STARPU_ABORT();
-		return -1;
-	}
+		ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+	return ret;
+}
+
+int _starpu_scc_copy_interface_from_scc_to_cpu(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_SCC_RAM && dst_kind == STARPU_CPU_RAM);
+
+	return _starpu_scc_copy_sink_to_src((void*) (src + src_offset), src_node,
+					    (void*) (dst + dst_offset), dst_node,
+					    size);
+}
+
+int _starpu_scc_copy_interface_from_scc_to_scc(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_SCC_RAM && dst_kind == STARPU_SCC_RAM);
+
+	return _starpu_scc_copy_sink_to_sink((void*) (src + src_offset), src_node,
+					     (void*) (dst + dst_offset), dst_node,
+					     size);
+}
+
+int _starpu_scc_copy_interface_from_cpu_to_scc(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_SCC_RAM);
+
+	return _starpu_scc_copy_src_to_sink((void*) (src + src_offset), src_node,
+					    (void*) (dst + dst_offset), dst_node,
+					    size);
 }
 
 int _starpu_scc_direct_access_supported(unsigned node, unsigned handling_node)
 {
+	(void) node;
+	(void) handling_node;
 	return 1;
 }
 
 uintptr_t _starpu_scc_malloc_on_node(unsigned dst_node, size_t size, int flags)
 {
+	(void) flags;
 	uintptr_t addr = 0;
 	if (_starpu_scc_allocate_memory((void **)(&addr), size, dst_node))
 		addr = 0;
@@ -368,5 +425,6 @@ uintptr_t _starpu_scc_malloc_on_node(unsigned dst_node, size_t size, int flags)
 
 void _starpu_scc_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags)
 {
+	(void) flags;
 	_starpu_scc_free_memory((void *) addr, dst_node);
 }

+ 8 - 1
src/drivers/scc/driver_scc_source.h

@@ -52,7 +52,14 @@ int _starpu_scc_copy_sink_to_sink(void *src, unsigned src_node, void *dst, unsig
 
 void *_starpu_scc_src_worker(void *arg);
 
-int _starpu_scc_copy_interface(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+int _starpu_scc_copy_data_from_scc_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+int _starpu_scc_copy_data_from_scc_to_scc(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+int _starpu_scc_copy_data_from_cpu_to_scc(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+
+int _starpu_scc_copy_interface_from_scc_to_cpu(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+int _starpu_scc_copy_interface_from_scc_to_scc(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+int _starpu_scc_copy_interface_from_cpu_to_scc(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+
 int _starpu_scc_direct_access_supported(unsigned node, unsigned handling_node);
 uintptr_t _starpu_scc_malloc_on_node(unsigned dst_node, size_t size, int flags);
 void _starpu_scc_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags);