Преглед на файлове

define node operations in a struct

Nathalie Furmento преди 6 години
родител
ревизия
44bcb87a60

+ 9 - 10
include/starpu_worker.h

@@ -42,16 +42,15 @@ extern "C"
 
 enum starpu_node_kind
 {
-	STARPU_UNUSED     = 0x00,
-	STARPU_CPU_RAM    = 0x01,
-	STARPU_CUDA_RAM   = 0x02,
-	STARPU_OPENCL_RAM = 0x03,
-	STARPU_DISK_RAM   = 0x04,
-	STARPU_MIC_RAM    = 0x05,
-	STARPU_SCC_RAM    = 0x06,
-	STARPU_SCC_SHM    = 0x07,
-	STARPU_MPI_MS_RAM = 0x08
-
+	STARPU_UNUSED=0,
+	STARPU_CPU_RAM=1,
+	STARPU_CUDA_RAM=2,
+	STARPU_OPENCL_RAM=3,
+	STARPU_DISK_RAM=4,
+	STARPU_MIC_RAM=5,
+	STARPU_SCC_RAM=6,
+	STARPU_SCC_SHM=7,
+	STARPU_MPI_MS_RAM=8
 };
 
 /**

+ 3 - 1
src/Makefile.am

@@ -3,7 +3,7 @@
 # Copyright (C) 2011-2017                                Inria
 # Copyright (C) 2012                                     Benjamin Lorendeau
 # Copyright (C) 2009-2019                                Université de Bordeaux
-# Copyright (C) 2010-2015,2017,2018                      CNRS
+# Copyright (C) 2010-2015,2017,2018,2019                 CNRS
 # Copyright (C) 2013                                     Simon Archipoff
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -98,6 +98,7 @@ noinst_HEADERS = 						\
 	core/detect_combined_workers.h				\
 	sched_policies/helper_mct.h				\
 	sched_policies/fifo_queues.h				\
+	datawizard/node_ops.h					\
 	datawizard/footprint.h					\
 	datawizard/datawizard.h					\
 	datawizard/data_request.h				\
@@ -224,6 +225,7 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 		\
 	sched_policies/graph_test_policy.c			\
 	drivers/driver_common/driver_common.c			\
 	drivers/disk/driver_disk.c				\
+	datawizard/node_ops.c					\
 	datawizard/memory_nodes.c				\
 	datawizard/write_back.c					\
 	datawizard/coherency.c					\

+ 7 - 8
src/core/disk.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2013,2017                                Inria
- * Copyright (C) 2015-2017                                CNRS
+ * Copyright (C) 2015-2017, 2019                          CNRS
  * Copyright (C) 2013-2015,2017,2018-2019                 Université de Bordeaux
  * Copyright (C) 2013                                     Corentin Salingue
  *
@@ -451,13 +451,12 @@ static int add_disk_in_list(unsigned node,  struct starpu_disk_ops *func, void *
 
 int _starpu_disk_can_copy(unsigned node1, unsigned node2)
 {
-	if (starpu_node_get_kind(node1) == STARPU_DISK_RAM && starpu_node_get_kind(node2) == STARPU_DISK_RAM)
-	{
-		if (disk_register_list[node1]->functions == disk_register_list[node2]->functions)
-			/* they must have a copy function */
-			if (disk_register_list[node1]->functions->copy != NULL)
-				return 1;
-	}
+	STARPU_ASSERT(starpu_node_get_kind(node1) == STARPU_DISK_RAM && starpu_node_get_kind(node2) == STARPU_DISK_RAM);
+
+	if (disk_register_list[node1]->functions == disk_register_list[node2]->functions)
+		/* they must have a copy function */
+		if (disk_register_list[node1]->functions->copy != NULL)
+			return 1;
 	return 0;
 }
 

+ 3 - 26
src/core/perfmodel/perfmodel.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2011,2012,2014,2016,2017                 Inria
  * Copyright (C) 2008-2019                                Université de Bordeaux
- * Copyright (C) 2010-2017                                CNRS
+ * Copyright (C) 2010-2017, 2019                          CNRS
  * Copyright (C) 2013                                     Thibaut Lambert
  * Copyright (C) 2011                                     Télécom-SudParis
  * Copyright (C) 2016                                     Uppsala University
@@ -246,7 +246,6 @@ double starpu_task_expected_conversion_time(struct starpu_task *task,
 {
 	unsigned i;
 	double sum = 0.0;
-	enum starpu_node_kind node_kind;
 	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
 
 #ifdef STARPU_DEVEL
@@ -258,35 +257,13 @@ double starpu_task_expected_conversion_time(struct starpu_task *task,
 	{
 		starpu_data_handle_t handle;
 		struct starpu_task *conversion_task;
+		enum starpu_node_kind node_kind;
 
 		handle = STARPU_TASK_GET_HANDLE(task, i);
 		if (!_starpu_data_is_multiformat_handle(handle))
 			continue;
 
-		switch(arch->devices[0].type)
-		{
-			case STARPU_CPU_WORKER:
-				node_kind = STARPU_CPU_RAM;
-				break;
-			case STARPU_CUDA_WORKER:
-				node_kind = STARPU_CUDA_RAM;
-				break;
-			case STARPU_OPENCL_WORKER:
-				node_kind = STARPU_OPENCL_RAM;
-				break;
-			case STARPU_MIC_WORKER:
-				node_kind = STARPU_MIC_RAM;
-				break;
-			case STARPU_SCC_WORKER:
-				node_kind = STARPU_SCC_RAM;
-				break;
-			case STARPU_MPI_MS_WORKER:
-				node_kind = STARPU_MPI_MS_RAM;
-				break;
-			default:
-				STARPU_ABORT();
-				break;
-		}
+		node_kind = _starpu_worker_get_node_kind(arch->devices[0].type);
 		if (!_starpu_handle_needs_conversion_task_for_arch(handle, node_kind))
 			continue;
 

+ 3 - 0
src/core/topology.c

@@ -37,6 +37,8 @@
 #include <datawizard/datastats.h>
 #include <datawizard/memory_nodes.h>
 #include <datawizard/memory_manager.h>
+#include <datawizard/node_ops.h>
+
 #include <common/uthash.h>
 
 #ifdef STARPU_HAVE_HWLOC
@@ -2969,6 +2971,7 @@ int _starpu_build_topology(struct _starpu_machine_config *config, int no_mp_conf
 	/* for the data management library */
 	_starpu_memory_nodes_init();
 	_starpu_datastats_init();
+	_starpu_node_ops_init();
 
 	_starpu_init_workers_binding_and_memory(config, no_mp_config);
 

+ 22 - 0
src/core/workers.c

@@ -2597,3 +2597,25 @@ void starpu_worker_set_waking_up_callback(void (*callback)(unsigned workerid))
 	_starpu_config.conf.callback_worker_waking_up = callback;
 }
 #endif
+
+enum starpu_node_kind _starpu_worker_get_node_kind(enum starpu_worker_archtype type)
+{
+	switch(type)
+	{
+		case STARPU_CPU_WORKER:
+			return STARPU_CPU_RAM;
+		case STARPU_CUDA_WORKER:
+			return STARPU_CUDA_RAM;
+		case STARPU_OPENCL_WORKER:
+			return STARPU_OPENCL_RAM;
+			break;
+		case STARPU_MIC_WORKER:
+			return STARPU_MIC_RAM;
+		case STARPU_SCC_WORKER:
+			return STARPU_SCC_RAM;
+		case STARPU_MPI_MS_WORKER:
+			return STARPU_MPI_MS_RAM;
+		default:
+			STARPU_ABORT();
+	}
+}

+ 2 - 0
src/core/workers.h

@@ -660,6 +660,8 @@ static inline unsigned __starpu_worker_get_id_check(const char *f, int l)
 }
 #define _starpu_worker_get_id_check(f,l) __starpu_worker_get_id_check(f,l)
 
+enum starpu_node_kind _starpu_worker_get_node_kind(enum starpu_worker_archtype type);
+
 void _starpu_worker_set_stream_ctx(unsigned workerid, struct _starpu_sched_ctx *sched_ctx);
 
 struct _starpu_sched_ctx* _starpu_worker_get_ctx_stream(unsigned stream_workerid);

+ 16 - 58
src/datawizard/coherency.c

@@ -3,7 +3,7 @@
  * Copyright (C) 2011-2014,2016,2017                      Inria
  * Copyright (C) 2008-2019                                Université de Bordeaux
  * Copyright (C) 2018                                     Federal University of Rio Grande do Sul (UFRGS)
- * Copyright (C) 2010-2018                                CNRS
+ * Copyright (C) 2010-2019                                CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -17,6 +17,9 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
+#include <limits.h>
+#include <math.h>
+
 #include <common/config.h>
 #include <datawizard/coherency.h>
 #include <datawizard/copy_driver.h>
@@ -25,11 +28,10 @@
 #include <core/dependencies/data_concurrency.h>
 #include <core/disk.h>
 #include <profiling/profiling.h>
-#include <math.h>
 #include <core/task.h>
 #include <starpu_scheduler.h>
 #include <core/workers.h>
-#include <limits.h>
+#include <datawizard/node_ops.h>
 
 #ifdef STARPU_SIMGRID
 #include <core/simgrid.h>
@@ -106,7 +108,7 @@ int _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
 		STARPU_ASSERT(handle->per_node[src_node].initialized);
 		return src_node;
 	}
-	
+
 	int i_ram = -1;
 	int i_gpu = -1;
 	int i_disk = -1;
@@ -114,7 +116,6 @@ int _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
 	/* Revert to dumb strategy: take RAM unless only a GPU has it */
 	for (i = 0; i < nnodes; i++)
 	{
-		
 		if (src_node_mask & (1<<i))
 		{
 			int (*can_copy)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, unsigned handling_node) = handle->ops->copy_methods->can_copy;
@@ -146,16 +147,16 @@ int _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
 			    starpu_node_get_kind(i) == STARPU_MIC_RAM)
 				i_gpu = i;
 
-			if (starpu_node_get_kind(i) == STARPU_CPU_RAM || 
+			if (starpu_node_get_kind(i) == STARPU_CPU_RAM ||
 			    starpu_node_get_kind(i) == STARPU_SCC_RAM ||
 			    starpu_node_get_kind(i) == STARPU_SCC_SHM ||
                             starpu_node_get_kind(i) == STARPU_MPI_MS_RAM)
 				i_ram = i;
-			if (starpu_node_get_kind(i) == STARPU_DISK_RAM)			
+			if (starpu_node_get_kind(i) == STARPU_DISK_RAM)
 				i_disk = i;
 		}
 	}
-	
+
 	/* we have to use cpu_ram in first */
 	if (i_ram != -1)
 		src_node = i_ram;
@@ -233,55 +234,12 @@ static int worker_supports_direct_access(unsigned node, unsigned handling_node)
 		return 0;
 
 	int type = starpu_node_get_kind(node);
-	switch (type)
+	if (_node_ops[type].direct_access_supported)
+		return _node_ops[type].direct_access_supported(node, handling_node);
+	else
 	{
-		case STARPU_CUDA_RAM:
-		{
-			/* GPUs not always allow direct remote access: if CUDA4
-			 * is enabled, we allow two CUDA devices to communicate. */
-#ifdef STARPU_SIMGRID
-			if (starpu_node_get_kind(handling_node) == STARPU_CUDA_RAM)
-			{
-				msg_host_t host = _starpu_simgrid_get_memnode_host(handling_node);
-				const char* cuda_memcpy_peer = MSG_host_get_property_value(host, "memcpy_peer");
-				return cuda_memcpy_peer && atoll(cuda_memcpy_peer);
-			}
-			else
-				return 0;
-#elif defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
-			/* simgrid */
-			enum starpu_node_kind kind = starpu_node_get_kind(handling_node);
-			return kind == STARPU_CUDA_RAM;
-#else /* STARPU_HAVE_CUDA_MEMCPY_PEER */
-			/* Direct GPU-GPU transfers are not allowed in general */
-			return 0;
-#endif /* STARPU_HAVE_CUDA_MEMCPY_PEER */
-		}
-		case STARPU_OPENCL_RAM:
-			return 0;
-		case STARPU_MIC_RAM:
-			/* TODO: We don't handle direct MIC-MIC transfers yet */
-			return 0;
-		case STARPU_DISK_RAM:
-			/* Each worker can manage disks but disk <-> disk is not always allowed */
-			switch (starpu_node_get_kind(handling_node))
-			{
-				case STARPU_CPU_RAM:
-					return 1;
-				case STARPU_DISK_RAM:
-					return _starpu_disk_can_copy(node, handling_node);
-				default:
-					return 0;
-			}
-                case STARPU_MPI_MS_RAM:
-                {
-                        enum starpu_node_kind kind = starpu_node_get_kind(handling_node);
-                        return kind == STARPU_MPI_MS_RAM;
-                }
-		case STARPU_SCC_RAM:
-			return 1;
-		default:
-			return 1;
+		STARPU_ABORT_MSG("Node %d does not define the operation 'direct_access_supported'", type);
+		return 1;
 	}
 }
 
@@ -338,7 +296,7 @@ static unsigned chose_best_numa_between_src_and_dest(int src, int dst)
 		}
 	}
 	STARPU_ASSERT(best_numa >= 0);
-	
+
 	return best_numa;
 }
 
@@ -1098,7 +1056,7 @@ static void _starpu_fetch_task_input_cb(void *arg)
 }
 
 
-/* Synchronously or asynchronously fetch data for a given task (if it's not there already) 
+/* Synchronously or asynchronously fetch data for a given task (if it's not there already)
  * Returns the number of data acquired here.  */
 
 /* _starpu_fetch_task_input must be called before

+ 26 - 709
src/datawizard/copy_driver.c

@@ -26,13 +26,15 @@
 #include <drivers/mpi/driver_mpi_sink.h>
 #include <drivers/mpi/driver_mpi_source.h>
 #include <drivers/mpi/driver_mpi_common.h>
+#include <drivers/mic/driver_mic_source.h>
 #include <common/fxt.h>
-#include "copy_driver.h"
-#include "memalloc.h"
+#include <datawizard/copy_driver.h>
+#include <datawizard/memalloc.h>
 #include <starpu_opencl.h>
 #include <starpu_cuda.h>
 #include <profiling/profiling.h>
 #include <core/disk.h>
+#include <datawizard/node_ops.h>
 
 #ifdef STARPU_SIMGRID
 #include <core/simgrid.h>
@@ -146,8 +148,8 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 				    struct _starpu_data_replicate *dst_replicate,
 				    struct _starpu_data_request *req)
 {
-	unsigned src_node = src_replicate->memory_node;
-	unsigned dst_node = dst_replicate->memory_node;
+	unsigned src_node = (unsigned)src_replicate->memory_node;
+	unsigned dst_node = (unsigned)dst_replicate->memory_node;
 
 	STARPU_ASSERT(src_replicate->refcnt);
 	STARPU_ASSERT(dst_replicate->refcnt);
@@ -161,19 +163,8 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 
 	return _starpu_simgrid_transfer(handle->ops->get_size(handle), src_node, dst_node, req);
 #else /* !SIMGRID */
-
-	int ret = 0;
-
-	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
-
 	enum starpu_node_kind src_kind = starpu_node_get_kind(src_node);
 	enum starpu_node_kind dst_kind = starpu_node_get_kind(dst_node);
-
-#ifdef STARPU_USE_CUDA
-	cudaError_t cures;
-	cudaStream_t stream;
-#endif
-
 	void *src_interface = src_replicate->data_interface;
 	void *dst_interface = dst_replicate->data_interface;
 
@@ -195,434 +186,14 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 	}
 #endif
 
-	switch (_STARPU_MEMORY_NODE_TUPLE(src_kind,dst_kind))
+	if (_node_ops[src_kind].copy_data_to[dst_kind])
+	{
+		return _node_ops[src_kind].copy_data_to[dst_kind](handle, src_interface, src_node, dst_interface, dst_node, req);
+	}
+	else
 	{
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CPU_RAM):
-		/* STARPU_CPU_RAM -> STARPU_CPU_RAM */
-		if (copy_methods->ram_to_ram)
-			copy_methods->ram_to_ram(src_interface, src_node, dst_interface, dst_node);
-		else
-			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req ? &req->async_channel : NULL);
-		break;
-#ifdef STARPU_USE_CUDA
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CPU_RAM):
-		/* only the proper CUBLAS thread can initiate this directly ! */
-#if !defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
-		STARPU_ASSERT(starpu_worker_get_local_memory_node() == src_node);
-#endif
-		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() ||
-				!(copy_methods->cuda_to_ram_async || copy_methods->any_to_any))
-		{
-			/* this is not associated to a request so it's synchronous */
-			STARPU_ASSERT(copy_methods->cuda_to_ram || copy_methods->any_to_any);
-			if (copy_methods->cuda_to_ram)
-				copy_methods->cuda_to_ram(src_interface, src_node, dst_interface, dst_node);
-			else
-				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
-		}
-		else
-		{
-			req->async_channel.type = STARPU_CUDA_RAM;
-			cures = cudaEventCreateWithFlags(&req->async_channel.event.cuda_event, cudaEventDisableTiming);
-			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
-
-			stream = starpu_cuda_get_out_transfer_stream(src_node);
-			if (copy_methods->cuda_to_ram_async)
-				ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
-			else
-			{
-				STARPU_ASSERT(copy_methods->any_to_any);
-				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
-			}
-
-			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
-			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
-		}
-		break;
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CUDA_RAM):
-		/* STARPU_CPU_RAM -> CUBLAS_RAM */
-		/* only the proper CUBLAS thread can initiate this ! */
-#if !defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
-		STARPU_ASSERT(starpu_worker_get_local_memory_node() == dst_node);
-#endif
-		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() ||
-				!(copy_methods->ram_to_cuda_async || copy_methods->any_to_any))
-		{
-			/* this is not associated to a request so it's synchronous */
-			STARPU_ASSERT(copy_methods->ram_to_cuda || copy_methods->any_to_any);
-			if (copy_methods->ram_to_cuda)
-				copy_methods->ram_to_cuda(src_interface, src_node, dst_interface, dst_node);
-			else
-				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
-		}
-		else
-		{
-			req->async_channel.type = STARPU_CUDA_RAM;
-			cures = cudaEventCreateWithFlags(&req->async_channel.event.cuda_event, cudaEventDisableTiming);
-			if (STARPU_UNLIKELY(cures != cudaSuccess))
-				STARPU_CUDA_REPORT_ERROR(cures);
-
-			stream = starpu_cuda_get_in_transfer_stream(dst_node);
-			if (copy_methods->ram_to_cuda_async)
-				ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
-			else
-			{
-				STARPU_ASSERT(copy_methods->any_to_any);
-				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
-			}
-
-			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
-			if (STARPU_UNLIKELY(cures != cudaSuccess))
-				STARPU_CUDA_REPORT_ERROR(cures);
-		}
-		break;
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CUDA_RAM):
-		/* CUDA - CUDA transfer */
-		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() ||
-				!(copy_methods->cuda_to_cuda_async || copy_methods->any_to_any))
-		{
-			STARPU_ASSERT(copy_methods->cuda_to_cuda || copy_methods->any_to_any);
-			/* this is not associated to a request so it's synchronous */
-			if (copy_methods->cuda_to_cuda)
-				copy_methods->cuda_to_cuda(src_interface, src_node, dst_interface, dst_node);
-			else
-				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
-		}
-		else
-		{
-			req->async_channel.type = STARPU_CUDA_RAM;
-			cures = cudaEventCreateWithFlags(&req->async_channel.event.cuda_event, cudaEventDisableTiming);
-			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
-
-			stream = starpu_cuda_get_peer_transfer_stream(src_node, dst_node);
-			if (copy_methods->cuda_to_cuda_async)
-				ret = copy_methods->cuda_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
-			else
-			{
-				STARPU_ASSERT(copy_methods->any_to_any);
-				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
-			}
-
-			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
-			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
-		}
-		break;
-#endif
-#ifdef STARPU_USE_OPENCL
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_CPU_RAM):
-		/* OpenCL -> RAM */
-		STARPU_ASSERT(starpu_worker_get_local_memory_node() == src_node);
-		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() ||
-				!(copy_methods->opencl_to_ram_async || copy_methods->any_to_any))
-		{
-			STARPU_ASSERT(copy_methods->opencl_to_ram || copy_methods->any_to_any);
-			/* this is not associated to a request so it's synchronous */
-			if (copy_methods->opencl_to_ram)
-				copy_methods->opencl_to_ram(src_interface, src_node, dst_interface, dst_node);
-			else
-				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
-		}
-		else
-		{
-			req->async_channel.type = STARPU_OPENCL_RAM;
-			if (copy_methods->opencl_to_ram_async)
-				ret = copy_methods->opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
-			else
-			{
-				STARPU_ASSERT(copy_methods->any_to_any);
-				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
-			}
-		}
-		break;
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_OPENCL_RAM):
-		/* STARPU_CPU_RAM -> STARPU_OPENCL_RAM */
-		STARPU_ASSERT(starpu_worker_get_local_memory_node() == dst_node);
-		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() ||
-				!(copy_methods->ram_to_opencl_async || copy_methods->any_to_any))
-		{
-			STARPU_ASSERT(copy_methods->ram_to_opencl || copy_methods->any_to_any);
-			/* this is not associated to a request so it's synchronous */
-			if (copy_methods->ram_to_opencl)
-				copy_methods->ram_to_opencl(src_interface, src_node, dst_interface, dst_node);
-			else
-				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
-		}
-		else
-		{
-			req->async_channel.type = STARPU_OPENCL_RAM;
-			if (copy_methods->ram_to_opencl_async)
-				ret = copy_methods->ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
-			else
-			{
-				STARPU_ASSERT(copy_methods->any_to_any);
-				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
-			}
-		}
-		break;
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_OPENCL_RAM):
-		/* STARPU_OPENCL_RAM -> STARPU_OPENCL_RAM */
-		STARPU_ASSERT(starpu_worker_get_local_memory_node() == dst_node || starpu_worker_get_local_memory_node() == src_node);
-		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() ||
-				!(copy_methods->opencl_to_opencl_async || copy_methods->any_to_any))
-		{
-			STARPU_ASSERT(copy_methods->opencl_to_opencl || copy_methods->any_to_any);
-			/* this is not associated to a request so it's synchronous */
-			if (copy_methods->opencl_to_opencl)
-				copy_methods->opencl_to_opencl(src_interface, src_node, dst_interface, dst_node);
-			else
-				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
-		}
-		else
-		{
-			req->async_channel.type = STARPU_OPENCL_RAM;
-			if (copy_methods->opencl_to_opencl_async)
-				ret = copy_methods->opencl_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
-			else
-			{
-				STARPU_ASSERT(copy_methods->any_to_any);
-				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
-			}
-		}
-		break;
-#endif
-#ifdef STARPU_USE_MIC
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_MIC_RAM):
-		/* RAM -> MIC */
-		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mic_copy_disabled() ||
-				!(copy_methods->ram_to_mic_async || copy_methods->any_to_any))
-		{
-			/* this is not associated to a request so it's synchronous */
-			STARPU_ASSERT(copy_methods->ram_to_mic || copy_methods->any_to_any);
-			if (copy_methods->ram_to_mic)
-				copy_methods->ram_to_mic(src_interface, src_node, dst_interface, dst_node);
-			else
-				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
-		}
-		else
-		{
-			req->async_channel.type = STARPU_MIC_RAM;
-			if (copy_methods->ram_to_mic_async)
-				ret = copy_methods->ram_to_mic_async(src_interface, src_node, dst_interface, dst_node);
-			else
-			{
-				STARPU_ASSERT(copy_methods->any_to_any);
-				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
-			}
-			_starpu_mic_init_event(&(req->async_channel.event.mic_event), dst_node);
-		}
-		break;
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_MIC_RAM,STARPU_CPU_RAM):
-		/* MIC -> RAM */
-		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mic_copy_disabled() ||
-				!(copy_methods->mic_to_ram_async || copy_methods->any_to_any))
-		{
-			/* this is not associated to a request so it's synchronous */
-			STARPU_ASSERT(copy_methods->mic_to_ram || copy_methods->any_to_any);
-			if (copy_methods->mic_to_ram)
-				copy_methods->mic_to_ram(src_interface, src_node, dst_interface, dst_node);
-			else
-				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
-		}
-		else
-		{
-			req->async_channel.type = STARPU_MIC_RAM;
-			if (copy_methods->mic_to_ram_async)
-				ret = copy_methods->mic_to_ram_async(src_interface, src_node, dst_interface, dst_node);
-			else
-			{
-				STARPU_ASSERT(copy_methods->any_to_any);
-				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
-			}
-			_starpu_mic_init_event(&(req->async_channel.event.mic_event), src_node);
-		}
-		break;
-	/* TODO: MIC -> MIC */
-#endif
-#ifdef STARPU_USE_MPI_MASTER_SLAVE
-        case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_MPI_MS_RAM):
-                if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mpi_ms_copy_disabled() ||
-                                !(copy_methods->ram_to_mpi_ms_async || copy_methods->any_to_any))
-                {
-                        /* this is not associated to a request so it's synchronous */
-                        STARPU_ASSERT(copy_methods->ram_to_mpi_ms || copy_methods->any_to_any);
-                        if (copy_methods->ram_to_mpi_ms)
-                                copy_methods->ram_to_mpi_ms(src_interface, src_node, dst_interface, dst_node);
-                        else
-                                copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
-                }
-                else
-                {
-                        req->async_channel.type = STARPU_MPI_MS_RAM;
-                        if(copy_methods->ram_to_mpi_ms_async)
-                                ret = copy_methods->ram_to_mpi_ms_async(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
-                        else
-                        {
-                                STARPU_ASSERT(copy_methods->any_to_any);
-                                ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
-                        }
-                }
-                break;
-
-        case _STARPU_MEMORY_NODE_TUPLE(STARPU_MPI_MS_RAM,STARPU_CPU_RAM):
-                if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mpi_ms_copy_disabled() ||
-                                !(copy_methods->mpi_ms_to_ram_async || copy_methods->any_to_any))
-                {
-                        /* this is not associated to a request so it's synchronous */
-                        STARPU_ASSERT(copy_methods->mpi_ms_to_ram || copy_methods->any_to_any);
-                        if (copy_methods->mpi_ms_to_ram)
-                                copy_methods->mpi_ms_to_ram(src_interface, src_node, dst_interface, dst_node);
-                        else
-                                copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
-                }
-                else
-                {
-                        req->async_channel.type = STARPU_MPI_MS_RAM;
-                        if(copy_methods->mpi_ms_to_ram_async)
-                                ret = copy_methods->mpi_ms_to_ram_async(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
-                        else
-                        {
-                                STARPU_ASSERT(copy_methods->any_to_any);
-                                ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
-                        }
-                }
-                break;
-
-        case _STARPU_MEMORY_NODE_TUPLE(STARPU_MPI_MS_RAM,STARPU_MPI_MS_RAM):
-                if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mpi_ms_copy_disabled() ||
-                                !(copy_methods->mpi_ms_to_mpi_ms_async || copy_methods->any_to_any))
-                {
-                        /* this is not associated to a request so it's synchronous */
-                        STARPU_ASSERT(copy_methods->mpi_ms_to_mpi_ms || copy_methods->any_to_any);
-                        if (copy_methods->mpi_ms_to_mpi_ms)
-                                copy_methods->mpi_ms_to_mpi_ms(src_interface, src_node, dst_interface, dst_node);
-                        else
-                                copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
-                }
-                else
-                {
-                        req->async_channel.type = STARPU_MPI_MS_RAM;
-                        if(copy_methods->mpi_ms_to_mpi_ms_async)
-                                ret = copy_methods->mpi_ms_to_mpi_ms_async(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
-                        else
-                        {
-                                STARPU_ASSERT(copy_methods->any_to_any);
-                                ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
-                        }
-                }
-                break;
-#endif
-#ifdef STARPU_USE_SCC
-		/* SCC RAM associated to the master process is considered as
-		 * the main memory node. */
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_SCC_RAM):
-		/* master private SCC RAM -> slave private SCC RAM */
-		if (copy_methods->scc_src_to_sink)
-			copy_methods->scc_src_to_sink(src_interface, src_node, dst_interface, dst_node);
-		else
-			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
-		break;
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_SCC_RAM,STARPU_CPU_RAM):
-		/* slave private SCC RAM -> master private SCC RAM */
-		if (copy_methods->scc_sink_to_src)
-			copy_methods->scc_sink_to_src(src_interface, src_node, dst_interface, dst_node);
-		else
-			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
-		break;
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_SCC_RAM,STARPU_SCC_RAM):
-		/* slave private SCC RAM -> slave private SCC RAM */
-		if (copy_methods->scc_sink_to_sink)
-			copy_methods->scc_sink_to_sink(src_interface, src_node, dst_interface, dst_node);
-		else
-			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
-		break;
-#endif
-
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_DISK_RAM):
-                if (req && !starpu_asynchronous_copy_disabled())
-                {
-                        req->async_channel.type = STARPU_DISK_RAM;
-                        req->async_channel.event.disk_event.requests = NULL;
-                        req->async_channel.event.disk_event.ptr = NULL;
-                        req->async_channel.event.disk_event.handle = NULL;
-                }
-		if(copy_methods->any_to_any)
-			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req && !starpu_asynchronous_copy_disabled() ? &req->async_channel : NULL);
-
-		else
-		{
-			void *obj = starpu_data_handle_to_pointer(handle, dst_node);
-			void * ptr = NULL;
-			starpu_ssize_t size = 0;
-			handle->ops->pack_data(handle, src_node, &ptr, &size);
-			ret = _starpu_disk_full_write(src_node, dst_node, obj, ptr, size, req && !starpu_asynchronous_copy_disabled() ? &req->async_channel : NULL);
-			if (ret == 0)
-			{
-				/* write is already finished, ptr was allocated in pack_data */
-				_starpu_free_flags_on_node(src_node, ptr, size, 0);
-			}
-			else if (ret == -EAGAIN)
-			{
-				STARPU_ASSERT(req);
-				req->async_channel.event.disk_event.ptr = ptr;
-				req->async_channel.event.disk_event.node = src_node;
-				req->async_channel.event.disk_event.size = size;
-			}
-
-			STARPU_ASSERT(ret == 0 || ret == -EAGAIN);
-		}
-		break;
-
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_DISK_RAM,STARPU_CPU_RAM):
-                if (req && !starpu_asynchronous_copy_disabled())
-                {
-                        req->async_channel.type = STARPU_DISK_RAM;
-                        req->async_channel.event.disk_event.requests = NULL;
-                        req->async_channel.event.disk_event.ptr = NULL;
-                        req->async_channel.event.disk_event.handle = NULL;
-                }
-		if(copy_methods->any_to_any)
-			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req && !starpu_asynchronous_copy_disabled()  ? &req->async_channel : NULL);
-		else
-		{
-			void *obj = starpu_data_handle_to_pointer(handle, src_node);
-			void * ptr = NULL;
-			size_t size = 0;
-			ret = _starpu_disk_full_read(src_node, dst_node, obj, &ptr, &size, req && !starpu_asynchronous_copy_disabled() ? &req->async_channel : NULL);
-			if (ret == 0)
-			{
-				/* read is already finished, we can already unpack */
-				handle->ops->unpack_data(handle, dst_node, ptr, size);
-			}
-			else if (ret == -EAGAIN)
-			{
-				STARPU_ASSERT(req);
-				req->async_channel.event.disk_event.ptr = ptr;
-				req->async_channel.event.disk_event.node = dst_node;
-				req->async_channel.event.disk_event.size = size;
-				req->async_channel.event.disk_event.handle = handle;
-			}
-
-			STARPU_ASSERT(ret == 0 || ret == -EAGAIN);
-		}
-		break;
-
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_DISK_RAM,STARPU_DISK_RAM):
-                if (req && !starpu_asynchronous_copy_disabled())
-                {
-                        req->async_channel.type = STARPU_DISK_RAM;
-                        req->async_channel.event.disk_event.requests = NULL;
-                        req->async_channel.event.disk_event.ptr = NULL;
-                        req->async_channel.event.disk_event.handle = NULL;
-                }
-		ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req && !starpu_asynchronous_copy_disabled() ? &req->async_channel : NULL);
-		break;
-
-	default:
 		STARPU_ABORT();
-		break;
 	}
-
-	return ret;
 #endif /* !SIMGRID */
 }
 
@@ -728,155 +299,20 @@ void starpu_interface_end_driver_copy_async(unsigned src_node, unsigned dst_node
 int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, void *async_data)
 {
 	struct _starpu_async_channel *async_channel = async_data;
-
 	enum starpu_node_kind src_kind = starpu_node_get_kind(src_node);
-	enum starpu_node_kind dst_kind = starpu_node_get_kind(dst_node);
-
-	switch (_STARPU_MEMORY_NODE_TUPLE(src_kind,dst_kind))
-	{
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CPU_RAM):
-		memcpy((void *) (dst + dst_offset), (void *) (src + src_offset), size);
-		return 0;
-
-#ifdef STARPU_USE_CUDA
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CPU_RAM):
-		return starpu_cuda_copy_async_sync(
-				(void*) (src + src_offset), src_node,
-				(void*) (dst + dst_offset), dst_node,
-				size,
-				async_channel?starpu_cuda_get_out_transfer_stream(src_node):NULL,
-				cudaMemcpyDeviceToHost);
-
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CUDA_RAM):
-		return starpu_cuda_copy_async_sync(
-				(void*) (src + src_offset), src_node,
-				(void*) (dst + dst_offset), dst_node,
-				size,
-				async_channel?starpu_cuda_get_in_transfer_stream(dst_node):NULL,
-				cudaMemcpyHostToDevice);
-
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CUDA_RAM):
-		return starpu_cuda_copy_async_sync(
-				(void*) (src + src_offset), src_node,
-				(void*) (dst + dst_offset), dst_node,
-				size,
-				async_channel?starpu_cuda_get_peer_transfer_stream(src_node, dst_node):NULL,
-				cudaMemcpyDeviceToDevice);
-
-#endif
-#ifdef STARPU_USE_OPENCL
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_CPU_RAM):
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_OPENCL_RAM):
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_OPENCL_RAM):
-		return starpu_opencl_copy_async_sync(
-				src, src_offset, src_node,
-				dst, dst_offset, dst_node,
-				size,
-				&async_channel->event.opencl_event);
-#endif
-#ifdef STARPU_USE_MIC
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_MIC_RAM,STARPU_CPU_RAM):
-		if (async_data)
-			return _starpu_mic_copy_mic_to_ram_async(
-					(void*) (src + src_offset), src_node,
-					(void*) (dst + dst_offset), dst_node,
-					size);
-		else
-			return _starpu_mic_copy_mic_to_ram(
-					(void*) (src + src_offset), src_node,
-					(void*) (dst + dst_offset), dst_node,
-					size);
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_MIC_RAM):
-		if (async_data)
-			return _starpu_mic_copy_ram_to_mic_async(
-					(void*) (src + src_offset), src_node,
-					(void*) (dst + dst_offset), dst_node,
-					size);
-		else
-			return _starpu_mic_copy_ram_to_mic(
-					(void*) (src + src_offset), src_node,
-					(void*) (dst + dst_offset), dst_node,
-					size);
-	/* TODO: MIC->MIC */
-#endif
-#ifdef STARPU_USE_SCC
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_SCC_RAM,STARPU_CPU_RAM):
-		return _starpu_scc_copy_sink_to_src(
-				(void*) (src + src_offset), src_node,
-				(void*) (dst + dst_offset), dst_node,
-				size);
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_SCC_RAM):
-		return _starpu_scc_copy_src_to_sink(
-				(void*) (src + src_offset), src_node,
-				(void*) (dst + dst_offset), dst_node,
-				size);
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_SCC_RAM,STARPU_SCC_RAM):
-		return _starpu_scc_copy_sink_to_sink(
-				(void*) (src + src_offset), src_node,
-				(void*) (dst + dst_offset), dst_node,
-				size);
-#endif
-#ifdef STARPU_USE_MPI_MASTER_SLAVE
-        case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM, STARPU_MPI_MS_RAM):
-                if (async_data)
-                        return _starpu_mpi_copy_ram_to_mpi_async(
-                                        (void*) (src + src_offset), src_node,
-                                        (void*) (dst + dst_offset), dst_node,
-                                        size, async_data);
-                else
-                        return _starpu_mpi_copy_ram_to_mpi_sync(
-                                        (void*) (src + src_offset), src_node,
-                                        (void*) (dst + dst_offset), dst_node,
-                                        size);
-        case _STARPU_MEMORY_NODE_TUPLE(STARPU_MPI_MS_RAM, STARPU_CPU_RAM):
-                if (async_data)
-                        return _starpu_mpi_copy_mpi_to_ram_async(
-                                        (void*) (src + src_offset), src_node,
-                                        (void*) (dst + dst_offset), dst_node,
-                                        size, async_data);
-                else
-                        return _starpu_mpi_copy_mpi_to_ram_sync(
-                                        (void*) (src + src_offset), src_node,
-                                        (void*) (dst + dst_offset), dst_node,
-                                        size);
-
-        case _STARPU_MEMORY_NODE_TUPLE(STARPU_MPI_MS_RAM, STARPU_MPI_MS_RAM):
-                if (async_data)
-                        return _starpu_mpi_copy_sink_to_sink_async(
-                                        (void*) (src + src_offset), src_node,
-                                        (void*) (dst + dst_offset), dst_node,
-                                        size, async_data);
-                else
-                        return _starpu_mpi_copy_sink_to_sink_sync(
-                                        (void*) (src + src_offset), src_node,
-                                        (void*) (dst + dst_offset), dst_node,
-                                        size);
-#endif
 
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM, STARPU_DISK_RAM):
+	if (_node_ops[src_kind].copy_interface)
 	{
-		return _starpu_disk_copy_src_to_disk(
-			(void*) (src + src_offset), src_node,
-			(void*) dst, dst_offset, dst_node,
-			size, async_channel);
+		return _node_ops[src_kind].copy_interface(src, src_offset, src_node,
+							  dst, dst_offset, dst_node,
+							  size,
+							  async_channel);
 	}
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_DISK_RAM, STARPU_CPU_RAM):
-		return _starpu_disk_copy_disk_to_src(
-			(void*) src, src_offset, src_node,
-			(void*) (dst + dst_offset), dst_node,
-			size, async_channel);
-
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_DISK_RAM, STARPU_DISK_RAM):
-		return _starpu_disk_copy_disk_to_disk(
-			(void*) src, src_offset, src_node,
-			(void*) dst, dst_offset, dst_node,
-			size, async_channel);
-
-	default:
+	else
+	{
 		STARPU_ABORT();
 		return -1;
 	}
-	return 0;
 }
 
 void _starpu_driver_wait_request_completion(struct _starpu_async_channel *async_channel)
@@ -885,70 +321,13 @@ void _starpu_driver_wait_request_completion(struct _starpu_async_channel *async_
 	_starpu_simgrid_wait_transfer_event(&async_channel->event);
 #else /* !SIMGRID */
 	enum starpu_node_kind kind = async_channel->type;
-#ifdef STARPU_USE_CUDA
-	cudaEvent_t event;
-	cudaError_t cures;
-#endif
-
-	switch (kind)
-	{
-#ifdef STARPU_USE_CUDA
-	case STARPU_CUDA_RAM:
-		event = (*async_channel).event.cuda_event;
 
-		cures = cudaEventSynchronize(event);
-		if (STARPU_UNLIKELY(cures))
-			STARPU_CUDA_REPORT_ERROR(cures);
-
-		cures = cudaEventDestroy(event);
-		if (STARPU_UNLIKELY(cures))
-			STARPU_CUDA_REPORT_ERROR(cures);
-
-		break;
-#endif
-#ifdef STARPU_USE_OPENCL
-	case STARPU_OPENCL_RAM:
+	if (_node_ops[kind].wait_request_completion != NULL)
 	{
-		cl_int err;
-		if ((*async_channel).event.opencl_event == NULL)
-			STARPU_ABORT();
-		err = clWaitForEvents(1, &((*async_channel).event.opencl_event));
-		if (STARPU_UNLIKELY(err != CL_SUCCESS))
-			STARPU_OPENCL_REPORT_ERROR(err);
-		err = clReleaseEvent((*async_channel).event.opencl_event);
-		if (STARPU_UNLIKELY(err != CL_SUCCESS))
-			STARPU_OPENCL_REPORT_ERROR(err);
-	      break;
+		_node_ops[kind].wait_request_completion(async_channel);
 	}
-#endif
-#ifdef STARPU_USE_MIC
-	case STARPU_MIC_RAM:
-		_starpu_mic_wait_request_completion(&(async_channel->event.mic_event));
-		break;
-#endif
-#ifdef STARPU_USE_MPI_MASTER_SLAVE
-        case STARPU_MPI_MS_RAM:
-                _starpu_mpi_common_wait_event(async_channel);
-                break;
-#endif
-	case STARPU_DISK_RAM:
-		starpu_disk_wait_request(async_channel);
-		if (async_channel->event.disk_event.ptr != NULL)
-		{
-			if (async_channel->event.disk_event.handle != NULL)
-			{
-				/* read is finished, we can already unpack */
-				async_channel->event.disk_event.handle->ops->unpack_data(async_channel->event.disk_event.handle, async_channel->event.disk_event.node, async_channel->event.disk_event.ptr, async_channel->event.disk_event.size);
-			}
-			else
-			{
-				/* write is finished, ptr was allocated in pack_data */
-				_starpu_free_flags_on_node(async_channel->event.disk_event.node, async_channel->event.disk_event.ptr, async_channel->event.disk_event.size, 0);
-			}
-		}
-		break;
-	case STARPU_CPU_RAM:
-	default:
+	else
+	{
 		STARPU_ABORT();
 	}
 #endif /* !SIMGRID */
@@ -960,76 +339,14 @@ unsigned _starpu_driver_test_request_completion(struct _starpu_async_channel *as
 	return _starpu_simgrid_test_transfer_event(&async_channel->event);
 #else /* !SIMGRID */
 	enum starpu_node_kind kind = async_channel->type;
-	unsigned success = 0;
-#ifdef STARPU_USE_CUDA
-	cudaEvent_t event;
-#endif
-
-	switch (kind)
-	{
-#ifdef STARPU_USE_CUDA
-	case STARPU_CUDA_RAM:
-		event = (*async_channel).event.cuda_event;
-		cudaError_t cures = cudaEventQuery(event);
 
-		success = (cures == cudaSuccess);
-		if (success)
-			cudaEventDestroy(event);
-		else if (cures != cudaErrorNotReady)
-			STARPU_CUDA_REPORT_ERROR(cures);
-		break;
-#endif
-#ifdef STARPU_USE_OPENCL
-	case STARPU_OPENCL_RAM:
+	if (_node_ops[kind].test_request_completion != NULL)
 	{
-		cl_int event_status;
-		cl_event opencl_event = (*async_channel).event.opencl_event;
-		if (opencl_event == NULL) STARPU_ABORT();
-		cl_int err = clGetEventInfo(opencl_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL);
-		if (STARPU_UNLIKELY(err != CL_SUCCESS))
-			STARPU_OPENCL_REPORT_ERROR(err);
-		if (event_status < 0)
-			STARPU_OPENCL_REPORT_ERROR(event_status);
-		if (event_status == CL_COMPLETE)
-		{
-			err = clReleaseEvent(opencl_event);
-			if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
-		}
-		success = (event_status == CL_COMPLETE);
-		break;
+		return _node_ops[kind].test_request_completion(async_channel);
 	}
-#endif
-#ifdef STARPU_USE_MIC
-	case STARPU_MIC_RAM:
-		success = _starpu_mic_request_is_complete(&(async_channel->event.mic_event));
-		break;
-#endif
-#ifdef STARPU_USE_MPI_MASTER_SLAVE
-        case STARPU_MPI_MS_RAM:
-                success = _starpu_mpi_common_test_event(async_channel);
-                break;
-#endif
-	case STARPU_DISK_RAM:
-		success = starpu_disk_test_request(async_channel);
-		if (async_channel->event.disk_event.ptr != NULL && success)
-		{
-			if (async_channel->event.disk_event.handle != NULL)
-			{
-				/* read is finished, we can already unpack */
-				async_channel->event.disk_event.handle->ops->unpack_data(async_channel->event.disk_event.handle, async_channel->event.disk_event.node, async_channel->event.disk_event.ptr, async_channel->event.disk_event.size);
-			}
-			else
-			{
-				/* write is finished, ptr was allocated in pack_data */
-				_starpu_free_flags_on_node(async_channel->event.disk_event.node, async_channel->event.disk_event.ptr, async_channel->event.disk_event.size, 0);
-			}
-		}
-		break;
-	case STARPU_CPU_RAM:
-	default:
+	else
+	{
 		STARPU_ABORT_MSG("Memory is not recognized (kind %d) \n", kind);
 	}
-
-	return success;
 #endif /* !SIMGRID */
 }

+ 17 - 229
src/datawizard/malloc.c

@@ -28,6 +28,7 @@
 #include <datawizard/memory_manager.h>
 #include <datawizard/memory_nodes.h>
 #include <datawizard/malloc.h>
+#include <datawizard/node_ops.h>
 #include <core/simgrid.h>
 #include <core/task.h>
 
@@ -578,15 +579,10 @@ static starpu_pthread_mutex_t cuda_alloc_mutex = STARPU_PTHREAD_MUTEX_INITIALIZE
 static starpu_pthread_mutex_t opencl_alloc_mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
 #endif
 
-static uintptr_t
-_starpu_malloc_on_node(unsigned dst_node, size_t size, int flags)
+static uintptr_t _starpu_malloc_on_node(unsigned dst_node, size_t size, int flags)
 {
 	uintptr_t addr = 0;
 
-#if defined(STARPU_USE_CUDA) && !defined(STARPU_SIMGRID)
-	cudaError_t status;
-#endif
-
 	/* Handle count first */
 	if (flags & STARPU_MALLOC_COUNT)
 	{
@@ -596,130 +592,11 @@ _starpu_malloc_on_node(unsigned dst_node, size_t size, int flags)
 		flags &= ~STARPU_MALLOC_COUNT;
 	}
 
-	switch(starpu_node_get_kind(dst_node))
-	{
-		case STARPU_CPU_RAM:
-		{
-			_starpu_malloc_flags_on_node(dst_node, (void**) &addr, size,
-#if defined(STARPU_USE_CUDA) && !defined(STARPU_HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
-					/* without memcpy_peer, we can not
-					 * allocated pinned memory, since it
-					 * requires waiting for a task, and we
-					 * may be called with a spinlock held
-					 */
-					flags & ~STARPU_MALLOC_PINNED
-#else
-					flags
-#endif
-					);
-			break;
-		}
-#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
-		case STARPU_CUDA_RAM:
-		{
-#ifdef STARPU_SIMGRID
-			static uintptr_t last[STARPU_MAXNODES];
-#ifdef STARPU_DEVEL
-#warning TODO: record used memory, using a simgrid property to know the available memory
-#endif
-			/* Sleep for the allocation */
-			STARPU_PTHREAD_MUTEX_LOCK(&cuda_alloc_mutex);
-			if (_starpu_simgrid_cuda_malloc_cost())
-				MSG_process_sleep(0.000175);
-			if (!last[dst_node])
-				last[dst_node] = 1<<10;
-			addr = last[dst_node];
-			last[dst_node]+=size;
-			STARPU_ASSERT(last[dst_node] >= addr);
-			STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_alloc_mutex);
-#else
-			unsigned devid = starpu_memory_node_get_devid(dst_node);
-#if defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
-			starpu_cuda_set_device(devid);
-#else
-			struct _starpu_worker *worker = _starpu_get_local_worker_key();
-			if (!worker || worker->arch != STARPU_CUDA_WORKER || worker->devid != devid)
-				STARPU_ASSERT_MSG(0, "CUDA peer access is not available with this version of CUDA");
-#endif
-			/* Check if there is free memory */
-			size_t cuda_mem_free, cuda_mem_total;
-			status = cudaMemGetInfo(&cuda_mem_free, &cuda_mem_total);
-			if (status == cudaSuccess &&
-					cuda_mem_free < (size*2))
-			{
-					addr = 0;
-					break;
-			}
-			status = cudaMalloc((void **)&addr, size);
-			if (!addr || (status != cudaSuccess))
-			{
-				if (STARPU_UNLIKELY(status != cudaErrorMemoryAllocation))
-					STARPU_CUDA_REPORT_ERROR(status);
-				addr = 0;
-			}
-#endif
-			break;
-		}
-#endif
-#if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
-	        case STARPU_OPENCL_RAM:
-		{
-#ifdef STARPU_SIMGRID
-				static uintptr_t last[STARPU_MAXNODES];
-				/* Sleep for the allocation */
-				STARPU_PTHREAD_MUTEX_LOCK(&opencl_alloc_mutex);
-				if (_starpu_simgrid_cuda_malloc_cost())
-					MSG_process_sleep(0.000175);
-				if (!last[dst_node])
-					last[dst_node] = 1<<10;
-				addr = last[dst_node];
-				last[dst_node]+=size;
-				STARPU_ASSERT(last[dst_node] >= addr);
-				STARPU_PTHREAD_MUTEX_UNLOCK(&opencl_alloc_mutex);
-#else
-                                int ret;
-				cl_mem ptr;
-
-				ret = starpu_opencl_allocate_memory(starpu_memory_node_get_devid(dst_node), &ptr, size, CL_MEM_READ_WRITE);
-				if (ret)
-				{
-					addr = 0;
-				}
-				else
-				{
-					addr = (uintptr_t)ptr;
-				}
-#endif
-			break;
-		}
-#endif
-	        case STARPU_DISK_RAM:
-		{
-			addr = (uintptr_t) _starpu_disk_alloc(dst_node, size);
-			break;
-		}
-
-#ifdef STARPU_USE_MIC
-		case STARPU_MIC_RAM:
-			if (_starpu_mic_allocate_memory((void **)(&addr), size, dst_node))
-				addr = 0;
-			break;
-#endif
-#ifdef STARPU_USE_MPI_MASTER_SLAVE
-		case STARPU_MPI_MS_RAM:
-			if (_starpu_mpi_src_allocate_memory((void **)(&addr), size, dst_node))
-				addr = 0;
-			break;
-#endif
-#ifdef STARPU_USE_SCC
-		case STARPU_SCC_RAM:
-			if (_starpu_scc_allocate_memory((void **)(&addr), size, dst_node))
-				addr = 0;
-			break;
-#endif
-		default:
-			STARPU_ABORT();
-	}
+	enum starpu_node_kind node_kind = starpu_node_get_kind(dst_node);
+	if (_node_ops[node_kind].malloc_on_node)
+		return _node_ops[node_kind].malloc_on_node(dst_node, size, flags);
+	else
+		STARPU_ABORT_MSG("No malloc_on_node function defined for node %s\n", _starpu_node_get_prefix(node_kind));
 
 	if (addr == 0)
 	{
@@ -730,108 +607,19 @@ _starpu_malloc_on_node(unsigned dst_node, size_t size, int flags)
 	return addr;
 }
 
-void
-_starpu_free_on_node_flags(unsigned dst_node, uintptr_t addr, size_t size, int flags)
+void _starpu_free_on_node_flags(unsigned dst_node, uintptr_t addr, size_t size, int flags)
 {
 	int count = flags & STARPU_MALLOC_COUNT;
 	flags &= ~STARPU_MALLOC_COUNT;
-	enum starpu_node_kind kind = starpu_node_get_kind(dst_node);
-	switch(kind)
-	{
-		case STARPU_CPU_RAM:
-			_starpu_free_flags_on_node(dst_node, (void*)addr, size,
-#if defined(STARPU_USE_CUDA) && !defined(STARPU_HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
-						   flags & ~STARPU_MALLOC_PINNED
-#else
-						   flags
-#endif
-				);
-			break;
-#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
-		case STARPU_CUDA_RAM:
-		{
-#ifdef STARPU_SIMGRID
-			STARPU_PTHREAD_MUTEX_LOCK(&cuda_alloc_mutex);
-			/* Sleep for the free */
-			if (_starpu_simgrid_cuda_malloc_cost())
-				MSG_process_sleep(0.000750);
-			STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_alloc_mutex);
-			/* CUDA also synchronizes roughly everything on cudaFree */
-			_starpu_simgrid_sync_gpus();
-#else
-			cudaError_t err;
-			unsigned devid = starpu_memory_node_get_devid(dst_node);
-#if defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
-			starpu_cuda_set_device(devid);
-#else
-			struct _starpu_worker *worker = _starpu_get_local_worker_key();
-			if (!worker || worker->arch != STARPU_CUDA_WORKER || worker->devid != devid)
-				STARPU_ASSERT_MSG(0, "CUDA peer access is not available with this version of CUDA");
-#endif /* STARPU_HAVE_CUDA_MEMCPY_PEER */
-			err = cudaFree((void*)addr);
-#ifdef STARPU_OPENMP
-			/* When StarPU is used as Open Runtime support,
-			 * starpu_omp_shutdown() will usually be called from a
-			 * destructor, in which case cudaThreadExit() reports a
-			 * cudaErrorCudartUnloading here. There should not
-			 * be any remaining tasks running at this point so
-			 * we can probably ignore it without much consequences. */
-			if (STARPU_UNLIKELY(err != cudaSuccess && err != cudaErrorCudartUnloading))
-				STARPU_CUDA_REPORT_ERROR(err);
-#else
-			if (STARPU_UNLIKELY(err != cudaSuccess))
-				STARPU_CUDA_REPORT_ERROR(err);
-#endif /* STARPU_OPENMP */
-#endif /* STARPU_SIMGRID */
-			break;
-		}
-#endif /* STARPU_USE_CUDA || STARPU_SIMGRID */
 
-#if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
-                case STARPU_OPENCL_RAM:
-		{
-#ifdef STARPU_SIMGRID
-			STARPU_PTHREAD_MUTEX_LOCK(&opencl_alloc_mutex);
-			/* Sleep for the free */
-			if (_starpu_simgrid_cuda_malloc_cost())
-				MSG_process_sleep(0.000750);
-			STARPU_PTHREAD_MUTEX_UNLOCK(&opencl_alloc_mutex);
-#else
-			cl_int err;
-                        err = clReleaseMemObject((void*)addr);
-			if (STARPU_UNLIKELY(err != CL_SUCCESS))
-				STARPU_OPENCL_REPORT_ERROR(err);
-#endif
-                        break;
-		}
-#endif
-	        case STARPU_DISK_RAM:
-		{
-			_starpu_disk_free (dst_node, (void *) addr , size);
-			break;
-		}
+	enum starpu_node_kind node_kind = starpu_node_get_kind(dst_node);
+	if (_node_ops[node_kind].free_on_node)
+		_node_ops[node_kind].free_on_node(dst_node, addr, size, flags);
+	else
+		STARPU_ABORT_MSG("No free_on_node function defined for node %s\n", _starpu_node_get_prefix(node_kind));
 
-#ifdef STARPU_USE_MIC
-		case STARPU_MIC_RAM:
-			_starpu_mic_free_memory((void*) addr, size, dst_node);
-			break;
-#endif
-#ifdef STARPU_USE_MPI_MASTER_SLAVE
-        case STARPU_MPI_MS_RAM:
-            _starpu_mpi_source_free_memory((void*) addr, dst_node);
-            break;
-#endif
-#ifdef STARPU_USE_SCC
-		case STARPU_SCC_RAM:
-			_starpu_scc_free_memory((void *) addr, dst_node);
-			break;
-#endif
-		default:
-			STARPU_ABORT();
-	}
 	if (count)
 		starpu_memory_deallocate(dst_node, size);
-
 }
 
 int
@@ -983,10 +771,10 @@ static struct _starpu_chunk *_starpu_new_chunk(unsigned dst_node, int flags)
 static int _starpu_malloc_should_suballoc(unsigned dst_node, size_t size, int flags)
 {
 	return size <= CHUNK_ALLOC_MAX &&
-			(   starpu_node_get_kind(dst_node) == STARPU_CUDA_RAM
-			|| (starpu_node_get_kind(dst_node) == STARPU_CPU_RAM
-			    && _starpu_malloc_should_pin(flags))
-			);
+		(starpu_node_get_kind(dst_node) == STARPU_CUDA_RAM
+		 || (starpu_node_get_kind(dst_node) == STARPU_CPU_RAM
+		     && _starpu_malloc_should_pin(flags))
+		 );
 }
 
 uintptr_t

+ 4 - 34
src/datawizard/memory_nodes.c

@@ -23,8 +23,9 @@
 #include <datawizard/memory_nodes.h>
 #include <datawizard/malloc.h>
 #include <common/fxt.h>
-#include "copy_driver.h"
-#include "memalloc.h"
+#include <datawizard/copy_driver.h>
+#include <datawizard/memalloc.h>
+#include <datawizard/node_ops.h>
 
 char _starpu_worker_drives_memory[STARPU_NMAXWORKERS][STARPU_MAXNODES];
 
@@ -75,38 +76,7 @@ unsigned starpu_memory_nodes_get_count(void)
 
 int starpu_memory_node_get_name(unsigned node, char *name, size_t size)
 {
-	const char *prefix;
-	switch (_starpu_descr.nodes[node])
-	{
-	case STARPU_CPU_RAM:
-		prefix = "NUMA";
-		break;
-	case STARPU_CUDA_RAM:
-		prefix = "CUDA";
-		break;
-	case STARPU_OPENCL_RAM:
-		prefix = "OpenCL";
-		break;
-	case STARPU_DISK_RAM:
-		prefix = "Disk";
-		break;
-	case STARPU_MIC_RAM:
-		prefix = "MIC";
-		break;
-	case STARPU_MPI_MS_RAM:
-		prefix = "MPI_MS";
-		break;
-	case STARPU_SCC_RAM:
-		prefix = "SCC_RAM";
-		break;
-	case STARPU_SCC_SHM:
-		prefix = "SCC_shared";
-		break;
-	case STARPU_UNUSED:
-	default:
-		prefix = "unknown";
-		STARPU_ASSERT(0);
-	}
+	const char *prefix = _starpu_node_get_prefix(_starpu_descr.nodes[node]);
 	return snprintf(name, size, "%s %d", prefix, _starpu_descr.devid[node]);
 }
 

+ 0 - 5
src/datawizard/memory_nodes.h

@@ -30,10 +30,6 @@
 #include <core/simgrid.h>
 #endif
 
-#define _STARPU_MEMORY_NODE_TUPLE(node1,node2) (node1 | (node2 << 4))
-#define _STARPU_MEMORY_NODE_TUPLE_FIRST(tuple) (tuple & 0x0F)
-#define _STARPU_MEMORY_NODE_TUPLE_SECOND(tuple) (tuple & 0xF0)
-
 extern char _starpu_worker_drives_memory[STARPU_NMAXWORKERS][STARPU_MAXNODES];
 
 struct _starpu_cond_and_worker
@@ -68,7 +64,6 @@ struct _starpu_memory_node_descr
 	/* the number of queues attached to each node */
 	unsigned total_condition_count;
 	unsigned condition_count[STARPU_MAXNODES];
-
 };
 
 extern struct _starpu_memory_node_descr _starpu_descr;

+ 137 - 0
src/datawizard/node_ops.c

@@ -0,0 +1,137 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2019                                     CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <common/config.h>
+
+#include <datawizard/node_ops.h>
+#include <drivers/cpu/driver_cpu.h>
+#include <drivers/cuda/driver_cuda.h>
+#include <drivers/mpi/driver_mpi_sink.h>
+#include <drivers/mpi/driver_mpi_source.h>
+#include <drivers/mpi/driver_mpi_common.h>
+#include <drivers/mic/driver_mic_source.h>
+#include <drivers/disk/driver_disk.h>
+
+struct _starpu_node_ops _node_ops[STARPU_MPI_MS_RAM+1];
+
+void _starpu_node_ops_init()
+{
+	memset(_node_ops, 0, STARPU_MPI_MS_RAM*sizeof(struct _starpu_node_ops));
+
+	// CPU
+	// CPU_RAM does not define wait_event operation
+	_node_ops[STARPU_CPU_RAM].copy_data_to[STARPU_CPU_RAM] = _starpu_cpu_copy_data_to_cpu;
+	_node_ops[STARPU_CPU_RAM].copy_data_to[STARPU_CUDA_RAM] = _starpu_cpu_copy_data_to_cuda;
+	_node_ops[STARPU_CPU_RAM].copy_data_to[STARPU_OPENCL_RAM] = _starpu_cpu_copy_data_to_opencl;
+	_node_ops[STARPU_CPU_RAM].copy_data_to[STARPU_DISK_RAM] = _starpu_cpu_copy_data_to_disk;
+	_node_ops[STARPU_CPU_RAM].copy_data_to[STARPU_MPI_MS_RAM] = _starpu_cpu_copy_data_to_mpi_ms;
+	_node_ops[STARPU_CPU_RAM].copy_data_to[STARPU_SCC_RAM] = _starpu_cpu_copy_data_to_scc;
+	_node_ops[STARPU_CPU_RAM].copy_data_to[STARPU_MIC_RAM] = _starpu_cpu_copy_data_to_mic;
+	_node_ops[STARPU_CPU_RAM].copy_interface = _starpu_cpu_copy_interface;
+	_node_ops[STARPU_CPU_RAM].direct_access_supported = _starpu_cpu_direct_access_supported;
+	_node_ops[STARPU_CPU_RAM].malloc_on_node = _starpu_cpu_malloc_on_node;
+	_node_ops[STARPU_CPU_RAM].free_on_node = _starpu_cpu_free_on_node;
+
+#ifdef STARPU_USE_CUDA
+	_node_ops[STARPU_CUDA_RAM].wait_request_completion = _starpu_cuda_wait_request_completion;
+	_node_ops[STARPU_CUDA_RAM].test_request_completion = _starpu_cuda_test_request_completion;
+	_node_ops[STARPU_CUDA_RAM].copy_data_to[STARPU_CUDA_RAM] = _starpu_cuda_copy_data_to_cuda;
+	_node_ops[STARPU_CUDA_RAM].copy_data_to[STARPU_CPU_RAM] = _starpu_cuda_copy_data_to_cpu;
+	_node_ops[STARPU_CUDA_RAM].copy_interface = _starpu_cuda_copy_interface;
+	_node_ops[STARPU_CUDA_RAM].direct_access_supported = _starpu_cuda_direct_access_supported;
+	_node_ops[STARPU_CUDA_RAM].malloc_on_node = _starpu_cuda_malloc_on_node;
+	_node_ops[STARPU_CUDA_RAM].free_on_node = _starpu_cuda_free_on_node;
+#endif
+
+#ifdef STARPU_USE_OPENCL
+	_node_ops[STARPU_OPENCL_RAM].wait_request_completion = _starpu_opencl_wait_request_completion;
+	_node_ops[STARPU_OPENCL_RAM].test_request_completion = _starpu_opencl_test_request_completion;
+	_node_ops[STARPU_OPENCL_RAM].copy_data_to[STARPU_OPENCL_RAM] = _starpu_opencl_copy_data_to_opencl;
+	_node_ops[STARPU_OPENCL_RAM].copy_data_to[STARPU_CPU_RAM] = _starpu_opencl_copy_data_to_cpu;
+	_node_ops[STARPU_OPENCL_RAM].copy_interface = _starpu_opencl_copy_interface;
+	_node_ops[STARPU_OPENCL_RAM].direct_access_supported = _starpu_opencl_direct_access_supported;
+	_node_ops[STARPU_OPENCL_RAM].malloc_on_node = _starpu_opencl_malloc_on_node;
+	_node_ops[STARPU_OPENCL_RAM].free_on_node = _starpu_opencl_free_on_node;
+#endif
+
+#ifdef STARPU_USE_MIC
+	_node_ops[STARPU_MIC_RAM].wait_request_completion = _starpu_mic_wait_request_completion;
+	_node_ops[STARPU_MIC_RAM].test_request_completion = _starpu_mic_test_request_completion;
+	_node_ops[STARPU_MIC_RAM].copy_data_to[STARPU_CPU_RAM] = _starpu_mic_copy_data_to_cpu;
+	/* TODO: MIC -> MIC */
+	_node_ops[STARPU_MIC_RAM].copy_interface = _starpu_mic_copy_interface;
+	_node_ops[STARPU_MIC_RAM].direct_access_supported = _starpu_mic_direct_access_supported;
+	_node_ops[STARPU_MIC_RAM].malloc_on_node = _starpu_mic_malloc_on_node;
+	_node_ops[STARPU_MIC_RAM].free_on_node = _starpu_mic_free_on_node;
+#endif
+
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+	_node_ops[STARPU_MPI_MS_RAM].wait_request_completion = _starpu_mpi_common_wait_request_completion;
+	_node_ops[STARPU_MPI_MS_RAM].test_request_completion = _starpu_mpi_common_test_event;
+	_node_ops[STARPU_MPI_MS_RAM].copy_data_to[STARPU_CPU_RAM] = _starpu_mpi_common_copy_data_to_cpu;
+	_node_ops[STARPU_MPI_MS_RAM].copy_data_to[STARPU_MPI_MS_RAM] = _starpu_mpi_common_copy_data_to_mpi;
+	_node_ops[STARPU_MPI_MS_RAM].copy_interface = _starpu_mpi_copy_interface;
+	_node_ops[STARPU_MPI_MS_RAM].direct_access_supported = _starpu_mpi_direct_access_supported;
+	_node_ops[STARPU_MPI_MS_RAM].malloc_on_node = _starpu_mpi_malloc_on_node;
+	_node_ops[STARPU_MPI_MS_RAM].free_on_node = _starpu_mpi_free_on_node;
+#endif
+
+	_node_ops[STARPU_DISK_RAM].wait_request_completion = _starpu_disk_wait_request_completion;
+	_node_ops[STARPU_DISK_RAM].test_request_completion = _starpu_disk_test_request_completion;
+	_node_ops[STARPU_DISK_RAM].copy_data_to[STARPU_CPU_RAM] = _starpu_disk_copy_data_to_cpu;
+	_node_ops[STARPU_DISK_RAM].copy_data_to[STARPU_DISK_RAM] = _starpu_disk_copy_data_to_disk;
+	_node_ops[STARPU_DISK_RAM].copy_interface = _starpu_disk_copy_interface;
+	_node_ops[STARPU_DISK_RAM].direct_access_supported = _starpu_disk_direct_access_supported;
+	_node_ops[STARPU_DISK_RAM].malloc_on_node = _starpu_disk_malloc_on_node;
+	_node_ops[STARPU_DISK_RAM].free_on_node = _starpu_disk_free_on_node;
+
+#ifdef STARPU_USE_SCC
+	_node_ops[STARPU_SCC_RAM].copy_data_to[STARPU_CPU_RAM] = _starpu_scc_common_copy_data_to_cpu;
+	_node_ops[STARPU_SCC_RAM].copy_data_to[STARPU_SCC_RAM] = _starpu_scc_common_copy_data_to_scc;
+	_node_ops[STARPU_SCC_RAM].copy_interface = _starpu_scc_copy_interface;
+	_node_ops[STARPU_SCC_RAM].direct_access_supported = _starpu_scc_direct_access_supported;
+	_node_ops[STARPU_SCC_RAM].malloc_on_node = _starpu_scc_malloc_on_node;
+	_node_ops[STARPU_SCC_RAM].free_on_node = _starpu_scc_free_on_node;
+#endif
+}
+
+const char* _starpu_node_get_prefix(enum starpu_node_kind kind)
+{
+	switch (kind)
+	{
+		case STARPU_CPU_RAM:
+			return "NUMA";
+		case STARPU_CUDA_RAM:
+			return "CUDA";
+		case STARPU_OPENCL_RAM:
+			return "OpenCL";
+		case STARPU_DISK_RAM:
+			return "Disk";
+		case STARPU_MIC_RAM:
+			return "MIC";
+		case STARPU_MPI_MS_RAM:
+			return "MPI_MS";
+		case STARPU_SCC_RAM:
+			return "SCC_RAM";
+		case STARPU_SCC_SHM:
+			return "SCC_shared";
+		case STARPU_UNUSED:
+		default:
+			STARPU_ASSERT(0);
+			return "unknown";
+	}
+}

+ 50 - 0
src/datawizard/node_ops.h

@@ -0,0 +1,50 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2008-2011,2014,2017                      Université de Bordeaux
+ * Copyright (C) 2016,2017                                Inria
+ * Copyright (C) 2010,2013,2015,2017,2019                      CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __NODE_OPS_H__
+#define __NODE_OPS_H__
+
+#include <starpu.h>
+#include <common/config.h>
+#include <datawizard/copy_driver.h>
+
+typedef int (*copy_data_func_t)(starpu_data_handle_t handle, void *src_interface, unsigned src_node,
+				void *dst_interface, unsigned dst_node,
+				struct _starpu_data_request *req);
+
+typedef int (*copy_interface_t)(uintptr_t src_ptr, size_t src_offset, unsigned src_node,
+				uintptr_t dst_ptr, size_t dst_offset, unsigned dst_node,
+				size_t ssize, struct _starpu_async_channel *async_channel);
+
+struct _starpu_node_ops
+{
+	copy_data_func_t copy_data_to[STARPU_MPI_MS_RAM+1];
+	copy_interface_t copy_interface;
+	void (*wait_request_completion)(struct _starpu_async_channel *async_channel);
+	unsigned (*test_request_completion)(struct _starpu_async_channel *async_channel);
+	int (*direct_access_supported)(unsigned node, unsigned handling_node);
+	uintptr_t (*malloc_on_node)(unsigned dst_node, size_t size, int flags);
+	void (*free_on_node)(unsigned dst_node, uintptr_t addr, size_t size, int flags);
+};
+
+extern struct _starpu_node_ops _node_ops[STARPU_MPI_MS_RAM+1];
+
+void _starpu_node_ops_init();
+const char* _starpu_node_get_prefix(enum starpu_node_kind kind);
+
+#endif // __NODE_OPS_H__

+ 347 - 2
src/drivers/cpu/driver_cpu.c

@@ -3,7 +3,7 @@
  * Copyright (C) 2011,2012,2014-2017                      Inria
  * Copyright (C) 2008-2018                                Université de Bordeaux
  * Copyright (C) 2010                                     Mehdi Juhoor
- * Copyright (C) 2010-2017                                CNRS
+ * Copyright (C) 2010-2017,2019                           CNRS
  * Copyright (C) 2013                                     Thibaut Lambert
  * Copyright (C) 2011                                     Télécom-SudParis
  *
@@ -31,11 +31,14 @@
 #include <core/drivers.h>
 #include <core/idle_hook.h>
 #include <drivers/cpu/driver_cpu.h>
+#include <drivers/disk/driver_disk.h>
 #include <core/sched_policy.h>
 #include <datawizard/memory_manager.h>
+#include <datawizard/memory_nodes.h>
 #include <datawizard/malloc.h>
 #include <core/simgrid.h>
 #include <core/task.h>
+#include <core/disk.h>
 
 #ifdef STARPU_HAVE_HWLOC
 #include <hwloc.h>
@@ -206,7 +209,7 @@ static int _starpu_cpu_driver_execute_task(struct _starpu_worker *cpu_worker, st
 		{
 			struct _starpu_combined_worker *combined_worker;
 			combined_worker = _starpu_get_combined_worker_struct(j->combined_workerid);
-			
+
 			cpu_worker->combined_workerid = j->combined_workerid;
 			cpu_worker->worker_size = combined_worker->worker_size;
 			perf_arch = &combined_worker->perf_arch;
@@ -426,6 +429,348 @@ int _starpu_cpu_driver_run(struct _starpu_worker *worker)
 	return 0;
 }
 
+int _starpu_cpu_copy_data_to_opencl(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_OPENCL_RAM);
+
+	int ret = 1;
+
+#ifdef STARPU_USE_OPENCL
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+	/* STARPU_CPU_RAM -> STARPU_OPENCL_RAM */
+	STARPU_ASSERT(starpu_worker_get_local_memory_node() == dst_node);
+	if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() || !(copy_methods->ram_to_opencl_async || copy_methods->any_to_any))
+	{
+		STARPU_ASSERT(copy_methods->ram_to_opencl || copy_methods->any_to_any);
+		/* this is not associated to a request so it's synchronous */
+		if (copy_methods->ram_to_opencl)
+			copy_methods->ram_to_opencl(src_interface, src_node, dst_interface, dst_node);
+		else
+			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+	}
+	else
+	{
+		req->async_channel.type = STARPU_OPENCL_RAM;
+		if (copy_methods->ram_to_opencl_async)
+			ret = copy_methods->ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
+		else
+		{
+			STARPU_ASSERT(copy_methods->any_to_any);
+			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+		}
+	}
+#endif
+	return ret;
+}
+
+int _starpu_cpu_copy_data_to_cuda(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_CUDA_RAM);
+
+	int ret = 1;
+
+#ifdef STARPU_USE_CUDA
+	cudaError_t cures;
+	cudaStream_t stream;
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+
+	/* STARPU_CPU_RAM -> CUBLAS_RAM */
+	/* only the proper CUBLAS thread can initiate this ! */
+#if !defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
+	STARPU_ASSERT(starpu_worker_get_local_memory_node() == dst_node);
+#endif
+	if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() ||
+	    !(copy_methods->ram_to_cuda_async || copy_methods->any_to_any))
+	{
+		/* this is not associated to a request so it's synchronous */
+		STARPU_ASSERT(copy_methods->ram_to_cuda || copy_methods->any_to_any);
+		if (copy_methods->ram_to_cuda)
+			copy_methods->ram_to_cuda(src_interface, src_node, dst_interface, dst_node);
+		else
+			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+	}
+	else
+	{
+		req->async_channel.type = STARPU_CUDA_RAM;
+		cures = cudaEventCreateWithFlags(&req->async_channel.event.cuda_event, cudaEventDisableTiming);
+		if (STARPU_UNLIKELY(cures != cudaSuccess))
+			STARPU_CUDA_REPORT_ERROR(cures);
+
+		stream = starpu_cuda_get_in_transfer_stream(dst_node);
+		if (copy_methods->ram_to_cuda_async)
+			ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
+		else
+		{
+			STARPU_ASSERT(copy_methods->any_to_any);
+			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+		}
+
+		cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
+		if (STARPU_UNLIKELY(cures != cudaSuccess))
+			STARPU_CUDA_REPORT_ERROR(cures);
+	}
+#endif
+	return ret;
+}
+
+int _starpu_cpu_copy_data_to_mic(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_MIC_RAM);
+
+	int ret = 1;
+
+#ifdef STARPU_USE_MIC
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+	/* RAM -> MIC */
+	if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mic_copy_disabled() || !(copy_methods->ram_to_mic_async || copy_methods->any_to_any))
+	{
+		/* this is not associated to a request so it's synchronous */
+		STARPU_ASSERT(copy_methods->ram_to_mic || copy_methods->any_to_any);
+		if (copy_methods->ram_to_mic)
+			copy_methods->ram_to_mic(src_interface, src_node, dst_interface, dst_node);
+		else
+			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+	}
+	else
+	{
+		req->async_channel.type = STARPU_MIC_RAM;
+		if (copy_methods->ram_to_mic_async)
+			ret = copy_methods->ram_to_mic_async(src_interface, src_node, dst_interface, dst_node);
+		else
+		{
+			STARPU_ASSERT(copy_methods->any_to_any);
+			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+		}
+		_starpu_mic_init_event(&(req->async_channel.event.mic_event), dst_node);
+	}
+#endif
+	return ret;
+}
+
+int _starpu_cpu_copy_data_to_disk(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_DISK_RAM);
+
+	int ret = 0;
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+
+	if (req && !starpu_asynchronous_copy_disabled())
+	{
+		req->async_channel.type = STARPU_DISK_RAM;
+		req->async_channel.event.disk_event.requests = NULL;
+		req->async_channel.event.disk_event.ptr = NULL;
+		req->async_channel.event.disk_event.handle = NULL;
+	}
+	if(copy_methods->any_to_any)
+		ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req && !starpu_asynchronous_copy_disabled() ? &req->async_channel : NULL);
+	else
+	{
+		void *obj = starpu_data_handle_to_pointer(handle, dst_node);
+		void * ptr = NULL;
+		starpu_ssize_t size = 0;
+		handle->ops->pack_data(handle, src_node, &ptr, &size);
+		ret = _starpu_disk_full_write(src_node, dst_node, obj, ptr, size, req && !starpu_asynchronous_copy_disabled() ? &req->async_channel : NULL);
+		if (ret == 0)
+		{
+			/* write is already finished, ptr was allocated in pack_data */
+			_starpu_free_flags_on_node(src_node, ptr, size, 0);
+		}
+		else if (ret == -EAGAIN)
+		{
+			STARPU_ASSERT(req);
+			req->async_channel.event.disk_event.ptr = ptr;
+			req->async_channel.event.disk_event.node = src_node;
+			req->async_channel.event.disk_event.size = size;
+		}
+		STARPU_ASSERT(ret == 0 || ret == -EAGAIN);
+	}
+	return ret;
+}
+
+int _starpu_cpu_copy_data_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_CPU_RAM);
+
+	int ret = 0;
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+	if (copy_methods->ram_to_ram)
+		copy_methods->ram_to_ram(src_interface, src_node, dst_interface, dst_node);
+	else
+		copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req ? &req->async_channel : NULL);
+	return ret;
+}
+
+int _starpu_cpu_copy_data_to_mpi_ms(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_MPI_MS_RAM);
+
+	int ret = 0;
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+
+	if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mpi_ms_copy_disabled() || !(copy_methods->ram_to_mpi_ms_async || copy_methods->any_to_any))
+	{
+		/* this is not associated to a request so it's synchronous */
+		STARPU_ASSERT(copy_methods->ram_to_mpi_ms || copy_methods->any_to_any);
+		if (copy_methods->ram_to_mpi_ms)
+			copy_methods->ram_to_mpi_ms(src_interface, src_node, dst_interface, dst_node);
+		else
+			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+	}
+	else
+	{
+		req->async_channel.type = STARPU_MPI_MS_RAM;
+		if(copy_methods->ram_to_mpi_ms_async)
+			ret = copy_methods->ram_to_mpi_ms_async(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+		else
+		{
+			STARPU_ASSERT(copy_methods->any_to_any);
+			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+		}
+	}
+	return ret;
+}
+
+int _starpu_cpu_copy_data_to_scc(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_SCC_RAM);
+
+	int ret = 0;
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+	if (copy_methods->scc_src_to_sink)
+		copy_methods->scc_src_to_sink(src_interface, src_node, dst_interface, dst_node);
+	else
+		copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+	return ret;
+}
+
+int _starpu_cpu_copy_interface(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	STARPU_ASSERT(src_kind == STARPU_CPU_RAM);
+
+	int dst_kind = starpu_node_get_kind(dst_node);
+
+	if (dst_kind == STARPU_CPU_RAM)
+	{
+		memcpy((void *) (dst + dst_offset), (void *) (src + src_offset), size);
+		return 0;
+	}
+#ifdef STARPU_USE_CUDA
+	else if (dst_kind == STARPU_CUDA_RAM)
+	{
+		return starpu_cuda_copy_async_sync((void*) (src + src_offset), src_node,
+						   (void*) (dst + dst_offset), dst_node,
+						   size,
+						   async_channel?starpu_cuda_get_in_transfer_stream(dst_node):NULL,
+						   cudaMemcpyHostToDevice);
+	}
+#endif
+#ifdef STARPU_USE_OPENCL
+	else if (dst_kind == STARPU_OPENCL_RAM)
+	{
+		return starpu_opencl_copy_async_sync(src, src_offset, src_node,
+						     dst, dst_offset, dst_node,
+						     size,
+						     &async_channel->event.opencl_event);
+
+	}
+#endif
+#ifdef STARPU_USE_MIC
+	else if (dst_kind == STARPU_MIC_RAM)
+	{
+		if (async_channel)
+			return _starpu_mic_copy_ram_to_mic_async((void*) (src + src_offset), src_node,
+								 (void*) (dst + dst_offset), dst_node,
+								 size);
+		else
+			return _starpu_mic_copy_ram_to_mic((void*) (src + src_offset), src_node,
+							   (void*) (dst + dst_offset), dst_node,
+							   size);
+
+	}
+#endif
+#ifdef STARPU_USE_SCC
+	else if (dst_kind == STARPU_MIC_RAM)
+	{
+		return _starpu_scc_copy_src_to_sink((void*) (src + src_offset), src_node,
+						    (void*) (dst + dst_offset), dst_node,
+						    size);
+	}
+#endif
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+	else if (dst_kind == STARPU_MPI_MS_RAM)
+	{
+                if (async_channel)
+                        return _starpu_mpi_copy_ram_to_mpi_async((void*) (src + src_offset), src_node,
+								 (void*) (dst + dst_offset), dst_node,
+								 size, async_channel);
+                else
+                        return _starpu_mpi_copy_ram_to_mpi_sync((void*) (src + src_offset), src_node,
+								(void*) (dst + dst_offset), dst_node,
+								size);
+	}
+#endif
+	else if (dst_kind == STARPU_DISK_RAM)
+	{
+		return _starpu_disk_copy_src_to_disk((void*) (src + src_offset), src_node,
+						     (void*) dst, dst_offset, dst_node,
+						     size, async_channel);
+	}
+	else
+	{
+		STARPU_ABORT();
+		return -1;
+	}
+}
+
+int _starpu_cpu_direct_access_supported(unsigned node, unsigned handling_node)
+{
+	return 1;
+}
+
+uintptr_t _starpu_cpu_malloc_on_node(unsigned dst_node, size_t size, int flags)
+{
+	uintptr_t addr = 0;
+	_starpu_malloc_flags_on_node(dst_node, (void**) &addr, size,
+#if defined(STARPU_USE_CUDA) && !defined(STARPU_HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
+				     /* without memcpy_peer, we can not
+				      * allocated pinned memory, since it
+				      * requires waiting for a task, and we
+				      * may be called with a spinlock held
+				      */
+				     flags & ~STARPU_MALLOC_PINNED
+#else
+				     flags
+#endif
+				     );
+	return addr;
+}
+
+void _starpu_cpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags)
+{
+	_starpu_free_flags_on_node(dst_node, (void*)addr, size,
+#if defined(STARPU_USE_CUDA) && !defined(STARPU_HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
+				   flags & ~STARPU_MALLOC_PINNED
+#else
+				   flags
+#endif
+				   );
+}
+
 struct _starpu_driver_ops _starpu_driver_cpu_ops =
 {
 	.init = _starpu_cpu_driver_init,

+ 32 - 1
src/drivers/cpu/driver_cpu.h

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2012,2014                                Inria
  * Copyright (C) 2008-2011,2014,2018                      Université de Bordeaux
- * Copyright (C) 2010,2012,2013,2015,2017                 CNRS
+ * Copyright (C) 2010,2012,2013,2015,2017,2019            CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -26,6 +26,37 @@
 extern struct _starpu_driver_ops _starpu_driver_cpu_ops;
 void *_starpu_cpu_worker(void *);
 
+int _starpu_cpu_copy_data_to_cuda(starpu_data_handle_t handle, void *src_interface, unsigned src_node,
+				  void *dst_interface, unsigned dst_node,
+				  struct _starpu_data_request *req);
+int _starpu_cpu_copy_data_to_opencl(starpu_data_handle_t handle, void *src_interface, unsigned src_node,
+				    void *dst_interface, unsigned dst_node,
+				    struct _starpu_data_request *req);
+int _starpu_cpu_copy_data_to_mic(starpu_data_handle_t handle, void *src_interface, unsigned src_node,
+				 void *dst_interface, unsigned dst_node,
+				 struct _starpu_data_request *req);
+int _starpu_cpu_copy_data_to_disk(starpu_data_handle_t handle, void *src_interface, unsigned src_node,
+				  void *dst_interface, unsigned dst_node,
+				  struct _starpu_data_request *req);
+int _starpu_cpu_copy_data_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node,
+				 void *dst_interface, unsigned dst_node,
+				 struct _starpu_data_request *req);
+int _starpu_cpu_copy_data_to_mpi_ms(starpu_data_handle_t handle, void *src_interface, unsigned src_node,
+				    void *dst_interface, unsigned dst_node,
+				    struct _starpu_data_request *req);
+int _starpu_cpu_copy_data_to_scc(starpu_data_handle_t handle, void *src_interface, unsigned src_node,
+				 void *dst_interface, unsigned dst_node,
+				 struct _starpu_data_request *req);
+
+int _starpu_cpu_copy_interface(uintptr_t src_ptr, size_t src_offset, unsigned src_node,
+			       uintptr_t dst_ptr, size_t dst_offset, unsigned dst_node,
+			       size_t ssize, struct _starpu_async_channel *async_channel);
+
+int _starpu_cpu_direct_access_supported(unsigned node, unsigned handling_node);
+
+uintptr_t _starpu_cpu_malloc_on_node(unsigned dst_node, size_t size, int flags);
+void _starpu_cpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags);
+
 #endif /* !STARPU_USE_CPU */
 
 #endif //  __DRIVER_CPU_H__

+ 268 - 0
src/drivers/cuda/driver_cuda.c

@@ -1214,6 +1214,274 @@ int _starpu_cuda_driver_deinit_from_worker(struct _starpu_worker *worker)
 	return _starpu_cuda_driver_deinit(worker->set);
 }
 
+unsigned _starpu_cuda_test_request_completion(struct _starpu_async_channel *async_channel)
+{
+	cudaEvent_t event;
+	cudaError_t cures;
+	unsigned success;
+
+	event = (*async_channel).event.cuda_event;
+	cures = cudaEventQuery(event);
+	success = (cures == cudaSuccess);
+
+	if (success)
+		cudaEventDestroy(event);
+	else if (cures != cudaErrorNotReady)
+		STARPU_CUDA_REPORT_ERROR(cures);
+
+	return success;
+}
+
+void _starpu_cuda_wait_request_completion(struct _starpu_async_channel *async_channel)
+{
+	cudaEvent_t event;
+	cudaError_t cures;
+
+	event = (*async_channel).event.cuda_event;
+
+	cures = cudaEventSynchronize(event);
+	if (STARPU_UNLIKELY(cures))
+		STARPU_CUDA_REPORT_ERROR(cures);
+
+	cures = cudaEventDestroy(event);
+	if (STARPU_UNLIKELY(cures))
+		STARPU_CUDA_REPORT_ERROR(cures);
+}
+
+int _starpu_cuda_copy_data_to_cuda(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_CUDA_RAM && dst_kind == STARPU_CUDA_RAM);
+
+	int ret = 1;
+#ifdef STARPU_USE_CUDA
+	cudaError_t cures;
+	cudaStream_t stream;
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+/* CUDA - CUDA transfer */
+	if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() || !(copy_methods->cuda_to_cuda_async || copy_methods->any_to_any))
+	{
+		STARPU_ASSERT(copy_methods->cuda_to_cuda || copy_methods->any_to_any);
+		/* this is not associated to a request so it's synchronous */
+		if (copy_methods->cuda_to_cuda)
+			copy_methods->cuda_to_cuda(src_interface, src_node, dst_interface, dst_node);
+		else
+			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+	}
+	else
+	{
+		req->async_channel.type = STARPU_CUDA_RAM;
+		cures = cudaEventCreateWithFlags(&req->async_channel.event.cuda_event, cudaEventDisableTiming);
+		if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
+
+		stream = starpu_cuda_get_peer_transfer_stream(src_node, dst_node);
+		if (copy_methods->cuda_to_cuda_async)
+			ret = copy_methods->cuda_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
+		else
+		{
+			STARPU_ASSERT(copy_methods->any_to_any);
+			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+		}
+
+		cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
+		if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
+	}
+#endif
+	return ret;
+}
+
+int _starpu_cuda_copy_data_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_CUDA_RAM && dst_kind == STARPU_CPU_RAM);
+
+	int ret = 1;
+#ifdef STARPU_USE_CUDA
+	cudaError_t cures;
+	cudaStream_t stream;
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+
+	/* only the proper CUBLAS thread can initiate this directly ! */
+#if !defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
+	STARPU_ASSERT(starpu_worker_get_local_memory_node() == src_node);
+#endif
+	if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() || !(copy_methods->cuda_to_ram_async || copy_methods->any_to_any))
+	{
+		/* this is not associated to a request so it's synchronous */
+		STARPU_ASSERT(copy_methods->cuda_to_ram || copy_methods->any_to_any);
+		if (copy_methods->cuda_to_ram)
+			copy_methods->cuda_to_ram(src_interface, src_node, dst_interface, dst_node);
+		else
+			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+	}
+	else
+	{
+		req->async_channel.type = STARPU_CUDA_RAM;
+		cures = cudaEventCreateWithFlags(&req->async_channel.event.cuda_event, cudaEventDisableTiming);
+		if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
+
+		stream = starpu_cuda_get_out_transfer_stream(src_node);
+		if (copy_methods->cuda_to_ram_async)
+			ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
+		else
+		{
+			STARPU_ASSERT(copy_methods->any_to_any);
+			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+		}
+
+		cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
+		if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
+	}
+#endif
+	return ret;
+}
+
+int _starpu_cuda_copy_interface(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	STARPU_ASSERT(src_kind == STARPU_CUDA_RAM);
+
+	int dst_kind = starpu_node_get_kind(dst_node);
+
+	if (dst_kind == STARPU_CPU_RAM)
+	{
+		return starpu_cuda_copy_async_sync((void*) (src + src_offset), src_node,
+						   (void*) (dst + dst_offset), dst_node,
+						   size,
+						   async_channel?starpu_cuda_get_out_transfer_stream(src_node):NULL,
+						   cudaMemcpyDeviceToHost);
+	}
+	else if (dst_kind == STARPU_CUDA_RAM)
+	{
+		return starpu_cuda_copy_async_sync((void*) (src + src_offset), src_node,
+						   (void*) (dst + dst_offset), dst_node,
+						   size,
+						   async_channel?starpu_cuda_get_peer_transfer_stream(src_node, dst_node):NULL,
+						   cudaMemcpyDeviceToDevice);
+	}
+	else
+	{
+		STARPU_ABORT();
+		return -1;
+	}
+}
+
+int _starpu_cuda_direct_access_supported(unsigned node, unsigned handling_node)
+{
+	/* GPUs not always allow direct remote access: if CUDA4
+	 * is enabled, we allow two CUDA devices to communicate. */
+#ifdef STARPU_SIMGRID
+	if (starpu_node_get_kind(handling_node) == STARPU_CUDA_RAM)
+	{
+		msg_host_t host = _starpu_simgrid_get_memnode_host(handling_node);
+		const char* cuda_memcpy_peer = MSG_host_get_property_value(host, "memcpy_peer");
+		return cuda_memcpy_peer && atoll(cuda_memcpy_peer);
+	}
+	else
+		return 0;
+#elif defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
+	enum starpu_node_kind kind = starpu_node_get_kind(handling_node);
+	return kind == STARPU_CUDA_RAM;
+#else /* STARPU_HAVE_CUDA_MEMCPY_PEER */
+	/* Direct GPU-GPU transfers are not allowed in general */
+	return 0;
+#endif /* STARPU_HAVE_CUDA_MEMCPY_PEER */
+}
+
+uintptr_t _starpu_cuda_malloc_on_node(unsigned dst_node, size_t size, int flags)
+{
+	uintptr_t addr = 0;
+
+#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
+
+#ifdef STARPU_SIMGRID
+	static uintptr_t last[STARPU_MAXNODES];
+#ifdef STARPU_DEVEL
+#warning TODO: record used memory, using a simgrid property to know the available memory
+#endif
+	/* Sleep for the allocation */
+	STARPU_PTHREAD_MUTEX_LOCK(&cuda_alloc_mutex);
+	if (_starpu_simgrid_cuda_malloc_cost())
+		MSG_process_sleep(0.000175);
+	if (!last[dst_node])
+		last[dst_node] = 1<<10;
+	addr = last[dst_node];
+	last[dst_node]+=size;
+	STARPU_ASSERT(last[dst_node] >= addr);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_alloc_mutex);
+#else
+	unsigned devid = starpu_memory_node_get_devid(dst_node);
+#if defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
+	starpu_cuda_set_device(devid);
+#else
+	struct _starpu_worker *worker = _starpu_get_local_worker_key();
+	if (!worker || worker->arch != STARPU_CUDA_WORKER || worker->devid != devid)
+		STARPU_ASSERT_MSG(0, "CUDA peer access is not available with this version of CUDA");
+#endif
+	/* Check if there is free memory */
+	size_t cuda_mem_free, cuda_mem_total;
+	cudaError_t status;
+	status = cudaMemGetInfo(&cuda_mem_free, &cuda_mem_total);
+	if (status == cudaSuccess && cuda_mem_free < (size*2))
+	{
+		addr = 0;
+	}
+	else
+	{
+		status = cudaMalloc((void **)&addr, size);
+		if (!addr || (status != cudaSuccess))
+		{
+			if (STARPU_UNLIKELY(status != cudaErrorMemoryAllocation))
+				STARPU_CUDA_REPORT_ERROR(status);
+			addr = 0;
+		}
+	}
+#endif
+#endif
+	return addr;
+}
+
+void _starpu_cuda_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags)
+{
+#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
+#ifdef STARPU_SIMGRID
+	STARPU_PTHREAD_MUTEX_LOCK(&cuda_alloc_mutex);
+	/* Sleep for the free */
+	if (_starpu_simgrid_cuda_malloc_cost())
+		MSG_process_sleep(0.000750);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_alloc_mutex);
+	/* CUDA also synchronizes roughly everything on cudaFree */
+	_starpu_simgrid_sync_gpus();
+#else
+	cudaError_t err;
+	unsigned devid = starpu_memory_node_get_devid(dst_node);
+#if defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
+	starpu_cuda_set_device(devid);
+#else
+	struct _starpu_worker *worker = _starpu_get_local_worker_key();
+	if (!worker || worker->arch != STARPU_CUDA_WORKER || worker->devid != devid)
+		STARPU_ASSERT_MSG(0, "CUDA peer access is not available with this version of CUDA");
+#endif /* STARPU_HAVE_CUDA_MEMCPY_PEER */
+	err = cudaFree((void*)addr);
+#ifdef STARPU_OPENMP
+	/* When StarPU is used as Open Runtime support,
+	 * starpu_omp_shutdown() will usually be called from a
+	 * destructor, in which case cudaThreadExit() reports a
+	 * cudaErrorCudartUnloading here. There should not
+	 * be any remaining tasks running at this point so
+	 * we can probably ignore it without much consequences. */
+	if (STARPU_UNLIKELY(err != cudaSuccess && err != cudaErrorCudartUnloading))
+		STARPU_CUDA_REPORT_ERROR(err);
+#else
+	if (STARPU_UNLIKELY(err != cudaSuccess))
+		STARPU_CUDA_REPORT_ERROR(err);
+#endif /* STARPU_OPENMP */
+#endif /* STARPU_SIMGRID */
+#endif
+}
+
 struct _starpu_driver_ops _starpu_driver_cuda_ops =
 {
 	.init = _starpu_cuda_driver_init_from_worker,

+ 11 - 1
src/drivers/cuda/driver_cuda.h

@@ -3,7 +3,7 @@
  * Copyright (C) 2008-2014,2016,2017                      Université de Bordeaux
  * Copyright (C) 2012,2017                                Inria
  * Copyright (C) 2015                                     Mathieu Lirzin
- * Copyright (C) 2010,2012,2015,2017                      CNRS
+ * Copyright (C) 2010,2012,2015,2017,2019                 CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -29,6 +29,7 @@
 #endif
 
 #include <starpu.h>
+#include <core/workers.h>
 
 extern struct _starpu_driver_ops _starpu_driver_cuda_ops;
 
@@ -52,5 +53,14 @@ cudaStream_t starpu_cuda_get_out_transfer_stream(unsigned src_node);
 cudaStream_t starpu_cuda_get_peer_transfer_stream(unsigned src_node, unsigned dst_node);
 #endif
 
+unsigned _starpu_cuda_test_request_completion(struct _starpu_async_channel *async_channel);
+void _starpu_cuda_wait_request_completion(struct _starpu_async_channel *async_channel);
+int _starpu_cuda_copy_data_to_cuda(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+int _starpu_cuda_copy_data_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+int _starpu_cuda_copy_interface(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+int _starpu_cuda_direct_access_supported(unsigned node, unsigned handling_node);
+uintptr_t _starpu_cuda_malloc_on_node(unsigned dst_node, size_t size, int flags);
+void _starpu_cuda_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags);
+
 #endif //  __DRIVER_CUDA_H__
 

+ 155 - 8
src/drivers/disk/driver_disk.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2013,2017                                CNRS
+ * Copyright (C) 2013,2017,2019                           CNRS
  * Copyright (C) 2013                                     Inria
  * Copyright (C) 2013                                     Université de Bordeaux
  * Copyright (C) 2013                                     Corentin Salingue
@@ -21,16 +21,15 @@
 #include <core/disk.h>
 #include <starpu_profiling.h>
 #include <drivers/disk/driver_disk.h>
+#include <datawizard/coherency.h>
 
 int _starpu_disk_copy_src_to_disk(void * src, unsigned src_node, void * dst, size_t dst_offset, unsigned dst_node, size_t size, void * async_channel)
 {
 	STARPU_ASSERT(starpu_node_get_kind(src_node) == STARPU_CPU_RAM);
-	
+
 	return _starpu_disk_write(src_node, dst_node, dst, src, dst_offset, size, async_channel);
-	
 }
 
-
 int _starpu_disk_copy_disk_to_src(void * src, size_t src_offset, unsigned src_node, void * dst, unsigned dst_node, size_t size, void * async_channel)
 {
 	STARPU_ASSERT(starpu_node_get_kind(dst_node) == STARPU_CPU_RAM);
@@ -38,13 +37,161 @@ int _starpu_disk_copy_disk_to_src(void * src, size_t src_offset, unsigned src_no
 	return _starpu_disk_read(src_node, dst_node, src, dst, src_offset, size, async_channel);
 }
 
-
 int _starpu_disk_copy_disk_to_disk(void * src, size_t src_offset, unsigned src_node, void * dst, size_t dst_offset, unsigned dst_node, size_t size, void * async_channel)
 {
 	STARPU_ASSERT(starpu_node_get_kind(src_node) == STARPU_DISK_RAM && starpu_node_get_kind(dst_node) == STARPU_DISK_RAM);
 
-       return _starpu_disk_copy(src_node, src, src_offset, 
-			       dst_node, dst, dst_offset,
-			       size, async_channel); 
+	return _starpu_disk_copy(src_node, src, src_offset, dst_node, dst, dst_offset, size, async_channel);
+}
 
+unsigned _starpu_disk_test_request_completion(struct _starpu_async_channel *async_channel)
+{
+	unsigned success = starpu_disk_test_request(async_channel);
+	if (async_channel->event.disk_event.ptr != NULL && success)
+	{
+		if (async_channel->event.disk_event.handle != NULL)
+		{
+			/* read is finished, we can already unpack */
+			async_channel->event.disk_event.handle->ops->unpack_data(async_channel->event.disk_event.handle, async_channel->event.disk_event.node, async_channel->event.disk_event.ptr, async_channel->event.disk_event.size);
+		}
+		else
+		{
+			/* write is finished, ptr was allocated in pack_data */
+			_starpu_free_flags_on_node(async_channel->event.disk_event.node, async_channel->event.disk_event.ptr, async_channel->event.disk_event.size, 0);
+		}
+	}
+	return success;
+}
+
+void _starpu_disk_wait_request_completion(struct _starpu_async_channel *async_channel)
+{
+	starpu_disk_wait_request(async_channel);
+	if (async_channel->event.disk_event.ptr != NULL)
+	{
+		if (async_channel->event.disk_event.handle != NULL)
+		{
+			/* read is finished, we can already unpack */
+			async_channel->event.disk_event.handle->ops->unpack_data(async_channel->event.disk_event.handle, async_channel->event.disk_event.node, async_channel->event.disk_event.ptr, async_channel->event.disk_event.size);
+		}
+		else
+		{
+			/* write is finished, ptr was allocated in pack_data */
+			_starpu_free_flags_on_node(async_channel->event.disk_event.node, async_channel->event.disk_event.ptr, async_channel->event.disk_event.size, 0);
+		}
+	}
+}
+
+int _starpu_disk_copy_data_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_DISK_RAM && dst_kind == STARPU_CPU_RAM);
+
+	int ret = 0;
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+
+	if (req && !starpu_asynchronous_copy_disabled())
+	{
+		req->async_channel.type = STARPU_DISK_RAM;
+		req->async_channel.event.disk_event.requests = NULL;
+		req->async_channel.event.disk_event.ptr = NULL;
+		req->async_channel.event.disk_event.handle = NULL;
+	}
+	if(copy_methods->any_to_any)
+		ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req && !starpu_asynchronous_copy_disabled()  ? &req->async_channel : NULL);
+	else
+	{
+		void *obj = starpu_data_handle_to_pointer(handle, src_node);
+		void * ptr = NULL;
+		size_t size = 0;
+		ret = _starpu_disk_full_read(src_node, dst_node, obj, &ptr, &size, req && !starpu_asynchronous_copy_disabled() ? &req->async_channel : NULL);
+		if (ret == 0)
+		{
+			/* read is already finished, we can already unpack */
+			handle->ops->unpack_data(handle, dst_node, ptr, size);
+		}
+		else if (ret == -EAGAIN)
+		{
+			STARPU_ASSERT(req);
+			req->async_channel.event.disk_event.ptr = ptr;
+			req->async_channel.event.disk_event.node = dst_node;
+			req->async_channel.event.disk_event.size = size;
+			req->async_channel.event.disk_event.handle = handle;
+		}
+		STARPU_ASSERT(ret == 0 || ret == -EAGAIN);
+	}
+
+	return ret;
+}
+
+int _starpu_disk_copy_data_to_disk(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_DISK_RAM && dst_kind == STARPU_DISK_RAM);
+
+	int ret = 0;
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+
+	if (req && !starpu_asynchronous_copy_disabled())
+	{
+		req->async_channel.type = STARPU_DISK_RAM;
+		req->async_channel.event.disk_event.requests = NULL;
+		req->async_channel.event.disk_event.ptr = NULL;
+		req->async_channel.event.disk_event.handle = NULL;
+	}
+	ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req && !starpu_asynchronous_copy_disabled() ? &req->async_channel : NULL);
+	return ret;
+}
+
+int _starpu_disk_copy_interface(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	STARPU_ASSERT(src_kind == STARPU_DISK_RAM);
+
+	int dst_kind = starpu_node_get_kind(dst_node);
+
+	if (dst_kind == STARPU_CPU_RAM)
+	{
+		return _starpu_disk_copy_disk_to_src((void*) src, src_offset, src_node,
+						     (void*) (dst + dst_offset), dst_node,
+						     size, async_channel);
+	}
+	else if (dst_kind == STARPU_DISK_RAM)
+	{
+		return _starpu_disk_copy_disk_to_disk((void*) src, src_offset, src_node,
+						      (void*) dst, dst_offset, dst_node,
+						      size, async_channel);
+	}
+	else
+	{
+		STARPU_ABORT();
+		return -1;
+	}
+}
+
+int _starpu_disk_direct_access_supported(unsigned node, unsigned handling_node)
+{
+	/* Each worker can manage disks but disk <-> disk is not always allowed */
+	switch (starpu_node_get_kind(handling_node))
+	{
+		case STARPU_CPU_RAM:
+			return 1;
+		case STARPU_DISK_RAM:
+			return _starpu_disk_can_copy(node, handling_node);
+		default:
+			return 0;
+	}
+}
+
+uintptr_t _starpu_disk_malloc_on_node(unsigned dst_node, size_t size, int flags)
+{
+	uintptr_t addr = 0;
+	addr = (uintptr_t) _starpu_disk_alloc(dst_node, size);
+	return addr;
+}
+
+void _starpu_disk_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags)
+{
+	_starpu_disk_free(dst_node, (void *) addr , size);
 }

+ 10 - 1
src/drivers/disk/driver_disk.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2017                                     CNRS
+ * Copyright (C) 2017, 2019                               CNRS
  * Copyright (C) 2013                                     Inria
  * Copyright (C) 2013                                     Université de Bordeaux
  * Copyright (C) 2013                                     Corentin Salingue
@@ -26,4 +26,13 @@ int _starpu_disk_copy_disk_to_src(void * src, size_t src_offset, unsigned src_no
 
 int _starpu_disk_copy_disk_to_disk(void * src, size_t src_offset, unsigned src_node, void * dst, size_t dst_offset, unsigned dst_node, size_t size, void * async_channel);
 
+unsigned _starpu_disk_test_request_completion(struct _starpu_async_channel *async_channel);
+void _starpu_disk_wait_request_completion(struct _starpu_async_channel *async_channel);
+int _starpu_disk_copy_data_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+int _starpu_disk_copy_data_to_disk(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+int _starpu_disk_copy_interface(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+int _starpu_disk_direct_access_supported(unsigned node, unsigned handling_node);
+uintptr_t _starpu_disk_malloc_on_node(unsigned dst_node, size_t size, int flags);
+void _starpu_disk_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags);
+
 #endif

+ 100 - 20
src/drivers/mic/driver_mic_source.c

@@ -471,29 +471,11 @@ int _starpu_mic_init_event(struct _starpu_mic_async_event *event, unsigned memor
 	return 0;
 }
 
-/* Wait the end of the asynchronous request */
-void _starpu_mic_wait_request_completion(struct _starpu_mic_async_event *event)
-{
-	if (event->signal != NULL)
-	{
-		const struct _starpu_mp_node *mp_node = _starpu_mic_src_get_mp_node_from_memory_node(event->memory_node);
-		scif_epd_t epd = mp_node->host_sink_dt_connection.mic_endpoint;
-
-		if (scif_fence_wait(epd, event->mark) < 0)
-			STARPU_MIC_SRC_REPORT_SCIF_ERROR(errno);
-
-		if (scif_unregister(epd, (off_t)(event->signal), 0x1000) < 0)
-			STARPU_MIC_SRC_REPORT_SCIF_ERROR(errno);
-
-		free(event->signal);
-		event->signal = NULL;
-	}
-}
-
 /* Test if a asynchronous request is end.
  * Return 1 if is end, 0 else. */
-int _starpu_mic_request_is_complete(struct _starpu_mic_async_event *event)
+unsigned _starpu_mic_test_request_completion(struct _starpu_async_channel *async_channel)
 {
+	struct _starpu_mic_async_event *event = &async_channel->event.mic_event;
 	if (event->signal != NULL && *(event->signal) != STARPU_MIC_REQUEST_COMPLETE)
 		return 0;
 
@@ -508,7 +490,25 @@ int _starpu_mic_request_is_complete(struct _starpu_mic_async_event *event)
 	return 1;
 }
 
+/* Wait the end of the asynchronous request */
+void _starpu_mic_wait_request_completion(struct _starpu_async_channel *async_channel)
+{
+	struct _starpu_mic_async_event *event = &async_channel->event.mic_event;
+	if (event->signal != NULL)
+	{
+		const struct _starpu_mp_node *mp_node = _starpu_mic_src_get_mp_node_from_memory_node(event->memory_node);
+		scif_epd_t epd = mp_node->host_sink_dt_connection.mic_endpoint;
 
+		if (scif_fence_wait(epd, event->mark) < 0)
+			STARPU_MIC_SRC_REPORT_SCIF_ERROR(errno);
+
+		if (scif_unregister(epd, (off_t)(event->signal), 0x1000) < 0)
+			STARPU_MIC_SRC_REPORT_SCIF_ERROR(errno);
+
+		free(event->signal);
+		event->signal = NULL;
+	}
+}
 
 void *_starpu_mic_src_worker(void *arg)
 {
@@ -562,3 +562,83 @@ void *_starpu_mic_src_worker(void *arg)
 	return NULL;
 
 }
+
+int _starpu_mic_copy_data_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_MIC_RAM && dst_kind == STARPU_CPU_RAM);
+
+	int ret = 1;
+
+#ifdef STARPU_USE_MIC
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+	/* MIC -> RAM */
+	if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mic_copy_disabled() || !(copy_methods->mic_to_ram_async || copy_methods->any_to_any))
+	{
+		/* this is not associated to a request so it's synchronous */
+		STARPU_ASSERT(copy_methods->mic_to_ram || copy_methods->any_to_any);
+		if (copy_methods->mic_to_ram)
+			copy_methods->mic_to_ram(src_interface, src_node, dst_interface, dst_node);
+		else
+			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+	}
+	else
+	{
+		req->async_channel.type = STARPU_MIC_RAM;
+		if (copy_methods->mic_to_ram_async)
+			ret = copy_methods->mic_to_ram_async(src_interface, src_node, dst_interface, dst_node);
+		else
+		{
+			STARPU_ASSERT(copy_methods->any_to_any);
+			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+		}
+		_starpu_mic_init_event(&(req->async_channel.event.mic_event), src_node);
+	}
+#endif
+	return 1;
+}
+
+int _starpu_mic_copy_interface(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	STARPU_ASSERT(src_kind == STARPU_MIC_RAM);
+
+	int dst_kind = starpu_node_get_kind(dst_node);
+
+	if (dst_kind == STARPU_CPU_RAM)
+	{
+		if (async_channel)
+			return _starpu_mic_copy_mic_to_ram_async((void*) (src + src_offset), src_node,
+								 (void*) (dst + dst_offset), dst_node,
+								 size);
+		else
+			return _starpu_mic_copy_mic_to_ram((void*) (src + src_offset), src_node,
+							   (void*) (dst + dst_offset), dst_node,
+							   size);
+	}
+	else
+	{
+		STARPU_ABORT();
+		return -1;
+	}
+}
+
+int _starpu_mic_direct_access_supported(unsigned node, unsigned handling_node)
+{
+	/* TODO: We don't handle direct MIC-MIC transfers yet */
+	return 0;
+}
+
+uintptr_t _starpu_mic_malloc_on_node(unsigned dst_node, size_t size, int flags)
+{
+	uintptr_t addr = 0;
+	if (_starpu_mic_allocate_memory((void **)(&addr), size, dst_node))
+		addr = 0;
+	return addr;
+}
+
+void _starpu_mic_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags)
+{
+	_starpu_mic_free_memory((void*) addr, size, dst_node);
+}

+ 8 - 3
src/drivers/mic/driver_mic_source.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2015,2017                                CNRS
+ * Copyright (C) 2015,2017,2019                           CNRS
  * Copyright (C) 2012,2017                                Inria
  * Copyright (C) 2013,2014,2017                           Université de Bordeaux
  * Copyright (C) 2013                                     Thibaut Lambert
@@ -74,12 +74,17 @@ int _starpu_mic_copy_ram_to_mic_async(void *src, unsigned src_node STARPU_ATTRIB
 int _starpu_mic_copy_mic_to_ram_async(void *src, unsigned src_node, void *dst, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size);
 
 int _starpu_mic_init_event(struct _starpu_mic_async_event *event, unsigned memory_node);
-void _starpu_mic_wait_request_completion(struct _starpu_mic_async_event *event);
-int _starpu_mic_request_is_complete(struct _starpu_mic_async_event *event);
 
 void *_starpu_mic_src_worker(void *arg);
 
 #endif /* STARPU_USE_MIC */
 
+unsigned _starpu_mic_test_request_completion(struct _starpu_async_channel *async_channel);
+void _starpu_mic_wait_request_completion(struct _starpu_async_channel *async_channel);
+int _starpu_mic_copy_data_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+int _starpu_mic_copy_interface(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+int _starpu_mic_direct_access_supported(unsigned node, unsigned handling_node);
+uintptr_t _starpu_mic_malloc_on_node(unsigned dst_node, size_t size, int flags);
+void _starpu_mic_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags);
 
 #endif /* __DRIVER_MIC_SOURCE_H__ */

+ 66 - 3
src/drivers/mpi/driver_mpi_common.c

@@ -21,7 +21,7 @@
 #include <core/workers.h>
 #include <core/perfmodel/perfmodel.h>
 #include <drivers/mp_common/source_common.h>
-#include "driver_mpi_common.h"
+#include <drivers/mpi/driver_mpi_common.h>
 
 #define NITER 32
 #define SIZE_BANDWIDTH (1024*1024)
@@ -366,7 +366,7 @@ static void _starpu_mpi_common_polling_node(struct _starpu_mp_node * node)
 /* - In device to device communications, the first ack received by host
  * is considered as the sender (but it cannot be, in fact, the sender)
  */
-int _starpu_mpi_common_test_event(struct _starpu_async_channel * event)
+unsigned _starpu_mpi_common_test_event(struct _starpu_async_channel * event)
 {
         if (event->event.mpi_ms_event.requests != NULL && !_starpu_mpi_ms_event_request_list_empty(event->event.mpi_ms_event.requests))
         {
@@ -411,7 +411,7 @@ int _starpu_mpi_common_test_event(struct _starpu_async_channel * event)
 /* - In device to device communications, the first ack received by host
  * is considered as the sender (but it cannot be, in fact, the sender)
  */
-void _starpu_mpi_common_wait_event(struct _starpu_async_channel * event)
+void _starpu_mpi_common_wait_request_completion(struct _starpu_async_channel * event)
 {
         if (event->event.mpi_ms_event.requests != NULL && !_starpu_mpi_ms_event_request_list_empty(event->event.mpi_ms_event.requests))
         {
@@ -550,3 +550,66 @@ void _starpu_mpi_common_measure_bandwidth_latency(double timing_dtod[STARPU_MAXM
         }
         free(buf);
 }
+
+int _starpu_mpi_common_copy_data_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_MPI_MS_RAM && dst_kind == STARPU_CPU_RAM);
+
+	int ret = 0;
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+	if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mpi_ms_copy_disabled() || !(copy_methods->mpi_ms_to_ram_async || copy_methods->any_to_any))
+	{
+		/* this is not associated to a request so it's synchronous */
+		STARPU_ASSERT(copy_methods->mpi_ms_to_ram || copy_methods->any_to_any);
+		if (copy_methods->mpi_ms_to_ram)
+			copy_methods->mpi_ms_to_ram(src_interface, src_node, dst_interface, dst_node);
+		else
+			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+	}
+	else
+	{
+		req->async_channel.type = STARPU_MPI_MS_RAM;
+		if(copy_methods->mpi_ms_to_ram_async)
+			ret = copy_methods->mpi_ms_to_ram_async(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+		else
+		{
+			STARPU_ASSERT(copy_methods->any_to_any);
+			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+		}
+	}
+	return ret;
+}
+
+int _starpu_mpi_common_copy_data_to_mpi(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_MPI_MS_RAM && dst_kind == STARPU_MPI_MS_RAM);
+
+	int ret = 0;
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+
+	if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mpi_ms_copy_disabled() || !(copy_methods->mpi_ms_to_mpi_ms_async || copy_methods->any_to_any))
+	{
+		/* this is not associated to a request so it's synchronous */
+		STARPU_ASSERT(copy_methods->mpi_ms_to_mpi_ms || copy_methods->any_to_any);
+		if (copy_methods->mpi_ms_to_mpi_ms)
+			copy_methods->mpi_ms_to_mpi_ms(src_interface, src_node, dst_interface, dst_node);
+		else
+			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+	}
+	else
+	{
+		req->async_channel.type = STARPU_MPI_MS_RAM;
+		if(copy_methods->mpi_ms_to_mpi_ms_async)
+			ret = copy_methods->mpi_ms_to_mpi_ms_async(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+		else
+		{
+			STARPU_ASSERT(copy_methods->any_to_any);
+			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+		}
+	}
+	return ret;
+}

+ 5 - 3
src/drivers/mpi/driver_mpi_common.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2016,2017                                Inria
- * Copyright (C) 2017                                     CNRS
+ * Copyright (C) 2017,2019                                CNRS
  * Copyright (C) 2015                                     Mathieu Lirzin
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -47,13 +47,15 @@ void _starpu_mpi_common_mp_recv(const struct _starpu_mp_node *node, void *msg, i
 void _starpu_mpi_common_recv_from_device(const struct _starpu_mp_node *node, int src_devid, void *msg, int len, void * event);
 void _starpu_mpi_common_send_to_device(const struct _starpu_mp_node *node, int dst_devid, void *msg, int len, void * event);
 
-int _starpu_mpi_common_test_event(struct _starpu_async_channel * event);
-void _starpu_mpi_common_wait_event(struct _starpu_async_channel * event);
+unsigned _starpu_mpi_common_test_event(struct _starpu_async_channel * event);
+void _starpu_mpi_common_wait_request_completion(struct _starpu_async_channel * event);
 
 void _starpu_mpi_common_barrier(void);
 
 void _starpu_mpi_common_measure_bandwidth_latency(double bandwidth_dtod[STARPU_MAXMPIDEVS][STARPU_MAXMPIDEVS], double latency_dtod[STARPU_MAXMPIDEVS][STARPU_MAXMPIDEVS]);
 
+int _starpu_mpi_common_copy_data_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+int _starpu_mpi_common_copy_data_to_mpi(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
 
 #endif  /* STARPU_USE_MPI_MASTER_SLAVE */
 

+ 55 - 0
src/drivers/mpi/driver_mpi_source.c

@@ -380,3 +380,58 @@ void *_starpu_mpi_src_worker(void *arg)
 
         return NULL;
 }
+
+int _starpu_mpi_copy_interface(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	STARPU_ASSERT(src_kind == STARPU_MPI_MS_RAM);
+
+	int dst_kind = starpu_node_get_kind(dst_node);
+
+	if (dst_kind == STARPU_CPU_RAM)
+	{
+                if (async_channel)
+                        return _starpu_mpi_copy_mpi_to_ram_async((void*) (src + src_offset), src_node,
+								 (void*) (dst + dst_offset), dst_node,
+								 size, async_channel);
+                else
+                        return _starpu_mpi_copy_mpi_to_ram_sync((void*) (src + src_offset), src_node,
+								(void*) (dst + dst_offset), dst_node,
+								size);
+	}
+	else if (dst_kind == STARPU_MPI_MS_RAM)
+	{
+                if (async_channel)
+                        return _starpu_mpi_copy_sink_to_sink_async((void*) (src + src_offset), src_node,
+								   (void*) (dst + dst_offset), dst_node,
+								   size, async_channel);
+                else
+                        return _starpu_mpi_copy_sink_to_sink_sync((void*) (src + src_offset), src_node,
+								  (void*) (dst + dst_offset), dst_node,
+								  size);
+	}
+	else
+	{
+		STARPU_ABORT();
+		return -1;
+	}
+}
+
+int _starpu_mpi_direct_access_supported(unsigned node, unsigned handling_node)
+{
+	enum starpu_node_kind kind = starpu_node_get_kind(handling_node);
+	return kind == STARPU_MPI_MS_RAM;
+}
+
+uintptr_t _starpu_mpi_malloc_on_node(unsigned dst_node, size_t size, int flags)
+{
+	uintptr_t addr = 0;
+	if (_starpu_mpi_src_allocate_memory((void **)(&addr), size, dst_node))
+		addr = 0;
+	return addr;
+}
+
+void _starpu_mpi_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags)
+{
+	_starpu_mpi_source_free_memory((void*) addr, dst_node);
+}

+ 5 - 2
src/drivers/mpi/driver_mpi_source.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2016,2017                                Inria
- * Copyright (C) 2017                                     CNRS
+ * Copyright (C) 2017,2019                                CNRS
  * Copyright (C) 2017                                     Université de Bordeaux
  * Copyright (C) 2015                                     Mathieu Lirzin
  *
@@ -47,7 +47,10 @@ int _starpu_mpi_copy_sink_to_sink_sync(void *src, unsigned src_node, void *dst,
 int _starpu_mpi_copy_mpi_to_ram_async(void *src, unsigned src_node, void *dst, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size, void * event);
 int _starpu_mpi_copy_ram_to_mpi_async(void *src, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst, unsigned dst_node, size_t size, void * event);
 int _starpu_mpi_copy_sink_to_sink_async(void *src, unsigned src_node, void *dst, unsigned dst_node, size_t size, void * event);
-
+int _starpu_mpi_copy_interface(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+int _starpu_mpi_direct_access_supported(unsigned node, unsigned handling_node);
+uintptr_t _starpu_mpi_malloc_on_node(unsigned dst_node, size_t size, int flags);
+void _starpu_mpi_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags);
 
 starpu_mpi_ms_kernel_t _starpu_mpi_ms_src_get_kernel_from_codelet(struct starpu_codelet *cl, unsigned nimpl);
 void(* _starpu_mpi_ms_src_get_kernel_from_job(const struct _starpu_mp_node *node STARPU_ATTRIBUTE_UNUSED, struct _starpu_job *j))(void);

+ 199 - 21
src/drivers/opencl/driver_opencl.c

@@ -3,7 +3,7 @@
  * Copyright (C) 2011,2012,2014-2017                      Inria
  * Copyright (C) 2010-2018                                Université de Bordeaux
  * Copyright (C) 2010                                     Mehdi Juhoor
- * Copyright (C) 2010-2017, 2019                                CNRS
+ * Copyright (C) 2010-2017,2019                           CNRS
  * Copyright (C) 2013                                     Thibaut Lambert
  * Copyright (C) 2011                                     Télécom-SudParis
  *
@@ -415,36 +415,34 @@ cl_int starpu_opencl_copy_async_sync(uintptr_t src, size_t src_offset, unsigned
 	cl_int err;
 	int ret;
 
-	switch (_STARPU_MEMORY_NODE_TUPLE(src_kind,dst_kind))
+	if (src_kind == STARPU_OPENCL_RAM && dst_kind == STARPU_CPU_RAM)
 	{
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_CPU_RAM):
-		err = starpu_opencl_copy_opencl_to_ram(
-				(cl_mem) src, src_node,
-				(void*) (dst + dst_offset), dst_node,
-				size, src_offset, event, &ret);
+		err = starpu_opencl_copy_opencl_to_ram((cl_mem) src, src_node,
+						       (void*) (dst + dst_offset), dst_node,
+						       size, src_offset, event, &ret);
 		_STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
 		return ret;
+	}
 
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_OPENCL_RAM):
-		err = starpu_opencl_copy_ram_to_opencl(
-				(void*) (src + src_offset), src_node,
-				(cl_mem) dst, dst_node,
-				size, dst_offset, event, &ret);
+	if (src_kind == STARPU_CPU_RAM && dst_kind == STARPU_OPENCL_RAM)
+	{
+		err = starpu_opencl_copy_ram_to_opencl((void*) (src + src_offset), src_node,
+						       (cl_mem) dst, dst_node,
+						       size, dst_offset, event, &ret);
 		_STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
 		return ret;
+	}
 
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_OPENCL_RAM):
-		err = starpu_opencl_copy_opencl_to_opencl(
-				(cl_mem) src, src_node, src_offset,
-				(cl_mem) dst, dst_node, dst_offset,
-				size, event, &ret);
+	if (src_kind == STARPU_OPENCL_RAM && (dst_kind == STARPU_CPU_RAM || dst_kind == STARPU_OPENCL_RAM))
+	{
+		err = starpu_opencl_copy_opencl_to_opencl((cl_mem) src, src_node, src_offset,
+							  (cl_mem) dst, dst_node, dst_offset,
+							  size, event, &ret);
 		_STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
 		return ret;
-
-	default:
-		STARPU_ABORT();
-		break;
 	}
+
+	STARPU_ABORT();
 }
 
 #if 0
@@ -1095,6 +1093,37 @@ static void _starpu_opencl_execute_job(struct starpu_task *task, struct _starpu_
 }
 
 #ifdef STARPU_USE_OPENCL
+unsigned _starpu_opencl_test_request_completion(struct _starpu_async_channel *async_channel)
+{
+	cl_int event_status;
+	cl_event opencl_event = (*async_channel).event.opencl_event;
+	if (opencl_event == NULL) STARPU_ABORT();
+	cl_int err = clGetEventInfo(opencl_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL);
+	if (STARPU_UNLIKELY(err != CL_SUCCESS))
+		STARPU_OPENCL_REPORT_ERROR(err);
+	if (event_status < 0)
+		STARPU_OPENCL_REPORT_ERROR(event_status);
+	if (event_status == CL_COMPLETE)
+	{
+		err = clReleaseEvent(opencl_event);
+		if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
+	}
+	return (event_status == CL_COMPLETE);
+}
+
+void _starpu_opencl_wait_request_completion(struct _starpu_async_channel *async_channel)
+{
+	cl_int err;
+	if ((*async_channel).event.opencl_event == NULL)
+		STARPU_ABORT();
+	err = clWaitForEvents(1, &((*async_channel).event.opencl_event));
+	if (STARPU_UNLIKELY(err != CL_SUCCESS))
+		STARPU_OPENCL_REPORT_ERROR(err);
+	err = clReleaseEvent((*async_channel).event.opencl_event);
+	if (STARPU_UNLIKELY(err != CL_SUCCESS))
+		STARPU_OPENCL_REPORT_ERROR(err);
+}
+
 int _starpu_run_opencl(struct _starpu_worker *workerarg)
 {
 	_STARPU_DEBUG("Running OpenCL %u from the application\n", workerarg->devid);
@@ -1117,3 +1146,152 @@ struct _starpu_driver_ops _starpu_driver_opencl_ops =
 };
 
 #endif /* STARPU_USE_OPENCL */
+
+int _starpu_opencl_copy_data_to_opencl(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_OPENCL_RAM && dst_kind == STARPU_OPENCL_RAM);
+
+	int ret = 1;
+#ifdef STARPU_USE_OPENCL
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+	/* STARPU_OPENCL_RAM -> STARPU_OPENCL_RAM */
+	STARPU_ASSERT(starpu_worker_get_local_memory_node() == dst_node || starpu_worker_get_local_memory_node() == src_node);
+	if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() || !(copy_methods->opencl_to_opencl_async || copy_methods->any_to_any))
+	{
+		STARPU_ASSERT(copy_methods->opencl_to_opencl || copy_methods->any_to_any);
+		/* this is not associated to a request so it's synchronous */
+		if (copy_methods->opencl_to_opencl)
+			copy_methods->opencl_to_opencl(src_interface, src_node, dst_interface, dst_node);
+		else
+			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+	}
+	else
+	{
+		req->async_channel.type = STARPU_OPENCL_RAM;
+		if (copy_methods->opencl_to_opencl_async)
+			ret = copy_methods->opencl_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
+		else
+		{
+			STARPU_ASSERT(copy_methods->any_to_any);
+			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+		}
+	}
+#endif
+	return ret;
+}
+
+int _starpu_opencl_copy_data_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_OPENCL_RAM && dst_kind == STARPU_CPU_RAM);
+
+	int ret = 1;
+#ifdef STARPU_USE_OPENCL
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+	/* OpenCL -> RAM */
+	STARPU_ASSERT(starpu_worker_get_local_memory_node() == src_node);
+	if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() || !(copy_methods->opencl_to_ram_async || copy_methods->any_to_any))
+	{
+		STARPU_ASSERT(copy_methods->opencl_to_ram || copy_methods->any_to_any);
+		/* this is not associated to a request so it's synchronous */
+		if (copy_methods->opencl_to_ram)
+			copy_methods->opencl_to_ram(src_interface, src_node, dst_interface, dst_node);
+		else
+			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+	}
+	else
+	{
+		req->async_channel.type = STARPU_OPENCL_RAM;
+		if (copy_methods->opencl_to_ram_async)
+			ret = copy_methods->opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
+		else
+		{
+			STARPU_ASSERT(copy_methods->any_to_any);
+			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+		}
+	}
+#endif
+	return ret;
+}
+
+int _starpu_opencl_copy_interface(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	STARPU_ASSERT(src_kind == STARPU_OPENCL_RAM);
+
+	int dst_kind = starpu_node_get_kind(dst_node);
+
+	if (dst_kind == STARPU_OPENCL_RAM || dst_kind == STARPU_CPU_RAM)
+	{
+		return starpu_opencl_copy_async_sync(src, src_offset, src_node,
+						     dst, dst_offset, dst_node,
+						     size,
+						     &async_channel->event.opencl_event);
+	}
+	else
+	{
+		STARPU_ABORT();
+		return -1;
+	}
+}
+
+int _starpu_opencl_direct_access_supported(unsigned node, unsigned handling_node)
+{
+	return 0;
+}
+
+uintptr_t _starpu_opencl_malloc_on_node(unsigned dst_node, size_t size, int flags)
+{
+	uintptr_t addr = 0;
+#if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
+#ifdef STARPU_SIMGRID
+	static uintptr_t last[STARPU_MAXNODES];
+	/* Sleep for the allocation */
+	STARPU_PTHREAD_MUTEX_LOCK(&opencl_alloc_mutex);
+	if (_starpu_simgrid_cuda_malloc_cost())
+		MSG_process_sleep(0.000175);
+	if (!last[dst_node])
+		last[dst_node] = 1<<10;
+	addr = last[dst_node];
+	last[dst_node]+=size;
+	STARPU_ASSERT(last[dst_node] >= addr);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&opencl_alloc_mutex);
+#else
+	int ret;
+	cl_mem ptr;
+
+	ret = starpu_opencl_allocate_memory(starpu_memory_node_get_devid(dst_node), &ptr, size, CL_MEM_READ_WRITE);
+	if (ret)
+	{
+		addr = 0;
+	}
+	else
+	{
+		addr = (uintptr_t)ptr;
+	}
+#endif
+
+#endif
+	return addr;
+}
+
+void _starpu_opencl_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags)
+{
+#if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
+#ifdef STARPU_SIMGRID
+	STARPU_PTHREAD_MUTEX_LOCK(&opencl_alloc_mutex);
+	/* Sleep for the free */
+	if (_starpu_simgrid_cuda_malloc_cost())
+		MSG_process_sleep(0.000750);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&opencl_alloc_mutex);
+#else
+	cl_int err;
+	err = clReleaseMemObject((void*)addr);
+	if (STARPU_UNLIKELY(err != CL_SUCCESS))
+		STARPU_OPENCL_REPORT_ERROR(err);
+#endif
+#endif
+}

+ 9 - 0
src/drivers/opencl/driver_opencl.h

@@ -70,4 +70,13 @@ cl_int _starpu_opencl_copy_rect_ram_to_opencl(void *ptr, unsigned src_node, cl_m
                                               size_t host_row_pitch, size_t host_slice_pitch, cl_event *event);
 #endif
 
+unsigned _starpu_opencl_test_request_completion(struct _starpu_async_channel *async_channel);
+void _starpu_opencl_wait_request_completion(struct _starpu_async_channel *async_channel);
+int _starpu_opencl_copy_data_to_opencl(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+int _starpu_opencl_copy_data_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+int _starpu_opencl_copy_interface(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+int _starpu_opencl_direct_access_supported(unsigned node, unsigned handling_node);
+uintptr_t _starpu_opencl_malloc_on_node(unsigned dst_node, size_t size, int flags);
+void _starpu_opencl_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags);
+
 #endif //  __DRIVER_OPENCL_H__

+ 31 - 1
src/drivers/scc/driver_scc_common.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2012,2016,2017                           Inria
- * Copyright (C) 2015,2017                                CNRS
+ * Copyright (C) 2015,2017,2019                           CNRS
  * Copyright (C) 2013,2017                                Université de Bordeaux
  * Copyright (C) 2013                                     Thibaut Lambert
  *
@@ -191,3 +191,33 @@ int _starpu_scc_common_recv_is_ready(const struct _starpu_mp_node *mp_node)
   ************/
   STARPU_ASSERT(0);
 }
+
+int _starpu_scc_common_copy_data_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_SCC_RAM && dst_kind == STARPU_CPU_RAM);
+
+	int ret = 0;
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+	if (copy_methods->scc_sink_to_src)
+		copy_methods->scc_sink_to_src(src_interface, src_node, dst_interface, dst_node);
+	else
+		copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+	return ret;
+}
+
+int _starpu_scc_common_copy_data_to_scc(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_SCC_RAM && dst_kind == STARPU_SCC_RAM);
+
+	int ret = 0;
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+	if (copy_methods->scc_sink_to_sink)
+		copy_methods->scc_sink_to_sink(src_interface, src_node, dst_interface, dst_node);
+	else
+		copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+	return ret;
+}

+ 4 - 1
src/drivers/scc/driver_scc_common.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2012,2016,2017                           Inria
- * Copyright (C) 2015,2017                                CNRS
+ * Copyright (C) 2015,2017,2019                           CNRS
  * Copyright (C) 2013                                     Université de Bordeaux
  * Copyright (C) 2013                                     Thibaut Lambert
  *
@@ -49,6 +49,9 @@ void _starpu_scc_common_report_rcce_error(const char *func, const char *file, co
 
 int _starpu_scc_common_recv_is_ready(const struct _starpu_mp_node *mp_node);
 
+int _starpu_scc_common_copy_data_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+int _starpu_scc_common_copy_data_to_scc(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+
 #endif /* STARPU_USE_SCC */
 
 

+ 45 - 0
src/drivers/scc/driver_scc_source.c

@@ -325,3 +325,48 @@ void *_starpu_scc_src_worker(void *arg)
 
 	return NULL;
 }
+
+int _starpu_scc_copy_interface(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	STARPU_ASSERT(src_kind == STARPU_SCC_RAM);
+
+	int dst_kind = starpu_node_get_kind(dst_node);
+
+	if (dst_kind == STARPU_CPU_RAM)
+	{
+		return _starpu_scc_copy_sink_to_src((void*) (src + src_offset), src_node,
+						    (void*) (dst + dst_offset), dst_node,
+						    size);
+
+	}
+	else if (dst_kind == STARPU_SCC_RAM)
+	{
+		return _starpu_scc_copy_sink_to_sink((void*) (src + src_offset), src_node,
+						     (void*) (dst + dst_offset), dst_node,
+						     size);
+	}
+	else
+	{
+		STARPU_ABORT();
+		return -1;
+	}
+}
+
+int _starpu_scc_direct_access_supported(unsigned node, unsigned handling_node)
+{
+	return 1;
+}
+
+uintptr_t _starpu_scc_malloc_on_node(unsigned dst_node, size_t size, int flags)
+{
+	uintptr_t addr = 0;
+	if (_starpu_scc_allocate_memory((void **)(&addr), size, dst_node))
+		addr = 0;
+	return addr;
+}
+
+void _starpu_scc_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags)
+{
+	_starpu_scc_free_memory((void *) addr, dst_node);
+}

+ 5 - 0
src/drivers/scc/driver_scc_source.h

@@ -52,6 +52,11 @@ int _starpu_scc_copy_sink_to_sink(void *src, unsigned src_node, void *dst, unsig
 
 void *_starpu_scc_src_worker(void *arg);
 
+int _starpu_scc_copy_interface(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+int _starpu_scc_direct_access_supported(unsigned node, unsigned handling_node);
+uintptr_t _starpu_scc_malloc_on_node(unsigned dst_node, size_t size, int flags);
+void _starpu_scc_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags);
+
 #endif /* STARPU_USE_SCC */