Browse Source

Merge branch 'master' into knobs

Conflicts:
	src/drivers/cpu/driver_cpu.c
	src/drivers/opencl/driver_opencl.c
Olivier Aumage 6 years ago
parent
commit
b5b0b53ac9
39 changed files with 1722 additions and 1176 deletions
  1. 1 1
      include/starpu_data_filters.h
  2. 9 10
      include/starpu_worker.h
  3. 5 1
      mpi/src/nmad/starpu_mpi_nmad.c
  4. 3 3
      src/Makefile.am
  5. 5 3
      src/common/thread.c
  6. 7 8
      src/core/disk.c
  7. 1 1
      src/core/perfmodel/multiple_regression.c
  8. 3 26
      src/core/perfmodel/perfmodel.c
  9. 7 1
      src/core/simgrid.c
  10. 3 0
      src/core/topology.c
  11. 22 0
      src/core/workers.c
  12. 3 1
      src/core/workers.h
  13. 16 58
      src/datawizard/coherency.c
  14. 26 708
      src/datawizard/copy_driver.c
  15. 2 2
      src/datawizard/interfaces/bcsr_filters.c
  16. 17 234
      src/datawizard/malloc.c
  17. 4 34
      src/datawizard/memory_nodes.c
  18. 0 5
      src/datawizard/memory_nodes.h
  19. 153 0
      src/datawizard/node_ops.c
  20. 50 0
      src/datawizard/node_ops.h
  21. 74 3
      src/drivers/cpu/driver_cpu.c
  22. 8 5
      src/drivers/cpu/driver_cpu.h
  23. 340 3
      src/drivers/cuda/driver_cuda.c
  24. 17 1
      src/drivers/cuda/driver_cuda.h
  25. 208 8
      src/drivers/disk/driver_disk.c
  26. 16 1
      src/drivers/disk/driver_disk.h
  27. 142 20
      src/drivers/mic/driver_mic_source.c
  28. 13 3
      src/drivers/mic/driver_mic_source.h
  29. 4 3
      src/drivers/mpi/driver_mpi_common.c
  30. 3 4
      src/drivers/mpi/driver_mpi_common.h
  31. 166 0
      src/drivers/mpi/driver_mpi_source.c
  32. 12 1
      src/drivers/mpi/driver_mpi_source.h
  33. 248 24
      src/drivers/opencl/driver_opencl.c
  34. 15 0
      src/drivers/opencl/driver_opencl.h
  35. 2 2
      src/drivers/opencl/driver_opencl_utils.c
  36. 1 1
      src/drivers/scc/driver_scc_common.c
  37. 1 1
      src/drivers/scc/driver_scc_common.h
  38. 103 0
      src/drivers/scc/driver_scc_source.c
  39. 12 0
      src/drivers/scc/driver_scc_source.h

+ 1 - 1
include/starpu_data_filters.h

@@ -322,7 +322,7 @@ void starpu_bcsr_filter_canonical_block(void *father_interface, void *child_inte
 /**
    Return the child_ops of the partition obtained with starpu_bcsr_filter_canonical_block().
 */
-void starpu_bcsr_filter_canonical_block_child_ops(struct starpu_data_filter *f, unsigned child);
+struct starpu_data_interface_ops *starpu_bcsr_filter_canonical_block_child_ops(struct starpu_data_filter *f, unsigned child);
 
 /** @} */
 

+ 9 - 10
include/starpu_worker.h

@@ -42,16 +42,15 @@ extern "C"
 
 enum starpu_node_kind
 {
-	STARPU_UNUSED     = 0x00,
-	STARPU_CPU_RAM    = 0x01,
-	STARPU_CUDA_RAM   = 0x02,
-	STARPU_OPENCL_RAM = 0x03,
-	STARPU_DISK_RAM   = 0x04,
-	STARPU_MIC_RAM    = 0x05,
-	STARPU_SCC_RAM    = 0x06,
-	STARPU_SCC_SHM    = 0x07,
-	STARPU_MPI_MS_RAM = 0x08
-
+	STARPU_UNUSED=0,
+	STARPU_CPU_RAM=1,
+	STARPU_CUDA_RAM=2,
+	STARPU_OPENCL_RAM=3,
+	STARPU_DISK_RAM=4,
+	STARPU_MIC_RAM=5,
+	STARPU_SCC_RAM=6,
+	STARPU_SCC_SHM=7,
+	STARPU_MPI_MS_RAM=8
 };
 
 /**

+ 5 - 1
mpi/src/nmad/starpu_mpi_nmad.c

@@ -649,7 +649,11 @@ int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv)
 	/* Tell pioman to use a bound thread for communication progression */
 	unsigned piom_bindid = starpu_get_next_bindid(STARPU_THREAD_ACTIVE, NULL, 0);
 	int indexes[1] = {piom_bindid};
-	piom_ltask_set_bound_thread_indexes(HWLOC_OBJ_CORE,indexes,1);
+	piom_ltask_set_bound_thread_indexes(HWLOC_OBJ_PU,indexes,1);
+
+	/* We force the "MPI" thread to share the same core as the pioman thread
+	   to avoid binding it on the same core as a worker */
+	_starpu_mpi_thread_cpuid = piom_bindid;
 
 	/* Register some hooks for communication progress if needed */
 	int polling_point_prog, polling_point_idle;

+ 3 - 3
src/Makefile.am

@@ -3,7 +3,7 @@
 # Copyright (C) 2011-2017,2019                           Inria
 # Copyright (C) 2012                                     Benjamin Lorendeau
 # Copyright (C) 2009-2019                                Université de Bordeaux
-# Copyright (C) 2010-2015,2017,2018                      CNRS
+# Copyright (C) 2010-2015,2017,2018,2019                 CNRS
 # Copyright (C) 2013                                     Simon Archipoff
 #
 # StarPU is free software; you can redistribute it and/or modify
@@ -98,6 +98,7 @@ noinst_HEADERS = 						\
 	core/detect_combined_workers.h				\
 	sched_policies/helper_mct.h				\
 	sched_policies/fifo_queues.h				\
+	datawizard/node_ops.h					\
 	datawizard/footprint.h					\
 	datawizard/datawizard.h					\
 	datawizard/data_request.h				\
@@ -226,6 +227,7 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES = 		\
 	sched_policies/graph_test_policy.c			\
 	drivers/driver_common/driver_common.c			\
 	drivers/disk/driver_disk.c				\
+	datawizard/node_ops.c					\
 	datawizard/memory_nodes.c				\
 	datawizard/write_back.c					\
 	datawizard/coherency.c					\
@@ -323,9 +325,7 @@ if STARPU_HAVE_HDF5
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += core/disk_ops/disk_hdf5.c
 endif
 
-if STARPU_USE_CPU
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/cpu/driver_cpu.c
-endif
 
 if STARPU_USE_CUDA
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/cuda/driver_cuda.c

+ 5 - 3
src/common/thread.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2013,2015,2017                           Inria
  * Copyright (C) 2010-2017                                CNRS
- * Copyright (C) 2010,2012-2018                           Université de Bordeaux
+ * Copyright (C) 2010,2012-2019                           Université de Bordeaux
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -259,7 +259,8 @@ int starpu_pthread_setspecific(starpu_pthread_key_t key, const void *pointer)
 	char *end;
 	/* Test whether it is an MPI rank */
 	strtol(process_name, &end, 10);
-	if (!*end || !strcmp(process_name, "wait for mpi transfer"))
+	if (!*end || !strcmp(process_name, "wait for mpi transfer") ||
+			(!strcmp(process_name, "main") && _starpu_simgrid_running_smpi()))
 		/* Special-case the SMPI process */
 		array = smpi_process_get_user_data();
 	else
@@ -281,7 +282,8 @@ void* starpu_pthread_getspecific(starpu_pthread_key_t key)
 	char *end;
 	/* Test whether it is an MPI rank */
 	strtol(process_name, &end, 10);
-	if (!*end || !strcmp(process_name, "wait for mpi transfer"))
+	if (!*end || !strcmp(process_name, "wait for mpi transfer") ||
+			(!strcmp(process_name, "main") && _starpu_simgrid_running_smpi()))
 		/* Special-case the SMPI processes */
 		array = smpi_process_get_user_data();
 	else

+ 7 - 8
src/core/disk.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2013,2017                                Inria
- * Copyright (C) 2015-2017                                CNRS
+ * Copyright (C) 2015-2017, 2019                          CNRS
  * Copyright (C) 2013-2015,2017,2018-2019                 Université de Bordeaux
  * Copyright (C) 2013                                     Corentin Salingue
  *
@@ -451,13 +451,12 @@ static int add_disk_in_list(unsigned node,  struct starpu_disk_ops *func, void *
 
 int _starpu_disk_can_copy(unsigned node1, unsigned node2)
 {
-	if (starpu_node_get_kind(node1) == STARPU_DISK_RAM && starpu_node_get_kind(node2) == STARPU_DISK_RAM)
-	{
-		if (disk_register_list[node1]->functions == disk_register_list[node2]->functions)
-			/* they must have a copy function */
-			if (disk_register_list[node1]->functions->copy != NULL)
-				return 1;
-	}
+	STARPU_ASSERT(starpu_node_get_kind(node1) == STARPU_DISK_RAM && starpu_node_get_kind(node2) == STARPU_DISK_RAM);
+
+	if (disk_register_list[node1]->functions == disk_register_list[node2]->functions)
+		/* they must have a copy function */
+		if (disk_register_list[node1]->functions->copy != NULL)
+			return 1;
 	return 0;
 }
 

+ 1 - 1
src/core/perfmodel/multiple_regression.c

@@ -283,7 +283,7 @@ int _starpu_multiple_regression(struct starpu_perfmodel_history_list *ptr, doubl
 	snprintf(directory, sizeof(directory), "%s/.starpu/sampling/codelets/tmp", _starpu_get_home_path());
 	_starpu_mkpath_and_check(directory, S_IRWXU);
 
-	char filepath[300];
+	char filepath[400];
 	snprintf(filepath, sizeof(filepath), "%s/%s.out", directory,codelet_name);
 
 	unsigned long old_lines=0;

+ 3 - 26
src/core/perfmodel/perfmodel.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2011,2012,2014,2016,2017                 Inria
  * Copyright (C) 2008-2019                                Université de Bordeaux
- * Copyright (C) 2010-2017                                CNRS
+ * Copyright (C) 2010-2017, 2019                          CNRS
  * Copyright (C) 2013                                     Thibaut Lambert
  * Copyright (C) 2011                                     Télécom-SudParis
  * Copyright (C) 2016                                     Uppsala University
@@ -246,7 +246,6 @@ double starpu_task_expected_conversion_time(struct starpu_task *task,
 {
 	unsigned i;
 	double sum = 0.0;
-	enum starpu_node_kind node_kind;
 	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
 
 #ifdef STARPU_DEVEL
@@ -258,35 +257,13 @@ double starpu_task_expected_conversion_time(struct starpu_task *task,
 	{
 		starpu_data_handle_t handle;
 		struct starpu_task *conversion_task;
+		enum starpu_node_kind node_kind;
 
 		handle = STARPU_TASK_GET_HANDLE(task, i);
 		if (!_starpu_data_is_multiformat_handle(handle))
 			continue;
 
-		switch(arch->devices[0].type)
-		{
-			case STARPU_CPU_WORKER:
-				node_kind = STARPU_CPU_RAM;
-				break;
-			case STARPU_CUDA_WORKER:
-				node_kind = STARPU_CUDA_RAM;
-				break;
-			case STARPU_OPENCL_WORKER:
-				node_kind = STARPU_OPENCL_RAM;
-				break;
-			case STARPU_MIC_WORKER:
-				node_kind = STARPU_MIC_RAM;
-				break;
-			case STARPU_SCC_WORKER:
-				node_kind = STARPU_SCC_RAM;
-				break;
-			case STARPU_MPI_MS_WORKER:
-				node_kind = STARPU_MPI_MS_RAM;
-				break;
-			default:
-				STARPU_ABORT();
-				break;
-		}
+		node_kind = _starpu_worker_get_node_kind(arch->devices[0].type);
 		if (!_starpu_handle_needs_conversion_task_for_arch(handle, node_kind))
 			continue;
 

+ 7 - 1
src/core/simgrid.c

@@ -38,6 +38,7 @@
 #ifdef STARPU_HAVE_SIMGRID_HOST_H
 #include <simgrid/host.h>
 #endif
+#include <smpi/smpi.h>
 
 #pragma weak starpu_main
 extern int starpu_main(int argc, char *argv[]);
@@ -48,6 +49,11 @@ extern int smpi_main(int (*realmain) (int argc, char *argv[]), int argc, char *a
 #pragma weak _starpu_mpi_simgrid_init
 extern int _starpu_mpi_simgrid_init(int argc, char *argv[]);
 
+#pragma weak smpi_process_set_user_data
+#if !HAVE_DECL_SMPI_PROCESS_SET_USER_DATA && !defined(smpi_process_set_user_data)
+extern void smpi_process_set_user_data(void *);
+#endif
+
 /* 1 when MSG_init was done, 2 when initialized through redirected main, 3 when
  * initialized through MSG_process_attach */
 static int simgrid_started;
@@ -396,7 +402,7 @@ void _starpu_simgrid_init_early(int *argc STARPU_ATTRIBUTE_UNUSED, char ***argv
 #endif
 		void **tsd;
 		_STARPU_CALLOC(tsd, MAX_TSD+1, sizeof(void*));
-		MSG_process_set_data(MSG_process_self(), tsd);
+		smpi_process_set_user_data(tsd);
 	}
 	unsigned i;
 	for (i = 0; i < STARPU_MAXNODES; i++)

+ 3 - 0
src/core/topology.c

@@ -37,6 +37,8 @@
 #include <datawizard/datastats.h>
 #include <datawizard/memory_nodes.h>
 #include <datawizard/memory_manager.h>
+#include <datawizard/node_ops.h>
+
 #include <common/uthash.h>
 
 #ifdef STARPU_HAVE_HWLOC
@@ -2969,6 +2971,7 @@ int _starpu_build_topology(struct _starpu_machine_config *config, int no_mp_conf
 	/* for the data management library */
 	_starpu_memory_nodes_init();
 	_starpu_datastats_init();
+	_starpu_node_ops_init();
 
 	_starpu_init_workers_binding_and_memory(config, no_mp_config);
 

+ 22 - 0
src/core/workers.c

@@ -2601,3 +2601,25 @@ void starpu_worker_set_waking_up_callback(void (*callback)(unsigned workerid))
 	_starpu_config.conf.callback_worker_waking_up = callback;
 }
 #endif
+
+enum starpu_node_kind _starpu_worker_get_node_kind(enum starpu_worker_archtype type)
+{
+	switch(type)
+	{
+		case STARPU_CPU_WORKER:
+			return STARPU_CPU_RAM;
+		case STARPU_CUDA_WORKER:
+			return STARPU_CUDA_RAM;
+		case STARPU_OPENCL_WORKER:
+			return STARPU_OPENCL_RAM;
+			break;
+		case STARPU_MIC_WORKER:
+			return STARPU_MIC_RAM;
+		case STARPU_SCC_WORKER:
+			return STARPU_SCC_RAM;
+		case STARPU_MPI_MS_WORKER:
+			return STARPU_MPI_MS_RAM;
+		default:
+			STARPU_ABORT();
+	}
+}

+ 3 - 1
src/core/workers.h

@@ -154,7 +154,7 @@ LIST_TYPE(_starpu_worker,
 	unsigned worker_is_initialized;
 	enum _starpu_worker_status status; /**< what is the worker doing now ? (eg. CALLBACK) */
 	unsigned state_keep_awake; /**< !0 if a task has been pushed to the worker and the task has not yet been seen by the worker, the worker should no go to sleep before processing this task*/
-	char name[64];
+	char name[128];
 	char short_name[32];
 	unsigned run_by_starpu; /**< Is this run by StarPU or directly by the application ? */
 	struct _starpu_driver_ops *driver_ops;
@@ -667,6 +667,8 @@ static inline unsigned __starpu_worker_get_id_check(const char *f, int l)
 }
 #define _starpu_worker_get_id_check(f,l) __starpu_worker_get_id_check(f,l)
 
+enum starpu_node_kind _starpu_worker_get_node_kind(enum starpu_worker_archtype type);
+
 void _starpu_worker_set_stream_ctx(unsigned workerid, struct _starpu_sched_ctx *sched_ctx);
 
 struct _starpu_sched_ctx* _starpu_worker_get_ctx_stream(unsigned stream_workerid);

+ 16 - 58
src/datawizard/coherency.c

@@ -3,7 +3,7 @@
  * Copyright (C) 2011-2014,2016,2017                      Inria
  * Copyright (C) 2008-2019                                Université de Bordeaux
  * Copyright (C) 2018                                     Federal University of Rio Grande do Sul (UFRGS)
- * Copyright (C) 2010-2018                                CNRS
+ * Copyright (C) 2010-2019                                CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -17,6 +17,9 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
+#include <limits.h>
+#include <math.h>
+
 #include <common/config.h>
 #include <datawizard/coherency.h>
 #include <datawizard/copy_driver.h>
@@ -25,11 +28,10 @@
 #include <core/dependencies/data_concurrency.h>
 #include <core/disk.h>
 #include <profiling/profiling.h>
-#include <math.h>
 #include <core/task.h>
 #include <starpu_scheduler.h>
 #include <core/workers.h>
-#include <limits.h>
+#include <datawizard/node_ops.h>
 
 #ifdef STARPU_SIMGRID
 #include <core/simgrid.h>
@@ -106,7 +108,7 @@ int _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
 		STARPU_ASSERT(handle->per_node[src_node].initialized);
 		return src_node;
 	}
-	
+
 	int i_ram = -1;
 	int i_gpu = -1;
 	int i_disk = -1;
@@ -114,7 +116,6 @@ int _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
 	/* Revert to dumb strategy: take RAM unless only a GPU has it */
 	for (i = 0; i < nnodes; i++)
 	{
-		
 		if (src_node_mask & (1<<i))
 		{
 			int (*can_copy)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, unsigned handling_node) = handle->ops->copy_methods->can_copy;
@@ -146,16 +147,16 @@ int _starpu_select_src_node(starpu_data_handle_t handle, unsigned destination)
 			    starpu_node_get_kind(i) == STARPU_MIC_RAM)
 				i_gpu = i;
 
-			if (starpu_node_get_kind(i) == STARPU_CPU_RAM || 
+			if (starpu_node_get_kind(i) == STARPU_CPU_RAM ||
 			    starpu_node_get_kind(i) == STARPU_SCC_RAM ||
 			    starpu_node_get_kind(i) == STARPU_SCC_SHM ||
                             starpu_node_get_kind(i) == STARPU_MPI_MS_RAM)
 				i_ram = i;
-			if (starpu_node_get_kind(i) == STARPU_DISK_RAM)			
+			if (starpu_node_get_kind(i) == STARPU_DISK_RAM)
 				i_disk = i;
 		}
 	}
-	
+
 	/* we have to use cpu_ram in first */
 	if (i_ram != -1)
 		src_node = i_ram;
@@ -233,55 +234,12 @@ static int worker_supports_direct_access(unsigned node, unsigned handling_node)
 		return 0;
 
 	int type = starpu_node_get_kind(node);
-	switch (type)
+	if (_node_ops[type].direct_access_supported)
+		return _node_ops[type].direct_access_supported(node, handling_node);
+	else
 	{
-		case STARPU_CUDA_RAM:
-		{
-			/* GPUs not always allow direct remote access: if CUDA4
-			 * is enabled, we allow two CUDA devices to communicate. */
-#ifdef STARPU_SIMGRID
-			if (starpu_node_get_kind(handling_node) == STARPU_CUDA_RAM)
-			{
-				msg_host_t host = _starpu_simgrid_get_memnode_host(handling_node);
-				const char* cuda_memcpy_peer = MSG_host_get_property_value(host, "memcpy_peer");
-				return cuda_memcpy_peer && atoll(cuda_memcpy_peer);
-			}
-			else
-				return 0;
-#elif defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
-			/* simgrid */
-			enum starpu_node_kind kind = starpu_node_get_kind(handling_node);
-			return kind == STARPU_CUDA_RAM;
-#else /* STARPU_HAVE_CUDA_MEMCPY_PEER */
-			/* Direct GPU-GPU transfers are not allowed in general */
-			return 0;
-#endif /* STARPU_HAVE_CUDA_MEMCPY_PEER */
-		}
-		case STARPU_OPENCL_RAM:
-			return 0;
-		case STARPU_MIC_RAM:
-			/* TODO: We don't handle direct MIC-MIC transfers yet */
-			return 0;
-		case STARPU_DISK_RAM:
-			/* Each worker can manage disks but disk <-> disk is not always allowed */
-			switch (starpu_node_get_kind(handling_node))
-			{
-				case STARPU_CPU_RAM:
-					return 1;
-				case STARPU_DISK_RAM:
-					return _starpu_disk_can_copy(node, handling_node);
-				default:
-					return 0;
-			}
-                case STARPU_MPI_MS_RAM:
-                {
-                        enum starpu_node_kind kind = starpu_node_get_kind(handling_node);
-                        return kind == STARPU_MPI_MS_RAM;
-                }
-		case STARPU_SCC_RAM:
-			return 1;
-		default:
-			return 1;
+		STARPU_ABORT_MSG("Node %d does not define the operation 'direct_access_supported'", type);
+		return 1;
 	}
 }
 
@@ -338,7 +296,7 @@ static unsigned chose_best_numa_between_src_and_dest(int src, int dst)
 		}
 	}
 	STARPU_ASSERT(best_numa >= 0);
-	
+
 	return best_numa;
 }
 
@@ -1098,7 +1056,7 @@ static void _starpu_fetch_task_input_cb(void *arg)
 }
 
 
-/* Synchronously or asynchronously fetch data for a given task (if it's not there already) 
+/* Synchronously or asynchronously fetch data for a given task (if it's not there already)
  * Returns the number of data acquired here.  */
 
 /* _starpu_fetch_task_input must be called before

+ 26 - 708
src/datawizard/copy_driver.c

@@ -26,13 +26,15 @@
 #include <drivers/mpi/driver_mpi_sink.h>
 #include <drivers/mpi/driver_mpi_source.h>
 #include <drivers/mpi/driver_mpi_common.h>
+#include <drivers/mic/driver_mic_source.h>
 #include <common/fxt.h>
-#include "copy_driver.h"
-#include "memalloc.h"
+#include <datawizard/copy_driver.h>
+#include <datawizard/memalloc.h>
 #include <starpu_opencl.h>
 #include <starpu_cuda.h>
 #include <profiling/profiling.h>
 #include <core/disk.h>
+#include <datawizard/node_ops.h>
 
 #ifdef STARPU_SIMGRID
 #include <core/simgrid.h>
@@ -146,8 +148,8 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 				    struct _starpu_data_replicate *dst_replicate,
 				    struct _starpu_data_request *req)
 {
-	unsigned src_node = src_replicate->memory_node;
-	unsigned dst_node = dst_replicate->memory_node;
+	unsigned src_node = (unsigned)src_replicate->memory_node;
+	unsigned dst_node = (unsigned)dst_replicate->memory_node;
 
 	STARPU_ASSERT(src_replicate->refcnt);
 	STARPU_ASSERT(dst_replicate->refcnt);
@@ -161,19 +163,8 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 
 	return _starpu_simgrid_transfer(handle->ops->get_size(handle), src_node, dst_node, req);
 #else /* !SIMGRID */
-
-	int ret = 0;
-
-	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
-
 	enum starpu_node_kind src_kind = starpu_node_get_kind(src_node);
 	enum starpu_node_kind dst_kind = starpu_node_get_kind(dst_node);
-
-#ifdef STARPU_USE_CUDA
-	cudaError_t cures;
-	cudaStream_t stream;
-#endif
-
 	void *src_interface = src_replicate->data_interface;
 	void *dst_interface = dst_replicate->data_interface;
 
@@ -195,434 +186,14 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 	}
 #endif
 
-	switch (_STARPU_MEMORY_NODE_TUPLE(src_kind,dst_kind))
+	if (_node_ops[src_kind].copy_data_to[dst_kind])
+	{
+		return _node_ops[src_kind].copy_data_to[dst_kind](handle, src_interface, src_node, dst_interface, dst_node, req);
+	}
+	else
 	{
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CPU_RAM):
-		/* STARPU_CPU_RAM -> STARPU_CPU_RAM */
-		if (copy_methods->ram_to_ram)
-			copy_methods->ram_to_ram(src_interface, src_node, dst_interface, dst_node);
-		else
-			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req ? &req->async_channel : NULL);
-		break;
-#ifdef STARPU_USE_CUDA
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CPU_RAM):
-		/* only the proper CUBLAS thread can initiate this directly ! */
-#if !defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
-		STARPU_ASSERT(starpu_worker_get_local_memory_node() == src_node);
-#endif
-		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() ||
-				!(copy_methods->cuda_to_ram_async || copy_methods->any_to_any))
-		{
-			/* this is not associated to a request so it's synchronous */
-			STARPU_ASSERT(copy_methods->cuda_to_ram || copy_methods->any_to_any);
-			if (copy_methods->cuda_to_ram)
-				copy_methods->cuda_to_ram(src_interface, src_node, dst_interface, dst_node);
-			else
-				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
-		}
-		else
-		{
-			req->async_channel.type = STARPU_CUDA_RAM;
-			cures = cudaEventCreateWithFlags(&req->async_channel.event.cuda_event, cudaEventDisableTiming);
-			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
-
-			stream = starpu_cuda_get_out_transfer_stream(src_node);
-			if (copy_methods->cuda_to_ram_async)
-				ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
-			else
-			{
-				STARPU_ASSERT(copy_methods->any_to_any);
-				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
-			}
-
-			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
-			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
-		}
-		break;
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CUDA_RAM):
-		/* STARPU_CPU_RAM -> CUBLAS_RAM */
-		/* only the proper CUBLAS thread can initiate this ! */
-#if !defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
-		STARPU_ASSERT(starpu_worker_get_local_memory_node() == dst_node);
-#endif
-		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() ||
-				!(copy_methods->ram_to_cuda_async || copy_methods->any_to_any))
-		{
-			/* this is not associated to a request so it's synchronous */
-			STARPU_ASSERT(copy_methods->ram_to_cuda || copy_methods->any_to_any);
-			if (copy_methods->ram_to_cuda)
-				copy_methods->ram_to_cuda(src_interface, src_node, dst_interface, dst_node);
-			else
-				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
-		}
-		else
-		{
-			req->async_channel.type = STARPU_CUDA_RAM;
-			cures = cudaEventCreateWithFlags(&req->async_channel.event.cuda_event, cudaEventDisableTiming);
-			if (STARPU_UNLIKELY(cures != cudaSuccess))
-				STARPU_CUDA_REPORT_ERROR(cures);
-
-			stream = starpu_cuda_get_in_transfer_stream(dst_node);
-			if (copy_methods->ram_to_cuda_async)
-				ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
-			else
-			{
-				STARPU_ASSERT(copy_methods->any_to_any);
-				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
-			}
-
-			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
-			if (STARPU_UNLIKELY(cures != cudaSuccess))
-				STARPU_CUDA_REPORT_ERROR(cures);
-		}
-		break;
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CUDA_RAM):
-		/* CUDA - CUDA transfer */
-		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() ||
-				!(copy_methods->cuda_to_cuda_async || copy_methods->any_to_any))
-		{
-			STARPU_ASSERT(copy_methods->cuda_to_cuda || copy_methods->any_to_any);
-			/* this is not associated to a request so it's synchronous */
-			if (copy_methods->cuda_to_cuda)
-				copy_methods->cuda_to_cuda(src_interface, src_node, dst_interface, dst_node);
-			else
-				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
-		}
-		else
-		{
-			req->async_channel.type = STARPU_CUDA_RAM;
-			cures = cudaEventCreateWithFlags(&req->async_channel.event.cuda_event, cudaEventDisableTiming);
-			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
-
-			stream = starpu_cuda_get_peer_transfer_stream(src_node, dst_node);
-			if (copy_methods->cuda_to_cuda_async)
-				ret = copy_methods->cuda_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
-			else
-			{
-				STARPU_ASSERT(copy_methods->any_to_any);
-				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
-			}
-
-			cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
-			if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
-		}
-		break;
-#endif
-#ifdef STARPU_USE_OPENCL
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_CPU_RAM):
-		/* OpenCL -> RAM */
-		STARPU_ASSERT(starpu_worker_get_local_memory_node() == src_node);
-		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() ||
-				!(copy_methods->opencl_to_ram_async || copy_methods->any_to_any))
-		{
-			STARPU_ASSERT(copy_methods->opencl_to_ram || copy_methods->any_to_any);
-			/* this is not associated to a request so it's synchronous */
-			if (copy_methods->opencl_to_ram)
-				copy_methods->opencl_to_ram(src_interface, src_node, dst_interface, dst_node);
-			else
-				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
-		}
-		else
-		{
-			req->async_channel.type = STARPU_OPENCL_RAM;
-			if (copy_methods->opencl_to_ram_async)
-				ret = copy_methods->opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
-			else
-			{
-				STARPU_ASSERT(copy_methods->any_to_any);
-				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
-			}
-		}
-		break;
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_OPENCL_RAM):
-		/* STARPU_CPU_RAM -> STARPU_OPENCL_RAM */
-		STARPU_ASSERT(starpu_worker_get_local_memory_node() == dst_node);
-		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() ||
-				!(copy_methods->ram_to_opencl_async || copy_methods->any_to_any))
-		{
-			STARPU_ASSERT(copy_methods->ram_to_opencl || copy_methods->any_to_any);
-			/* this is not associated to a request so it's synchronous */
-			if (copy_methods->ram_to_opencl)
-				copy_methods->ram_to_opencl(src_interface, src_node, dst_interface, dst_node);
-			else
-				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
-		}
-		else
-		{
-			req->async_channel.type = STARPU_OPENCL_RAM;
-			if (copy_methods->ram_to_opencl_async)
-				ret = copy_methods->ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
-			else
-			{
-				STARPU_ASSERT(copy_methods->any_to_any);
-				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
-			}
-		}
-		break;
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_OPENCL_RAM):
-		/* STARPU_OPENCL_RAM -> STARPU_OPENCL_RAM */
-		STARPU_ASSERT(starpu_worker_get_local_memory_node() == dst_node || starpu_worker_get_local_memory_node() == src_node);
-		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() ||
-				!(copy_methods->opencl_to_opencl_async || copy_methods->any_to_any))
-		{
-			STARPU_ASSERT(copy_methods->opencl_to_opencl || copy_methods->any_to_any);
-			/* this is not associated to a request so it's synchronous */
-			if (copy_methods->opencl_to_opencl)
-				copy_methods->opencl_to_opencl(src_interface, src_node, dst_interface, dst_node);
-			else
-				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
-		}
-		else
-		{
-			req->async_channel.type = STARPU_OPENCL_RAM;
-			if (copy_methods->opencl_to_opencl_async)
-				ret = copy_methods->opencl_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
-			else
-			{
-				STARPU_ASSERT(copy_methods->any_to_any);
-				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
-			}
-		}
-		break;
-#endif
-#ifdef STARPU_USE_MIC
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_MIC_RAM):
-		/* RAM -> MIC */
-		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mic_copy_disabled() ||
-				!(copy_methods->ram_to_mic_async || copy_methods->any_to_any))
-		{
-			/* this is not associated to a request so it's synchronous */
-			STARPU_ASSERT(copy_methods->ram_to_mic || copy_methods->any_to_any);
-			if (copy_methods->ram_to_mic)
-				copy_methods->ram_to_mic(src_interface, src_node, dst_interface, dst_node);
-			else
-				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
-		}
-		else
-		{
-			req->async_channel.type = STARPU_MIC_RAM;
-			if (copy_methods->ram_to_mic_async)
-				ret = copy_methods->ram_to_mic_async(src_interface, src_node, dst_interface, dst_node);
-			else
-			{
-				STARPU_ASSERT(copy_methods->any_to_any);
-				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
-			}
-			_starpu_mic_init_event(&(req->async_channel.event.mic_event), dst_node);
-		}
-		break;
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_MIC_RAM,STARPU_CPU_RAM):
-		/* MIC -> RAM */
-		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mic_copy_disabled() ||
-				!(copy_methods->mic_to_ram_async || copy_methods->any_to_any))
-		{
-			/* this is not associated to a request so it's synchronous */
-			STARPU_ASSERT(copy_methods->mic_to_ram || copy_methods->any_to_any);
-			if (copy_methods->mic_to_ram)
-				copy_methods->mic_to_ram(src_interface, src_node, dst_interface, dst_node);
-			else
-				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
-		}
-		else
-		{
-			req->async_channel.type = STARPU_MIC_RAM;
-			if (copy_methods->mic_to_ram_async)
-				ret = copy_methods->mic_to_ram_async(src_interface, src_node, dst_interface, dst_node);
-			else
-			{
-				STARPU_ASSERT(copy_methods->any_to_any);
-				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
-			}
-			_starpu_mic_init_event(&(req->async_channel.event.mic_event), src_node);
-		}
-		break;
-	/* TODO: MIC -> MIC */
-#endif
-#ifdef STARPU_USE_MPI_MASTER_SLAVE
-        case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_MPI_MS_RAM):
-                if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mpi_ms_copy_disabled() ||
-                                !(copy_methods->ram_to_mpi_ms_async || copy_methods->any_to_any))
-                {
-                        /* this is not associated to a request so it's synchronous */
-                        STARPU_ASSERT(copy_methods->ram_to_mpi_ms || copy_methods->any_to_any);
-                        if (copy_methods->ram_to_mpi_ms)
-                                copy_methods->ram_to_mpi_ms(src_interface, src_node, dst_interface, dst_node);
-                        else
-                                copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
-                }
-                else
-                {
-                        req->async_channel.type = STARPU_MPI_MS_RAM;
-                        if(copy_methods->ram_to_mpi_ms_async)
-                                ret = copy_methods->ram_to_mpi_ms_async(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
-                        else
-                        {
-                                STARPU_ASSERT(copy_methods->any_to_any);
-                                ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
-                        }
-                }
-                break;
-
-        case _STARPU_MEMORY_NODE_TUPLE(STARPU_MPI_MS_RAM,STARPU_CPU_RAM):
-                if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mpi_ms_copy_disabled() ||
-                                !(copy_methods->mpi_ms_to_ram_async || copy_methods->any_to_any))
-                {
-                        /* this is not associated to a request so it's synchronous */
-                        STARPU_ASSERT(copy_methods->mpi_ms_to_ram || copy_methods->any_to_any);
-                        if (copy_methods->mpi_ms_to_ram)
-                                copy_methods->mpi_ms_to_ram(src_interface, src_node, dst_interface, dst_node);
-                        else
-                                copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
-                }
-                else
-                {
-                        req->async_channel.type = STARPU_MPI_MS_RAM;
-                        if(copy_methods->mpi_ms_to_ram_async)
-                                ret = copy_methods->mpi_ms_to_ram_async(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
-                        else
-                        {
-                                STARPU_ASSERT(copy_methods->any_to_any);
-                                ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
-                        }
-                }
-                break;
-
-        case _STARPU_MEMORY_NODE_TUPLE(STARPU_MPI_MS_RAM,STARPU_MPI_MS_RAM):
-                if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mpi_ms_copy_disabled() ||
-                                !(copy_methods->mpi_ms_to_mpi_ms_async || copy_methods->any_to_any))
-                {
-                        /* this is not associated to a request so it's synchronous */
-                        STARPU_ASSERT(copy_methods->mpi_ms_to_mpi_ms || copy_methods->any_to_any);
-                        if (copy_methods->mpi_ms_to_mpi_ms)
-                                copy_methods->mpi_ms_to_mpi_ms(src_interface, src_node, dst_interface, dst_node);
-                        else
-                                copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
-                }
-                else
-                {
-                        req->async_channel.type = STARPU_MPI_MS_RAM;
-                        if(copy_methods->mpi_ms_to_mpi_ms_async)
-                                ret = copy_methods->mpi_ms_to_mpi_ms_async(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
-                        else
-                        {
-                                STARPU_ASSERT(copy_methods->any_to_any);
-                                ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
-                        }
-                }
-                break;
-#endif
-#ifdef STARPU_USE_SCC
-		/* SCC RAM associated to the master process is considered as
-		 * the main memory node. */
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_SCC_RAM):
-		/* master private SCC RAM -> slave private SCC RAM */
-		if (copy_methods->scc_src_to_sink)
-			copy_methods->scc_src_to_sink(src_interface, src_node, dst_interface, dst_node);
-		else
-			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
-		break;
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_SCC_RAM,STARPU_CPU_RAM):
-		/* slave private SCC RAM -> master private SCC RAM */
-		if (copy_methods->scc_sink_to_src)
-			copy_methods->scc_sink_to_src(src_interface, src_node, dst_interface, dst_node);
-		else
-			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
-		break;
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_SCC_RAM,STARPU_SCC_RAM):
-		/* slave private SCC RAM -> slave private SCC RAM */
-		if (copy_methods->scc_sink_to_sink)
-			copy_methods->scc_sink_to_sink(src_interface, src_node, dst_interface, dst_node);
-		else
-			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
-		break;
-#endif
-
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_DISK_RAM):
-                if (req && !starpu_asynchronous_copy_disabled())
-                {
-                        req->async_channel.type = STARPU_DISK_RAM;
-                        req->async_channel.event.disk_event.requests = NULL;
-                        req->async_channel.event.disk_event.ptr = NULL;
-                        req->async_channel.event.disk_event.handle = NULL;
-                }
-		if(copy_methods->any_to_any)
-			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req && !starpu_asynchronous_copy_disabled() ? &req->async_channel : NULL);
-
-		else
-		{
-			void *obj = starpu_data_handle_to_pointer(handle, dst_node);
-			void * ptr = NULL;
-			starpu_ssize_t size = 0;
-			handle->ops->pack_data(handle, src_node, &ptr, &size);
-			ret = _starpu_disk_full_write(src_node, dst_node, obj, ptr, size, req && !starpu_asynchronous_copy_disabled() ? &req->async_channel : NULL);
-			if (ret == 0)
-			{
-				/* write is already finished, ptr was allocated in pack_data */
-				_starpu_free_flags_on_node(src_node, ptr, size, 0);
-			}
-			else if (ret == -EAGAIN)
-			{
-				STARPU_ASSERT(req);
-				req->async_channel.event.disk_event.ptr = ptr;
-				req->async_channel.event.disk_event.node = src_node;
-				req->async_channel.event.disk_event.size = size;
-			}
-
-			STARPU_ASSERT(ret == 0 || ret == -EAGAIN);
-		}
-		break;
-
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_DISK_RAM,STARPU_CPU_RAM):
-                if (req && !starpu_asynchronous_copy_disabled())
-                {
-                        req->async_channel.type = STARPU_DISK_RAM;
-                        req->async_channel.event.disk_event.requests = NULL;
-                        req->async_channel.event.disk_event.ptr = NULL;
-                        req->async_channel.event.disk_event.handle = NULL;
-                }
-		if(copy_methods->any_to_any)
-			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req && !starpu_asynchronous_copy_disabled()  ? &req->async_channel : NULL);
-		else
-		{
-			void *obj = starpu_data_handle_to_pointer(handle, src_node);
-			void * ptr = NULL;
-			size_t size = 0;
-			ret = _starpu_disk_full_read(src_node, dst_node, obj, &ptr, &size, req && !starpu_asynchronous_copy_disabled() ? &req->async_channel : NULL);
-			if (ret == 0)
-			{
-				/* read is already finished, we can already unpack */
-				handle->ops->unpack_data(handle, dst_node, ptr, size);
-			}
-			else if (ret == -EAGAIN)
-			{
-				STARPU_ASSERT(req);
-				req->async_channel.event.disk_event.ptr = ptr;
-				req->async_channel.event.disk_event.node = dst_node;
-				req->async_channel.event.disk_event.size = size;
-				req->async_channel.event.disk_event.handle = handle;
-			}
-
-			STARPU_ASSERT(ret == 0 || ret == -EAGAIN);
-		}
-		break;
-
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_DISK_RAM,STARPU_DISK_RAM):
-                if (req && !starpu_asynchronous_copy_disabled())
-                {
-                        req->async_channel.type = STARPU_DISK_RAM;
-                        req->async_channel.event.disk_event.requests = NULL;
-                        req->async_channel.event.disk_event.ptr = NULL;
-                        req->async_channel.event.disk_event.handle = NULL;
-                }
-		ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req && !starpu_asynchronous_copy_disabled() ? &req->async_channel : NULL);
-		break;
-
-	default:
 		STARPU_ABORT();
-		break;
 	}
-
-	return ret;
 #endif /* !SIMGRID */
 }
 
@@ -728,155 +299,21 @@ void starpu_interface_end_driver_copy_async(unsigned src_node, unsigned dst_node
 int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, void *async_data)
 {
 	struct _starpu_async_channel *async_channel = async_data;
-
 	enum starpu_node_kind src_kind = starpu_node_get_kind(src_node);
 	enum starpu_node_kind dst_kind = starpu_node_get_kind(dst_node);
 
-	switch (_STARPU_MEMORY_NODE_TUPLE(src_kind,dst_kind))
+	if (_node_ops[src_kind].copy_interface_to[dst_kind])
 	{
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CPU_RAM):
-		memcpy((void *) (dst + dst_offset), (void *) (src + src_offset), size);
-		return 0;
-
-#ifdef STARPU_USE_CUDA
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CPU_RAM):
-		return starpu_cuda_copy_async_sync(
-				(void*) (src + src_offset), src_node,
-				(void*) (dst + dst_offset), dst_node,
-				size,
-				async_channel?starpu_cuda_get_out_transfer_stream(src_node):NULL,
-				cudaMemcpyDeviceToHost);
-
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_CUDA_RAM):
-		return starpu_cuda_copy_async_sync(
-				(void*) (src + src_offset), src_node,
-				(void*) (dst + dst_offset), dst_node,
-				size,
-				async_channel?starpu_cuda_get_in_transfer_stream(dst_node):NULL,
-				cudaMemcpyHostToDevice);
-
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CUDA_RAM,STARPU_CUDA_RAM):
-		return starpu_cuda_copy_async_sync(
-				(void*) (src + src_offset), src_node,
-				(void*) (dst + dst_offset), dst_node,
-				size,
-				async_channel?starpu_cuda_get_peer_transfer_stream(src_node, dst_node):NULL,
-				cudaMemcpyDeviceToDevice);
-
-#endif
-#ifdef STARPU_USE_OPENCL
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_CPU_RAM):
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_OPENCL_RAM):
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_OPENCL_RAM):
-		return starpu_opencl_copy_async_sync(
-				src, src_offset, src_node,
-				dst, dst_offset, dst_node,
-				size,
-				&async_channel->event.opencl_event);
-#endif
-#ifdef STARPU_USE_MIC
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_MIC_RAM,STARPU_CPU_RAM):
-		if (async_data)
-			return _starpu_mic_copy_mic_to_ram_async(
-					(void*) (src + src_offset), src_node,
-					(void*) (dst + dst_offset), dst_node,
-					size);
-		else
-			return _starpu_mic_copy_mic_to_ram(
-					(void*) (src + src_offset), src_node,
-					(void*) (dst + dst_offset), dst_node,
-					size);
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_MIC_RAM):
-		if (async_data)
-			return _starpu_mic_copy_ram_to_mic_async(
-					(void*) (src + src_offset), src_node,
-					(void*) (dst + dst_offset), dst_node,
-					size);
-		else
-			return _starpu_mic_copy_ram_to_mic(
-					(void*) (src + src_offset), src_node,
-					(void*) (dst + dst_offset), dst_node,
-					size);
-	/* TODO: MIC->MIC */
-#endif
-#ifdef STARPU_USE_SCC
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_SCC_RAM,STARPU_CPU_RAM):
-		return _starpu_scc_copy_sink_to_src(
-				(void*) (src + src_offset), src_node,
-				(void*) (dst + dst_offset), dst_node,
-				size);
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_SCC_RAM):
-		return _starpu_scc_copy_src_to_sink(
-				(void*) (src + src_offset), src_node,
-				(void*) (dst + dst_offset), dst_node,
-				size);
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_SCC_RAM,STARPU_SCC_RAM):
-		return _starpu_scc_copy_sink_to_sink(
-				(void*) (src + src_offset), src_node,
-				(void*) (dst + dst_offset), dst_node,
-				size);
-#endif
-#ifdef STARPU_USE_MPI_MASTER_SLAVE
-        case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM, STARPU_MPI_MS_RAM):
-                if (async_data)
-                        return _starpu_mpi_copy_ram_to_mpi_async(
-                                        (void*) (src + src_offset), src_node,
-                                        (void*) (dst + dst_offset), dst_node,
-                                        size, async_data);
-                else
-                        return _starpu_mpi_copy_ram_to_mpi_sync(
-                                        (void*) (src + src_offset), src_node,
-                                        (void*) (dst + dst_offset), dst_node,
-                                        size);
-        case _STARPU_MEMORY_NODE_TUPLE(STARPU_MPI_MS_RAM, STARPU_CPU_RAM):
-                if (async_data)
-                        return _starpu_mpi_copy_mpi_to_ram_async(
-                                        (void*) (src + src_offset), src_node,
-                                        (void*) (dst + dst_offset), dst_node,
-                                        size, async_data);
-                else
-                        return _starpu_mpi_copy_mpi_to_ram_sync(
-                                        (void*) (src + src_offset), src_node,
-                                        (void*) (dst + dst_offset), dst_node,
-                                        size);
-
-        case _STARPU_MEMORY_NODE_TUPLE(STARPU_MPI_MS_RAM, STARPU_MPI_MS_RAM):
-                if (async_data)
-                        return _starpu_mpi_copy_sink_to_sink_async(
-                                        (void*) (src + src_offset), src_node,
-                                        (void*) (dst + dst_offset), dst_node,
-                                        size, async_data);
-                else
-                        return _starpu_mpi_copy_sink_to_sink_sync(
-                                        (void*) (src + src_offset), src_node,
-                                        (void*) (dst + dst_offset), dst_node,
-                                        size);
-#endif
-
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM, STARPU_DISK_RAM):
-	{
-		return _starpu_disk_copy_src_to_disk(
-			(void*) (src + src_offset), src_node,
-			(void*) dst, dst_offset, dst_node,
-			size, async_channel);
+		return _node_ops[src_kind].copy_interface_to[dst_kind](src, src_offset, src_node,
+								       dst, dst_offset, dst_node,
+								       size,
+								       async_channel);
 	}
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_DISK_RAM, STARPU_CPU_RAM):
-		return _starpu_disk_copy_disk_to_src(
-			(void*) src, src_offset, src_node,
-			(void*) (dst + dst_offset), dst_node,
-			size, async_channel);
-
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_DISK_RAM, STARPU_DISK_RAM):
-		return _starpu_disk_copy_disk_to_disk(
-			(void*) src, src_offset, src_node,
-			(void*) dst, dst_offset, dst_node,
-			size, async_channel);
-
-	default:
+	else
+	{
 		STARPU_ABORT();
 		return -1;
 	}
-	return 0;
 }
 
 void _starpu_driver_wait_request_completion(struct _starpu_async_channel *async_channel)
@@ -885,70 +322,13 @@ void _starpu_driver_wait_request_completion(struct _starpu_async_channel *async_
 	_starpu_simgrid_wait_transfer_event(&async_channel->event);
 #else /* !SIMGRID */
 	enum starpu_node_kind kind = async_channel->type;
-#ifdef STARPU_USE_CUDA
-	cudaEvent_t event;
-	cudaError_t cures;
-#endif
-
-	switch (kind)
-	{
-#ifdef STARPU_USE_CUDA
-	case STARPU_CUDA_RAM:
-		event = (*async_channel).event.cuda_event;
-
-		cures = cudaEventSynchronize(event);
-		if (STARPU_UNLIKELY(cures))
-			STARPU_CUDA_REPORT_ERROR(cures);
 
-		cures = cudaEventDestroy(event);
-		if (STARPU_UNLIKELY(cures))
-			STARPU_CUDA_REPORT_ERROR(cures);
-
-		break;
-#endif
-#ifdef STARPU_USE_OPENCL
-	case STARPU_OPENCL_RAM:
+	if (_node_ops[kind].wait_request_completion != NULL)
 	{
-		cl_int err;
-		if ((*async_channel).event.opencl_event == NULL)
-			STARPU_ABORT();
-		err = clWaitForEvents(1, &((*async_channel).event.opencl_event));
-		if (STARPU_UNLIKELY(err != CL_SUCCESS))
-			STARPU_OPENCL_REPORT_ERROR(err);
-		err = clReleaseEvent((*async_channel).event.opencl_event);
-		if (STARPU_UNLIKELY(err != CL_SUCCESS))
-			STARPU_OPENCL_REPORT_ERROR(err);
-	      break;
+		_node_ops[kind].wait_request_completion(async_channel);
 	}
-#endif
-#ifdef STARPU_USE_MIC
-	case STARPU_MIC_RAM:
-		_starpu_mic_wait_request_completion(&(async_channel->event.mic_event));
-		break;
-#endif
-#ifdef STARPU_USE_MPI_MASTER_SLAVE
-        case STARPU_MPI_MS_RAM:
-                _starpu_mpi_common_wait_event(async_channel);
-                break;
-#endif
-	case STARPU_DISK_RAM:
-		starpu_disk_wait_request(async_channel);
-		if (async_channel->event.disk_event.ptr != NULL)
-		{
-			if (async_channel->event.disk_event.handle != NULL)
-			{
-				/* read is finished, we can already unpack */
-				async_channel->event.disk_event.handle->ops->unpack_data(async_channel->event.disk_event.handle, async_channel->event.disk_event.node, async_channel->event.disk_event.ptr, async_channel->event.disk_event.size);
-			}
-			else
-			{
-				/* write is finished, ptr was allocated in pack_data */
-				_starpu_free_flags_on_node(async_channel->event.disk_event.node, async_channel->event.disk_event.ptr, async_channel->event.disk_event.size, 0);
-			}
-		}
-		break;
-	case STARPU_CPU_RAM:
-	default:
+	else
+	{
 		STARPU_ABORT();
 	}
 #endif /* !SIMGRID */
@@ -960,76 +340,14 @@ unsigned _starpu_driver_test_request_completion(struct _starpu_async_channel *as
 	return _starpu_simgrid_test_transfer_event(&async_channel->event);
 #else /* !SIMGRID */
 	enum starpu_node_kind kind = async_channel->type;
-	unsigned success = 0;
-#ifdef STARPU_USE_CUDA
-	cudaEvent_t event;
-#endif
-
-	switch (kind)
-	{
-#ifdef STARPU_USE_CUDA
-	case STARPU_CUDA_RAM:
-		event = (*async_channel).event.cuda_event;
-		cudaError_t cures = cudaEventQuery(event);
 
-		success = (cures == cudaSuccess);
-		if (success)
-			cudaEventDestroy(event);
-		else if (cures != cudaErrorNotReady)
-			STARPU_CUDA_REPORT_ERROR(cures);
-		break;
-#endif
-#ifdef STARPU_USE_OPENCL
-	case STARPU_OPENCL_RAM:
+	if (_node_ops[kind].test_request_completion != NULL)
 	{
-		cl_int event_status;
-		cl_event opencl_event = (*async_channel).event.opencl_event;
-		if (opencl_event == NULL) STARPU_ABORT();
-		cl_int err = clGetEventInfo(opencl_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL);
-		if (STARPU_UNLIKELY(err != CL_SUCCESS))
-			STARPU_OPENCL_REPORT_ERROR(err);
-		if (event_status < 0)
-			STARPU_OPENCL_REPORT_ERROR(event_status);
-		if (event_status == CL_COMPLETE)
-		{
-			err = clReleaseEvent(opencl_event);
-			if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
-		}
-		success = (event_status == CL_COMPLETE);
-		break;
+		return _node_ops[kind].test_request_completion(async_channel);
 	}
-#endif
-#ifdef STARPU_USE_MIC
-	case STARPU_MIC_RAM:
-		success = _starpu_mic_request_is_complete(&(async_channel->event.mic_event));
-		break;
-#endif
-#ifdef STARPU_USE_MPI_MASTER_SLAVE
-        case STARPU_MPI_MS_RAM:
-                success = _starpu_mpi_common_test_event(async_channel);
-                break;
-#endif
-	case STARPU_DISK_RAM:
-		success = starpu_disk_test_request(async_channel);
-		if (async_channel->event.disk_event.ptr != NULL && success)
-		{
-			if (async_channel->event.disk_event.handle != NULL)
-			{
-				/* read is finished, we can already unpack */
-				async_channel->event.disk_event.handle->ops->unpack_data(async_channel->event.disk_event.handle, async_channel->event.disk_event.node, async_channel->event.disk_event.ptr, async_channel->event.disk_event.size);
-			}
-			else
-			{
-				/* write is finished, ptr was allocated in pack_data */
-				_starpu_free_flags_on_node(async_channel->event.disk_event.node, async_channel->event.disk_event.ptr, async_channel->event.disk_event.size, 0);
-			}
-		}
-		break;
-	case STARPU_CPU_RAM:
-	default:
+	else
+	{
 		STARPU_ABORT_MSG("Memory is not recognized (kind %d) \n", kind);
 	}
-
-	return success;
 #endif /* !SIMGRID */
 }

+ 2 - 2
src/datawizard/interfaces/bcsr_filters.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2008-2011,2013,2014,2016,2019            Université de Bordeaux
  * Copyright (C) 2010                                     Mehdi Juhoor
- * Copyright (C) 2010,2011,2013,2015,2017                 CNRS
+ * Copyright (C) 2010,2011,2013,2015,2017,2019            CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -51,7 +51,7 @@ void starpu_bcsr_filter_canonical_block(void *father_interface, void *child_inte
 	}
 }
 
-void starpu_bcsr_filter_canonical_block_child_ops(STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f, unsigned child)
+struct starpu_data_interface_ops *starpu_bcsr_filter_canonical_block_child_ops(STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f, STARPU_ATTRIBUTE_UNUSED unsigned child)
 {
 	return &starpu_interface_matrix_ops;
 }

+ 17 - 234
src/datawizard/malloc.c

@@ -28,6 +28,7 @@
 #include <datawizard/memory_manager.h>
 #include <datawizard/memory_nodes.h>
 #include <datawizard/malloc.h>
+#include <datawizard/node_ops.h>
 #include <core/simgrid.h>
 #include <core/task.h>
 
@@ -573,20 +574,10 @@ int starpu_free(void *A)
 	return starpu_free_flags(A, 0, STARPU_MALLOC_PINNED);
 }
 
-#ifdef STARPU_SIMGRID
-static starpu_pthread_mutex_t cuda_alloc_mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
-static starpu_pthread_mutex_t opencl_alloc_mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
-#endif
-
-static uintptr_t
-_starpu_malloc_on_node(unsigned dst_node, size_t size, int flags)
+static uintptr_t _starpu_malloc_on_node(unsigned dst_node, size_t size, int flags)
 {
 	uintptr_t addr = 0;
 
-#if defined(STARPU_USE_CUDA) && !defined(STARPU_SIMGRID)
-	cudaError_t status;
-#endif
-
 	/* Handle count first */
 	if (flags & STARPU_MALLOC_COUNT)
 	{
@@ -596,130 +587,11 @@ _starpu_malloc_on_node(unsigned dst_node, size_t size, int flags)
 		flags &= ~STARPU_MALLOC_COUNT;
 	}
 
-	switch(starpu_node_get_kind(dst_node))
-	{
-		case STARPU_CPU_RAM:
-		{
-			_starpu_malloc_flags_on_node(dst_node, (void**) &addr, size,
-#if defined(STARPU_USE_CUDA) && !defined(STARPU_HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
-					/* without memcpy_peer, we can not
-					 * allocated pinned memory, since it
-					 * requires waiting for a task, and we
-					 * may be called with a spinlock held
-					 */
-					flags & ~STARPU_MALLOC_PINNED
-#else
-					flags
-#endif
-					);
-			break;
-		}
-#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
-		case STARPU_CUDA_RAM:
-		{
-#ifdef STARPU_SIMGRID
-			static uintptr_t last[STARPU_MAXNODES];
-#ifdef STARPU_DEVEL
-#warning TODO: record used memory, using a simgrid property to know the available memory
-#endif
-			/* Sleep for the allocation */
-			STARPU_PTHREAD_MUTEX_LOCK(&cuda_alloc_mutex);
-			if (_starpu_simgrid_cuda_malloc_cost())
-				MSG_process_sleep(0.000175);
-			if (!last[dst_node])
-				last[dst_node] = 1<<10;
-			addr = last[dst_node];
-			last[dst_node]+=size;
-			STARPU_ASSERT(last[dst_node] >= addr);
-			STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_alloc_mutex);
-#else
-			unsigned devid = starpu_memory_node_get_devid(dst_node);
-#if defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
-			starpu_cuda_set_device(devid);
-#else
-			struct _starpu_worker *worker = _starpu_get_local_worker_key();
-			if (!worker || worker->arch != STARPU_CUDA_WORKER || worker->devid != devid)
-				STARPU_ASSERT_MSG(0, "CUDA peer access is not available with this version of CUDA");
-#endif
-			/* Check if there is free memory */
-			size_t cuda_mem_free, cuda_mem_total;
-			status = cudaMemGetInfo(&cuda_mem_free, &cuda_mem_total);
-			if (status == cudaSuccess &&
-					cuda_mem_free < (size*2))
-			{
-					addr = 0;
-					break;
-			}
-			status = cudaMalloc((void **)&addr, size);
-			if (!addr || (status != cudaSuccess))
-			{
-				if (STARPU_UNLIKELY(status != cudaErrorMemoryAllocation))
-					STARPU_CUDA_REPORT_ERROR(status);
-				addr = 0;
-			}
-#endif
-			break;
-		}
-#endif
-#if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
-	        case STARPU_OPENCL_RAM:
-		{
-#ifdef STARPU_SIMGRID
-				static uintptr_t last[STARPU_MAXNODES];
-				/* Sleep for the allocation */
-				STARPU_PTHREAD_MUTEX_LOCK(&opencl_alloc_mutex);
-				if (_starpu_simgrid_cuda_malloc_cost())
-					MSG_process_sleep(0.000175);
-				if (!last[dst_node])
-					last[dst_node] = 1<<10;
-				addr = last[dst_node];
-				last[dst_node]+=size;
-				STARPU_ASSERT(last[dst_node] >= addr);
-				STARPU_PTHREAD_MUTEX_UNLOCK(&opencl_alloc_mutex);
-#else
-                                int ret;
-				cl_mem ptr;
-
-				ret = starpu_opencl_allocate_memory(starpu_memory_node_get_devid(dst_node), &ptr, size, CL_MEM_READ_WRITE);
-				if (ret)
-				{
-					addr = 0;
-				}
-				else
-				{
-					addr = (uintptr_t)ptr;
-				}
-#endif
-			break;
-		}
-#endif
-	        case STARPU_DISK_RAM:
-		{
-			addr = (uintptr_t) _starpu_disk_alloc(dst_node, size);
-			break;
-		}
-
-#ifdef STARPU_USE_MIC
-		case STARPU_MIC_RAM:
-			if (_starpu_mic_allocate_memory((void **)(&addr), size, dst_node))
-				addr = 0;
-			break;
-#endif
-#ifdef STARPU_USE_MPI_MASTER_SLAVE
-		case STARPU_MPI_MS_RAM:
-			if (_starpu_mpi_src_allocate_memory((void **)(&addr), size, dst_node))
-				addr = 0;
-			break;
-#endif
-#ifdef STARPU_USE_SCC
-		case STARPU_SCC_RAM:
-			if (_starpu_scc_allocate_memory((void **)(&addr), size, dst_node))
-				addr = 0;
-			break;
-#endif
-		default:
-			STARPU_ABORT();
-	}
+	enum starpu_node_kind node_kind = starpu_node_get_kind(dst_node);
+	if (_node_ops[node_kind].malloc_on_node)
+		return _node_ops[node_kind].malloc_on_node(dst_node, size, flags);
+	else
+		STARPU_ABORT_MSG("No malloc_on_node function defined for node %s\n", _starpu_node_get_prefix(node_kind));
 
 	if (addr == 0)
 	{
@@ -730,108 +602,19 @@ _starpu_malloc_on_node(unsigned dst_node, size_t size, int flags)
 	return addr;
 }
 
-void
-_starpu_free_on_node_flags(unsigned dst_node, uintptr_t addr, size_t size, int flags)
+void _starpu_free_on_node_flags(unsigned dst_node, uintptr_t addr, size_t size, int flags)
 {
 	int count = flags & STARPU_MALLOC_COUNT;
 	flags &= ~STARPU_MALLOC_COUNT;
-	enum starpu_node_kind kind = starpu_node_get_kind(dst_node);
-	switch(kind)
-	{
-		case STARPU_CPU_RAM:
-			_starpu_free_flags_on_node(dst_node, (void*)addr, size,
-#if defined(STARPU_USE_CUDA) && !defined(STARPU_HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
-						   flags & ~STARPU_MALLOC_PINNED
-#else
-						   flags
-#endif
-				);
-			break;
-#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
-		case STARPU_CUDA_RAM:
-		{
-#ifdef STARPU_SIMGRID
-			STARPU_PTHREAD_MUTEX_LOCK(&cuda_alloc_mutex);
-			/* Sleep for the free */
-			if (_starpu_simgrid_cuda_malloc_cost())
-				MSG_process_sleep(0.000750);
-			STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_alloc_mutex);
-			/* CUDA also synchronizes roughly everything on cudaFree */
-			_starpu_simgrid_sync_gpus();
-#else
-			cudaError_t err;
-			unsigned devid = starpu_memory_node_get_devid(dst_node);
-#if defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
-			starpu_cuda_set_device(devid);
-#else
-			struct _starpu_worker *worker = _starpu_get_local_worker_key();
-			if (!worker || worker->arch != STARPU_CUDA_WORKER || worker->devid != devid)
-				STARPU_ASSERT_MSG(0, "CUDA peer access is not available with this version of CUDA");
-#endif /* STARPU_HAVE_CUDA_MEMCPY_PEER */
-			err = cudaFree((void*)addr);
-#ifdef STARPU_OPENMP
-			/* When StarPU is used as Open Runtime support,
-			 * starpu_omp_shutdown() will usually be called from a
-			 * destructor, in which case cudaThreadExit() reports a
-			 * cudaErrorCudartUnloading here. There should not
-			 * be any remaining tasks running at this point so
-			 * we can probably ignore it without much consequences. */
-			if (STARPU_UNLIKELY(err != cudaSuccess && err != cudaErrorCudartUnloading))
-				STARPU_CUDA_REPORT_ERROR(err);
-#else
-			if (STARPU_UNLIKELY(err != cudaSuccess))
-				STARPU_CUDA_REPORT_ERROR(err);
-#endif /* STARPU_OPENMP */
-#endif /* STARPU_SIMGRID */
-			break;
-		}
-#endif /* STARPU_USE_CUDA || STARPU_SIMGRID */
 
-#if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
-                case STARPU_OPENCL_RAM:
-		{
-#ifdef STARPU_SIMGRID
-			STARPU_PTHREAD_MUTEX_LOCK(&opencl_alloc_mutex);
-			/* Sleep for the free */
-			if (_starpu_simgrid_cuda_malloc_cost())
-				MSG_process_sleep(0.000750);
-			STARPU_PTHREAD_MUTEX_UNLOCK(&opencl_alloc_mutex);
-#else
-			cl_int err;
-                        err = clReleaseMemObject((void*)addr);
-			if (STARPU_UNLIKELY(err != CL_SUCCESS))
-				STARPU_OPENCL_REPORT_ERROR(err);
-#endif
-                        break;
-		}
-#endif
-	        case STARPU_DISK_RAM:
-		{
-			_starpu_disk_free (dst_node, (void *) addr , size);
-			break;
-		}
+	enum starpu_node_kind node_kind = starpu_node_get_kind(dst_node);
+	if (_node_ops[node_kind].free_on_node)
+		_node_ops[node_kind].free_on_node(dst_node, addr, size, flags);
+	else
+		STARPU_ABORT_MSG("No free_on_node function defined for node %s\n", _starpu_node_get_prefix(node_kind));
 
-#ifdef STARPU_USE_MIC
-		case STARPU_MIC_RAM:
-			_starpu_mic_free_memory((void*) addr, size, dst_node);
-			break;
-#endif
-#ifdef STARPU_USE_MPI_MASTER_SLAVE
-        case STARPU_MPI_MS_RAM:
-            _starpu_mpi_source_free_memory((void*) addr, dst_node);
-            break;
-#endif
-#ifdef STARPU_USE_SCC
-		case STARPU_SCC_RAM:
-			_starpu_scc_free_memory((void *) addr, dst_node);
-			break;
-#endif
-		default:
-			STARPU_ABORT();
-	}
 	if (count)
 		starpu_memory_deallocate(dst_node, size);
-
 }
 
 int
@@ -983,10 +766,10 @@ static struct _starpu_chunk *_starpu_new_chunk(unsigned dst_node, int flags)
 static int _starpu_malloc_should_suballoc(unsigned dst_node, size_t size, int flags)
 {
 	return size <= CHUNK_ALLOC_MAX &&
-			(   starpu_node_get_kind(dst_node) == STARPU_CUDA_RAM
-			|| (starpu_node_get_kind(dst_node) == STARPU_CPU_RAM
-			    && _starpu_malloc_should_pin(flags))
-			);
+		(starpu_node_get_kind(dst_node) == STARPU_CUDA_RAM
+		 || (starpu_node_get_kind(dst_node) == STARPU_CPU_RAM
+		     && _starpu_malloc_should_pin(flags))
+		 );
 }
 
 uintptr_t

+ 4 - 34
src/datawizard/memory_nodes.c

@@ -23,8 +23,9 @@
 #include <datawizard/memory_nodes.h>
 #include <datawizard/malloc.h>
 #include <common/fxt.h>
-#include "copy_driver.h"
-#include "memalloc.h"
+#include <datawizard/copy_driver.h>
+#include <datawizard/memalloc.h>
+#include <datawizard/node_ops.h>
 
 char _starpu_worker_drives_memory[STARPU_NMAXWORKERS][STARPU_MAXNODES];
 
@@ -75,38 +76,7 @@ unsigned starpu_memory_nodes_get_count(void)
 
 int starpu_memory_node_get_name(unsigned node, char *name, size_t size)
 {
-	const char *prefix;
-	switch (_starpu_descr.nodes[node])
-	{
-	case STARPU_CPU_RAM:
-		prefix = "NUMA";
-		break;
-	case STARPU_CUDA_RAM:
-		prefix = "CUDA";
-		break;
-	case STARPU_OPENCL_RAM:
-		prefix = "OpenCL";
-		break;
-	case STARPU_DISK_RAM:
-		prefix = "Disk";
-		break;
-	case STARPU_MIC_RAM:
-		prefix = "MIC";
-		break;
-	case STARPU_MPI_MS_RAM:
-		prefix = "MPI_MS";
-		break;
-	case STARPU_SCC_RAM:
-		prefix = "SCC_RAM";
-		break;
-	case STARPU_SCC_SHM:
-		prefix = "SCC_shared";
-		break;
-	case STARPU_UNUSED:
-	default:
-		prefix = "unknown";
-		STARPU_ASSERT(0);
-	}
+	const char *prefix = _starpu_node_get_prefix(_starpu_descr.nodes[node]);
 	return snprintf(name, size, "%s %d", prefix, _starpu_descr.devid[node]);
 }
 

+ 0 - 5
src/datawizard/memory_nodes.h

@@ -30,10 +30,6 @@
 #include <core/simgrid.h>
 #endif
 
-#define _STARPU_MEMORY_NODE_TUPLE(node1,node2) (node1 | (node2 << 4))
-#define _STARPU_MEMORY_NODE_TUPLE_FIRST(tuple) (tuple & 0x0F)
-#define _STARPU_MEMORY_NODE_TUPLE_SECOND(tuple) (tuple & 0xF0)
-
 extern char _starpu_worker_drives_memory[STARPU_NMAXWORKERS][STARPU_MAXNODES];
 
 struct _starpu_cond_and_worker
@@ -68,7 +64,6 @@ struct _starpu_memory_node_descr
 	/* the number of queues attached to each node */
 	unsigned total_condition_count;
 	unsigned condition_count[STARPU_MAXNODES];
-
 };
 
 extern struct _starpu_memory_node_descr _starpu_descr;

+ 153 - 0
src/datawizard/node_ops.c

@@ -0,0 +1,153 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2019                                     CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <starpu.h>
+#include <common/config.h>
+
+#include <datawizard/node_ops.h>
+#include <drivers/cpu/driver_cpu.h>
+#include <drivers/cuda/driver_cuda.h>
+#include <drivers/opencl/driver_opencl.h>
+#include <drivers/mpi/driver_mpi_common.h>
+#include <drivers/mpi/driver_mpi_source.h>
+#include <drivers/mic/driver_mic_source.h>
+#include <drivers/scc/driver_scc_source.h>
+#include <drivers/disk/driver_disk.h>
+
+struct _starpu_node_ops _node_ops[STARPU_MPI_MS_RAM+1];
+
+void _starpu_node_ops_init()
+{
+	memset(_node_ops, 0, STARPU_MPI_MS_RAM*sizeof(struct _starpu_node_ops));
+
+	// CPU
+	// CPU_RAM does not define wait_event operation
+	_node_ops[STARPU_CPU_RAM].copy_data_to[STARPU_CPU_RAM] = _starpu_cpu_copy_data;
+	_node_ops[STARPU_CPU_RAM].copy_interface_to[STARPU_CPU_RAM] = _starpu_cpu_copy_interface;
+	_node_ops[STARPU_CPU_RAM].direct_access_supported = _starpu_cpu_direct_access_supported;
+	_node_ops[STARPU_CPU_RAM].malloc_on_node = _starpu_cpu_malloc_on_node;
+	_node_ops[STARPU_CPU_RAM].free_on_node = _starpu_cpu_free_on_node;
+
+#ifdef STARPU_USE_CUDA
+	_node_ops[STARPU_CUDA_RAM].wait_request_completion = _starpu_cuda_wait_request_completion;
+	_node_ops[STARPU_CUDA_RAM].test_request_completion = _starpu_cuda_test_request_completion;
+	_node_ops[STARPU_CUDA_RAM].copy_data_to[STARPU_CUDA_RAM] = _starpu_cuda_copy_data_from_cuda_to_cuda;
+	_node_ops[STARPU_CUDA_RAM].copy_data_to[STARPU_CPU_RAM] = _starpu_cuda_copy_data_from_cuda_to_cpu;
+	_node_ops[STARPU_CPU_RAM].copy_data_to[STARPU_CUDA_RAM] = _starpu_cuda_copy_data_from_cpu_to_cuda;
+	_node_ops[STARPU_CUDA_RAM].copy_interface_to[STARPU_CUDA_RAM] = _starpu_cuda_copy_interface_from_cuda_to_cuda;
+	_node_ops[STARPU_CUDA_RAM].copy_interface_to[STARPU_CPU_RAM] = _starpu_cuda_copy_interface_from_cuda_to_cpu;
+	_node_ops[STARPU_CPU_RAM].copy_interface_to[STARPU_CUDA_RAM] = _starpu_cuda_copy_interface_from_cpu_to_cuda;
+#endif
+#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
+	_node_ops[STARPU_CUDA_RAM].direct_access_supported = _starpu_cuda_direct_access_supported;
+	_node_ops[STARPU_CUDA_RAM].malloc_on_node = _starpu_cuda_malloc_on_node;
+	_node_ops[STARPU_CUDA_RAM].free_on_node = _starpu_cuda_free_on_node;
+#endif
+
+#ifdef STARPU_USE_OPENCL
+	_node_ops[STARPU_OPENCL_RAM].wait_request_completion = _starpu_opencl_wait_request_completion;
+	_node_ops[STARPU_OPENCL_RAM].test_request_completion = _starpu_opencl_test_request_completion;
+	_node_ops[STARPU_OPENCL_RAM].copy_data_to[STARPU_OPENCL_RAM] = _starpu_opencl_copy_data_from_opencl_to_opencl;
+	_node_ops[STARPU_OPENCL_RAM].copy_data_to[STARPU_CPU_RAM] = _starpu_opencl_copy_data_from_opencl_to_cpu;
+	_node_ops[STARPU_CPU_RAM].copy_data_to[STARPU_OPENCL_RAM] = _starpu_opencl_copy_data_from_cpu_to_opencl;
+	_node_ops[STARPU_OPENCL_RAM].copy_interface_to[STARPU_OPENCL_RAM] = _starpu_opencl_copy_interface_from_opencl_to_opencl;
+	_node_ops[STARPU_OPENCL_RAM].copy_interface_to[STARPU_CPU_RAM] = _starpu_opencl_copy_interface_from_opencl_to_cpu;
+	_node_ops[STARPU_CPU_RAM].copy_interface_to[STARPU_OPENCL_RAM] = _starpu_opencl_copy_interface_from_cpu_to_opencl;
+#endif
+#if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
+	_node_ops[STARPU_OPENCL_RAM].direct_access_supported = _starpu_opencl_direct_access_supported;
+	_node_ops[STARPU_OPENCL_RAM].malloc_on_node = _starpu_opencl_malloc_on_node;
+	_node_ops[STARPU_OPENCL_RAM].free_on_node = _starpu_opencl_free_on_node;
+#endif
+
+#ifdef STARPU_USE_MIC
+	_node_ops[STARPU_MIC_RAM].wait_request_completion = _starpu_mic_wait_request_completion;
+	_node_ops[STARPU_MIC_RAM].test_request_completion = _starpu_mic_test_request_completion;
+	/* TODO: MIC -> MIC */
+	_node_ops[STARPU_MIC_RAM].copy_data_to[STARPU_CPU_RAM] = _starpu_mic_copy_data_from_mic_to_cpu;
+	_node_ops[STARPU_CPU_RAM].copy_data_to[STARPU_MIC_RAM] = _starpu_mic_copy_data_from_cpu_to_mic;
+	_node_ops[STARPU_MIC_RAM].copy_interface_to[STARPU_CPU_RAM] = _starpu_mic_copy_interface_from_mic_to_cpu;
+	_node_ops[STARPU_CPU_RAM].copy_interface_to[STARPU_MIC_RAM] = _starpu_mic_copy_interface_from_cpu_to_mic;
+	_node_ops[STARPU_MIC_RAM].direct_access_supported = _starpu_mic_direct_access_supported;
+	_node_ops[STARPU_MIC_RAM].malloc_on_node = _starpu_mic_malloc_on_node;
+	_node_ops[STARPU_MIC_RAM].free_on_node = _starpu_mic_free_on_node;
+#endif
+
+#ifdef STARPU_USE_MPI_MASTER_SLAVE
+	_node_ops[STARPU_MPI_MS_RAM].wait_request_completion = _starpu_mpi_common_wait_request_completion;
+	_node_ops[STARPU_MPI_MS_RAM].test_request_completion = _starpu_mpi_common_test_event;
+	_node_ops[STARPU_MPI_MS_RAM].copy_data_to[STARPU_CPU_RAM] = _starpu_mpi_copy_data_from_mpi_to_cpu;
+	_node_ops[STARPU_MPI_MS_RAM].copy_data_to[STARPU_MPI_MS_RAM] = _starpu_mpi_copy_data_from_mpi_to_mpi;
+	_node_ops[STARPU_CPU_RAM].copy_data_to[STARPU_MPI_MS_RAM] = _starpu_mpi_copy_data_from_cpu_to_mpi;
+	_node_ops[STARPU_MPI_MS_RAM].copy_interface_to[STARPU_MPI_MS_RAM] = _starpu_mpi_copy_interface_from_mpi_to_mpi;
+	_node_ops[STARPU_MPI_MS_RAM].copy_interface_to[STARPU_CPU_RAM] = _starpu_mpi_copy_interface_from_mpi_to_cpu;
+	_node_ops[STARPU_CPU_RAM].copy_interface_to[STARPU_MPI_MS_RAM] = _starpu_mpi_copy_interface_from_cpu_to_mpi;
+	_node_ops[STARPU_MPI_MS_RAM].direct_access_supported = _starpu_mpi_direct_access_supported;
+	_node_ops[STARPU_MPI_MS_RAM].malloc_on_node = _starpu_mpi_malloc_on_node;
+	_node_ops[STARPU_MPI_MS_RAM].free_on_node = _starpu_mpi_free_on_node;
+#endif
+
+	_node_ops[STARPU_DISK_RAM].wait_request_completion = _starpu_disk_wait_request_completion;
+	_node_ops[STARPU_DISK_RAM].test_request_completion = _starpu_disk_test_request_completion;
+	_node_ops[STARPU_DISK_RAM].copy_data_to[STARPU_CPU_RAM] = _starpu_disk_copy_data_from_disk_to_cpu;
+	_node_ops[STARPU_DISK_RAM].copy_data_to[STARPU_DISK_RAM] = _starpu_disk_copy_data_from_disk_to_disk;
+	_node_ops[STARPU_CPU_RAM].copy_data_to[STARPU_DISK_RAM] = _starpu_disk_copy_data_from_cpu_to_disk;
+	_node_ops[STARPU_DISK_RAM].copy_interface_to[STARPU_DISK_RAM] = _starpu_disk_copy_interface_from_disk_to_disk;
+	_node_ops[STARPU_DISK_RAM].copy_interface_to[STARPU_CPU_RAM] = _starpu_disk_copy_interface_from_disk_to_cpu;
+	_node_ops[STARPU_CPU_RAM].copy_interface_to[STARPU_DISK_RAM] = _starpu_disk_copy_interface_from_cpu_to_disk;
+	_node_ops[STARPU_DISK_RAM].direct_access_supported = _starpu_disk_direct_access_supported;
+	_node_ops[STARPU_DISK_RAM].malloc_on_node = _starpu_disk_malloc_on_node;
+	_node_ops[STARPU_DISK_RAM].free_on_node = _starpu_disk_free_on_node;
+
+#ifdef STARPU_USE_SCC
+	_node_ops[STARPU_SCC_RAM].copy_data_to[STARPU_CPU_RAM] = _starpu_scc_copy_data_from_scc_to_cpu;
+	_node_ops[STARPU_SCC_RAM].copy_data_to[STARPU_SCC_RAM] = _starpu_scc_copy_data_from_scc_to_scc;
+	_node_ops[STARPU_CPU_RAM].copy_data_to[STARPU_SCC_RAM] = _starpu_scc_copy_data_from_cpu_to_scc;
+	_node_ops[STARPU_SCC_RAM].copy_interface_to[STARPU_SCC_RAM] = _starpu_scc_copy_interface_from_scc_to_scc;
+	_node_ops[STARPU_SCC_RAM].copy_interface_to[STARPU_CPU_RAM] = _starpu_scc_copy_interface_from_scc_to_cpu;
+	_node_ops[STARPU_CPU_RAM].copy_interface_to[STARPU_SCC_RAM] = _starpu_scc_copy_interface_from_cpu_to_scc;
+	_node_ops[STARPU_SCC_RAM].direct_access_supported = _starpu_scc_direct_access_supported;
+	_node_ops[STARPU_SCC_RAM].malloc_on_node = _starpu_scc_malloc_on_node;
+	_node_ops[STARPU_SCC_RAM].free_on_node = _starpu_scc_free_on_node;
+#endif
+}
+
+const char* _starpu_node_get_prefix(enum starpu_node_kind kind)
+{
+	switch (kind)
+	{
+		case STARPU_CPU_RAM:
+			return "NUMA";
+		case STARPU_CUDA_RAM:
+			return "CUDA";
+		case STARPU_OPENCL_RAM:
+			return "OpenCL";
+		case STARPU_DISK_RAM:
+			return "Disk";
+		case STARPU_MIC_RAM:
+			return "MIC";
+		case STARPU_MPI_MS_RAM:
+			return "MPI_MS";
+		case STARPU_SCC_RAM:
+			return "SCC_RAM";
+		case STARPU_SCC_SHM:
+			return "SCC_shared";
+		case STARPU_UNUSED:
+		default:
+			STARPU_ASSERT(0);
+			return "unknown";
+	}
+}

+ 50 - 0
src/datawizard/node_ops.h

@@ -0,0 +1,50 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2008-2011,2014,2017                      Université de Bordeaux
+ * Copyright (C) 2016,2017                                Inria
+ * Copyright (C) 2010,2013,2015,2017,2019                      CNRS
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __NODE_OPS_H__
+#define __NODE_OPS_H__
+
+#include <starpu.h>
+#include <common/config.h>
+#include <datawizard/copy_driver.h>
+
+typedef int (*copy_data_func_t)(starpu_data_handle_t handle, void *src_interface, unsigned src_node,
+				void *dst_interface, unsigned dst_node,
+				struct _starpu_data_request *req);
+
+typedef int (*copy_interface_t)(uintptr_t src_ptr, size_t src_offset, unsigned src_node,
+				uintptr_t dst_ptr, size_t dst_offset, unsigned dst_node,
+				size_t ssize, struct _starpu_async_channel *async_channel);
+
+struct _starpu_node_ops
+{
+	copy_data_func_t copy_data_to[STARPU_MPI_MS_RAM+1];
+	copy_interface_t copy_interface_to[STARPU_MPI_MS_RAM+1];
+	void (*wait_request_completion)(struct _starpu_async_channel *async_channel);
+	unsigned (*test_request_completion)(struct _starpu_async_channel *async_channel);
+	int (*direct_access_supported)(unsigned node, unsigned handling_node);
+	uintptr_t (*malloc_on_node)(unsigned dst_node, size_t size, int flags);
+	void (*free_on_node)(unsigned dst_node, uintptr_t addr, size_t size, int flags);
+};
+
+extern struct _starpu_node_ops _node_ops[STARPU_MPI_MS_RAM+1];
+
+void _starpu_node_ops_init();
+const char* _starpu_node_get_prefix(enum starpu_node_kind kind);
+
+#endif // __NODE_OPS_H__

+ 74 - 3
src/drivers/cpu/driver_cpu.c

@@ -1,9 +1,9 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011,2012,2014-2017,2019                 Inria
- * Copyright (C) 2008-2018                                Université de Bordeaux
+ * Copyright (C) 2008-2019                                Université de Bordeaux
  * Copyright (C) 2010                                     Mehdi Juhoor
- * Copyright (C) 2010-2017                                CNRS
+ * Copyright (C) 2010-2017,2019                           CNRS
  * Copyright (C) 2013                                     Thibaut Lambert
  * Copyright (C) 2011                                     Télécom-SudParis
  *
@@ -31,12 +31,18 @@
 #include <core/drivers.h>
 #include <core/idle_hook.h>
 #include <drivers/cpu/driver_cpu.h>
+#include <drivers/disk/driver_disk.h>
 #include <core/sched_policy.h>
 #include <datawizard/memory_manager.h>
+#include <datawizard/memory_nodes.h>
 #include <datawizard/malloc.h>
 #include <core/simgrid.h>
 #include <core/task.h>
+<<<<<<< HEAD
 #include <common/knobs.h>
+=======
+#include <core/disk.h>
+>>>>>>> master
 
 #ifdef STARPU_HAVE_HWLOC
 #include <hwloc.h>
@@ -53,6 +59,7 @@
 #endif
 
 
+#ifdef STARPU_USE_CPU
 /* Actually launch the job on a cpu worker.
  * Handle binding CPUs on cores.
  * In the case of a combined worker WORKER_TASK != J->TASK */
@@ -207,7 +214,7 @@ static int _starpu_cpu_driver_execute_task(struct _starpu_worker *cpu_worker, st
 		{
 			struct _starpu_combined_worker *combined_worker;
 			combined_worker = _starpu_get_combined_worker_struct(j->combined_workerid);
-			
+
 			cpu_worker->combined_workerid = j->combined_workerid;
 			cpu_worker->worker_size = combined_worker->worker_size;
 			perf_arch = &combined_worker->perf_arch;
@@ -434,3 +441,67 @@ struct _starpu_driver_ops _starpu_driver_cpu_ops =
 	.run_once = _starpu_cpu_driver_run_once,
 	.deinit = _starpu_cpu_driver_deinit
 };
+#endif /* STARPU_USE_CPU */
+
+int _starpu_cpu_copy_data(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_CPU_RAM);
+
+	int ret = 0;
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+	if (copy_methods->ram_to_ram)
+		copy_methods->ram_to_ram(src_interface, src_node, dst_interface, dst_node);
+	else
+		copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req ? &req->async_channel : NULL);
+	return ret;
+}
+
+int _starpu_cpu_copy_interface(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_CPU_RAM);
+
+	(void) async_channel;
+
+	memcpy((void *) (dst + dst_offset), (void *) (src + src_offset), size);
+	return 0;
+}
+
+int _starpu_cpu_direct_access_supported(unsigned node, unsigned handling_node)
+{
+	(void) node;
+	(void) handling_node;
+	return 1;
+}
+
+uintptr_t _starpu_cpu_malloc_on_node(unsigned dst_node, size_t size, int flags)
+{
+	uintptr_t addr = 0;
+	_starpu_malloc_flags_on_node(dst_node, (void**) &addr, size,
+#if defined(STARPU_USE_CUDA) && !defined(STARPU_HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
+				     /* without memcpy_peer, we can not
+				      * allocated pinned memory, since it
+				      * requires waiting for a task, and we
+				      * may be called with a spinlock held
+				      */
+				     flags & ~STARPU_MALLOC_PINNED
+#else
+				     flags
+#endif
+				     );
+	return addr;
+}
+
+void _starpu_cpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags)
+{
+	_starpu_free_flags_on_node(dst_node, (void*)addr, size,
+#if defined(STARPU_USE_CUDA) && !defined(STARPU_HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
+				   flags & ~STARPU_MALLOC_PINNED
+#else
+				   flags
+#endif
+				   );
+}

+ 8 - 5
src/drivers/cpu/driver_cpu.h

@@ -1,8 +1,8 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2012,2014                                Inria
- * Copyright (C) 2008-2011,2014,2018                      Université de Bordeaux
- * Copyright (C) 2010,2012,2013,2015,2017                 CNRS
+ * Copyright (C) 2008-2011,2014,2018-2019                 Université de Bordeaux
+ * Copyright (C) 2010,2012,2013,2015,2017,2019            CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -21,11 +21,14 @@
 
 #include <common/config.h>
 
-#ifdef STARPU_USE_CPU
-
 extern struct _starpu_driver_ops _starpu_driver_cpu_ops;
 void *_starpu_cpu_worker(void *);
 
-#endif /* !STARPU_USE_CPU */
+int _starpu_cpu_copy_data(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+int _starpu_cpu_copy_interface(uintptr_t src_ptr, size_t src_offset, unsigned src_node, uintptr_t dst_ptr, size_t dst_offset, unsigned dst_node, size_t ssize, struct _starpu_async_channel *async_channel);
+
+int _starpu_cpu_direct_access_supported(unsigned node, unsigned handling_node);
+uintptr_t _starpu_cpu_malloc_on_node(unsigned dst_node, size_t size, int flags);
+void _starpu_cpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags);
 
 #endif //  __DRIVER_CPU_H__

+ 340 - 3
src/drivers/cuda/driver_cuda.c

@@ -77,6 +77,7 @@ static cudaEvent_t task_events[STARPU_NMAXWORKERS][STARPU_MAX_PIPELINE];
 #endif /* STARPU_USE_CUDA */
 #ifdef STARPU_SIMGRID
 static unsigned task_finished[STARPU_NMAXWORKERS][STARPU_MAX_PIPELINE];
+static starpu_pthread_mutex_t cuda_alloc_mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
 #endif /* STARPU_SIMGRID */
 
 static enum initialization cuda_device_init[STARPU_MAXCUDADEVS];
@@ -718,9 +719,9 @@ int _starpu_cuda_driver_init(struct _starpu_worker_set *worker_set)
 		const char *devname = "Simgrid";
 #else
 		/* get the device's name */
-		char devname[128];
-		strncpy(devname, props[devid].name, 127);
-		devname[127] = 0;
+		char devname[64];
+		strncpy(devname, props[devid].name, 63);
+		devname[63] = 0;
 #endif
 
 #if defined(STARPU_HAVE_BUSID) && !defined(STARPU_SIMGRID)
@@ -1215,6 +1216,342 @@ int _starpu_cuda_driver_deinit_from_worker(struct _starpu_worker *worker)
 	return _starpu_cuda_driver_deinit(worker->set);
 }
 
+#ifdef STARPU_USE_CUDA
+unsigned _starpu_cuda_test_request_completion(struct _starpu_async_channel *async_channel)
+{
+	cudaEvent_t event;
+	cudaError_t cures;
+	unsigned success;
+
+	event = (*async_channel).event.cuda_event;
+	cures = cudaEventQuery(event);
+	success = (cures == cudaSuccess);
+
+	if (success)
+		cudaEventDestroy(event);
+	else if (cures != cudaErrorNotReady)
+		STARPU_CUDA_REPORT_ERROR(cures);
+
+	return success;
+}
+
+void _starpu_cuda_wait_request_completion(struct _starpu_async_channel *async_channel)
+{
+	cudaEvent_t event;
+	cudaError_t cures;
+
+	event = (*async_channel).event.cuda_event;
+
+	cures = cudaEventSynchronize(event);
+	if (STARPU_UNLIKELY(cures))
+		STARPU_CUDA_REPORT_ERROR(cures);
+
+	cures = cudaEventDestroy(event);
+	if (STARPU_UNLIKELY(cures))
+		STARPU_CUDA_REPORT_ERROR(cures);
+}
+
+int _starpu_cuda_copy_data_from_cuda_to_cuda(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_CUDA_RAM && dst_kind == STARPU_CUDA_RAM);
+
+	int ret = 1;
+	cudaError_t cures;
+	cudaStream_t stream;
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+/* CUDA - CUDA transfer */
+	if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() || !(copy_methods->cuda_to_cuda_async || copy_methods->any_to_any))
+	{
+		STARPU_ASSERT(copy_methods->cuda_to_cuda || copy_methods->any_to_any);
+		/* this is not associated to a request so it's synchronous */
+		if (copy_methods->cuda_to_cuda)
+			copy_methods->cuda_to_cuda(src_interface, src_node, dst_interface, dst_node);
+		else
+			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+	}
+	else
+	{
+		req->async_channel.type = STARPU_CUDA_RAM;
+		cures = cudaEventCreateWithFlags(&req->async_channel.event.cuda_event, cudaEventDisableTiming);
+		if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
+
+		stream = starpu_cuda_get_peer_transfer_stream(src_node, dst_node);
+		if (copy_methods->cuda_to_cuda_async)
+			ret = copy_methods->cuda_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
+		else
+		{
+			STARPU_ASSERT(copy_methods->any_to_any);
+			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+		}
+
+		cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
+		if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
+	}
+	return ret;
+}
+
+int _starpu_cuda_copy_data_from_cuda_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_CUDA_RAM && dst_kind == STARPU_CPU_RAM);
+
+	int ret = 1;
+	cudaError_t cures;
+	cudaStream_t stream;
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+
+	/* only the proper CUBLAS thread can initiate this directly ! */
+#if !defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
+	STARPU_ASSERT(starpu_worker_get_local_memory_node() == src_node);
+#endif
+	if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() || !(copy_methods->cuda_to_ram_async || copy_methods->any_to_any))
+	{
+		/* this is not associated to a request so it's synchronous */
+		STARPU_ASSERT(copy_methods->cuda_to_ram || copy_methods->any_to_any);
+		if (copy_methods->cuda_to_ram)
+			copy_methods->cuda_to_ram(src_interface, src_node, dst_interface, dst_node);
+		else
+			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+	}
+	else
+	{
+		req->async_channel.type = STARPU_CUDA_RAM;
+		cures = cudaEventCreateWithFlags(&req->async_channel.event.cuda_event, cudaEventDisableTiming);
+		if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
+
+		stream = starpu_cuda_get_out_transfer_stream(src_node);
+		if (copy_methods->cuda_to_ram_async)
+			ret = copy_methods->cuda_to_ram_async(src_interface, src_node, dst_interface, dst_node, stream);
+		else
+		{
+			STARPU_ASSERT(copy_methods->any_to_any);
+			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+		}
+
+		cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
+		if (STARPU_UNLIKELY(cures != cudaSuccess)) STARPU_CUDA_REPORT_ERROR(cures);
+	}
+	return ret;
+}
+
+int _starpu_cuda_copy_data_from_cpu_to_cuda(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_CUDA_RAM);
+
+	int ret = 1;
+	cudaError_t cures;
+	cudaStream_t stream;
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+
+	/* STARPU_CPU_RAM -> CUBLAS_RAM */
+	/* only the proper CUBLAS thread can initiate this ! */
+#if !defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
+	STARPU_ASSERT(starpu_worker_get_local_memory_node() == dst_node);
+#endif
+	if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_cuda_copy_disabled() ||
+	    !(copy_methods->ram_to_cuda_async || copy_methods->any_to_any))
+	{
+		/* this is not associated to a request so it's synchronous */
+		STARPU_ASSERT(copy_methods->ram_to_cuda || copy_methods->any_to_any);
+		if (copy_methods->ram_to_cuda)
+			copy_methods->ram_to_cuda(src_interface, src_node, dst_interface, dst_node);
+		else
+			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+	}
+	else
+	{
+		req->async_channel.type = STARPU_CUDA_RAM;
+		cures = cudaEventCreateWithFlags(&req->async_channel.event.cuda_event, cudaEventDisableTiming);
+		if (STARPU_UNLIKELY(cures != cudaSuccess))
+			STARPU_CUDA_REPORT_ERROR(cures);
+
+		stream = starpu_cuda_get_in_transfer_stream(dst_node);
+		if (copy_methods->ram_to_cuda_async)
+			ret = copy_methods->ram_to_cuda_async(src_interface, src_node, dst_interface, dst_node, stream);
+		else
+		{
+			STARPU_ASSERT(copy_methods->any_to_any);
+			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+		}
+
+		cures = cudaEventRecord(req->async_channel.event.cuda_event, stream);
+		if (STARPU_UNLIKELY(cures != cudaSuccess))
+			STARPU_CUDA_REPORT_ERROR(cures);
+	}
+	return ret;
+}
+
+int _starpu_cuda_copy_interface_from_cuda_to_cpu(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+
+	STARPU_ASSERT(src_kind == STARPU_CUDA_RAM && dst_kind == STARPU_CPU_RAM);
+
+	return starpu_cuda_copy_async_sync((void*) (src + src_offset), src_node,
+					   (void*) (dst + dst_offset), dst_node,
+					   size,
+					   async_channel?starpu_cuda_get_out_transfer_stream(src_node):NULL,
+					   cudaMemcpyDeviceToHost);
+}
+
+int _starpu_cuda_copy_interface_from_cuda_to_cuda(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+
+	STARPU_ASSERT(src_kind == STARPU_CUDA_RAM && dst_kind == STARPU_CUDA_RAM);
+
+	return starpu_cuda_copy_async_sync((void*) (src + src_offset), src_node,
+					   (void*) (dst + dst_offset), dst_node,
+					   size,
+					   async_channel?starpu_cuda_get_peer_transfer_stream(src_node, dst_node):NULL,
+					   cudaMemcpyDeviceToDevice);
+}
+
+int _starpu_cuda_copy_interface_from_cpu_to_cuda(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+
+	STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_CUDA_RAM);
+
+	return starpu_cuda_copy_async_sync((void*) (src + src_offset), src_node,
+					   (void*) (dst + dst_offset), dst_node,
+					   size,
+					   async_channel?starpu_cuda_get_in_transfer_stream(dst_node):NULL,
+					   cudaMemcpyHostToDevice);
+}
+
+#endif /* STARPU_USE_CUDA */
+
+int _starpu_cuda_direct_access_supported(unsigned node, unsigned handling_node)
+{
+	/* GPUs not always allow direct remote access: if CUDA4
+	 * is enabled, we allow two CUDA devices to communicate. */
+#ifdef STARPU_SIMGRID
+	(void) node;
+	if (starpu_node_get_kind(handling_node) == STARPU_CUDA_RAM)
+	{
+		msg_host_t host = _starpu_simgrid_get_memnode_host(handling_node);
+		const char* cuda_memcpy_peer = MSG_host_get_property_value(host, "memcpy_peer");
+		return cuda_memcpy_peer && atoll(cuda_memcpy_peer);
+	}
+	else
+		return 0;
+#elif defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
+	(void) node;
+	enum starpu_node_kind kind = starpu_node_get_kind(handling_node);
+	return kind == STARPU_CUDA_RAM;
+#else /* STARPU_HAVE_CUDA_MEMCPY_PEER */
+	/* Direct GPU-GPU transfers are not allowed in general */
+	(void) node;
+	(void) handling_node;
+	return 0;
+#endif /* STARPU_HAVE_CUDA_MEMCPY_PEER */
+}
+
+uintptr_t _starpu_cuda_malloc_on_node(unsigned dst_node, size_t size, int flags)
+{
+	uintptr_t addr = 0;
+	(void) flags;
+
+#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
+
+#ifdef STARPU_SIMGRID
+	static uintptr_t last[STARPU_MAXNODES];
+#ifdef STARPU_DEVEL
+#warning TODO: record used memory, using a simgrid property to know the available memory
+#endif
+	/* Sleep for the allocation */
+	STARPU_PTHREAD_MUTEX_LOCK(&cuda_alloc_mutex);
+	if (_starpu_simgrid_cuda_malloc_cost())
+		MSG_process_sleep(0.000175);
+	if (!last[dst_node])
+		last[dst_node] = 1<<10;
+	addr = last[dst_node];
+	last[dst_node]+=size;
+	STARPU_ASSERT(last[dst_node] >= addr);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_alloc_mutex);
+#else
+	unsigned devid = starpu_memory_node_get_devid(dst_node);
+#if defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
+	starpu_cuda_set_device(devid);
+#else
+	struct _starpu_worker *worker = _starpu_get_local_worker_key();
+	if (!worker || worker->arch != STARPU_CUDA_WORKER || worker->devid != devid)
+		STARPU_ASSERT_MSG(0, "CUDA peer access is not available with this version of CUDA");
+#endif
+	/* Check if there is free memory */
+	size_t cuda_mem_free, cuda_mem_total;
+	cudaError_t status;
+	status = cudaMemGetInfo(&cuda_mem_free, &cuda_mem_total);
+	if (status == cudaSuccess && cuda_mem_free < (size*2))
+	{
+		addr = 0;
+	}
+	else
+	{
+		status = cudaMalloc((void **)&addr, size);
+		if (!addr || (status != cudaSuccess))
+		{
+			if (STARPU_UNLIKELY(status != cudaErrorMemoryAllocation))
+				STARPU_CUDA_REPORT_ERROR(status);
+			addr = 0;
+		}
+	}
+#endif
+#endif
+	return addr;
+}
+
+void _starpu_cuda_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags)
+{
+	(void) size;
+	(void) flags;
+
+#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
+#ifdef STARPU_SIMGRID
+	STARPU_PTHREAD_MUTEX_LOCK(&cuda_alloc_mutex);
+	/* Sleep for the free */
+	if (_starpu_simgrid_cuda_malloc_cost())
+		MSG_process_sleep(0.000750);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&cuda_alloc_mutex);
+	/* CUDA also synchronizes roughly everything on cudaFree */
+	_starpu_simgrid_sync_gpus();
+#else
+	cudaError_t err;
+	unsigned devid = starpu_memory_node_get_devid(dst_node);
+#if defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
+	starpu_cuda_set_device(devid);
+#else
+	struct _starpu_worker *worker = _starpu_get_local_worker_key();
+	if (!worker || worker->arch != STARPU_CUDA_WORKER || worker->devid != devid)
+		STARPU_ASSERT_MSG(0, "CUDA peer access is not available with this version of CUDA");
+#endif /* STARPU_HAVE_CUDA_MEMCPY_PEER */
+	err = cudaFree((void*)addr);
+#ifdef STARPU_OPENMP
+	/* When StarPU is used as Open Runtime support,
+	 * starpu_omp_shutdown() will usually be called from a
+	 * destructor, in which case cudaThreadExit() reports a
+	 * cudaErrorCudartUnloading here. There should not
+	 * be any remaining tasks running at this point so
+	 * we can probably ignore it without much consequences. */
+	if (STARPU_UNLIKELY(err != cudaSuccess && err != cudaErrorCudartUnloading))
+		STARPU_CUDA_REPORT_ERROR(err);
+#else
+	if (STARPU_UNLIKELY(err != cudaSuccess))
+		STARPU_CUDA_REPORT_ERROR(err);
+#endif /* STARPU_OPENMP */
+#endif /* STARPU_SIMGRID */
+#endif
+}
+
 struct _starpu_driver_ops _starpu_driver_cuda_ops =
 {
 	.init = _starpu_cuda_driver_init_from_worker,

+ 17 - 1
src/drivers/cuda/driver_cuda.h

@@ -3,7 +3,7 @@
  * Copyright (C) 2008-2014,2016,2017                      Université de Bordeaux
  * Copyright (C) 2012,2017                                Inria
  * Copyright (C) 2015                                     Mathieu Lirzin
- * Copyright (C) 2010,2012,2015,2017                      CNRS
+ * Copyright (C) 2010,2012,2015,2017,2019                 CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -29,6 +29,7 @@
 #endif
 
 #include <starpu.h>
+#include <core/workers.h>
 
 extern struct _starpu_driver_ops _starpu_driver_cuda_ops;
 
@@ -52,5 +53,20 @@ cudaStream_t starpu_cuda_get_out_transfer_stream(unsigned src_node);
 cudaStream_t starpu_cuda_get_peer_transfer_stream(unsigned src_node, unsigned dst_node);
 #endif
 
+unsigned _starpu_cuda_test_request_completion(struct _starpu_async_channel *async_channel);
+void _starpu_cuda_wait_request_completion(struct _starpu_async_channel *async_channel);
+
+int _starpu_cuda_copy_data_from_cpu_to_cuda(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+int _starpu_cuda_copy_data_from_cuda_to_cuda(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+int _starpu_cuda_copy_data_from_cuda_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+
+int _starpu_cuda_copy_interface_from_cuda_to_cuda(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+int _starpu_cuda_copy_interface_from_cuda_to_cpu(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+int _starpu_cuda_copy_interface_from_cpu_to_cuda(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+
+int _starpu_cuda_direct_access_supported(unsigned node, unsigned handling_node);
+uintptr_t _starpu_cuda_malloc_on_node(unsigned dst_node, size_t size, int flags);
+void _starpu_cuda_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags);
+
 #endif //  __DRIVER_CUDA_H__
 

+ 208 - 8
src/drivers/disk/driver_disk.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2013,2017                                CNRS
+ * Copyright (C) 2013,2017,2019                           CNRS
  * Copyright (C) 2013                                     Inria
  * Copyright (C) 2013                                     Université de Bordeaux
  * Copyright (C) 2013                                     Corentin Salingue
@@ -21,16 +21,15 @@
 #include <core/disk.h>
 #include <starpu_profiling.h>
 #include <drivers/disk/driver_disk.h>
+#include <datawizard/coherency.h>
 
 int _starpu_disk_copy_src_to_disk(void * src, unsigned src_node, void * dst, size_t dst_offset, unsigned dst_node, size_t size, void * async_channel)
 {
 	STARPU_ASSERT(starpu_node_get_kind(src_node) == STARPU_CPU_RAM);
-	
+
 	return _starpu_disk_write(src_node, dst_node, dst, src, dst_offset, size, async_channel);
-	
 }
 
-
 int _starpu_disk_copy_disk_to_src(void * src, size_t src_offset, unsigned src_node, void * dst, unsigned dst_node, size_t size, void * async_channel)
 {
 	STARPU_ASSERT(starpu_node_get_kind(dst_node) == STARPU_CPU_RAM);
@@ -38,13 +37,214 @@ int _starpu_disk_copy_disk_to_src(void * src, size_t src_offset, unsigned src_no
 	return _starpu_disk_read(src_node, dst_node, src, dst, src_offset, size, async_channel);
 }
 
-
 int _starpu_disk_copy_disk_to_disk(void * src, size_t src_offset, unsigned src_node, void * dst, size_t dst_offset, unsigned dst_node, size_t size, void * async_channel)
 {
 	STARPU_ASSERT(starpu_node_get_kind(src_node) == STARPU_DISK_RAM && starpu_node_get_kind(dst_node) == STARPU_DISK_RAM);
 
-       return _starpu_disk_copy(src_node, src, src_offset, 
-			       dst_node, dst, dst_offset,
-			       size, async_channel); 
+	return _starpu_disk_copy(src_node, src, src_offset, dst_node, dst, dst_offset, size, async_channel);
+}
+
+unsigned _starpu_disk_test_request_completion(struct _starpu_async_channel *async_channel)
+{
+	unsigned success = starpu_disk_test_request(async_channel);
+	if (async_channel->event.disk_event.ptr != NULL && success)
+	{
+		if (async_channel->event.disk_event.handle != NULL)
+		{
+			/* read is finished, we can already unpack */
+			async_channel->event.disk_event.handle->ops->unpack_data(async_channel->event.disk_event.handle, async_channel->event.disk_event.node, async_channel->event.disk_event.ptr, async_channel->event.disk_event.size);
+		}
+		else
+		{
+			/* write is finished, ptr was allocated in pack_data */
+			_starpu_free_flags_on_node(async_channel->event.disk_event.node, async_channel->event.disk_event.ptr, async_channel->event.disk_event.size, 0);
+		}
+	}
+	return success;
+}
+
+void _starpu_disk_wait_request_completion(struct _starpu_async_channel *async_channel)
+{
+	starpu_disk_wait_request(async_channel);
+	if (async_channel->event.disk_event.ptr != NULL)
+	{
+		if (async_channel->event.disk_event.handle != NULL)
+		{
+			/* read is finished, we can already unpack */
+			async_channel->event.disk_event.handle->ops->unpack_data(async_channel->event.disk_event.handle, async_channel->event.disk_event.node, async_channel->event.disk_event.ptr, async_channel->event.disk_event.size);
+		}
+		else
+		{
+			/* write is finished, ptr was allocated in pack_data */
+			_starpu_free_flags_on_node(async_channel->event.disk_event.node, async_channel->event.disk_event.ptr, async_channel->event.disk_event.size, 0);
+		}
+	}
+}
+
+int _starpu_disk_copy_data_from_disk_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_DISK_RAM && dst_kind == STARPU_CPU_RAM);
+
+	int ret = 0;
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+
+	if (req && !starpu_asynchronous_copy_disabled())
+	{
+		req->async_channel.type = STARPU_DISK_RAM;
+		req->async_channel.event.disk_event.requests = NULL;
+		req->async_channel.event.disk_event.ptr = NULL;
+		req->async_channel.event.disk_event.handle = NULL;
+	}
+	if(copy_methods->any_to_any)
+		ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req && !starpu_asynchronous_copy_disabled()  ? &req->async_channel : NULL);
+	else
+	{
+		void *obj = starpu_data_handle_to_pointer(handle, src_node);
+		void * ptr = NULL;
+		size_t size = 0;
+		ret = _starpu_disk_full_read(src_node, dst_node, obj, &ptr, &size, req && !starpu_asynchronous_copy_disabled() ? &req->async_channel : NULL);
+		if (ret == 0)
+		{
+			/* read is already finished, we can already unpack */
+			handle->ops->unpack_data(handle, dst_node, ptr, size);
+		}
+		else if (ret == -EAGAIN)
+		{
+			STARPU_ASSERT(req);
+			req->async_channel.event.disk_event.ptr = ptr;
+			req->async_channel.event.disk_event.node = dst_node;
+			req->async_channel.event.disk_event.size = size;
+			req->async_channel.event.disk_event.handle = handle;
+		}
+		STARPU_ASSERT(ret == 0 || ret == -EAGAIN);
+	}
+
+	return ret;
+}
+
+int _starpu_disk_copy_data_from_disk_to_disk(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_DISK_RAM && dst_kind == STARPU_DISK_RAM);
+
+	int ret = 0;
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+
+	if (req && !starpu_asynchronous_copy_disabled())
+	{
+		req->async_channel.type = STARPU_DISK_RAM;
+		req->async_channel.event.disk_event.requests = NULL;
+		req->async_channel.event.disk_event.ptr = NULL;
+		req->async_channel.event.disk_event.handle = NULL;
+	}
+	ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req && !starpu_asynchronous_copy_disabled() ? &req->async_channel : NULL);
+	return ret;
+}
+
+int _starpu_disk_copy_data_from_cpu_to_disk(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_DISK_RAM);
+
+	int ret = 0;
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
 
+	if (req && !starpu_asynchronous_copy_disabled())
+	{
+		req->async_channel.type = STARPU_DISK_RAM;
+		req->async_channel.event.disk_event.requests = NULL;
+		req->async_channel.event.disk_event.ptr = NULL;
+		req->async_channel.event.disk_event.handle = NULL;
+	}
+
+	if(copy_methods->any_to_any)
+		ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, req && !starpu_asynchronous_copy_disabled() ? &req->async_channel : NULL);
+	else
+	{
+		void *obj = starpu_data_handle_to_pointer(handle, dst_node);
+		void * ptr = NULL;
+		starpu_ssize_t size = 0;
+		handle->ops->pack_data(handle, src_node, &ptr, &size);
+		ret = _starpu_disk_full_write(src_node, dst_node, obj, ptr, size, req && !starpu_asynchronous_copy_disabled() ? &req->async_channel : NULL);
+		if (ret == 0)
+		{
+			/* write is already finished, ptr was allocated in pack_data */
+			_starpu_free_flags_on_node(src_node, ptr, size, 0);
+		}
+		else if (ret == -EAGAIN)
+		{
+			STARPU_ASSERT(req);
+			req->async_channel.event.disk_event.ptr = ptr;
+			req->async_channel.event.disk_event.node = src_node;
+			req->async_channel.event.disk_event.size = size;
+		}
+		STARPU_ASSERT(ret == 0 || ret == -EAGAIN);
+	}
+
+	return ret;
+}
+
+int _starpu_disk_copy_interface_from_disk_to_cpu(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_DISK_RAM && dst_kind == STARPU_CPU_RAM);
+
+	return _starpu_disk_copy_disk_to_src((void*) src, src_offset, src_node,
+					     (void*) (dst + dst_offset), dst_node,
+					     size, async_channel);
+}
+
+int _starpu_disk_copy_interface_from_disk_to_disk(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_DISK_RAM && dst_kind == STARPU_DISK_RAM);
+
+	return _starpu_disk_copy_disk_to_disk((void*) src, src_offset, src_node,
+					      (void*) dst, dst_offset, dst_node,
+					      size, async_channel);
+}
+
+int _starpu_disk_copy_interface_from_cpu_to_disk(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_DISK_RAM);
+
+	return _starpu_disk_copy_src_to_disk((void*) (src + src_offset), src_node,
+					     (void*) dst, dst_offset, dst_node,
+					     size, async_channel);
+}
+
+int _starpu_disk_direct_access_supported(unsigned node, unsigned handling_node)
+{
+	/* Each worker can manage disks but disk <-> disk is not always allowed */
+	switch (starpu_node_get_kind(handling_node))
+	{
+		case STARPU_CPU_RAM:
+			return 1;
+		case STARPU_DISK_RAM:
+			return _starpu_disk_can_copy(node, handling_node);
+		default:
+			return 0;
+	}
+}
+
+uintptr_t _starpu_disk_malloc_on_node(unsigned dst_node, size_t size, int flags)
+{
+	(void) flags;
+	uintptr_t addr = 0;
+	addr = (uintptr_t) _starpu_disk_alloc(dst_node, size);
+	return addr;
+}
+
+void _starpu_disk_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags)
+{
+	(void) flags;
+	_starpu_disk_free(dst_node, (void *) addr , size);
 }

+ 16 - 1
src/drivers/disk/driver_disk.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2017                                     CNRS
+ * Copyright (C) 2017, 2019                               CNRS
  * Copyright (C) 2013                                     Inria
  * Copyright (C) 2013                                     Université de Bordeaux
  * Copyright (C) 2013                                     Corentin Salingue
@@ -26,4 +26,19 @@ int _starpu_disk_copy_disk_to_src(void * src, size_t src_offset, unsigned src_no
 
 int _starpu_disk_copy_disk_to_disk(void * src, size_t src_offset, unsigned src_node, void * dst, size_t dst_offset, unsigned dst_node, size_t size, void * async_channel);
 
+unsigned _starpu_disk_test_request_completion(struct _starpu_async_channel *async_channel);
+void _starpu_disk_wait_request_completion(struct _starpu_async_channel *async_channel);
+
+int _starpu_disk_copy_data_from_disk_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+int _starpu_disk_copy_data_from_disk_to_disk(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+int _starpu_disk_copy_data_from_cpu_to_disk(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+
+int _starpu_disk_copy_interface_from_disk_to_cpu(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+int _starpu_disk_copy_interface_from_disk_to_disk(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+int _starpu_disk_copy_interface_from_cpu_to_disk(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+
+int _starpu_disk_direct_access_supported(unsigned node, unsigned handling_node);
+uintptr_t _starpu_disk_malloc_on_node(unsigned dst_node, size_t size, int flags);
+void _starpu_disk_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags);
+
 #endif

+ 142 - 20
src/drivers/mic/driver_mic_source.c

@@ -471,29 +471,11 @@ int _starpu_mic_init_event(struct _starpu_mic_async_event *event, unsigned memor
 	return 0;
 }
 
-/* Wait the end of the asynchronous request */
-void _starpu_mic_wait_request_completion(struct _starpu_mic_async_event *event)
-{
-	if (event->signal != NULL)
-	{
-		const struct _starpu_mp_node *mp_node = _starpu_mic_src_get_mp_node_from_memory_node(event->memory_node);
-		scif_epd_t epd = mp_node->host_sink_dt_connection.mic_endpoint;
-
-		if (scif_fence_wait(epd, event->mark) < 0)
-			STARPU_MIC_SRC_REPORT_SCIF_ERROR(errno);
-
-		if (scif_unregister(epd, (off_t)(event->signal), 0x1000) < 0)
-			STARPU_MIC_SRC_REPORT_SCIF_ERROR(errno);
-
-		free(event->signal);
-		event->signal = NULL;
-	}
-}
-
 /* Test if a asynchronous request is end.
  * Return 1 if is end, 0 else. */
-int _starpu_mic_request_is_complete(struct _starpu_mic_async_event *event)
+unsigned _starpu_mic_test_request_completion(struct _starpu_async_channel *async_channel)
 {
+	struct _starpu_mic_async_event *event = &async_channel->event.mic_event;
 	if (event->signal != NULL && *(event->signal) != STARPU_MIC_REQUEST_COMPLETE)
 		return 0;
 
@@ -508,7 +490,25 @@ int _starpu_mic_request_is_complete(struct _starpu_mic_async_event *event)
 	return 1;
 }
 
+/* Wait the end of the asynchronous request */
+void _starpu_mic_wait_request_completion(struct _starpu_async_channel *async_channel)
+{
+	struct _starpu_mic_async_event *event = &async_channel->event.mic_event;
+	if (event->signal != NULL)
+	{
+		const struct _starpu_mp_node *mp_node = _starpu_mic_src_get_mp_node_from_memory_node(event->memory_node);
+		scif_epd_t epd = mp_node->host_sink_dt_connection.mic_endpoint;
+
+		if (scif_fence_wait(epd, event->mark) < 0)
+			STARPU_MIC_SRC_REPORT_SCIF_ERROR(errno);
+
+		if (scif_unregister(epd, (off_t)(event->signal), 0x1000) < 0)
+			STARPU_MIC_SRC_REPORT_SCIF_ERROR(errno);
 
+		free(event->signal);
+		event->signal = NULL;
+	}
+}
 
 void *_starpu_mic_src_worker(void *arg)
 {
@@ -562,3 +562,125 @@ void *_starpu_mic_src_worker(void *arg)
 	return NULL;
 
 }
+
+int _starpu_mic_copy_data_from_mic_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_MIC_RAM && dst_kind == STARPU_CPU_RAM);
+
+	int ret = 0;
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+	/* MIC -> RAM */
+	if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mic_copy_disabled() || !(copy_methods->mic_to_ram_async || copy_methods->any_to_any))
+	{
+		/* this is not associated to a request so it's synchronous */
+		STARPU_ASSERT(copy_methods->mic_to_ram || copy_methods->any_to_any);
+		if (copy_methods->mic_to_ram)
+			ret = copy_methods->mic_to_ram(src_interface, src_node, dst_interface, dst_node);
+		else
+			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+	}
+	else
+	{
+		req->async_channel.type = STARPU_MIC_RAM;
+		if (copy_methods->mic_to_ram_async)
+			ret = copy_methods->mic_to_ram_async(src_interface, src_node, dst_interface, dst_node);
+		else
+		{
+			STARPU_ASSERT(copy_methods->any_to_any);
+			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+		}
+		_starpu_mic_init_event(&(req->async_channel.event.mic_event), src_node);
+	}
+	return ret;
+}
+
+int _starpu_mic_copy_data_from_cpu_to_mic(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_MIC_RAM);
+
+	int ret = 0;
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+	/* RAM -> MIC */
+	if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mic_copy_disabled() || !(copy_methods->ram_to_mic_async || copy_methods->any_to_any))
+	{
+		/* this is not associated to a request so it's synchronous */
+		STARPU_ASSERT(copy_methods->ram_to_mic || copy_methods->any_to_any);
+		if (copy_methods->ram_to_mic)
+			ret = copy_methods->ram_to_mic(src_interface, src_node, dst_interface, dst_node);
+		else
+			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+	}
+	else
+	{
+		req->async_channel.type = STARPU_MIC_RAM;
+		if (copy_methods->ram_to_mic_async)
+			ret = copy_methods->ram_to_mic_async(src_interface, src_node, dst_interface, dst_node);
+		else
+		{
+			STARPU_ASSERT(copy_methods->any_to_any);
+			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+		}
+		_starpu_mic_init_event(&(req->async_channel.event.mic_event), dst_node);
+	}
+
+	return ret;
+}
+
+int _starpu_mic_copy_interface_from_mic_to_cpu(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_MIC_RAM && dst_kind == STARPU_CPU_RAM);
+
+	if (async_channel)
+		return _starpu_mic_copy_mic_to_ram_async((void*) (src + src_offset), src_node,
+							 (void*) (dst + dst_offset), dst_node,
+							 size);
+	else
+		return _starpu_mic_copy_mic_to_ram((void*) (src + src_offset), src_node,
+						   (void*) (dst + dst_offset), dst_node,
+						   size);
+}
+
+int _starpu_mic_copy_interface_from_cpu_to_mic(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_MIC_RAM);
+
+	if (async_channel)
+		return _starpu_mic_copy_ram_to_mic_async((void*) (src + src_offset), src_node,
+							 (void*) (dst + dst_offset), dst_node,
+							 size);
+	else
+		return _starpu_mic_copy_ram_to_mic((void*) (src + src_offset), src_node,
+						   (void*) (dst + dst_offset), dst_node,
+						   size);
+}
+
+int _starpu_mic_direct_access_supported(unsigned node, unsigned handling_node)
+{
+	(void) node;
+	(void) handling_node;
+	/* TODO: We don't handle direct MIC-MIC transfers yet */
+	return 0;
+}
+
+uintptr_t _starpu_mic_malloc_on_node(unsigned dst_node, size_t size, int flags)
+{
+	(void) flags;
+	uintptr_t addr = 0;
+	if (_starpu_mic_allocate_memory((void **)(&addr), size, dst_node))
+		addr = 0;
+	return addr;
+}
+
+void _starpu_mic_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags)
+{
+	(void) flags;
+	_starpu_mic_free_memory((void*) addr, size, dst_node);
+}

+ 13 - 3
src/drivers/mic/driver_mic_source.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
- * Copyright (C) 2015,2017                                CNRS
+ * Copyright (C) 2015,2017,2019                           CNRS
  * Copyright (C) 2012,2017                                Inria
  * Copyright (C) 2013,2014,2017                           Université de Bordeaux
  * Copyright (C) 2013                                     Thibaut Lambert
@@ -74,12 +74,22 @@ int _starpu_mic_copy_ram_to_mic_async(void *src, unsigned src_node STARPU_ATTRIB
 int _starpu_mic_copy_mic_to_ram_async(void *src, unsigned src_node, void *dst, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size);
 
 int _starpu_mic_init_event(struct _starpu_mic_async_event *event, unsigned memory_node);
-void _starpu_mic_wait_request_completion(struct _starpu_mic_async_event *event);
-int _starpu_mic_request_is_complete(struct _starpu_mic_async_event *event);
 
 void *_starpu_mic_src_worker(void *arg);
 
 #endif /* STARPU_USE_MIC */
 
+unsigned _starpu_mic_test_request_completion(struct _starpu_async_channel *async_channel);
+void _starpu_mic_wait_request_completion(struct _starpu_async_channel *async_channel);
+
+int _starpu_mic_copy_data_from_mic_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+int _starpu_mic_copy_data_from_cpu_to_mic(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+
+int _starpu_mic_copy_interface_from_mic_to_cpu(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+int _starpu_mic_copy_interface_from_cpu_to_mic(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+
+int _starpu_mic_direct_access_supported(unsigned node, unsigned handling_node);
+uintptr_t _starpu_mic_malloc_on_node(unsigned dst_node, size_t size, int flags);
+void _starpu_mic_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags);
 
 #endif /* __DRIVER_MIC_SOURCE_H__ */

+ 4 - 3
src/drivers/mpi/driver_mpi_common.c

@@ -21,7 +21,7 @@
 #include <core/workers.h>
 #include <core/perfmodel/perfmodel.h>
 #include <drivers/mp_common/source_common.h>
-#include "driver_mpi_common.h"
+#include <drivers/mpi/driver_mpi_common.h>
 
 #define NITER 32
 #define SIZE_BANDWIDTH (1024*1024)
@@ -366,7 +366,7 @@ static void _starpu_mpi_common_polling_node(struct _starpu_mp_node * node)
 /* - In device to device communications, the first ack received by host
  * is considered as the sender (but it cannot be, in fact, the sender)
  */
-int _starpu_mpi_common_test_event(struct _starpu_async_channel * event)
+unsigned _starpu_mpi_common_test_event(struct _starpu_async_channel * event)
 {
         if (event->event.mpi_ms_event.requests != NULL && !_starpu_mpi_ms_event_request_list_empty(event->event.mpi_ms_event.requests))
         {
@@ -411,7 +411,7 @@ int _starpu_mpi_common_test_event(struct _starpu_async_channel * event)
 /* - In device to device communications, the first ack received by host
  * is considered as the sender (but it cannot be, in fact, the sender)
  */
-void _starpu_mpi_common_wait_event(struct _starpu_async_channel * event)
+void _starpu_mpi_common_wait_request_completion(struct _starpu_async_channel * event)
 {
         if (event->event.mpi_ms_event.requests != NULL && !_starpu_mpi_ms_event_request_list_empty(event->event.mpi_ms_event.requests))
         {
@@ -550,3 +550,4 @@ void _starpu_mpi_common_measure_bandwidth_latency(double timing_dtod[STARPU_MAXM
         }
         free(buf);
 }
+

+ 3 - 4
src/drivers/mpi/driver_mpi_common.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2016,2017                                Inria
- * Copyright (C) 2017                                     CNRS
+ * Copyright (C) 2017,2019                                CNRS
  * Copyright (C) 2015                                     Mathieu Lirzin
  *
  * StarPU is free software; you can redistribute it and/or modify
@@ -47,14 +47,13 @@ void _starpu_mpi_common_mp_recv(const struct _starpu_mp_node *node, void *msg, i
 void _starpu_mpi_common_recv_from_device(const struct _starpu_mp_node *node, int src_devid, void *msg, int len, void * event);
 void _starpu_mpi_common_send_to_device(const struct _starpu_mp_node *node, int dst_devid, void *msg, int len, void * event);
 
-int _starpu_mpi_common_test_event(struct _starpu_async_channel * event);
-void _starpu_mpi_common_wait_event(struct _starpu_async_channel * event);
+unsigned _starpu_mpi_common_test_event(struct _starpu_async_channel * event);
+void _starpu_mpi_common_wait_request_completion(struct _starpu_async_channel * event);
 
 void _starpu_mpi_common_barrier(void);
 
 void _starpu_mpi_common_measure_bandwidth_latency(double bandwidth_dtod[STARPU_MAXMPIDEVS][STARPU_MAXMPIDEVS], double latency_dtod[STARPU_MAXMPIDEVS][STARPU_MAXMPIDEVS]);
 
-
 #endif  /* STARPU_USE_MPI_MASTER_SLAVE */
 
 #endif	/* __DRIVER_MPI_COMMON_H__ */

+ 166 - 0
src/drivers/mpi/driver_mpi_source.c

@@ -380,3 +380,169 @@ void *_starpu_mpi_src_worker(void *arg)
 
         return NULL;
 }
+
+int _starpu_mpi_copy_data_from_mpi_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_MPI_MS_RAM && dst_kind == STARPU_CPU_RAM);
+
+	int ret = 0;
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+	if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mpi_ms_copy_disabled() || !(copy_methods->mpi_ms_to_ram_async || copy_methods->any_to_any))
+	{
+		/* this is not associated to a request so it's synchronous */
+		STARPU_ASSERT(copy_methods->mpi_ms_to_ram || copy_methods->any_to_any);
+		if (copy_methods->mpi_ms_to_ram)
+			copy_methods->mpi_ms_to_ram(src_interface, src_node, dst_interface, dst_node);
+		else
+			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+	}
+	else
+	{
+		req->async_channel.type = STARPU_MPI_MS_RAM;
+		if(copy_methods->mpi_ms_to_ram_async)
+			ret = copy_methods->mpi_ms_to_ram_async(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+		else
+		{
+			STARPU_ASSERT(copy_methods->any_to_any);
+			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+		}
+	}
+	return ret;
+}
+
+int _starpu_mpi_copy_data_from_mpi_to_mpi(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_MPI_MS_RAM && dst_kind == STARPU_MPI_MS_RAM);
+
+	int ret = 0;
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+
+	if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mpi_ms_copy_disabled() || !(copy_methods->mpi_ms_to_mpi_ms_async || copy_methods->any_to_any))
+	{
+		/* this is not associated to a request so it's synchronous */
+		STARPU_ASSERT(copy_methods->mpi_ms_to_mpi_ms || copy_methods->any_to_any);
+		if (copy_methods->mpi_ms_to_mpi_ms)
+			copy_methods->mpi_ms_to_mpi_ms(src_interface, src_node, dst_interface, dst_node);
+		else
+			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+	}
+	else
+	{
+		req->async_channel.type = STARPU_MPI_MS_RAM;
+		if(copy_methods->mpi_ms_to_mpi_ms_async)
+			ret = copy_methods->mpi_ms_to_mpi_ms_async(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+		else
+		{
+			STARPU_ASSERT(copy_methods->any_to_any);
+			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+		}
+	}
+	return ret;
+}
+
+int _starpu_mpi_copy_data_from_cpu_to_mpi(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_MPI_MS_RAM);
+
+	int ret = 0;
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+
+	if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mpi_ms_copy_disabled() || !(copy_methods->ram_to_mpi_ms_async || copy_methods->any_to_any))
+	{
+		/* this is not associated to a request so it's synchronous */
+		STARPU_ASSERT(copy_methods->ram_to_mpi_ms || copy_methods->any_to_any);
+		if (copy_methods->ram_to_mpi_ms)
+			copy_methods->ram_to_mpi_ms(src_interface, src_node, dst_interface, dst_node);
+		else
+			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+	}
+	else
+	{
+		req->async_channel.type = STARPU_MPI_MS_RAM;
+		if(copy_methods->ram_to_mpi_ms_async)
+			ret = copy_methods->ram_to_mpi_ms_async(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+		else
+		{
+			STARPU_ASSERT(copy_methods->any_to_any);
+			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+		}
+	}
+	return ret;
+}
+
+int _starpu_mpi_copy_interface_from_mpi_to_cpu(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_MPI_MS_RAM && dst_kind == STARPU_CPU_RAM);
+
+	if (async_channel)
+		return _starpu_mpi_copy_mpi_to_ram_async((void*) (src + src_offset), src_node,
+							 (void*) (dst + dst_offset), dst_node,
+							 size, async_channel);
+	else
+		return _starpu_mpi_copy_mpi_to_ram_sync((void*) (src + src_offset), src_node,
+							(void*) (dst + dst_offset), dst_node,
+							size);
+}
+
+int _starpu_mpi_copy_interface_from_mpi_to_mpi(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_MPI_MS_RAM && dst_kind == STARPU_MPI_MS_RAM);
+
+	if (async_channel)
+		return _starpu_mpi_copy_sink_to_sink_async((void*) (src + src_offset), src_node,
+							   (void*) (dst + dst_offset), dst_node,
+							   size, async_channel);
+	else
+		return _starpu_mpi_copy_sink_to_sink_sync((void*) (src + src_offset), src_node,
+							  (void*) (dst + dst_offset), dst_node,
+							  size);
+}
+
+int _starpu_mpi_copy_interface_from_cpu_to_mpi(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_MPI_MS_RAM);
+
+	if (async_channel)
+		return _starpu_mpi_copy_ram_to_mpi_async((void*) (src + src_offset), src_node,
+							 (void*) (dst + dst_offset), dst_node,
+							 size, async_channel);
+	else
+		return _starpu_mpi_copy_ram_to_mpi_sync((void*) (src + src_offset), src_node,
+							(void*) (dst + dst_offset), dst_node,
+							size);
+}
+
+int _starpu_mpi_direct_access_supported(unsigned node, unsigned handling_node)
+{
+	(void) node;
+	enum starpu_node_kind kind = starpu_node_get_kind(handling_node);
+	return kind == STARPU_MPI_MS_RAM;
+}
+
+uintptr_t _starpu_mpi_malloc_on_node(unsigned dst_node, size_t size, int flags)
+{
+	(void) flags;
+	uintptr_t addr = 0;
+	if (_starpu_mpi_src_allocate_memory((void **)(&addr), size, dst_node))
+		addr = 0;
+	return addr;
+}
+
+void _starpu_mpi_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags)
+{
+	(void) flags;
+	(void) size;
+	_starpu_mpi_source_free_memory((void*) addr, dst_node);
+}

+ 12 - 1
src/drivers/mpi/driver_mpi_source.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2016,2017                                Inria
- * Copyright (C) 2017                                     CNRS
+ * Copyright (C) 2017,2019                                CNRS
  * Copyright (C) 2017                                     Université de Bordeaux
  * Copyright (C) 2015                                     Mathieu Lirzin
  *
@@ -48,6 +48,17 @@ int _starpu_mpi_copy_mpi_to_ram_async(void *src, unsigned src_node, void *dst, u
 int _starpu_mpi_copy_ram_to_mpi_async(void *src, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst, unsigned dst_node, size_t size, void * event);
 int _starpu_mpi_copy_sink_to_sink_async(void *src, unsigned src_node, void *dst, unsigned dst_node, size_t size, void * event);
 
+int _starpu_mpi_copy_data_from_mpi_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+int _starpu_mpi_copy_data_from_mpi_to_mpi(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+int _starpu_mpi_copy_data_from_cpu_to_mpi(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+
+int _starpu_mpi_copy_interface_from_mpi_to_cpu(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+int _starpu_mpi_copy_interface_from_mpi_to_mpi(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+int _starpu_mpi_copy_interface_from_cpu_to_mpi(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+
+int _starpu_mpi_direct_access_supported(unsigned node, unsigned handling_node);
+uintptr_t _starpu_mpi_malloc_on_node(unsigned dst_node, size_t size, int flags);
+void _starpu_mpi_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags);
 
 starpu_mpi_ms_kernel_t _starpu_mpi_ms_src_get_kernel_from_codelet(struct starpu_codelet *cl, unsigned nimpl);
 void(* _starpu_mpi_ms_src_get_kernel_from_job(const struct _starpu_mp_node *node STARPU_ATTRIBUTE_UNUSED, struct _starpu_job *j))(void);

+ 248 - 24
src/drivers/opencl/driver_opencl.c

@@ -1,9 +1,9 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2011,2012,2014-2017,2019                 Inria
- * Copyright (C) 2010-2018                                Université de Bordeaux
+ * Copyright (C) 2010-2019                                Université de Bordeaux
  * Copyright (C) 2010                                     Mehdi Juhoor
- * Copyright (C) 2010-2017                                CNRS
+ * Copyright (C) 2010-2017,2019                           CNRS
  * Copyright (C) 2013                                     Thibaut Lambert
  * Copyright (C) 2011                                     Télécom-SudParis
  *
@@ -61,6 +61,7 @@ static cl_event task_events[STARPU_MAXOPENCLDEVS][STARPU_MAX_PIPELINE];
 #endif
 #ifdef STARPU_SIMGRID
 static unsigned task_finished[STARPU_MAXOPENCLDEVS][STARPU_MAX_PIPELINE];
+static starpu_pthread_mutex_t opencl_alloc_mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
 #endif /* STARPU_SIMGRID */
 
 #define _STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err) do { if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err); } while(0)
@@ -416,36 +417,34 @@ cl_int starpu_opencl_copy_async_sync(uintptr_t src, size_t src_offset, unsigned
 	cl_int err;
 	int ret;
 
-	switch (_STARPU_MEMORY_NODE_TUPLE(src_kind,dst_kind))
+	if (src_kind == STARPU_OPENCL_RAM && dst_kind == STARPU_CPU_RAM)
 	{
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_CPU_RAM):
-		err = starpu_opencl_copy_opencl_to_ram(
-				(cl_mem) src, src_node,
-				(void*) (dst + dst_offset), dst_node,
-				size, src_offset, event, &ret);
+		err = starpu_opencl_copy_opencl_to_ram((cl_mem) src, src_node,
+						       (void*) (dst + dst_offset), dst_node,
+						       size, src_offset, event, &ret);
 		_STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
 		return ret;
+	}
 
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_OPENCL_RAM):
-		err = starpu_opencl_copy_ram_to_opencl(
-				(void*) (src + src_offset), src_node,
-				(cl_mem) dst, dst_node,
-				size, dst_offset, event, &ret);
+	if (src_kind == STARPU_CPU_RAM && dst_kind == STARPU_OPENCL_RAM)
+	{
+		err = starpu_opencl_copy_ram_to_opencl((void*) (src + src_offset), src_node,
+						       (cl_mem) dst, dst_node,
+						       size, dst_offset, event, &ret);
 		_STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
 		return ret;
+	}
 
-	case _STARPU_MEMORY_NODE_TUPLE(STARPU_OPENCL_RAM,STARPU_OPENCL_RAM):
-		err = starpu_opencl_copy_opencl_to_opencl(
-				(cl_mem) src, src_node, src_offset,
-				(cl_mem) dst, dst_node, dst_offset,
-				size, event, &ret);
+	if (src_kind == STARPU_OPENCL_RAM && (dst_kind == STARPU_CPU_RAM || dst_kind == STARPU_OPENCL_RAM))
+	{
+		err = starpu_opencl_copy_opencl_to_opencl((cl_mem) src, src_node, src_offset,
+							  (cl_mem) dst, dst_node, dst_offset,
+							  size, event, &ret);
 		_STARPU_OPENCL_CHECK_AND_REPORT_ERROR(err);
 		return ret;
-
-	default:
-		STARPU_ABORT();
-		break;
 	}
+
+	STARPU_ABORT();
 }
 
 #if 0
@@ -642,8 +641,8 @@ int _starpu_opencl_driver_init(struct _starpu_worker *worker)
 	const char *devname = "Simgrid";
 #else
 	/* get the device's name */
-	char devname[128];
-	_starpu_opencl_get_device_name(devid, devname, 128);
+	char devname[64];
+	_starpu_opencl_get_device_name(devid, devname, 64);
 #endif
 	snprintf(worker->name, sizeof(worker->name), "OpenCL %d (%s %.1f GiB)", devid, devname, size);
 	snprintf(worker->short_name, sizeof(worker->short_name), "OpenCL %d", devid);
@@ -1096,6 +1095,37 @@ static void _starpu_opencl_execute_job(struct starpu_task *task, struct _starpu_
 }
 
 #ifdef STARPU_USE_OPENCL
+unsigned _starpu_opencl_test_request_completion(struct _starpu_async_channel *async_channel)
+{
+	cl_int event_status;
+	cl_event opencl_event = (*async_channel).event.opencl_event;
+	if (opencl_event == NULL) STARPU_ABORT();
+	cl_int err = clGetEventInfo(opencl_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(event_status), &event_status, NULL);
+	if (STARPU_UNLIKELY(err != CL_SUCCESS))
+		STARPU_OPENCL_REPORT_ERROR(err);
+	if (event_status < 0)
+		STARPU_OPENCL_REPORT_ERROR(event_status);
+	if (event_status == CL_COMPLETE)
+	{
+		err = clReleaseEvent(opencl_event);
+		if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
+	}
+	return (event_status == CL_COMPLETE);
+}
+
+void _starpu_opencl_wait_request_completion(struct _starpu_async_channel *async_channel)
+{
+	cl_int err;
+	if ((*async_channel).event.opencl_event == NULL)
+		STARPU_ABORT();
+	err = clWaitForEvents(1, &((*async_channel).event.opencl_event));
+	if (STARPU_UNLIKELY(err != CL_SUCCESS))
+		STARPU_OPENCL_REPORT_ERROR(err);
+	err = clReleaseEvent((*async_channel).event.opencl_event);
+	if (STARPU_UNLIKELY(err != CL_SUCCESS))
+		STARPU_OPENCL_REPORT_ERROR(err);
+}
+
 int _starpu_run_opencl(struct _starpu_worker *workerarg)
 {
 	_STARPU_DEBUG("Running OpenCL %u from the application\n", workerarg->devid);
@@ -1117,4 +1147,198 @@ struct _starpu_driver_ops _starpu_driver_opencl_ops =
 	.deinit = _starpu_opencl_driver_deinit
 };
 
+int _starpu_opencl_copy_data_from_opencl_to_opencl(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_OPENCL_RAM && dst_kind == STARPU_OPENCL_RAM);
+
+	int ret = 1;
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+	/* STARPU_OPENCL_RAM -> STARPU_OPENCL_RAM */
+	STARPU_ASSERT(starpu_worker_get_local_memory_node() == dst_node || starpu_worker_get_local_memory_node() == src_node);
+	if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() || !(copy_methods->opencl_to_opencl_async || copy_methods->any_to_any))
+	{
+		STARPU_ASSERT(copy_methods->opencl_to_opencl || copy_methods->any_to_any);
+		/* this is not associated to a request so it's synchronous */
+		if (copy_methods->opencl_to_opencl)
+			copy_methods->opencl_to_opencl(src_interface, src_node, dst_interface, dst_node);
+		else
+			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+	}
+	else
+	{
+		req->async_channel.type = STARPU_OPENCL_RAM;
+		if (copy_methods->opencl_to_opencl_async)
+			ret = copy_methods->opencl_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
+		else
+		{
+			STARPU_ASSERT(copy_methods->any_to_any);
+			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+		}
+	}
+	return ret;
+}
+
+int _starpu_opencl_copy_data_from_opencl_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_OPENCL_RAM && dst_kind == STARPU_CPU_RAM);
+
+	int ret = 1;
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+	/* OpenCL -> RAM */
+	STARPU_ASSERT(starpu_worker_get_local_memory_node() == src_node);
+	if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() || !(copy_methods->opencl_to_ram_async || copy_methods->any_to_any))
+	{
+		STARPU_ASSERT(copy_methods->opencl_to_ram || copy_methods->any_to_any);
+		/* this is not associated to a request so it's synchronous */
+		if (copy_methods->opencl_to_ram)
+			copy_methods->opencl_to_ram(src_interface, src_node, dst_interface, dst_node);
+		else
+			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+	}
+	else
+	{
+		req->async_channel.type = STARPU_OPENCL_RAM;
+		if (copy_methods->opencl_to_ram_async)
+			ret = copy_methods->opencl_to_ram_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
+		else
+		{
+			STARPU_ASSERT(copy_methods->any_to_any);
+			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+		}
+	}
+	return ret;
+}
+
+int _starpu_opencl_copy_data_from_cpu_to_opencl(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_OPENCL_RAM);
+
+	int ret = 0;
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+	/* STARPU_CPU_RAM -> STARPU_OPENCL_RAM */
+	STARPU_ASSERT(starpu_worker_get_local_memory_node() == dst_node);
+	if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_opencl_copy_disabled() || !(copy_methods->ram_to_opencl_async || copy_methods->any_to_any))
+	{
+		STARPU_ASSERT(copy_methods->ram_to_opencl || copy_methods->any_to_any);
+		/* this is not associated to a request so it's synchronous */
+		if (copy_methods->ram_to_opencl)
+			copy_methods->ram_to_opencl(src_interface, src_node, dst_interface, dst_node);
+		else
+			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+	}
+	else
+	{
+		req->async_channel.type = STARPU_OPENCL_RAM;
+		if (copy_methods->ram_to_opencl_async)
+			ret = copy_methods->ram_to_opencl_async(src_interface, src_node, dst_interface, dst_node, &(req->async_channel.event.opencl_event));
+		else
+		{
+			STARPU_ASSERT(copy_methods->any_to_any);
+			ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+		}
+	}
+	return ret;
+}
+
+int _starpu_opencl_copy_interface_from_opencl_to_opencl(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_OPENCL_RAM && dst_kind == STARPU_OPENCL_RAM);
+
+	return starpu_opencl_copy_async_sync(src, src_offset, src_node,
+					     dst, dst_offset, dst_node,
+					     size,
+					     &async_channel->event.opencl_event);
+}
+
+int _starpu_opencl_copy_interface_from_opencl_to_cpu(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_OPENCL_RAM && dst_kind == STARPU_CPU_RAM);
+
+	return starpu_opencl_copy_async_sync(src, src_offset, src_node,
+					     dst, dst_offset, dst_node,
+					     size,
+					     &async_channel->event.opencl_event);
+}
+
+int _starpu_opencl_copy_interface_from_cpu_to_opencl(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_OPENCL_RAM);
+
+	return starpu_opencl_copy_async_sync(src, src_offset, src_node,
+					     dst, dst_offset, dst_node,
+					     size,
+					     &async_channel->event.opencl_event);
+}
+
 #endif /* STARPU_USE_OPENCL */
+
+int _starpu_opencl_direct_access_supported(unsigned node, unsigned handling_node)
+{
+	(void)node;
+	(void)handling_node;
+	return 0;
+}
+
+uintptr_t _starpu_opencl_malloc_on_node(unsigned dst_node, size_t size, int flags)
+{
+	(void)flags;
+	uintptr_t addr = 0;
+#ifdef STARPU_SIMGRID
+	static uintptr_t last[STARPU_MAXNODES];
+	/* Sleep for the allocation */
+	STARPU_PTHREAD_MUTEX_LOCK(&opencl_alloc_mutex);
+	if (_starpu_simgrid_cuda_malloc_cost())
+		MSG_process_sleep(0.000175);
+	if (!last[dst_node])
+		last[dst_node] = 1<<10;
+	addr = last[dst_node];
+	last[dst_node]+=size;
+	STARPU_ASSERT(last[dst_node] >= addr);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&opencl_alloc_mutex);
+#else
+	int ret;
+	cl_mem ptr;
+
+	ret = starpu_opencl_allocate_memory(starpu_memory_node_get_devid(dst_node), &ptr, size, CL_MEM_READ_WRITE);
+	if (ret)
+	{
+		addr = 0;
+	}
+	else
+	{
+		addr = (uintptr_t)ptr;
+	}
+#endif
+	return addr;
+}
+
+void _starpu_opencl_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags)
+{
+	(void)flags;
+	(void)size;
+	(void)dst_node;
+#ifdef STARPU_SIMGRID
+	STARPU_PTHREAD_MUTEX_LOCK(&opencl_alloc_mutex);
+	/* Sleep for the free */
+	if (_starpu_simgrid_cuda_malloc_cost())
+		MSG_process_sleep(0.000750);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&opencl_alloc_mutex);
+#else
+	cl_int err;
+	err = clReleaseMemObject((void*)addr);
+	if (STARPU_UNLIKELY(err != CL_SUCCESS))
+		STARPU_OPENCL_REPORT_ERROR(err);
+#endif
+}

+ 15 - 0
src/drivers/opencl/driver_opencl.h

@@ -70,4 +70,19 @@ cl_int _starpu_opencl_copy_rect_ram_to_opencl(void *ptr, unsigned src_node, cl_m
                                               size_t host_row_pitch, size_t host_slice_pitch, cl_event *event);
 #endif
 
+unsigned _starpu_opencl_test_request_completion(struct _starpu_async_channel *async_channel);
+void _starpu_opencl_wait_request_completion(struct _starpu_async_channel *async_channel);
+
+int _starpu_opencl_copy_data_from_opencl_to_opencl(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+int _starpu_opencl_copy_data_from_opencl_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+int _starpu_opencl_copy_data_from_cpu_to_opencl(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+
+int _starpu_opencl_copy_interface_from_opencl_to_cpu(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+int _starpu_opencl_copy_interface_from_opencl_to_opencl(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+int _starpu_opencl_copy_interface_from_cpu_to_opencl(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+
+int _starpu_opencl_direct_access_supported(unsigned node, unsigned handling_node);
+uintptr_t _starpu_opencl_malloc_on_node(unsigned dst_node, size_t size, int flags);
+void _starpu_opencl_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags);
+
 #endif //  __DRIVER_OPENCL_H__

+ 2 - 2
src/drivers/opencl/driver_opencl_utils.c

@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2010-2018                                Université de Bordeaux
  * Copyright (C) 2011,2012,2016                           Inria
- * Copyright (C) 2010-2018                                CNRS
+ * Copyright (C) 2010-2019                                CNRS
  *
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
@@ -335,7 +335,7 @@ int _starpu_opencl_compile_or_load_opencl_from_string(const char *opencl_program
 		}
 		else
 		{
-			char binary_file_name[1024];
+			char binary_file_name[2048];
 			char *binary;
 			size_t binary_len;
 			FILE *fh;

+ 1 - 1
src/drivers/scc/driver_scc_common.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2012,2016,2017                           Inria
- * Copyright (C) 2015,2017                                CNRS
+ * Copyright (C) 2015,2017,2019                           CNRS
  * Copyright (C) 2013,2017                                Université de Bordeaux
  * Copyright (C) 2013                                     Thibaut Lambert
  *

+ 1 - 1
src/drivers/scc/driver_scc_common.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  * Copyright (C) 2012,2016,2017                           Inria
- * Copyright (C) 2015,2017                                CNRS
+ * Copyright (C) 2015,2017,2019                           CNRS
  * Copyright (C) 2013                                     Université de Bordeaux
  * Copyright (C) 2013                                     Thibaut Lambert
  *

+ 103 - 0
src/drivers/scc/driver_scc_source.c

@@ -325,3 +325,106 @@ void *_starpu_scc_src_worker(void *arg)
 
 	return NULL;
 }
+
+int _starpu_scc_copy_data_from_scc_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+{
+	(void) req;
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_SCC_RAM && dst_kind == STARPU_CPU_RAM);
+
+	int ret = 0;
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+	if (copy_methods->scc_sink_to_src)
+		ret = copy_methods->scc_sink_to_src(src_interface, src_node, dst_interface, dst_node);
+	else
+		ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+	return ret;
+}
+
+int _starpu_scc_copy_data_from_scc_to_scc(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+{
+	(void) req;
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_SCC_RAM && dst_kind == STARPU_SCC_RAM);
+
+	int ret = 0;
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+	if (copy_methods->scc_sink_to_sink)
+		ret = copy_methods->scc_sink_to_sink(src_interface, src_node, dst_interface, dst_node);
+	else
+		ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+	return ret;
+}
+
+int _starpu_scc_copy_data_from_cpu_to_scc(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req)
+{
+	(void) req;
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_SCC_RAM);
+
+	int ret = 0;
+	const struct starpu_data_copy_methods *copy_methods = handle->ops->copy_methods;
+	if (copy_methods->scc_src_to_sink)
+		ret = copy_methods->scc_src_to_sink(src_interface, src_node, dst_interface, dst_node);
+	else
+		ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+	return ret;
+}
+
+int _starpu_scc_copy_interface_from_scc_to_cpu(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_SCC_RAM && dst_kind == STARPU_CPU_RAM);
+
+	return _starpu_scc_copy_sink_to_src((void*) (src + src_offset), src_node,
+					    (void*) (dst + dst_offset), dst_node,
+					    size);
+}
+
+int _starpu_scc_copy_interface_from_scc_to_scc(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_SCC_RAM && dst_kind == STARPU_SCC_RAM);
+
+	return _starpu_scc_copy_sink_to_sink((void*) (src + src_offset), src_node,
+					     (void*) (dst + dst_offset), dst_node,
+					     size);
+}
+
+int _starpu_scc_copy_interface_from_cpu_to_scc(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel)
+{
+	int src_kind = starpu_node_get_kind(src_node);
+	int dst_kind = starpu_node_get_kind(dst_node);
+	STARPU_ASSERT(src_kind == STARPU_CPU_RAM && dst_kind == STARPU_SCC_RAM);
+
+	return _starpu_scc_copy_src_to_sink((void*) (src + src_offset), src_node,
+					    (void*) (dst + dst_offset), dst_node,
+					    size);
+}
+
+int _starpu_scc_direct_access_supported(unsigned node, unsigned handling_node)
+{
+	(void) node;
+	(void) handling_node;
+	return 1;
+}
+
+uintptr_t _starpu_scc_malloc_on_node(unsigned dst_node, size_t size, int flags)
+{
+	(void) flags;
+	uintptr_t addr = 0;
+	if (_starpu_scc_allocate_memory((void **)(&addr), size, dst_node))
+		addr = 0;
+	return addr;
+}
+
+void _starpu_scc_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags)
+{
+	(void) flags;
+	_starpu_scc_free_memory((void *) addr, dst_node);
+}

+ 12 - 0
src/drivers/scc/driver_scc_source.h

@@ -52,6 +52,18 @@ int _starpu_scc_copy_sink_to_sink(void *src, unsigned src_node, void *dst, unsig
 
 void *_starpu_scc_src_worker(void *arg);
 
+int _starpu_scc_copy_data_from_scc_to_cpu(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+int _starpu_scc_copy_data_from_scc_to_scc(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+int _starpu_scc_copy_data_from_cpu_to_scc(starpu_data_handle_t handle, void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, struct _starpu_data_request *req);
+
+int _starpu_scc_copy_interface_from_scc_to_cpu(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+int _starpu_scc_copy_interface_from_scc_to_scc(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+int _starpu_scc_copy_interface_from_cpu_to_scc(uintptr_t src, size_t src_offset, unsigned src_node, uintptr_t dst, size_t dst_offset, unsigned dst_node, size_t size, struct _starpu_async_channel *async_channel);
+
+int _starpu_scc_direct_access_supported(unsigned node, unsigned handling_node);
+uintptr_t _starpu_scc_malloc_on_node(unsigned dst_node, size_t size, int flags);
+void _starpu_scc_free_on_node(unsigned dst_node, uintptr_t addr, size_t size, int flags);
+
 #endif /* STARPU_USE_SCC */