8 years ago · cbee1918f5
--- a/configure.ac
+++ b/configure.ac
@@ -562,6 +562,23 @@ AM_CONDITIONAL([STARPU_USE_MIC], [test "x$enable_mic" = "xyes"])
 
																 ###############################################################################
															
 
																+###############################################################################
															
 
																+#                                                                             #
															
 
																+#                           NUMA memory nodes                                 #
															
 
																+#                                                                             #
															
 
																+###############################################################################
															
 
																+
															
 
																+AC_MSG_CHECKING(maximum number of NUMA nodes)
															
 
																+AC_ARG_ENABLE(maxnumanodes, [AS_HELP_STRING([--enable-maxnumanodes=<number>],
															
 
																+			[maximum number of NUMA nodes])],
															
 
																+			nmaxnumanodes=$enableval, nmaxnumanodes=2)
															
 
																+AC_MSG_RESULT($nmaxnumanodes)
															
 
																+AC_DEFINE_UNQUOTED(STARPU_MAXNUMANODES, [$nmaxnumanodes],
															
 
																+		[maximum number of NUMA nodes])
															
 
																+
															
 
																+
															
 
																+###############################################################################
															
 
																+
															
 
																 AC_PATH_PROGS([STARPU_MS_LIB], [lib])
															
 
																 AC_ARG_VAR([STARPU_MS_LIB], [Path to Microsoft's Visual Studio `lib' tool])
															
 
																 AM_CONDITIONAL([STARPU_HAVE_MS_LIB], [test "x$STARPU_MS_LIB" != "x"])
															
@@ -2138,8 +2155,8 @@ if test x$maxnodes = x0 ; then
 
																 	else
															
 
																 		# We have one memory node shared by all CPU workers, one node per GPU
															
 
																 		# and per MIC device
															
 
																-		# we add nodes to use 3 memory disks
															
 
																-		nodes=4
															
 
																+		# we add nodes to use 2 memory disks
															
 
																+		nodes=`expr $nmaxnumanodes + 2`
															
 
																 		if test x$enable_cuda = xyes ; then
															
 
																 			# we could have used nmaxcudadev + 1, but this would certainly give an
															
 
																 			# odd number.
															
--- a/doc/doxygen/chapters/210_check_list_performance.doxy
+++ b/doc/doxygen/chapters/210_check_list_performance.doxy
@@ -197,9 +197,13 @@ structures of StarPU by describing the shape of your machine and/or your
 
																 application at the configure step.
															
 
																 To reduce the memory footprint of the data internal structures of StarPU, one
															
 
																-can set the \ref enable-maxcpus "--enable-maxcpus", \ref enable-maxcudadev
															
 
																-"--enable-maxcudadev", \ref enable-maxopencldev "--enable-maxopencldev" and
															
 
																-\ref enable-maxnodes "--enable-maxnodes" configure parameters to give StarPU
															
 
																+can set the
															
 
																+\ref enable-maxcpus "--enable-maxcpus",
															
 
																+\ref enable-maxnumanodes "--enable-maxnumanodes",
															
 
																+\ref enable-maxcudadev "--enable-maxcudadev",
															
 
																+\ref enable-maxopencldev "--enable-maxopencldev" and
															
 
																+\ref enable-maxnodes "--enable-maxnodes"
															
 
																+configure parameters to give StarPU
															
 
																 the architecture of the machine it will run on, thus tuning the size of the
															
 
																 structures to the machine.
															
--- a/doc/doxygen/chapters/501_environment_variables.doxy
+++ b/doc/doxygen/chapters/501_environment_variables.doxy
@@ -845,6 +845,14 @@ available to the application in the main CPU memory. Setting it enables allocati
 
																 cache in main memory. Setting it to zero lets StarPU overflow memory.
															
 
																 </dd>
															
 
																+<dt>STARPU_LIMIT_CPU_NUMA_devid_MEM</dt>
															
 
																+<dd>
															
 
																+\anchor STARPU_LIMIT_CPU_NUMA_devid_MEM
															
 
																+\addindex __env__STARPU_LIMIT_CPU_NUMA_devid_MEM
															
 
																+This variable specifies the maximum number of megabytes that should be
															
 
																+available to the application on the NUMA node with the OS identifier <c>devid</c>.
															
 
																+</dd>
															
 
																+
															
 
																 <dt>STARPU_MINIMUM_AVAILABLE_MEM</dt>
															
 
																 <dd>
															
 
																 \anchor STARPU_MINIMUM_AVAILABLE_MEM
															
@@ -1110,6 +1118,19 @@ implements an advanced but centralized management of concurrent data
 
																 accesses (see \ref ConcurrentDataAccess).
															
 
																 </dd>
															
 
																+<dt>STARPU_USE_NUMA</dt>
															
 
																+<dd>
															
 
																+\anchor STARPU_USE_NUMA 
															
 
																+\addindex __env__STARPU_USE_NUMA
															
 
																+When defined, NUMA nodes are taking into account by StarPU. Otherwise, memory
															
 
																+is considered as only one node. This is experimental for now.
															
 
																+
															
 
																+When enabled, STARPU_MAIN_MEMORY is a pointer to the NUMA node associated to the
															
 
																+first CPU worker if it exists, the NUMA node associated to the first GPU discovered otherwise.
															
 
																+If StarPU doesn't find any NUMA node after these step, STARPU_MAIN_MEMORY is the first NUMA node
															
 
																+discovered by StarPU.
															
 
																+</dd>
															
 
																+
															
 
																 </dl>
															
 
																 \section ConfiguringTheHypervisor Configuring The Hypervisor
															
--- a/doc/doxygen/chapters/510_configure_options.doxy
+++ b/doc/doxygen/chapters/510_configure_options.doxy
@@ -130,6 +130,14 @@ Use at most <c>count</c> CPU cores.  This information is then
 
																 available as the macro ::STARPU_MAXCPUS.
															
 
																 </dd>
															
 
																+<dt>--enable-maxnumanodes=<c>count</c></dt>
															
 
																+<dd>
															
 
																+\anchor enable-maxnumanodes
															
 
																+\addindex __configure__--enable-maxnumanodes
															
 
																+Use at most <c>count</c> NUMA nodes.  This information is then
															
 
																+available as the macro ::STARPU_MAXNUMANODES.
															
 
																+</dd>
															
 
																+
															
 
																 <dt>--disable-cpu</dt>
															
 
																 <dd>
															
 
																 \anchor disable-cpu
															
--- a/doc/doxygen/chapters/api/data_management.doxy
+++ b/doc/doxygen/chapters/api/data_management.doxy
@@ -2,7 +2,7 @@
 
																  * This file is part of the StarPU Handbook.
															
 
																  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
															
 
																  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
															
 
																- * Copyright (C) 2011, 2012 INRIA
															
 
																+ * Copyright (C) 2011, 2012, 2017  INRIA
															
 
																  * See the file version.doxy for copying conditions.
															
 
																  */
															
@@ -104,6 +104,10 @@ data to StarPU, the specified memory node indicates where the piece of
 
																 data initially resides (we also call this memory node the home node of
															
 
																 a piece of data).
															
 
																+In the case of NUMA systems, functions starpu_memory_nodes_numa_devid_to_id()
															
 
																+and starpu_memory_nodes_numa_id_to_devid() can be used to convert from NUMA node
															
 
																+numbers as seen by the Operating System and NUMA node numbers as seen by StarPU.
															
 
																+
															
 
																 \fn void starpu_data_register(starpu_data_handle_t *handleptr, int home_node, void *data_interface, struct starpu_data_interface_ops *ops)
															
 
																 \ingroup API_Data_Management
															
 
																 Register a piece of data into the handle located at the
															
--- a/doc/doxygen/chapters/api/workers.doxy
+++ b/doc/doxygen/chapters/api/workers.doxy
@@ -250,6 +250,16 @@ Return the type of \p node as defined by
 
																 this function should be used in the allocation function to determine
															
 
																 on which device the memory needs to be allocated.
															
 
																+\fn int starpu_memory_nodes_numa_id_to_devid(int osid)
															
 
																+\ingroup API_Workers_Properties
															
 
																+This function returns the identifier of the memory node associated to the NUMA
															
 
																+node identified by \p osid by the Operating System.
															
 
																+
															
 
																+\fn int starpu_memory_nodes_numa_devid_to_id(unsigned id);
															
 
																+\ingroup API_Workers_Properties
															
 
																+This function returns the Operating System identifier of the memory node
															
 
																+whose StarPU identifier is \p id.
															
 
																+
															
 
																 \fn char *starpu_worker_get_type_as_string(enum starpu_worker_archtype type)
															
 
																 \ingroup API_Workers_Properties
															
 
																 Return worker \p type as a string.
															
--- a/examples/cpp/add_vectors_cpp11.cpp
+++ b/examples/cpp/add_vectors_cpp11.cpp
@@ -2,7 +2,7 @@
 
																  *
															
 
																  * Copyright (C) 2009, 2010-2011, 2013-2015  Université de Bordeaux
															
 
																  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
															
 
																- * Copyright (C) 2012 INRIA
															
 
																+ * Copyright (C) 2012, 2017  INRIA
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -78,6 +78,12 @@ int main(int argc, char **argv)
 
																 		return 77;
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																+	if (starpu_memory_nodes_get_numa_count() > 1)
															
 
																+	{
															
 
																+		starpu_shutdown();
															
 
																+		return 77;
															
 
																+	}
															
 
																+
															
 
																 	// StarPU data registering
															
 
																 	starpu_data_handle_t spu_vec_A;
															
 
																 	starpu_data_handle_t spu_vec_B;
															
--- a/include/starpu_config.h.in
+++ b/include/starpu_config.h.in
@@ -88,6 +88,7 @@
 
																 #undef STARPU_MAXNODES
															
 
																 #undef STARPU_NMAXBUFS
															
 
																 #undef STARPU_MAXCPUS
															
 
																+#undef STARPU_MAXNUMANODES
															
 
																 #undef STARPU_MAXCUDADEVS
															
 
																 #undef STARPU_MAXOPENCLDEVS
															
 
																 #undef STARPU_MAXMICDEVS
															
--- a/include/starpu_data.h
+++ b/include/starpu_data.h
@@ -2,7 +2,7 @@
 
																  *
															
 
																  * Copyright (C) 2010-2017  Université de Bordeaux
															
 
																  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  CNRS
															
 
																- * Copyright (C) 2016  Inria
															
 
																+ * Copyright (C) 2016, 2017  Inria
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -132,6 +132,10 @@ enum starpu_node_kind
 
																 unsigned starpu_worker_get_memory_node(unsigned workerid);
															
 
																 unsigned starpu_memory_nodes_get_count(void);
															
 
																+int starpu_memory_nodes_get_numa_count(void);
															
 
																+int starpu_memory_nodes_numa_id_to_devid(int osid);
															
 
																+int starpu_memory_nodes_numa_devid_to_id(unsigned id);
															
 
																+
															
 
																 enum starpu_node_kind starpu_node_get_kind(unsigned node);
															
 
																 void starpu_data_set_wt_mask(starpu_data_handle_t handle, uint32_t wt_mask);
															
--- a/include/starpu_data_interfaces.h
+++ b/include/starpu_data_interfaces.h
@@ -483,6 +483,8 @@ size_t starpu_data_get_size(starpu_data_handle_t handle);
 
																 starpu_data_handle_t starpu_data_lookup(const void *ptr);
															
 
																+int starpu_data_get_home_node(starpu_data_handle_t handle);
															
 
																+
															
 
																 #ifdef __cplusplus
															
 
																 }
															
 
																 #endif
															
--- a/src/core/disk.c
+++ b/src/core/disk.c
@@ -2,6 +2,7 @@
 
																  *
															
 
																  * Copyright (C) 2013  Corentin Salingue
															
 
																  * Copyright (C) 2015, 2016, 2017  CNRS
															
 
																+ * Copyright (C) 2017  Inria
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -77,16 +78,22 @@ int starpu_disk_register(struct starpu_disk_ops *func, void *parameter, starpu_s
 
																 {
															
 
																 	STARPU_ASSERT_MSG(size < 0 || size >= STARPU_DISK_SIZE_MIN, "Minimum disk size is %d Bytes ! (Here %d) \n", (int) STARPU_DISK_SIZE_MIN, (int) size);
															
 
																 	/* register disk */
															
 
																-	unsigned memory_node = _starpu_memory_node_register(STARPU_DISK_RAM, 0);
															
 
																+	unsigned disk_memnode = _starpu_memory_node_register(STARPU_DISK_RAM, 0);
															
 
																-	_starpu_register_bus(STARPU_MAIN_RAM, memory_node);
															
 
																-	_starpu_register_bus(memory_node, STARPU_MAIN_RAM);
															
 
																+        /* Connect the disk memory node to all numa memory nodes */
															
 
																+        int nb_numa_nodes = starpu_memory_nodes_get_numa_count();
															
 
																+        int numa_node;
															
 
																+        for (numa_node = 0; numa_node < nb_numa_nodes; numa_node++)
															
 
																+        {
															
 
																+                _starpu_register_bus(disk_memnode, numa_node);
															
 
																+                _starpu_register_bus(numa_node, disk_memnode);
															
 
																+        }
															
 
																 	/* connect disk */
															
 
																 	void *base = func->plug(parameter, size);
															
 
																 	/* remember it */
															
 
																-	int n STARPU_ATTRIBUTE_UNUSED = add_disk_in_list(memory_node,func,base);
															
 
																+	int n STARPU_ATTRIBUTE_UNUSED = add_disk_in_list(disk_memnode, func, base);
															
 
																 #ifdef STARPU_SIMGRID
															
 
																 	char name[16];
															
@@ -96,13 +103,13 @@ int starpu_disk_register(struct starpu_disk_ops *func, void *parameter, starpu_s
 
																 	_starpu_simgrid_memory_node_set_host(memory_node, host);
															
 
																 #endif
															
 
																-	int ret = func->bandwidth(memory_node);
															
 
																+	int ret = func->bandwidth(disk_memnode);
															
 
																 	/* have a problem with the disk */
															
 
																 	if (ret == 0)
															
 
																 		return -ENOENT;
															
 
																 	if (size >= 0)
															
 
																-		_starpu_memory_manager_set_global_memory_size(memory_node, size);
															
 
																-	return memory_node;
															
 
																+		_starpu_memory_manager_set_global_memory_size(disk_memnode, size);
															
 
																+	return disk_memnode;
															
 
																 }
															
 
																 void _starpu_disk_unregister(void)
															
--- a/src/core/perfmodel/perfmodel.h
+++ b/src/core/perfmodel/perfmodel.h
@@ -110,6 +110,10 @@ void _starpu_simgrid_get_platform_path(int version, char *path, size_t maxlen);
 
																 void _starpu_perfmodel_realloc(struct starpu_perfmodel *model, int nb);
															
 
																+#if defined(STARPU_HAVE_HWLOC)
															
 
																+hwloc_topology_t _starpu_perfmodel_get_hwtopology();
															
 
																+#endif
															
 
																+
															
 
																 #ifdef __cplusplus
															
 
																 }
															
 
																 #endif
															
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
--- a/src/core/simgrid.c
+++ b/src/core/simgrid.c
@@ -1,8 +1,8 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2012-2017  Université de Bordeaux
															
 
																- * Copyright (C) 2016  	    Inria
															
 
																- * Copyright (C) 2016, 2017  	    CNRS
															
 
																+ * Copyright (C) 2016, 2017  Inria
															
 
																+ * Copyright (C) 2016, 2017  CNRS
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -1039,8 +1039,19 @@ void _starpu_simgrid_count_ngpus(void)
 
																 			ngpus = 0;
															
 
																 			for (src2 = 1; src2 < STARPU_MAXNODES; src2++)
															
 
																 			{
															
 
																-				if (starpu_bus_get_id(src2, STARPU_MAIN_RAM) == -1)
															
 
																+				int numa;
															
 
																+				int nnumas = starpu_memory_nodes_get_numa_count();
															
 
																+				int found = 0;
															
 
																+				for (numa = 0; numa < nnumas; numa++)
															
 
																+					if (starpu_bus_get_id(src2, numa) != -1)
															
 
																+					{
															
 
																+						found = 1;
															
 
																+						break;
															
 
																+					}
															
 
																+					
															
 
																+				if (!found)
															
 
																 					continue;
															
 
																+
															
 
																 				msg_host_t srchost2 = _starpu_simgrid_get_memnode_host(src2);
															
 
																 				int routesize2;
															
 
																 #ifdef HAVE_SG_HOST_ROUTE
															
--- a/src/core/topology.c
+++ b/src/core/topology.c
@@ -30,6 +30,7 @@
 
																 #include <drivers/mpi/driver_mpi_common.h>
															
 
																 #include <drivers/mp_common/source_common.h>
															
 
																 #include <drivers/opencl/driver_opencl.h>
															
 
																+#include <drivers/opencl/driver_opencl_utils.h>
															
 
																 #include <profiling/profiling.h>
															
 
																 #include <datawizard/datastats.h>
															
 
																 #include <datawizard/memory_nodes.h>
															
@@ -54,11 +55,23 @@
 
																 #include <hwloc/cuda.h>
															
 
																 #endif
															
 
																+#if defined(STARPU_USE_OPENCL)
															
 
																+#include <hwloc/opencl.h>
															
 
																+#endif
															
 
																+
															
 
																 static unsigned topology_is_initialized = 0;
															
 
																 static int nobind;
															
 
																 /* For checking whether two workers share the same PU, indexed by PU number */
															
 
																 static int cpu_worker[STARPU_MAXCPUS];
															
 
																+static unsigned nb_numa_nodes = 0;
															
 
																+static int numa_memory_nodes_to_hwloclogid[STARPU_MAXNUMANODES]; /* indexed by StarPU numa node to convert in hwloc logid */
															
 
																+static int numa_memory_nodes_to_physicalid[STARPU_MAXNUMANODES]; /* indexed by StarPU numa node to convert in physical id */
															
 
																+static unsigned numa_bus_id[STARPU_MAXNUMANODES*STARPU_MAXNUMANODES];
															
 
																+static int _starpu_get_logical_numa_node_worker(unsigned workerid);
															
 
																+
															
 
																+#define STARPU_NUMA_UNINITIALIZED (-2)
															
 
																+#define STARPU_NUMA_MAIN_RAM (-1)
															
 
																 #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_SCC) || defined(STARPU_SIMGRID) || defined(STARPU_USE_MPI_MASTER_SLAVE)
															
@@ -87,6 +100,124 @@ static struct _starpu_worker_set mic_worker_set[STARPU_MAXMICDEVS];
 
																 struct _starpu_worker_set mpi_worker_set[STARPU_MAXMPIDEVS];
															
 
																 #endif
															
 
																+int starpu_memory_nodes_get_numa_count(void)
															
 
																+{
															
 
																+	return nb_numa_nodes;
															
 
																+}
															
 
																+
															
 
																+#if defined(STARPU_HAVE_HWLOC)
															
 
																+static int numa_get_logical_id(hwloc_obj_t obj)
															
 
																+{
															
 
																+	STARPU_ASSERT(obj);
															
 
																+	while (obj->type != HWLOC_OBJ_NODE)
															
 
																+	{
															
 
																+		obj = obj->parent;
															
 
																+
															
 
																+		/* If we don't find a "node" obj before the root, this means
															
 
																+		 * hwloc does not know whether there are numa nodes or not, so
															
 
																+		 * we should not use a per-node sampling in that case. */
															
 
																+		if (!obj)
															
 
																+			return STARPU_NUMA_MAIN_RAM;
															
 
																+	}
															
 
																+	return obj->logical_index;
															
 
																+}
															
 
																+
															
 
																+static int numa_get_physical_id(hwloc_obj_t obj)
															
 
																+{
															
 
																+	STARPU_ASSERT(obj);
															
 
																+	while (obj->type != HWLOC_OBJ_NODE)
															
 
																+	{
															
 
																+		obj = obj->parent;
															
 
																+
															
 
																+		/* If we don't find a "node" obj before the root, this means
															
 
																+		 * hwloc does not know whether there are numa nodes or not, so
															
 
																+		 * we should not use a per-node sampling in that case. */
															
 
																+		if (!obj)
															
 
																+			return STARPU_NUMA_MAIN_RAM;
															
 
																+	}
															
 
																+	return obj->os_index;
															
 
																+}
															
 
																+#endif
															
 
																+
															
 
																+static int _starpu_get_logical_numa_node_worker(unsigned workerid)
															
 
																+{
															
 
																+#if defined(STARPU_HAVE_HWLOC)
															
 
																+	char * state;
															
 
																+	if ((state = starpu_getenv("STARPU_USE_NUMA")) && atoi(state))
															
 
																+	{
															
 
																+		struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
															
 
																+		struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config() ;
															
 
																+		struct _starpu_machine_topology *topology = &config->topology ;
															
 
																+
															
 
																+		hwloc_obj_t obj;
															
 
																+		switch(worker->arch) 	
															
 
																+		{
															
 
																+			case STARPU_CPU_WORKER:
															
 
																+				obj = hwloc_get_obj_by_type(topology->hwtopology, HWLOC_OBJ_PU, worker->bindid) ;
															
 
																+				break;
															
 
																+			default:
															
 
																+				STARPU_ABORT();
															
 
																+		}
															
 
																+
															
 
																+		return numa_get_logical_id(obj);
															
 
																+	}
															
 
																+	else		
															
 
																+#endif 
															
 
																+	{
															
 
																+		(void) workerid; /* unused */
															
 
																+		return STARPU_NUMA_MAIN_RAM;
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+static int _starpu_get_physical_numa_node_worker(unsigned workerid)
															
 
																+{
															
 
																+#if defined(STARPU_HAVE_HWLOC)
															
 
																+	char * state;
															
 
																+	if ((state = starpu_getenv("STARPU_USE_NUMA")) && atoi(state))
															
 
																+	{
															
 
																+		struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
															
 
																+		struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config() ;
															
 
																+		struct _starpu_machine_topology *topology = &config->topology ;
															
 
																+
															
 
																+		hwloc_obj_t obj;
															
 
																+		switch(worker->arch) 	
															
 
																+		{
															
 
																+			case STARPU_CPU_WORKER:
															
 
																+				obj = hwloc_get_obj_by_type(topology->hwtopology, HWLOC_OBJ_PU, worker->bindid) ;
															
 
																+				break;
															
 
																+			default:
															
 
																+				STARPU_ABORT();
															
 
																+		}
															
 
																+
															
 
																+		return numa_get_physical_id(obj);
															
 
																+	}
															
 
																+	else		
															
 
																+#endif 
															
 
																+	{
															
 
																+		(void) workerid; /* unused */
															
 
																+		return STARPU_NUMA_MAIN_RAM;
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+static int _starpu_numa_get_logical_id_from_pu(int pu)
															
 
																+{
															
 
																+#if defined(STARPU_HAVE_HWLOC)
															
 
																+	if (nb_numa_nodes > 1)
															
 
																+	{
															
 
																+		struct _starpu_machine_config *config = _starpu_get_machine_config();
															
 
																+		struct _starpu_machine_topology *topology = &config->topology;
															
 
																+
															
 
																+		hwloc_obj_t obj = hwloc_get_obj_by_type(topology->hwtopology, HWLOC_OBJ_PU, pu);
															
 
																+		return numa_get_logical_id(obj);
															
 
																+	}
															
 
																+	else
															
 
																+#endif
															
 
																+	{
															
 
																+		return -1;
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+
															
 
																 struct _starpu_worker *_starpu_get_worker_from_driver(struct starpu_driver *d)
															
 
																 {
															
 
																 	unsigned nworkers = starpu_worker_get_count();
															
@@ -846,6 +977,67 @@ _starpu_topology_get_nhwpu (struct _starpu_machine_config *config)
 
																 	return config->topology.nhwpus;
															
 
																 }
															
 
																+unsigned _starpu_topology_get_nnumanodes(struct _starpu_machine_config *config STARPU_ATTRIBUTE_UNUSED)
															
 
																+{
															
 
																+#if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
															
 
																+        _starpu_opencl_init();
															
 
																+#endif
															
 
																+#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
															
 
																+        _starpu_init_cuda();
															
 
																+#endif
															
 
																+        _starpu_init_topology(config);
															
 
																+
															
 
																+	int res;
															
 
																+#if defined(STARPU_HAVE_HWLOC)
															
 
																+	char * state;
															
 
																+	if ((state = starpu_getenv("STARPU_USE_NUMA")) && atoi(state))
															
 
																+	{
															
 
																+		struct _starpu_machine_topology *topology = &config->topology ;
															
 
																+		int nnumanodes = hwloc_get_nbobjs_by_type(topology->hwtopology, HWLOC_OBJ_NODE) ;
															
 
																+		res = nnumanodes > 0 ? nnumanodes : 1 ;
															
 
																+	}
															
 
																+	else
															
 
																+#endif 
															
 
																+	{	
															
 
																+		res = 1;
															
 
																+	}
															
 
																+
															
 
																+	STARPU_ASSERT_MSG(res <= STARPU_MAXNUMANODES, "Number of NUMA nodes discovered is higher than maximum accepted ! Use configure option --enable-maxnumanodes=xxx to increase the maximum value of supported NUMA nodes.\n");
															
 
																+	return res;
															
 
																+}
															
 
																+
															
 
																+//TODO change this in an array
															
 
																+int starpu_memory_nodes_numa_hwloclogid_to_id(int logid)
															
 
																+{
															
 
																+	unsigned n;
															
 
																+	for (n = 0; n < nb_numa_nodes; n++)
															
 
																+		if (numa_memory_nodes_to_hwloclogid[n] == logid)
															
 
																+			return n;
															
 
																+	return -1;
															
 
																+}
															
 
																+
															
 
																+int starpu_memory_nodes_numa_id_to_hwloclogid(unsigned id)
															
 
																+{
															
 
																+	STARPU_ASSERT(id < STARPU_MAXNUMANODES);
															
 
																+	return numa_memory_nodes_to_hwloclogid[id];
															
 
																+}
															
 
																+
															
 
																+int starpu_memory_nodes_numa_devid_to_id(unsigned id)
															
 
																+{
															
 
																+	STARPU_ASSERT(id < STARPU_MAXNUMANODES);
															
 
																+	return numa_memory_nodes_to_physicalid[id];
															
 
																+}
															
 
																+
															
 
																+//TODO change this in an array
															
 
																+int starpu_memory_nodes_numa_id_to_devid(int osid)
															
 
																+{
															
 
																+	unsigned n;
															
 
																+	for (n = 0; n < nb_numa_nodes; n++)
															
 
																+		if (numa_memory_nodes_to_physicalid[n] == osid)
															
 
																+			return n;
															
 
																+	return -1;
															
 
																+}
															
 
																+
															
 
																 #ifdef STARPU_HAVE_HWLOC
															
 
																 void _starpu_topology_filter(hwloc_topology_t topology)
															
 
																 {
															
@@ -1751,35 +1943,294 @@ _starpu_bind_thread_on_cpus (
 
																 #endif
															
 
																 }
															
 
																-static void
															
 
																-_starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_config STARPU_ATTRIBUTE_UNUSED)
															
 
																+static void _starpu_init_binding_cpu(struct _starpu_machine_config *config)
															
 
																+{
															
 
																+	unsigned worker;
															
 
																+	for (worker = 0; worker < config->topology.nworkers; worker++)
															
 
																+	{
															
 
																+		struct _starpu_worker *workerarg = &config->workers[worker];
															
 
																+
															
 
																+		switch (workerarg->arch)
															
 
																+		{
															
 
																+			case STARPU_CPU_WORKER:
															
 
																+			{
															
 
																+				/* Dedicate a cpu core to that worker */
															
 
																+				workerarg->bindid = _starpu_get_next_bindid(config, NULL, 0);
															
 
																+				break;
															
 
																+			}
															
 
																+			default:
															
 
																+				/* Do nothing */
															
 
																+				break;
															
 
																+		}
															
 
																+
															
 
																+
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+//TODO : Check SIMGRID
															
 
																+static void _starpu_init_numa_node(struct _starpu_machine_config *config)
															
 
																 {
															
 
																-	/* launch one thread per CPU */
															
 
																-	unsigned ram_memory_node;
															
 
																+	nb_numa_nodes = 0;
															
 
																+
															
 
																+	unsigned i;
															
 
																+	for (i = 0; i < STARPU_MAXNUMANODES; i++)
															
 
																+	{
															
 
																+		numa_memory_nodes_to_hwloclogid[i] = STARPU_NUMA_UNINITIALIZED;
															
 
																+		numa_memory_nodes_to_physicalid[i] = STARPU_NUMA_UNINITIALIZED;
															
 
																+	}
															
 
																-	/* note that even if the CPU cpu are not used, we always have a RAM
															
 
																-	 * node */
															
 
																-	/* TODO : support NUMA  ;) */
															
 
																-	ram_memory_node = _starpu_memory_node_register(STARPU_CPU_RAM, 0);
															
 
																-	STARPU_ASSERT(ram_memory_node == STARPU_MAIN_RAM);
															
 
																+	char * state;
															
 
																+	/* NUMA mode activated */
															
 
																+	if ((state = starpu_getenv("STARPU_USE_NUMA")) && atoi(state))
															
 
																+	{
															
 
																+		/* Take all NUMA nodes used by CPU workers */
															
 
																+		unsigned worker;
															
 
																+		for (worker = 0; worker < config->topology.nworkers; worker++)
															
 
																+		{
															
 
																+			struct _starpu_worker *workerarg = &config->workers[worker];
															
 
																+			if (workerarg->arch == STARPU_CPU_WORKER)
															
 
																+			{
															
 
																+				int numa_logical_id = _starpu_get_logical_numa_node_worker(worker);
															
 
																+
															
 
																+				/* Convert logical id to StarPU id to check if this NUMA node is already saved or not */
															
 
																+				int numa_starpu_id = starpu_memory_nodes_numa_hwloclogid_to_id(numa_logical_id);
															
 
																+
															
 
																+				/* This shouldn't happen */
															
 
																+				if (numa_starpu_id == -1 && nb_numa_nodes == STARPU_MAXNUMANODES)
															
 
																+				{
															
 
																+					_STARPU_MSG("Warning: %u NUMA nodes available. Only %u enabled. Use configure option --enable-maxnumanodes=xxx to update the maximum value of supported NUMA nodes.\n", _starpu_topology_get_nnumanodes(config), STARPU_MAXNUMANODES);
															
 
																+					STARPU_ABORT();
															
 
																+				}
															
 
																+
															
 
																+				if (numa_starpu_id == -1)
															
 
																+				{
															
 
																+					int devid = numa_logical_id == STARPU_NUMA_MAIN_RAM ? 0 : numa_logical_id;
															
 
																+					int memnode = _starpu_memory_node_register(STARPU_CPU_RAM, devid);
															
 
																+					STARPU_ASSERT_MSG(memnode < STARPU_MAXNUMANODES, "Wrong Memory Node : %d (only %d available)", memnode, STARPU_MAXNUMANODES);
															
 
																+					numa_memory_nodes_to_hwloclogid[memnode] = numa_logical_id;
															
 
																+					int numa_physical_id = _starpu_get_physical_numa_node_worker(worker);
															
 
																+					numa_memory_nodes_to_physicalid[memnode] = numa_physical_id;
															
 
																+					nb_numa_nodes++;
															
 
																+#ifdef STARPU_SIMGRID
															
 
																+					snprintf(name, sizeof(name), "RAM%d", memnode);
															
 
																+					host = _starpu_simgrid_get_host_by_name(name);
															
 
																+					STARPU_ASSERT(host);
															
 
																+					_starpu_simgrid_memory_node_set_host(memnode, host);
															
 
																+#endif
															
 
																+				}
															
 
																+			}
															
 
																+		}
															
 
																+
															
 
																+		/* If we found NUMA nodes from CPU workers, it's good */
															
 
																+		if (nb_numa_nodes != 0)
															
 
																+			return;
															
 
																+
															
 
																+		_STARPU_DISP("No NUMA nodes found when checking CPU workers...\n");
															
 
																+
															
 
																+#if (defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)) && defined(STARPU_HAVE_HWLOC)
															
 
																+		_STARPU_DISP("Take NUMA nodes attached to CUDA and OpenCL devices...\n");
															
 
																+#endif
															
 
																+
															
 
																+#if defined(STARPU_USE_CUDA) && defined(STARPU_HAVE_HWLOC)
															
 
																+		for (i = 0; i < config->topology.ncudagpus; i++)
															
 
																+		{
															
 
																+			hwloc_obj_t obj = hwloc_cuda_get_device_osdev_by_index(config->topology.hwtopology, i);
															
 
																+
															
 
																+			/* Hwloc cannot recognize some devices */
															
 
																+			if (!obj)
															
 
																+				continue;
															
 
																+
															
 
																+			while (obj->type != HWLOC_OBJ_NODE)
															
 
																+			{
															
 
																+				obj = obj->parent;
															
 
																+
															
 
																+				/* If we don't find a "node" obj before the root, this means
															
 
																+				 * hwloc does not know whether there are numa nodes or not, so
															
 
																+				 * we should not use a per-node sampling in that case. */
															
 
																+				if (!obj)
															
 
																+					continue;
															
 
																+			}
															
 
																+			int numa_starpu_id = starpu_memory_nodes_numa_hwloclogid_to_id(obj->logical_index);
															
 
																+
															
 
																+			/* This shouldn't happen */
															
 
																+			if (numa_starpu_id == -1 && nb_numa_nodes == STARPU_MAXNUMANODES)
															
 
																+			{
															
 
																+				_STARPU_MSG("Warning: %u NUMA nodes available. Only %u enabled. Use configure option --enable-maxnumanodes=xxx to update the maximum value of supported NUMA nodes.\n", _starpu_topology_get_nnumanodes(config), STARPU_MAXNUMANODES);
															
 
																+				STARPU_ABORT();
															
 
																+			}
															
 
																+
															
 
																+			if (numa_starpu_id == -1)
															
 
																+			{
															
 
																+				int memnode = _starpu_memory_node_register(STARPU_CPU_RAM, obj->logical_index);
															
 
																+				STARPU_ASSERT_MSG(memnode < STARPU_MAXNUMANODES, "Wrong Memory Node : %d (only %d available)", memnode, STARPU_MAXNUMANODES);
															
 
																+				numa_memory_nodes_to_hwloclogid[memnode] = obj->logical_index;
															
 
																+				numa_memory_nodes_to_physicalid[memnode] = obj->os_index;
															
 
																+				nb_numa_nodes++;
															
 
																+#ifdef STARPU_SIMGRID
															
 
																+				snprintf(name, sizeof(name), "RAM%d", memnode);
															
 
																+				host = _starpu_simgrid_get_host_by_name(name);
															
 
																+				STARPU_ASSERT(host);
															
 
																+				_starpu_simgrid_memory_node_set_host(memnode, host);
															
 
																+#endif
															
 
																+			}
															
 
																+		}	
															
 
																+#endif
															
 
																+#if defined(STARPU_USE_OPENCL) && defined(STARPU_HAVE_HWLOC)
															
 
																+		if (config->topology.nopenclgpus > 0)
															
 
																+		{
															
 
																+			cl_int err;
															
 
																+			cl_platform_id platform_id[_STARPU_OPENCL_PLATFORM_MAX];
															
 
																+			cl_uint nb_platforms;
															
 
																+			unsigned platform;
															
 
																+			unsigned nb_opencl_devices = 0, num = 0;
															
 
																+
															
 
																+			err = clGetPlatformIDs(_STARPU_OPENCL_PLATFORM_MAX, platform_id, &nb_platforms);
															
 
																+			if (STARPU_UNLIKELY(err != CL_SUCCESS)) 
															
 
																+				nb_platforms=0;
															
 
																+
															
 
																+			cl_device_type device_type = CL_DEVICE_TYPE_GPU|CL_DEVICE_TYPE_ACCELERATOR;
															
 
																+			if (starpu_get_env_number("STARPU_OPENCL_ON_CPUS") > 0)
															
 
																+				device_type |= CL_DEVICE_TYPE_CPU;
															
 
																+			if (starpu_get_env_number("STARPU_OPENCL_ONLY_ON_CPUS") > 0)
															
 
																+				device_type = CL_DEVICE_TYPE_CPU;
															
 
																+
															
 
																+			for (platform = 0; platform < nb_platforms ; platform++)
															
 
																+			{
															
 
																+				err = clGetDeviceIDs(platform_id[platform], device_type, 0, NULL, &num);
															
 
																+				if (err != CL_SUCCESS)
															
 
																+					num = 0;
															
 
																+				nb_opencl_devices += num;
															
 
																+
															
 
																+				for (i = 0; i < num; i++)
															
 
																+				{
															
 
																+					hwloc_obj_t obj = hwloc_opencl_get_device_osdev_by_index(config->topology.hwtopology, platform, i);
															
 
																+
															
 
																+					/* Hwloc cannot recognize some devices */
															
 
																+					if (!obj)
															
 
																+						continue;
															
 
																+
															
 
																+					while (obj->type != HWLOC_OBJ_NODE)
															
 
																+					{
															
 
																+						obj = obj->parent;
															
 
																+
															
 
																+						/* If we don't find a "node" obj before the root, this means
															
 
																+						 * hwloc does not know whether there are numa nodes or not, so
															
 
																+						 * we should not use a per-node sampling in that case. */
															
 
																+						if (!obj)
															
 
																+							continue;
															
 
																+					}
															
 
																+					int numa_starpu_id = starpu_memory_nodes_numa_hwloclogid_to_id(obj->logical_index);
															
 
																+
															
 
																+					/* This shouldn't happen */
															
 
																+					if (numa_starpu_id == -1 && nb_numa_nodes == STARPU_MAXNUMANODES)
															
 
																+					{
															
 
																+						_STARPU_MSG("Warning: %u NUMA nodes available. Only %u enabled. Use configure option --enable-maxnumanodes=xxx to update the maximum value of supported NUMA nodes.\n", _starpu_topology_get_nnumanodes(config), STARPU_MAXNUMANODES);
															
 
																+						STARPU_ABORT();
															
 
																+					}
															
 
																+
															
 
																+					if (numa_starpu_id == -1)
															
 
																+					{
															
 
																+						int memnode = _starpu_memory_node_register(STARPU_CPU_RAM, obj->logical_index);
															
 
																+						STARPU_ASSERT_MSG(memnode < STARPU_MAXNUMANODES, "Wrong Memory Node : %d (only %d available)", memnode, STARPU_MAXNUMANODES);
															
 
																+						numa_memory_nodes_to_hwloclogid[memnode] = obj->logical_index;
															
 
																+						numa_memory_nodes_to_physicalid[memnode] = obj->os_index;	
															
 
																+						nb_numa_nodes++;
															
 
																 #ifdef STARPU_SIMGRID
															
 
																-	char name[16];
															
 
																-	msg_host_t host = _starpu_simgrid_get_host_by_name("RAM");
															
 
																-	STARPU_ASSERT(host);
															
 
																-	_starpu_simgrid_memory_node_set_host(STARPU_MAIN_RAM, host);
															
 
																+						snprintf(name, sizeof(name), "RAM%d", memnode);
															
 
																+						host = _starpu_simgrid_get_host_by_name(name);
															
 
																+						STARPU_ASSERT(host);
															
 
																+						_starpu_simgrid_memory_node_set_host(memnode, host);
															
 
																 #endif
															
 
																+					}
															
 
																+				}	
															
 
																+			}
															
 
																+		}
															
 
																+#endif
															
 
																+	}
															
 
																+	
															
 
																+#if (defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)) && defined(STARPU_HAVE_HWLOC)
															
 
																+	//Found NUMA nodes from CUDA nodes
															
 
																+	if (nb_numa_nodes != 0)
															
 
																+		return;
															
 
																+
															
 
																+	/* In case, we do not find any NUMA nodes when checking NUMA nodes attached to GPUs, we take all of them */
															
 
																+	_STARPU_DISP("No NUMA nodes found when checking GPUs devices...\n");
															
 
																+#endif
															
 
																+
															
 
																+	_STARPU_DISP("Finally, take all NUMA nodes available... \n");
															
 
																+
															
 
																+	unsigned nnuma = _starpu_topology_get_nnumanodes(config);
															
 
																+	if (nnuma > STARPU_MAXNUMANODES)
															
 
																+	{
															
 
																+		_STARPU_MSG("Warning: %u NUMA nodes available. Only %u enabled. Use configure option --enable-maxnumanodes=xxx to update the maximum value of supported NUMA nodes.\n", _starpu_topology_get_nnumanodes(config), STARPU_MAXNUMANODES);
															
 
																+		nnuma = STARPU_MAXNUMANODES;		
															
 
																+	}
															
 
																+
															
 
																+	unsigned numa;
															
 
																+	for (numa = 0; numa < nnuma; numa++)
															
 
																+	{
															
 
																+#if defined(STARPU_HAVE_HWLOC)
															
 
																+		if (nnuma > 1)
															
 
																+		{
															
 
																+			hwloc_obj_t obj = hwloc_get_obj_by_type(config->topology.hwtopology, HWLOC_OBJ_NUMANODE, numa);
															
 
																+			unsigned numa_logical_id = obj->logical_index;
															
 
																+			unsigned numa_physical_id = obj->os_index;
															
 
																+			int memnode = _starpu_memory_node_register(STARPU_CPU_RAM, 0);
															
 
																+			STARPU_ASSERT_MSG(memnode < STARPU_MAXNUMANODES, "Wrong Memory Node : %d (only %d available) \n", memnode, STARPU_MAXNUMANODES);
															
 
																+
															
 
																+			numa_memory_nodes_to_hwloclogid[memnode] = numa_logical_id;
															
 
																+			numa_memory_nodes_to_physicalid[memnode] = numa_physical_id;
															
 
																+			nb_numa_nodes++;								
															
 
																+
															
 
																+#ifdef STARPU_SIMGRID
															
 
																+			snprintf(name, sizeof(name), "RAM%d", memnode);
															
 
																+			host = _starpu_simgrid_get_host_by_name(name);
															
 
																+			STARPU_ASSERT(host);
															
 
																+			_starpu_simgrid_memory_node_set_host(memnode, host);
															
 
																+#endif
															
 
																+		}
															
 
																+		else
															
 
																+#endif /* defined(STARPU_HAVE_HWLOC) */
															
 
																+		{
															
 
																+
															
 
																+			/* In this case, nnuma has only one node */
															
 
																+			int memnode = _starpu_memory_node_register(STARPU_CPU_RAM, 0);
															
 
																+			STARPU_ASSERT_MSG(memnode == STARPU_MAIN_RAM, "Wrong Memory Node : %d (expected %d) \n", memnode, STARPU_MAIN_RAM);
															
 
																+
															
 
																+			numa_memory_nodes_to_hwloclogid[memnode] = STARPU_NUMA_MAIN_RAM;
															
 
																+			numa_memory_nodes_to_physicalid[memnode] = STARPU_NUMA_MAIN_RAM;
															
 
																+			nb_numa_nodes++;								
															
 
																+#ifdef STARPU_SIMGRID
															
 
																+			char name[16];
															
 
																+			msg_host_t host = _starpu_simgrid_get_host_by_name("RAM");
															
 
																+			STARPU_ASSERT(host);
															
 
																+			_starpu_simgrid_memory_node_set_host(STARPU_MAIN_RAM, host);
															
 
																+#endif
															
 
																+		}
															
 
																+
															
 
																+	}	
															
 
																+	
															
 
																+	STARPU_ASSERT_MSG(nb_numa_nodes > 0, "No NUMA node found... We need at least one memory node !\n");	
															
 
																+}
															
 
																+
															
 
																+static void _starpu_init_numa_bus()
															
 
																+{
															
 
																+	unsigned i, j;
															
 
																+	for (i = 0; i < nb_numa_nodes; i++)
															
 
																+		for (j = 0; j < nb_numa_nodes; j++)
															
 
																+			if (i != j)
															
 
																+				numa_bus_id[i*nb_numa_nodes+j] = _starpu_register_bus(i, j);
															
 
																+}
															
 
																+
															
 
																+static void
															
 
																+_starpu_init_workers_binding_and_memory (struct _starpu_machine_config *config, int no_mp_config STARPU_ATTRIBUTE_UNUSED)
															
 
																+{
															
 
																 	/* We will store all the busid of the different (src, dst)
															
 
																 	 * combinations in a matrix which we initialize here. */
															
 
																 	_starpu_initialize_busid_matrix();
															
 
																-	/* Each device is initialized,
															
 
																-	 * giving it a memory node and a core bind id.
															
 
																-	 */
															
 
																-	/* TODO: STARPU_MAXNUMANODES */
															
 
																-	unsigned numa_init[1] = { 1 };
															
 
																-	unsigned numa_memory_nodes[1] = { ram_memory_node };
															
 
																 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
															
 
																 	unsigned cuda_init[STARPU_MAXCUDADEVS] = { };
															
 
																 	unsigned cuda_memory_nodes[STARPU_MAXCUDADEVS];
															
@@ -1801,6 +2252,7 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
																 	unsigned mpi_memory_nodes[STARPU_MAXMPIDEVS];
															
 
																 	unsigned mpi_bindid[STARPU_MAXMPIDEVS];
															
 
																 #endif
															
 
																+
															
 
																 	unsigned bindid;
															
 
																 	for (bindid = 0; bindid < config->nbindid; bindid++)
															
@@ -1810,6 +2262,13 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
																 		config->bindid_workers[bindid].nworkers = 0;
															
 
																 	}
															
 
																+	/* Init CPU binding before NUMA nodes, because we use it to discover NUMA nodes */
															
 
																+	_starpu_init_binding_cpu(config);
															
 
																+
															
 
																+	/* Initialize NUMA nodes */
															
 
																+	_starpu_init_numa_node(config);
															
 
																+	_starpu_init_numa_bus();
															
 
																+
															
 
																 	unsigned worker;
															
 
																 	for (worker = 0; worker < config->topology.nworkers; worker++)
															
 
																 	{
															
@@ -1828,33 +2287,22 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
																 		{
															
 
																 			case STARPU_CPU_WORKER:
															
 
																 			{
															
 
																-				/* TODO: NUMA */
															
 
																-				int numaid = 0;
															
 
																-				/* "dedicate" a cpu core to that worker */
															
 
																-				if (numa_init[numaid])
															
 
																-				{
															
 
																-					memory_node = numa_memory_nodes[numaid];
															
 
																-				}
															
 
																-				else
															
 
																-				{
															
 
																-					numa_init[numaid] = 1;
															
 
																-					memory_node = numa_memory_nodes[numaid] = _starpu_memory_node_register(STARPU_CPU_RAM, numaid);
															
 
																-#ifdef STARPU_SIMGRID
															
 
																-					snprintf(name, sizeof(name), "RAM%d", numaid);
															
 
																-					host = _starpu_simgrid_get_host_by_name(name);
															
 
																-					STARPU_ASSERT(host);
															
 
																-					_starpu_simgrid_memory_node_set_host(memory_node, host);
															
 
																-#endif
															
 
																-				}
															
 
																-				workerarg->bindid = _starpu_get_next_bindid(config, NULL, 0);
															
 
																+				int numa_logical_id = _starpu_get_logical_numa_node_worker(worker);
															
 
																+				int numa_starpu_id =  starpu_memory_nodes_numa_hwloclogid_to_id(numa_logical_id);
															
 
																+				if (numa_starpu_id >= STARPU_MAXNUMANODES)
															
 
																+					numa_starpu_id = STARPU_MAIN_RAM;
															
 
																+
															
 
																+				workerarg->numa_memory_node = memory_node = numa_starpu_id;
															
 
																+
															
 
																 				_starpu_memory_node_add_nworkers(memory_node);
															
 
																-                                _starpu_worker_drives_memory_node(workerarg, STARPU_MAIN_RAM);
															
 
																-				_starpu_worker_drives_memory_node(workerarg, memory_node);
															
 
																+				_starpu_worker_drives_memory_node(workerarg, numa_starpu_id);
															
 
																 				break;
															
 
																 			}
															
 
																 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
															
 
																 			case STARPU_CUDA_WORKER:
															
 
																+			{
															
 
																+				unsigned numa;
															
 
																 #ifndef STARPU_SIMGRID
															
 
																 				if (may_bind_automatically[STARPU_CUDA_WORKER])
															
 
																 				{
															
@@ -1884,8 +2332,11 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
																 						workerarg->bindid = cuda_bindid[devid] = _starpu_get_next_bindid(config, preferred_binding, npreferred);
															
 
																 					memory_node = cuda_memory_nodes[devid] = _starpu_memory_node_register(STARPU_CUDA_RAM, devid);
															
 
																-					_starpu_cuda_bus_ids[0][devid+1] = _starpu_register_bus(STARPU_MAIN_RAM, memory_node);
															
 
																-					_starpu_cuda_bus_ids[devid+1][0] = _starpu_register_bus(memory_node, STARPU_MAIN_RAM);
															
 
																+					for (numa = 0; numa < nb_numa_nodes; numa++)
															
 
																+					{
															
 
																+						_starpu_cuda_bus_ids[numa][devid+STARPU_MAXNUMANODES] = _starpu_register_bus(numa, memory_node);
															
 
																+						_starpu_cuda_bus_ids[devid+STARPU_MAXNUMANODES][numa] = _starpu_register_bus(memory_node, numa);
															
 
																+					}
															
 
																 #ifdef STARPU_SIMGRID
															
 
																 					const char* cuda_memcpy_peer;
															
 
																 					snprintf(name, sizeof(name), "CUDA%u", devid);
															
@@ -1912,8 +2363,8 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
																 							if (workerarg2->arch == STARPU_CUDA_WORKER)
															
 
																 							{
															
 
																 								unsigned memory_node2 = starpu_worker_get_memory_node(worker2);
															
 
																-								_starpu_cuda_bus_ids[devid2][devid] = _starpu_register_bus(memory_node2, memory_node);
															
 
																-								_starpu_cuda_bus_ids[devid][devid2] = _starpu_register_bus(memory_node, memory_node2);
															
 
																+								_starpu_cuda_bus_ids[devid2+STARPU_MAXNUMANODES][devid+STARPU_MAXNUMANODES] = _starpu_register_bus(memory_node2, memory_node);
															
 
																+								_starpu_cuda_bus_ids[devid+STARPU_MAXNUMANODES][devid2+STARPU_MAXNUMANODES] = _starpu_register_bus(memory_node, memory_node2);
															
 
																 #ifndef STARPU_SIMGRID
															
 
																 #if defined(HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX) && HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX
															
 
																 								{
															
@@ -1931,8 +2382,8 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
																 											_STARPU_DEBUG("CUDA%u and CUDA%u are linked through %s, along %u GPUs\n", devid, devid2, name, data->ngpus);
															
 
																 										}
															
 
																 #endif
															
 
																-										starpu_bus_set_ngpus(_starpu_cuda_bus_ids[devid2][devid], data->ngpus);
															
 
																-										starpu_bus_set_ngpus(_starpu_cuda_bus_ids[devid][devid2], data->ngpus);
															
 
																+										starpu_bus_set_ngpus(_starpu_cuda_bus_ids[devid2+STARPU_MAXNUMANODES][devid+STARPU_MAXNUMANODES], data->ngpus);
															
 
																+										starpu_bus_set_ngpus(_starpu_cuda_bus_ids[devid+STARPU_MAXNUMANODES][devid2+STARPU_MAXNUMANODES], data->ngpus);
															
 
																 									}
															
 
																 								}
															
 
																 #endif
															
@@ -1943,13 +2394,19 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
																 				}
															
 
																 				_starpu_memory_node_add_nworkers(memory_node);
															
 
																-                                _starpu_worker_drives_memory_node(&workerarg->set->workers[0], STARPU_MAIN_RAM);
															
 
																+				//This worker can manage transfers on NUMA nodes
															
 
																+				for (numa = 0; numa < nb_numa_nodes; numa++)
															
 
																+						_starpu_worker_drives_memory_node(&workerarg->set->workers[0], numa);
															
 
																+
															
 
																 				_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
															
 
																 				break;
															
 
																+			}
															
 
																 #endif
															
 
																 #if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
															
 
																 		        case STARPU_OPENCL_WORKER:
															
 
																+			{
															
 
																+				unsigned numa;
															
 
																 #ifndef STARPU_SIMGRID
															
 
																 				if (may_bind_automatically[STARPU_OPENCL_WORKER])
															
 
																 				{
															
@@ -1970,8 +2427,12 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
																 					opencl_init[devid] = 1;
															
 
																 					workerarg->bindid = opencl_bindid[devid] = _starpu_get_next_bindid(config, preferred_binding, npreferred);
															
 
																 					memory_node = opencl_memory_nodes[devid] = _starpu_memory_node_register(STARPU_OPENCL_RAM, devid);
															
 
																-					_starpu_register_bus(STARPU_MAIN_RAM, memory_node);
															
 
																-					_starpu_register_bus(memory_node, STARPU_MAIN_RAM);
															
 
																+
															
 
																+					for (numa = 0; numa < nb_numa_nodes; numa++)
															
 
																+					{
															
 
																+						_starpu_register_bus(numa, memory_node);
															
 
																+						_starpu_register_bus(memory_node, numa);
															
 
																+					}
															
 
																 #ifdef STARPU_SIMGRID
															
 
																 					snprintf(name, sizeof(name), "OpenCL%u", devid);
															
 
																 					host = _starpu_simgrid_get_host_by_name(name);
															
@@ -1981,13 +2442,19 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
																 				}
															
 
																 				_starpu_memory_node_add_nworkers(memory_node);
															
 
																-                                _starpu_worker_drives_memory_node(workerarg, STARPU_MAIN_RAM);
															
 
																+				//This worker can manage transfers on NUMA nodes
															
 
																+				for (numa = 0; numa < nb_numa_nodes; numa++)
															
 
																+						_starpu_worker_drives_memory_node(workerarg, numa);
															
 
																+
															
 
																 				_starpu_worker_drives_memory_node(workerarg, memory_node);
															
 
																 				break;
															
 
																+			}
															
 
																 #endif
															
 
																 #ifdef STARPU_USE_MIC
															
 
																 		        case STARPU_MIC_WORKER:
															
 
																+			{
															
 
																+				unsigned numa;
															
 
																 				if (mic_init[devid])
															
 
																 				{
															
 
																 					memory_node = mic_memory_nodes[devid];
															
@@ -2004,21 +2471,30 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
																 					//}
															
 
																 					mic_bindid[devid] = _starpu_get_next_bindid(config, preferred_binding, npreferred);
															
 
																 					memory_node = mic_memory_nodes[devid] = _starpu_memory_node_register(STARPU_MIC_RAM, devid);
															
 
																-					_starpu_register_bus(STARPU_MAIN_RAM, memory_node);
															
 
																-					_starpu_register_bus(memory_node, STARPU_MAIN_RAM);
															
 
																+
															
 
																+					for (numa = 0; numa < nb_numa_nodes; numa++)
															
 
																+					{
															
 
																+						_starpu_register_bus(numa, memory_node);
															
 
																+						_starpu_register_bus(memory_node, numa);
															
 
																+					}
															
 
																 				}
															
 
																 				workerarg->bindid = mic_bindid[devid];
															
 
																 				_starpu_memory_node_add_nworkers(memory_node);
															
 
																-                                _starpu_worker_drives_memory_node(&workerarg->set->workers[0], STARPU_MAIN_RAM);
															
 
																+				//This worker can manage transfers on NUMA nodes
															
 
																+				for (numa = 0; numa < nb_numa_nodes; numa++)
															
 
																+						_starpu_worker_drives_memory_node(&workerarg->set->workers[0], numa);
															
 
																+
															
 
																 				_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
															
 
																 				break;
															
 
																+			}
															
 
																 #endif /* STARPU_USE_MIC */
															
 
																 #ifdef STARPU_USE_SCC
															
 
																 			case STARPU_SCC_WORKER:
															
 
																 			{
															
 
																+				unsigned numa;
															
 
																 				/* Node 0 represents the SCC shared memory when we're on SCC. */
															
 
																 				struct _starpu_memory_node_descr *descr = _starpu_memory_node_get_description();
															
 
																 				descr->nodes[ram_memory_node] = STARPU_SCC_SHM;
															
@@ -2026,7 +2502,10 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
																 				memory_node = ram_memory_node;
															
 
																 				_starpu_memory_node_add_nworkers(memory_node);
															
 
																-                                _starpu_worker_drives_memory_node(workerarg, STARPU_MAIN_RAM);
															
 
																+				//This worker can manage transfers on NUMA nodes
															
 
																+				for (numa = 0; numa < nb_numa_nodes; numa++)
															
 
																+						_starpu_worker_drives_memory_node(workerarg, numa);
															
 
																+
															
 
																 				_starpu_worker_drives_memory_node(workerarg, memory_node);
															
 
																 			}
															
 
																 				break;
															
@@ -2035,6 +2514,7 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
																 #ifdef STARPU_USE_MPI_MASTER_SLAVE
															
 
																 			case STARPU_MPI_MS_WORKER:
															
 
																 			{
															
 
																+				unsigned numa;
															
 
																 				if (mpi_init[devid])
															
 
																 				{
															
 
																 					memory_node = mpi_memory_nodes[devid];
															
@@ -2044,11 +2524,18 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
																 					mpi_init[devid] = 1;
															
 
																 					mpi_bindid[devid] = _starpu_get_next_bindid(config, preferred_binding, npreferred);
															
 
																 					memory_node = mpi_memory_nodes[devid] = _starpu_memory_node_register(STARPU_MPI_MS_RAM, devid);
															
 
																-					_starpu_register_bus(STARPU_MAIN_RAM, memory_node);
															
 
																-					_starpu_register_bus(memory_node, STARPU_MAIN_RAM);
															
 
																+		
															
 
																+					for (numa = 0; numa < nb_numa_nodes; numa++)
															
 
																+					{	
															
 
																+						_starpu_register_bus(numa, memory_node);
															
 
																+						_starpu_register_bus(memory_node, numa);
															
 
																+					}
															
 
																 				}
															
 
																-                                _starpu_worker_drives_memory_node(&workerarg->set->workers[0], STARPU_MAIN_RAM);
															
 
																+				//This worker can manage transfers on NUMA nodes
															
 
																+				for (numa = 0; numa < nb_numa_nodes; numa++)
															
 
																+						_starpu_worker_drives_memory_node(&workerarg->set->workers[0], numa);
															
 
																+
															
 
																 				_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
															
 
																 #ifndef STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD
															
 
																                                 /* MPI driver thread can manage all slave memories if we disable the MPI multiple thread */
															
@@ -2154,7 +2641,7 @@ _starpu_build_topology (struct _starpu_machine_config *config, int no_mp_config)
 
																 	_starpu_memory_nodes_init();
															
 
																 	_starpu_datastats_init();
															
 
																-	_starpu_init_workers_binding(config, no_mp_config);
															
 
																+	_starpu_init_workers_binding_and_memory(config, no_mp_config);
															
 
																 	config->cpus_nodeid = -1;
															
 
																 	config->cuda_nodeid = -1;
															
@@ -2293,3 +2780,4 @@ starpu_topology_print (FILE *output)
 
																 		fprintf(output, "\n");
															
 
																 	}
															
 
																 }
															
 
																+
															
--- a/src/core/topology.h
+++ b/src/core/topology.h
@@ -1,7 +1,8 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009-2010, 2012, 2014-2016  Université de Bordeaux
															
 
																+ * Copyright (C) 2009-2010, 2012, 2014-2017  Université de Bordeaux
															
 
																  * Copyright (C) 2010, 2015, 2017  CNRS
															
 
																+ * Copyright (C) 2017  Inria
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -51,6 +52,9 @@ unsigned _starpu_topology_get_nhwcpu(struct _starpu_machine_config *config);
 
																 /* returns the number of logical cpus */
															
 
																 unsigned _starpu_topology_get_nhwpu(struct _starpu_machine_config *config);
															
 
																+/* returns the number of NUMA nodes */
															
 
																+unsigned _starpu_topology_get_nnumanodes(struct _starpu_machine_config *config);
															
 
																+
															
 
																 #ifdef STARPU_HAVE_HWLOC
															
 
																 /* Small convenient function to filter hwloc topology depending on HWLOC API version */
															
 
																 void _starpu_topology_filter(hwloc_topology_t topology);
															
@@ -68,4 +72,7 @@ void _starpu_bind_thread_on_cpus(struct _starpu_machine_config *config STARPU_AT
 
																 struct _starpu_worker *_starpu_get_worker_from_driver(struct starpu_driver *d);
															
 
																+int starpu_memory_nodes_get_numa_count(void);
															
 
																+int starpu_memory_nodes_numa_id_to_hwloclogid(unsigned id);
															
 
																+	
															
 
																 #endif // __TOPOLOGY_H__
															
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -1599,8 +1599,13 @@ void starpu_shutdown(void)
 
																 	/* tell all workers to shutdown */
															
 
																 	_starpu_kill_all_workers(&_starpu_config);
															
 
																-
															
 
																-	_starpu_free_all_automatically_allocated_buffers(STARPU_MAIN_RAM);
															
 
																+	
															
 
																+	unsigned i;
															
 
																+	unsigned nb_numa_nodes = starpu_memory_nodes_get_numa_count();
															
 
																+	for (i=0; i<nb_numa_nodes; i++)
															
 
																+	{
															
 
																+		_starpu_free_all_automatically_allocated_buffers(i);
															
 
																+	}
															
 
																 	{
															
 
																 	     int stats = starpu_get_env_number("STARPU_STATS");
															
--- a/src/core/workers.h
+++ b/src/core/workers.h
@@ -84,6 +84,7 @@ LIST_TYPE(_starpu_worker,
 
																 	starpu_pthread_cond_t started_cond; /* indicate when the worker is ready */
															
 
																 	starpu_pthread_cond_t ready_cond; /* indicate when the worker is ready */
															
 
																 	unsigned memory_node; /* which memory node is the worker associated with ? */
															
 
																+	unsigned numa_memory_node; /* which numa memory node is the worker associated with? (logical index) */
															
 
																 	/* condition variable used for passive waiting operations on worker
															
 
																 	 * STARPU_PTHREAD_COND_BROADCAST must be used instead of STARPU_PTHREAD_COND_SIGNAL,
															
 
																 	 * since the condition is shared for multiple purpose */
															
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c
@@ -1,7 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures. *
															
 
																  * Copyright (C) 2009-2017  Université de Bordeaux
															
 
																  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
															
 
																- * Copyright (C) 2014  INRIA
															
 
																+ * Copyright (C) 2014, 2017  INRIA
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -320,6 +320,29 @@ static int link_supports_direct_transfers(starpu_data_handle_t handle, unsigned
 
																 	return 0;
															
 
																 }
															
 
																+/* Now, we use slowness/bandwidth to compare numa nodes, is it better to use latency ? */
															
 
																+static unsigned chose_best_numa_between_src_and_dest(int src, int dst)
															
 
																+{
															
 
																+	double timing_best;
															
 
																+	int best_numa = -1;
															
 
																+	unsigned numa;
															
 
																+	const unsigned nb_numa_nodes = starpu_memory_nodes_get_numa_count();
															
 
																+	for(numa = 0; numa < nb_numa_nodes; numa++)
															
 
																+	{
															
 
																+		double actual = 1.0/starpu_transfer_bandwidth(src, numa) + 1.0/starpu_transfer_bandwidth(numa, dst);
															
 
																+
															
 
																+		/* Compare slowness : take the lowest */
															
 
																+		if (best_numa < 0 || actual < timing_best)
															
 
																+		{
															
 
																+			best_numa = numa;
															
 
																+			timing_best = actual;
															
 
																+		}
															
 
																+	}
															
 
																+	STARPU_ASSERT(best_numa >= 0);
															
 
																+	
															
 
																+	return best_numa;
															
 
																+}
															
 
																+
															
 
																 /* Determines the path of a request : each hop is defined by (src,dst) and the
															
 
																  * node that handles the hop. The returned value indicates the number of hops,
															
 
																  * and the max_len is the maximum number of hops (ie. the size of the
															
@@ -362,9 +385,11 @@ static int determine_request_path(starpu_data_handle_t handle,
 
																 		STARPU_ASSERT(max_len >= 2);
															
 
																 		STARPU_ASSERT(src_node >= 0);
															
 
																+		unsigned numa = chose_best_numa_between_src_and_dest(src_node, dst_node);
															
 
																+
															
 
																 		/* GPU -> RAM */
															
 
																 		src_nodes[0] = src_node;
															
 
																-		dst_nodes[0] = STARPU_MAIN_RAM;
															
 
																+		dst_nodes[0] = numa;
															
 
																 		if (starpu_node_get_kind(src_node) == STARPU_DISK_RAM)
															
 
																 			/* Disks don't have their own driver thread */
															
@@ -380,7 +405,7 @@ static int determine_request_path(starpu_data_handle_t handle,
 
																 		}
															
 
																 		/* RAM -> GPU */
															
 
																-		src_nodes[1] = STARPU_MAIN_RAM;
															
 
																+		src_nodes[1] = numa;
															
 
																 		dst_nodes[1] = dst_node;
															
 
																 		if (starpu_node_get_kind(dst_node) == STARPU_DISK_RAM)
															
@@ -573,7 +598,7 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 
																 		/* if the data is in write only mode (and not SCRATCH or REDUX), there is no need for a source, data will be initialized by the task itself */
															
 
																 		if (mode & STARPU_W)
															
 
																 			dst_replicate->initialized = 1;
															
 
																-		if (requesting_node == STARPU_MAIN_RAM && !nwait)
															
 
																+		if (starpu_node_get_kind(requesting_node) == STARPU_CPU_RAM && !nwait)
															
 
																 		{
															
 
																 			/* And this is the main RAM, really no need for a
															
 
																 			 * request, just allocate */
															
--- a/src/datawizard/copy_driver.c
+++ b/src/datawizard/copy_driver.c
@@ -864,10 +864,9 @@ void _starpu_driver_wait_request_completion(struct _starpu_async_channel *async_
 
																                 _starpu_mpi_common_wait_event(async_channel);
															
 
																                 break;
															
 
																 #endif
															
 
																-	case STARPU_MAIN_RAM:
															
 
																+	case STARPU_CPU_RAM:
															
 
																 		starpu_disk_wait_request(async_channel);
															
 
																 		break;
															
 
																-	case STARPU_CPU_RAM:
															
 
																 	default:
															
 
																 		STARPU_ABORT();
															
 
																 	}
															
--- a/src/datawizard/data_request.c
+++ b/src/datawizard/data_request.c
@@ -147,7 +147,7 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 
																 	if (handling_node == -1)
															
 
																 		handling_node = STARPU_MAIN_RAM;
															
 
																 	r->handling_node = handling_node;
															
 
																-	STARPU_ASSERT(handling_node == STARPU_MAIN_RAM || _starpu_memory_node_get_nworkers(handling_node));
															
 
																+	STARPU_ASSERT(starpu_node_get_kind(handling_node) == STARPU_CPU_RAM || _starpu_memory_node_get_nworkers(handling_node));
															
 
																 	r->completed = 0;
															
 
																 	r->prefetch = is_prefetch;
															
 
																 	r->prio = prio;
															
@@ -276,7 +276,7 @@ void _starpu_post_data_request(struct _starpu_data_request *r)
 
																 	unsigned handling_node = r->handling_node;
															
 
																 	/* We don't have a worker for disk nodes, these should have been posted to a main RAM node */
															
 
																 	STARPU_ASSERT(starpu_node_get_kind(handling_node) != STARPU_DISK_RAM);
															
 
																-	STARPU_ASSERT(handling_node == STARPU_MAIN_RAM || _starpu_memory_node_get_nworkers(handling_node));
															
 
																+	STARPU_ASSERT(starpu_node_get_kind(handling_node) == STARPU_CPU_RAM || _starpu_memory_node_get_nworkers(handling_node));
															
 
																 //	_STARPU_DEBUG("POST REQUEST\n");
															
--- a/src/datawizard/datawizard.c
+++ b/src/datawizard/datawizard.c
@@ -22,6 +22,7 @@
 
																 #include <datawizard/memory_nodes.h>
															
 
																 #include <core/workers.h>
															
 
																 #include <core/progress_hook.h>
															
 
																+#include <core/topology.h>
															
 
																 #ifdef STARPU_SIMGRID
															
 
																 #include <core/simgrid.h>
															
 
																 #endif
															
@@ -71,8 +72,16 @@ int __starpu_datawizard_progress(unsigned may_alloc, unsigned push_requests)
 
																         unsigned memnode;
															
 
																 	if (!worker)
															
 
																+	{
															
 
																 		/* Call from main application, only make RAM requests progress */
															
 
																-		return ___starpu_datawizard_progress(STARPU_MAIN_RAM, may_alloc, push_requests);
															
 
																+		int ret = 0;
															
 
																+		int nnumas = starpu_memory_nodes_get_numa_count();
															
 
																+		int numa;
															
 
																+		for (numa = 0; numa < nnumas; numa++)
															
 
																+			ret |=  ___starpu_datawizard_progress(numa, may_alloc, push_requests);
															
 
																+
															
 
																+		return ret;
															
 
																+	}
															
 
																 	if (worker->set)
															
 
																 		/* Runing one of the workers of a worker set. The reference for
															
 
																 		 * driving memory is its worker 0 (see registrations in topology.c) */
															
--- a/src/datawizard/datawizard.h
+++ b/src/datawizard/datawizard.h
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009, 2010, 2014  Université de Bordeaux
															
 
																+ * Copyright (C) 2009, 2010, 2014, 2017  Université de Bordeaux
															
 
																  * Copyright (C) 2010, 2013  CNRS
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
--- a/src/datawizard/filters.c
+++ b/src/datawizard/filters.c
@@ -188,14 +188,18 @@ static void _starpu_data_partition(starpu_data_handle_t initial_handle, starpu_d
 
																 		/* This is lazy allocation, allocate it now in main RAM, so as
															
 
																 		 * to have somewhere to gather pieces later */
															
 
																 		/* FIXME: mark as unevictable! */
															
 
																-		int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[STARPU_MAIN_RAM], 0);
															
 
																+		int home_node = initial_handle->home_node;
															
 
																+		if (home_node < 0 || (starpu_node_get_kind(home_node) != STARPU_CPU_RAM))
															
 
																+			home_node = STARPU_MAIN_RAM;
															
 
																+		int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[home_node], 0);
															
 
																 #ifdef STARPU_DEVEL
															
 
																 #warning we should reclaim memory if allocation failed
															
 
																 #endif
															
 
																 		STARPU_ASSERT(!ret);
															
 
																 	}
															
 
																-	_starpu_data_unregister_ram_pointer(initial_handle);
															
 
																+	for (node = 0; node < STARPU_MAXNODES; node++)
															
 
																+		_starpu_data_unregister_ram_pointer(initial_handle, node);
															
 
																 	if (nparts && !inherit_state)
															
 
																 	{
															
@@ -324,10 +328,14 @@ static void _starpu_data_partition(starpu_data_handle_t initial_handle, starpu_d
 
																 		 * store it in the handle */
															
 
																 		child->footprint = _starpu_compute_data_footprint(child);
															
 
																-		void *ptr;
															
 
																-		ptr = starpu_data_handle_to_pointer(child, STARPU_MAIN_RAM);
															
 
																-		if (ptr != NULL)
															
 
																-			_starpu_data_register_ram_pointer(child, ptr);
															
 
																+		for (node = 0; node < STARPU_MAXNODES; node++)
															
 
																+		{
															
 
																+			if (starpu_node_get_kind(node) != STARPU_CPU_RAM)
															
 
																+				continue;
															
 
																+			void *ptr = starpu_data_handle_to_pointer(child, node);
															
 
																+			if (ptr != NULL)
															
 
																+				_starpu_data_register_ram_pointer(child, ptr);
															
 
																+		}
															
 
																 		_STARPU_TRACE_HANDLE_DATA_REGISTER(child);
															
 
																 	}
															
@@ -428,7 +436,8 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 
																 			child_handle->unregister_hook(child_handle);
															
 
																 		}
															
 
																-		_starpu_data_unregister_ram_pointer(child_handle);
															
 
																+		for (node = 0; node < STARPU_MAXNODES; node++)
															
 
																+			_starpu_data_unregister_ram_pointer(child_handle, node);
															
 
																 		if (child_handle->per_worker)
															
 
																 		{
															
@@ -444,9 +453,14 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 
																 		_starpu_memory_stats_free(child_handle);
															
 
																 	}
															
 
																-	ptr = starpu_data_handle_to_pointer(root_handle, STARPU_MAIN_RAM);
															
 
																-	if (ptr != NULL)
															
 
																-		_starpu_data_register_ram_pointer(root_handle, ptr);
															
 
																+	for (node = 0; node < STARPU_MAXNODES; node++)
															
 
																+	{
															
 
																+		if (starpu_node_get_kind(node) != STARPU_CPU_RAM)
															
 
																+			continue;
															
 
																+		ptr = starpu_data_handle_to_pointer(root_handle, node);
															
 
																+		if (ptr != NULL)
															
 
																+			_starpu_data_register_ram_pointer(root_handle, ptr);
															
 
																+	}
															
 
																 	/* the gathering_node should now have a valid copy of all the children.
															
 
																 	 * For all nodes, if the node had all copies and none was locally
															
--- a/src/datawizard/interfaces/bcsr_interface.c
+++ b/src/datawizard/interfaces/bcsr_interface.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009-2016  Université de Bordeaux
															
 
																+ * Copyright (C) 2009-2017  Université de Bordeaux
															
 
																  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2017  CNRS
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -130,7 +130,7 @@ void starpu_bcsr_data_register(starpu_data_handle_t *handleptr, int home_node,
 
																 		.elemsize = elemsize
															
 
																 	};
															
 
																 #ifndef STARPU_SIMGRID
															
 
																-	if (home_node == STARPU_MAIN_RAM)
															
 
																+	if (home_node >= 0 && starpu_node_get_kind(home_node) == STARPU_CPU_RAM)
															
 
																 	{
															
 
																 		STARPU_ASSERT_ACCESSIBLE(nzval);
															
 
																 		STARPU_ASSERT_ACCESSIBLE(nzval + nnz*elemsize*r*c - 1);
															
@@ -260,9 +260,13 @@ uintptr_t starpu_bcsr_get_local_nzval(starpu_data_handle_t handle)
 
																 uint32_t *starpu_bcsr_get_local_colind(starpu_data_handle_t handle)
															
 
																 {
															
 
																+	int node = handle->home_node;
															
 
																+	if (node < 0 || (starpu_node_get_kind(node) != STARPU_CPU_RAM))
															
 
																+		node = STARPU_MAIN_RAM;
															
 
																+
															
 
																 	/* XXX 0 */
															
 
																 	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
															
 
																-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
															
 
																+		starpu_data_get_interface_on_node(handle, node);
															
 
																 #ifdef STARPU_DEBUG
															
 
																 	STARPU_ASSERT_MSG(data_interface->id == STARPU_BCSR_INTERFACE_ID, "Error. The given data is not a bcsr.");
															
@@ -273,9 +277,13 @@ uint32_t *starpu_bcsr_get_local_colind(starpu_data_handle_t handle)
 
																 uint32_t *starpu_bcsr_get_local_rowptr(starpu_data_handle_t handle)
															
 
																 {
															
 
																+	int node = handle->home_node;
															
 
																+	if (node < 0 || (starpu_node_get_kind(node) != STARPU_CPU_RAM))
															
 
																+		node = STARPU_MAIN_RAM;
															
 
																+
															
 
																 	/* XXX 0 */
															
 
																 	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
															
 
																-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
															
 
																+		starpu_data_get_interface_on_node(handle, node);
															
 
																 #ifdef STARPU_DEBUG
															
 
																 	STARPU_ASSERT_MSG(data_interface->id == STARPU_BCSR_INTERFACE_ID, "Error. The given data is not a bcsr.");
															
--- a/src/datawizard/interfaces/block_interface.c
+++ b/src/datawizard/interfaces/block_interface.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009-2016  Université de Bordeaux
															
 
																+ * Copyright (C) 2009-2017  Université de Bordeaux
															
 
																  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -163,7 +163,7 @@ void starpu_block_data_register(starpu_data_handle_t *handleptr, int home_node,
 
																 		.elemsize = elemsize
															
 
																 	};
															
 
																 #ifndef STARPU_SIMGRID
															
 
																-	if (home_node == STARPU_MAIN_RAM)
															
 
																+	if (home_node >= 0 && starpu_node_get_kind(home_node) == STARPU_CPU_RAM)
															
 
																 	{
															
 
																 		STARPU_ASSERT_ACCESSIBLE(ptr);
															
 
																 		STARPU_ASSERT_ACCESSIBLE(ptr + (nz-1)*ldz*elemsize + (ny-1)*ldy*elemsize + nx*elemsize - 1);
															
--- a/src/datawizard/interfaces/coo_interface.c
+++ b/src/datawizard/interfaces/coo_interface.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2013-2016  Université Bordeaux
															
 
																+ * Copyright (C) 2013-2017  Université Bordeaux
															
 
																  * Copyright (C) 2012 INRIA
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -18,6 +18,7 @@
 
																 #include <starpu.h>
															
 
																 #include <common/fxt.h>
															
 
																 #include <datawizard/memalloc.h>
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																 static int
															
 
																 copy_any_to_any(void *src_interface, unsigned src_node,
															
@@ -236,7 +237,7 @@ starpu_coo_data_register(starpu_data_handle_t *handleptr, int home_node,
 
																 		.elemsize = elemsize,
															
 
																 	};
															
 
																 #ifndef STARPU_SIMGRID
															
 
																-	if (home_node == STARPU_MAIN_RAM)
															
 
																+	if (home_node >= 0 && starpu_node_get_kind(home_node) == STARPU_CPU_RAM)
															
 
																 	{
															
 
																 		STARPU_ASSERT_ACCESSIBLE(columns);
															
 
																 		STARPU_ASSERT_ACCESSIBLE((uintptr_t) columns + n_values*sizeof(uint32_t) - 1);
															
--- a/src/datawizard/interfaces/csr_interface.c
+++ b/src/datawizard/interfaces/csr_interface.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009-2016  Université de Bordeaux
															
 
																+ * Copyright (C) 2009-2017  Université de Bordeaux
															
 
																  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
															
 
																  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2017  CNRS
															
 
																  *
															
@@ -112,7 +112,7 @@ void starpu_csr_data_register(starpu_data_handle_t *handleptr, int home_node,
 
																 		.elemsize = elemsize
															
 
																 	};
															
 
																 #ifndef STARPU_SIMGRID
															
 
																-	if (home_node == STARPU_MAIN_RAM)
															
 
																+	if (home_node >= 0 && starpu_node_get_kind(home_node) == STARPU_CPU_RAM)
															
 
																 	{
															
 
																 		STARPU_ASSERT_ACCESSIBLE(nzval);
															
 
																 		STARPU_ASSERT_ACCESSIBLE(nzval + nnz*elemsize - 1);
															
--- a/src/datawizard/interfaces/data_interface.c
+++ b/src/datawizard/interfaces/data_interface.c
@@ -369,12 +369,14 @@ static void _starpu_register_new_data(starpu_data_handle_t handle,
 
																 	/* now the data is available ! */
															
 
																 	_starpu_spin_unlock(&handle->header_lock);
															
 
																-
															
 
																-
															
 
																-	ptr = starpu_data_handle_to_pointer(handle, STARPU_MAIN_RAM);
															
 
																-	if (ptr != NULL)
															
 
																+	for (node = 0; node < STARPU_MAXNODES; node++)
															
 
																 	{
															
 
																-		_starpu_data_register_ram_pointer(handle, ptr);
															
 
																+		if (starpu_node_get_kind(node) != STARPU_CPU_RAM)
															
 
																+			continue;
															
 
																+
															
 
																+		ptr = starpu_data_handle_to_pointer(handle, node);
															
 
																+		if (ptr != NULL)
															
 
																+			_starpu_data_register_ram_pointer(handle, ptr);
															
 
																 	}
															
 
																 }
															
@@ -521,13 +523,17 @@ struct starpu_data_interface_ops* starpu_data_get_interface_ops(starpu_data_hand
 
																  * Stop monitoring a piece of data
															
 
																  */
															
 
																-void _starpu_data_unregister_ram_pointer(starpu_data_handle_t handle)
															
 
																+void _starpu_data_unregister_ram_pointer(starpu_data_handle_t handle, unsigned node)
															
 
																 {
															
 
																-	const void *ram_ptr = starpu_data_handle_to_pointer(handle, STARPU_MAIN_RAM);
															
 
																+	if (starpu_node_get_kind(node) != STARPU_CPU_RAM)
															
 
																+		return;
															
 
																+
															
 
																 #ifdef STARPU_OPENMP
															
 
																 	if (handle->removed_from_context_hash)
															
 
																 		return;
															
 
																 #endif
															
 
																+	const void *ram_ptr = starpu_data_handle_to_pointer(handle, node);
															
 
																+
															
 
																 	if (ram_ptr != NULL)
															
 
																 	{
															
 
																 		/* Remove the PTR -> HANDLE mapping.  If a mapping from PTR
															
@@ -757,7 +763,10 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 
																 			_STARPU_DEBUG("Conversion needed\n");
															
 
																 			void *buffers[1];
															
 
																 			struct starpu_multiformat_interface *format_interface;
															
 
																-			format_interface = (struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
															
 
																+			home_node = handle->home_node;
															
 
																+			if (home_node < 0 || (starpu_node_get_kind(home_node) != STARPU_CPU_RAM))
															
 
																+				home_node = STARPU_MAIN_RAM;
															
 
																+			format_interface = (struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, home_node);
															
 
																 			struct starpu_codelet *cl = NULL;
															
 
																 			enum starpu_node_kind node_kind = starpu_node_get_kind(handle->mf_node);
															
@@ -850,16 +859,19 @@ retry_busy:
 
																 	size_t size = _starpu_data_get_size(handle);
															
 
																-	_starpu_data_unregister_ram_pointer(handle);
															
 
																-
															
 
																 	/* Destroy the data now */
															
 
																 	unsigned node;
															
 
																 	for (node = 0; node < STARPU_MAXNODES; node++)
															
 
																 	{
															
 
																 		struct _starpu_data_replicate *local = &handle->per_node[node];
															
 
																+		if (local->allocated)
															
 
																+		{
															
 
																+			_starpu_data_unregister_ram_pointer(handle, node);
															
 
																+
															
 
																 		/* free the data copy in a lazy fashion */
															
 
																-		if (local->allocated && local->automatically_allocated)
															
 
																-			_starpu_request_mem_chunk_removal(handle, local, node, size);
															
 
																+			if (local->automatically_allocated)
															
 
																+				_starpu_request_mem_chunk_removal(handle, local, node, size);
															
 
																+		}
															
 
																 	}
															
 
																 	if (handle->per_worker)
															
 
																 	{
															
@@ -976,8 +988,7 @@ static void _starpu_data_invalidate(void *data)
 
																 		if (local->mc && local->allocated && local->automatically_allocated)
															
 
																 		{
															
 
																-			if (node == STARPU_MAIN_RAM)
															
 
																-				_starpu_data_unregister_ram_pointer(handle);
															
 
																+			_starpu_data_unregister_ram_pointer(handle, node);
															
 
																 			/* free the data copy in a lazy fashion */
															
 
																 			_starpu_request_mem_chunk_removal(handle, local, node, size);
															
--- a/src/datawizard/interfaces/data_interface.h
+++ b/src/datawizard/interfaces/data_interface.h
@@ -78,7 +78,7 @@ extern void _starpu_data_register_ram_pointer(starpu_data_handle_t handle,
 
																 						void *ptr)
															
 
																 	STARPU_ATTRIBUTE_INTERNAL;
															
 
																-extern void _starpu_data_unregister_ram_pointer(starpu_data_handle_t handle)
															
 
																+extern void _starpu_data_unregister_ram_pointer(starpu_data_handle_t handle, unsigned node)
															
 
																 	STARPU_ATTRIBUTE_INTERNAL;
															
 
																 #define _starpu_data_is_multiformat_handle(handle) handle->ops->is_multiformat
															
--- a/src/datawizard/interfaces/matrix_interface.c
+++ b/src/datawizard/interfaces/matrix_interface.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010-2016  Université de Bordeaux
															
 
																+ * Copyright (C) 2010-2017  Université de Bordeaux
															
 
																  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -174,7 +174,7 @@ void starpu_matrix_data_register(starpu_data_handle_t *handleptr, int home_node,
 
																                 .offset = 0
															
 
																 	};
															
 
																 #ifndef STARPU_SIMGRID
															
 
																-	if (home_node == STARPU_MAIN_RAM)
															
 
																+	if (home_node >= 0 && starpu_node_get_kind(home_node) == STARPU_CPU_RAM)
															
 
																 	{
															
 
																 		STARPU_ASSERT_ACCESSIBLE(ptr);
															
 
																 		STARPU_ASSERT_ACCESSIBLE(ptr + (ny-1)*ld*elemsize + nx*elemsize - 1);
															
--- a/src/datawizard/interfaces/variable_interface.c
+++ b/src/datawizard/interfaces/variable_interface.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2010-2016  Université de Bordeaux
															
 
																+ * Copyright (C) 2010-2017  Université de Bordeaux
															
 
																  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2017  CNRS
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
@@ -113,7 +113,7 @@ void starpu_variable_data_register(starpu_data_handle_t *handleptr, int home_nod
 
																 		.elemsize = elemsize
															
 
																 	};
															
 
																 #ifndef STARPU_SIMGRID
															
 
																-	if (home_node == STARPU_MAIN_RAM)
															
 
																+	if (home_node >= 0 && starpu_node_get_kind(home_node) == STARPU_CPU_RAM)
															
 
																 	{
															
 
																 		STARPU_ASSERT_ACCESSIBLE(ptr);
															
 
																 		STARPU_ASSERT_ACCESSIBLE(ptr + elemsize - 1);
															
--- a/src/datawizard/interfaces/vector_interface.c
+++ b/src/datawizard/interfaces/vector_interface.c
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2009-2016  Université de Bordeaux
															
 
																+ * Copyright (C) 2009-2017  Université de Bordeaux
															
 
																  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2017  CNRS
															
 
																  * Copyright (C) 2017  Inria
															
 
																  *
															
@@ -122,7 +122,7 @@ void starpu_vector_data_register(starpu_data_handle_t *handleptr, int home_node,
 
																                 .offset = 0
															
 
																 	};
															
 
																 #if (!defined(STARPU_SIMGRID) && !defined(STARPU_OPENMP))
															
 
																-	if (home_node == STARPU_MAIN_RAM)
															
 
																+	if (home_node >= 0 && starpu_node_get_kind(home_node) == STARPU_CPU_RAM)
															
 
																 	{
															
 
																 		STARPU_ASSERT_ACCESSIBLE(ptr);
															
 
																 		STARPU_ASSERT_ACCESSIBLE(ptr + nx*elemsize - 1);
															
--- a/src/datawizard/malloc.c
+++ b/src/datawizard/malloc.c
@@ -2,6 +2,7 @@
 
																  *
															
 
																  * Copyright (C) 2009-2010, 2012-2017  Université de Bordeaux
															
 
																  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
															
 
																+ * Copyright (C) 2017  Inria
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -112,8 +113,14 @@ static struct starpu_codelet malloc_pinned_cl =
 
																 };
															
 
																 #endif
															
 
																+/* Allocation in CPU RAM */
															
 
																 int starpu_malloc_flags(void **A, size_t dim, int flags)
															
 
																 {
															
 
																+	return _starpu_malloc_flags_on_node(STARPU_MAIN_RAM, A, dim, flags);
															
 
																+}
															
 
																+
															
 
																+int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int flags)
															
 
																+{
															
 
																 	int ret=0;
															
 
																 	STARPU_ASSERT(A);
															
@@ -121,14 +128,14 @@ int starpu_malloc_flags(void **A, size_t dim, int flags)
 
																 	if (flags & STARPU_MALLOC_COUNT)
															
 
																 	{
															
 
																 		if (!(flags & STARPU_MALLOC_NORECLAIM))
															
 
																-			while (starpu_memory_allocate(STARPU_MAIN_RAM, dim, flags) != 0)
															
 
																+			while (starpu_memory_allocate(dst_node, dim, flags) != 0)
															
 
																 			{
															
 
																 				size_t freed;
															
 
																 				size_t reclaim = 2 * dim;
															
 
																 				_STARPU_DEBUG("There is not enough memory left, we are going to reclaim %ld\n", (long)reclaim);
															
 
																-				_STARPU_TRACE_START_MEMRECLAIM(STARPU_MAIN_RAM,0);
															
 
																-				freed = _starpu_memory_reclaim_generic(STARPU_MAIN_RAM, 0, reclaim);
															
 
																-				_STARPU_TRACE_END_MEMRECLAIM(STARPU_MAIN_RAM,0);
															
 
																+				_STARPU_TRACE_START_MEMRECLAIM(dst_node,0);
															
 
																+				freed = _starpu_memory_reclaim_generic(dst_node, 0, reclaim);
															
 
																+				_STARPU_TRACE_END_MEMRECLAIM(dst_node,0);
															
 
																 				if (freed < dim && !(flags & STARPU_MEMORY_WAIT))
															
 
																 				{
															
 
																 					// We could not reclaim enough memory
															
@@ -137,9 +144,9 @@ int starpu_malloc_flags(void **A, size_t dim, int flags)
 
																 				}
															
 
																 			}
															
 
																 		else if (flags & STARPU_MEMORY_WAIT)
															
 
																-			starpu_memory_allocate(STARPU_MAIN_RAM, dim, flags);
															
 
																+			starpu_memory_allocate(dst_node, dim, flags);
															
 
																 		else
															
 
																-			starpu_memory_allocate(STARPU_MAIN_RAM, dim, flags | STARPU_MEMORY_OVERFLOW);
															
 
																+			starpu_memory_allocate(dst_node, dim, flags | STARPU_MEMORY_OVERFLOW);
															
 
																 	}
															
 
																 	if (flags & STARPU_MALLOC_PINNED && disable_pinning <= 0 && STARPU_RUNNING_ON_VALGRIND == 0)
															
@@ -298,6 +305,18 @@ int starpu_malloc_flags(void **A, size_t dim, int flags)
 
																 		_starpu_scc_allocate_shared_memory(A, dim);
															
 
																 #endif
															
 
																 	}
															
 
																+#ifdef STARPU_HAVE_HWLOC
															
 
																+	if (starpu_memory_nodes_get_numa_count() > 1) {
															
 
																+		struct _starpu_machine_config *config = _starpu_get_machine_config();
															
 
																+		hwloc_topology_t hwtopology = config->topology.hwtopology;
															
 
																+		hwloc_obj_t numa_node_obj = hwloc_get_obj_by_type(hwtopology, HWLOC_OBJ_NODE, starpu_memory_nodes_numa_id_to_hwloclogid(dst_node));
															
 
																+		hwloc_bitmap_t nodeset = numa_node_obj->nodeset;
															
 
																+		*A = hwloc_alloc_membind_nodeset(hwtopology, dim, nodeset, HWLOC_MEMBIND_BIND | HWLOC_MEMBIND_NOCPUBIND, flags);
															
 
																+		//fprintf(stderr, "Allocation %lu bytes on NUMA node %d [%p]\n", (unsigned long) dim, starpu_memnode_get_numaphysid(dst_node), *A);
															
 
																+		if (!*A)
															
 
																+			ret = -ENOMEM;
															
 
																+	}
															
 
																+#endif /* STARPU_HAVE_HWLOC */
															
 
																 	else
															
 
																 #ifdef STARPU_HAVE_POSIX_MEMALIGN
															
 
																 	if (_malloc_align != sizeof(void*))
															
@@ -333,7 +352,7 @@ end:
 
																 	}
															
 
																 	else if (flags & STARPU_MALLOC_COUNT)
															
 
																 	{
															
 
																-		starpu_memory_deallocate(STARPU_MAIN_RAM, dim);
															
 
																+		starpu_memory_deallocate(dst_node, dim);
															
 
																 	}
															
 
																 	return ret;
															
@@ -383,6 +402,11 @@ static struct starpu_codelet free_pinned_cl =
 
																 int starpu_free_flags(void *A, size_t dim, int flags)
															
 
																 {
															
 
																+	return _starpu_free_flags_on_node(STARPU_MAIN_RAM, A, dim, flags);
															
 
																+}
															
 
																+
															
 
																+int _starpu_free_flags_on_node(unsigned dst_node, void *A, size_t dim, int flags)
															
 
																+{
															
 
																 	if (flags & STARPU_MALLOC_PINNED && disable_pinning <= 0 && STARPU_RUNNING_ON_VALGRIND == 0)
															
 
																 	{
															
 
																 		if (_starpu_can_submit_cuda_task())
															
@@ -470,6 +494,13 @@ int starpu_free_flags(void *A, size_t dim, int flags)
 
																 		_starpu_scc_free_shared_memory(A);
															
 
																 #endif
															
 
																 	}
															
 
																+#ifdef STARPU_HAVE_HWLOC
															
 
																+	else if (starpu_memory_nodes_get_numa_count() > 1) {
															
 
																+		struct _starpu_machine_config *config = _starpu_get_machine_config();
															
 
																+		hwloc_topology_t hwtopology = config->topology.hwtopology;
															
 
																+		hwloc_free(hwtopology, A, dim);
															
 
																+	}
															
 
																+#endif /* STARPU_HAVE_HWLOC */
															
 
																 	else
															
 
																 		free(A);
															
@@ -478,7 +509,7 @@ out:
 
																 #endif
															
 
																 	if (flags & STARPU_MALLOC_COUNT)
															
 
																 	{
															
 
																-		starpu_memory_deallocate(STARPU_MAIN_RAM, dim);
															
 
																+		starpu_memory_deallocate(dst_node, dim);
															
 
																 	}
															
 
																 	return 0;
															
@@ -516,7 +547,7 @@ _starpu_malloc_on_node(unsigned dst_node, size_t size, int flags)
 
																 	{
															
 
																 		case STARPU_CPU_RAM:
															
 
																 		{
															
 
																-			starpu_malloc_flags((void**) &addr, size,
															
 
																+			_starpu_malloc_flags_on_node(dst_node, (void**) &addr, size,			
															
 
																 #if defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
															
 
																 					/* without memcpy_peer, we can not
															
 
																 					 * allocated pinned memory, since it
															
@@ -646,7 +677,7 @@ _starpu_free_on_node_flags(unsigned dst_node, uintptr_t addr, size_t size, int f
 
																 	switch(kind)
															
 
																 	{
															
 
																 		case STARPU_CPU_RAM:
															
 
																-			starpu_free_flags((void*)addr, size,
															
 
																+			_starpu_free_flags_on_node(dst_node, (void*)addr, size,
															
 
																 #if defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
															
 
																 					flags & ~STARPU_MALLOC_PINNED
															
 
																 #else
															
--- a/src/datawizard/malloc.h
+++ b/src/datawizard/malloc.h
@@ -1,6 +1,6 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																- * Copyright (C) 2013  Université de Bordeaux
															
 
																+ * Copyright (C) 2013, 2017  Université de Bordeaux
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -22,4 +22,6 @@ void _starpu_malloc_shutdown(unsigned dst_node);
 
																 void _starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size);
															
 
																+int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int flags);
															
 
																+int _starpu_free_flags_on_node(unsigned dst_node, void *A, size_t dim, int flags);
															
 
																 #endif
															
--- a/src/datawizard/memalloc.c
+++ b/src/datawizard/memalloc.c
@@ -2,7 +2,7 @@
 
																  *
															
 
																  * Copyright (C) 2009-2017  Université de Bordeaux
															
 
																  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
															
 
																- * Copyright (C) 2016  Inria
															
 
																+ * Copyright (C) 2016, 2017  Inria
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -21,6 +21,7 @@
 
																 #include <datawizard/memalloc.h>
															
 
																 #include <datawizard/footprint.h>
															
 
																 #include <core/disk.h>
															
 
																+#include <core/topology.h>
															
 
																 #include <starpu.h>
															
 
																 #include <common/uthash.h>
															
@@ -382,8 +383,8 @@ static size_t free_memory_on_node(struct _starpu_mem_chunk *mc, unsigned node)
 
																 			data_interface = mc->chunk_interface;
															
 
																 		STARPU_ASSERT(data_interface);
															
 
																-		if (handle && node == STARPU_MAIN_RAM)
															
 
																-			_starpu_data_unregister_ram_pointer(handle);
															
 
																+		if (handle && (starpu_node_get_kind(node) == STARPU_CPU_RAM))
															
 
																+			_starpu_data_unregister_ram_pointer(handle, node);
															
 
																 		_STARPU_TRACE_START_FREE(node, mc->size);
															
 
																 		mc->ops->free_data_on_node(data_interface, node);
															
@@ -443,8 +444,7 @@ static void reuse_mem_chunk(unsigned node, struct _starpu_data_replicate *new_re
 
																 	struct _starpu_data_replicate *old_replicate = mc->replicate;
															
 
																 	if (old_replicate)
															
 
																 	{
															
 
																-		if (node == STARPU_MAIN_RAM)
															
 
																-			_starpu_data_unregister_ram_pointer(old_replicate->handle);
															
 
																+		_starpu_data_unregister_ram_pointer(old_replicate->handle, node);
															
 
																 		old_replicate->allocated = 0;
															
 
																 		old_replicate->automatically_allocated = 0;
															
 
																 		old_replicate->initialized = 0;
															
@@ -1486,11 +1486,11 @@ int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_
 
																 	replicate->allocated = 1;
															
 
																 	replicate->automatically_allocated = 1;
															
 
																-	if (replicate->relaxed_coherency == 0 && dst_node == STARPU_MAIN_RAM)
															
 
																+	if (replicate->relaxed_coherency == 0 && (starpu_node_get_kind(dst_node) == STARPU_CPU_RAM))
															
 
																 	{
															
 
																 		/* We are allocating the buffer in main memory, also register it
															
 
																 		 * for the gcc plugin.  */
															
 
																-		void *ptr = starpu_data_handle_to_pointer(handle, STARPU_MAIN_RAM);
															
 
																+		void *ptr = starpu_data_handle_to_pointer(handle, dst_node);
															
 
																 		if (ptr != NULL)
															
 
																 		{
															
 
																 			_starpu_data_register_ram_pointer(handle, ptr);
															
@@ -1617,7 +1617,7 @@ get_better_disk_can_accept_size(starpu_data_handle_t handle, unsigned node)
 
																 	int target = -1;
															
 
																 	unsigned nnodes = starpu_memory_nodes_get_count();
															
 
																 	unsigned int i;
															
 
																-	double time_disk = 0;
															
 
																+	double time_disk = 0.0;
															
 
																 	for (i = 0; i < nnodes; i++)
															
 
																 	{
															
@@ -1628,13 +1628,17 @@ get_better_disk_can_accept_size(starpu_data_handle_t handle, unsigned node)
 
																 			/* if we can write on the disk */
															
 
																 			if (_starpu_get_disk_flag(i) != STARPU_DISK_NO_RECLAIM)
															
 
																 			{
															
 
																-				/* only time can change between disk <-> main_ram
															
 
																-				 * and not between main_ram <-> worker if we compare diks*/
															
 
																-				double time_tmp = starpu_transfer_predict(i, STARPU_MAIN_RAM, _starpu_data_get_size(handle));
															
 
																-				if (target == -1 || time_disk > time_tmp)
															
 
																+				unsigned numa;
															
 
																+				unsigned nnumas = starpu_memory_nodes_get_numa_count();
															
 
																+				for (numa = 0; numa < nnumas; numa++)
															
 
																 				{
															
 
																-					target = i;
															
 
																-					time_disk = time_tmp;
															
 
																+					/* TODO : check if starpu_transfer_predict(node, i,...) is the same */
															
 
																+					double time_tmp = starpu_transfer_predict(node, numa, _starpu_data_get_size(handle)) + starpu_transfer_predict(i, numa, _starpu_data_get_size(handle));
															
 
																+					if (target == -1 || time_disk > time_tmp)
															
 
																+					{
															
 
																+						target = i;
															
 
																+						time_disk = time_tmp;
															
 
																+					}
															
 
																 				}
															
 
																 			}
															
 
																 		}
															
@@ -1642,6 +1646,9 @@ get_better_disk_can_accept_size(starpu_data_handle_t handle, unsigned node)
 
																 	return target;
															
 
																 }
															
 
																+#ifdef STARPU_DEVEL
															
 
																+#  warning TODO: better choose NUMA node
															
 
																+#endif
															
 
																 static unsigned
															
 
																 choose_target(starpu_data_handle_t handle, unsigned node)
															
@@ -1650,14 +1657,20 @@ choose_target(starpu_data_handle_t handle, unsigned node)
 
																 	size_t size_handle = _starpu_data_get_size(handle);
															
 
																 	if (handle->home_node != -1)
															
 
																 		/* try to push on RAM if we can before to push on disk */
															
 
																-		if(starpu_node_get_kind(handle->home_node) == STARPU_DISK_RAM && node != STARPU_MAIN_RAM)
															
 
																+		if(starpu_node_get_kind(handle->home_node) == STARPU_DISK_RAM && (starpu_node_get_kind(node) != STARPU_CPU_RAM))
															
 
																 		{
															
 
																-			if (handle->per_node[STARPU_MAIN_RAM].allocated ||
															
 
																-			    _starpu_memory_manager_test_allocate_size(STARPU_MAIN_RAM, size_handle) == 1)
															
 
																+ 	                unsigned i;
															
 
																+			unsigned nb_numa_nodes = starpu_memory_nodes_get_numa_count();
															
 
																+			for (i=0; i<nb_numa_nodes; i++)
															
 
																 			{
															
 
																-				target = STARPU_MAIN_RAM;
															
 
																+				if (handle->per_node[i].allocated || 
															
 
																+				    _starpu_memory_manager_test_allocate_size(i, size_handle) == 1)
															
 
																+				{
															
 
																+					target = i;
															
 
																+					break;
															
 
																+				}
															
 
																 			}
															
 
																-			else
															
 
																+			if (target == -1)
															
 
																 			{
															
 
																 				target = get_better_disk_can_accept_size(handle, node);
															
 
																 			}
															
@@ -1672,19 +1685,26 @@ choose_target(starpu_data_handle_t handle, unsigned node)
 
																 	{
															
 
																 		/* handle->home_node == -1 */
															
 
																 		/* no place for datas in RAM, we push on disk */
															
 
																-		if (node == STARPU_MAIN_RAM)
															
 
																+		if (starpu_node_get_kind(node) == STARPU_CPU_RAM)
															
 
																 		{
															
 
																 			target = get_better_disk_can_accept_size(handle, node);
															
 
																-		}
															
 
																+		} else {
															
 
																 		/* node != 0 */
															
 
																 		/* try to push data to RAM if we can before to push on disk*/
															
 
																-		else if (handle->per_node[STARPU_MAIN_RAM].allocated ||
															
 
																-			 _starpu_memory_manager_test_allocate_size(STARPU_MAIN_RAM, size_handle) == 1)
															
 
																-		{
															
 
																-			target = STARPU_MAIN_RAM;
															
 
																+			unsigned i;
															
 
																+			unsigned nb_numa_nodes = starpu_memory_nodes_get_numa_count();
															
 
																+			for (i=0; i<nb_numa_nodes; i++)
															
 
																+			{
															
 
																+				if (handle->per_node[i].allocated || 
															
 
																+				    _starpu_memory_manager_test_allocate_size(i, size_handle) == 1)
															
 
																+				{
															
 
																+					target = i;
															
 
																+					break;
															
 
																+				}
															
 
																+			}
															
 
																 		}
															
 
																 		/* no place in RAM */
															
 
																-		else
															
 
																+		if (target == -1)
															
 
																 		{
															
 
																 			target = get_better_disk_can_accept_size(handle, node);
															
 
																 		}
															
--- a/src/datawizard/memory_nodes.c
+++ b/src/datawizard/memory_nodes.c
@@ -77,7 +77,7 @@ void _starpu_memory_node_get_name(unsigned node, char *name, int size)
 
																 	switch (_starpu_descr.nodes[node])
															
 
																 	{
															
 
																 	case STARPU_CPU_RAM:
															
 
																-		prefix = "RAM";
															
 
																+		prefix = "NUMA";
															
 
																 		break;
															
 
																 	case STARPU_CUDA_RAM:
															
 
																 		prefix = "CUDA";
															
--- a/src/datawizard/user_interactions.c
+++ b/src/datawizard/user_interactions.c
@@ -257,13 +257,19 @@ int starpu_data_acquire_on_node_cb(starpu_data_handle_t handle, int node,
 
																 int starpu_data_acquire_cb(starpu_data_handle_t handle,
															
 
																 			   enum starpu_data_access_mode mode, void (*callback)(void *), void *arg)
															
 
																 {
															
 
																-	return starpu_data_acquire_on_node_cb(handle, STARPU_MAIN_RAM, mode, callback, arg);
															
 
																+	int home_node = handle->home_node;
															
 
																+	if (home_node < 0)
															
 
																+		home_node = STARPU_MAIN_RAM;
															
 
																+	return starpu_data_acquire_on_node_cb(handle, home_node, mode, callback, arg);
															
 
																 }
															
 
																 int starpu_data_acquire_cb_sequential_consistency(starpu_data_handle_t handle,
															
 
																 						  enum starpu_data_access_mode mode, void (*callback)(void *), void *arg, int sequential_consistency)
															
 
																 {
															
 
																-	return starpu_data_acquire_on_node_cb_sequential_consistency(handle, STARPU_MAIN_RAM, mode, callback, arg, sequential_consistency);
															
 
																+	int home_node = handle->home_node;
															
 
																+	if (home_node < 0)
															
 
																+		home_node = STARPU_MAIN_RAM;
															
 
																+	return starpu_data_acquire_on_node_cb_sequential_consistency(handle, home_node, mode, callback, arg, sequential_consistency);
															
 
																 }
															
@@ -372,7 +378,10 @@ int starpu_data_acquire_on_node(starpu_data_handle_t handle, int node, enum star
 
																 int starpu_data_acquire(starpu_data_handle_t handle, enum starpu_data_access_mode mode)
															
 
																 {
															
 
																-	return starpu_data_acquire_on_node(handle, STARPU_MAIN_RAM, mode);
															
 
																+	int home_node = handle->home_node;
															
 
																+	if (home_node < 0)
															
 
																+		home_node = STARPU_MAIN_RAM;
															
 
																+	return starpu_data_acquire_on_node(handle, home_node, mode);
															
 
																 }
															
 
																 int starpu_data_acquire_on_node_try(starpu_data_handle_t handle, int node, enum starpu_data_access_mode mode)
															
@@ -445,7 +454,10 @@ void starpu_data_release_on_node(starpu_data_handle_t handle, int node)
 
																 void starpu_data_release(starpu_data_handle_t handle)
															
 
																 {
															
 
																-	starpu_data_release_on_node(handle, STARPU_MAIN_RAM);
															
 
																+	int home_node = handle->home_node;
															
 
																+	if (home_node < 0)
															
 
																+		home_node = STARPU_MAIN_RAM;
															
 
																+	starpu_data_release_on_node(handle, home_node);
															
 
																 }
															
 
																 static void _prefetch_data_on_node(void *arg)
															
--- a/src/drivers/cpu/driver_cpu.c
+++ b/src/drivers/cpu/driver_cpu.c
@@ -153,35 +153,43 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 
																 static size_t _starpu_cpu_get_global_mem_size(int nodeid STARPU_ATTRIBUTE_UNUSED, struct _starpu_machine_config *config STARPU_ATTRIBUTE_UNUSED)
															
 
																 {
															
 
																 	size_t global_mem;
															
 
																-	starpu_ssize_t limit;
															
 
																+	starpu_ssize_t limit = -1;
															
 
																-	limit = starpu_get_env_number("STARPU_LIMIT_CPU_MEM");
															
 
																-#ifdef STARPU_DEVEL
															
 
																-#  warning TODO: take into account NUMA node and check STARPU_LIMIT_CPU_numanode_MEM
															
 
																-#endif
															
 
																+	char name[32];
															
 
																 #if defined(STARPU_HAVE_HWLOC)
															
 
																 	struct _starpu_machine_topology *topology = &config->topology;
															
 
																-#if 0
															
 
																-	/* Do not limit ourself to a single NUMA node yet, as we don't have real NUMA support for now */
															
 
																-        int depth_node = hwloc_get_type_depth(topology->hwtopology, HWLOC_OBJ_NODE);
															
 
																-
															
 
																-	if (depth_node == HWLOC_TYPE_DEPTH_UNKNOWN)
															
 
																-	     global_mem = hwloc_get_root_obj(topology->hwtopology)->memory.total_memory;
															
 
																+	int nnumas = starpu_memory_nodes_get_numa_count();
															
 
																+	if (nnumas > 1)
															
 
																+	{
															
 
																+		int depth_node = hwloc_get_type_depth(topology->hwtopology, HWLOC_OBJ_NODE);
															
 
																+
															
 
																+		if (depth_node == HWLOC_TYPE_DEPTH_UNKNOWN)
															
 
																+		     global_mem = hwloc_get_root_obj(topology->hwtopology)->memory.total_memory;
															
 
																+		else {
															
 
																+		     hwloc_obj_t obj = hwloc_get_obj_by_depth(topology->hwtopology, depth_node, nodeid);
															
 
																+		     global_mem = obj->memory.local_memory;
															
 
																+		     sprintf(name, "STARPU_LIMIT_CPU_NUMA_%d_MEM", obj->os_index);
															
 
																+		     limit = starpu_get_env_number(name);
															
 
																+		}
															
 
																+	}
															
 
																 	else
															
 
																-	     global_mem = hwloc_get_obj_by_depth(topology->hwtopology, depth_node, nodeid)->memory.local_memory;
															
 
																-#else
															
 
																-	global_mem = hwloc_get_root_obj(topology->hwtopology)->memory.total_memory;
															
 
																-#endif
															
 
																+	{
															
 
																+		/* Do not limit ourself to a single NUMA node */
															
 
																+		global_mem = hwloc_get_root_obj(topology->hwtopology)->memory.total_memory;
															
 
																+	}
															
 
																 #else /* STARPU_HAVE_HWLOC */
															
 
																 #ifdef STARPU_DEVEL
															
 
																-#  warning use sysinfo when available to get global size
															
 
																+#  warning TODO: use sysinfo when available to get global size
															
 
																 #endif
															
 
																 	global_mem = 0;
															
 
																 #endif
															
 
																+	if (limit == -1)
															
 
																+		limit = starpu_get_env_number("STARPU_LIMIT_CPU_MEM");
															
 
																+
															
 
																 	if (limit < 0)
															
 
																 		// No limit is defined, we return the global memory size
															
 
																 		return global_mem;
															
@@ -198,9 +206,7 @@ int _starpu_cpu_driver_init(struct _starpu_worker *cpu_worker)
 
																 	int devid = cpu_worker->devid;
															
 
																 	_starpu_driver_start(cpu_worker, _STARPU_FUT_CPU_KEY, 1);
															
 
																-	/* FIXME: when we have NUMA support, properly turn node number into NUMA node number */
															
 
																-	_starpu_memory_manager_set_global_memory_size(cpu_worker->memory_node, _starpu_cpu_get_global_mem_size(cpu_worker->memory_node, cpu_worker->config));
															
 
																-
															
 
																+	_starpu_memory_manager_set_global_memory_size(cpu_worker->memory_node, _starpu_cpu_get_global_mem_size(cpu_worker->numa_memory_node, cpu_worker->config));
															
 
																 	snprintf(cpu_worker->name, sizeof(cpu_worker->name), "CPU %d", devid);
															
 
																 	snprintf(cpu_worker->short_name, sizeof(cpu_worker->short_name), "CPU %d", devid);
															
 
																 	starpu_pthread_setname(cpu_worker->short_name);
															
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -53,7 +53,7 @@
 
																 static int ncudagpus = -1;
															
 
																 static size_t global_mem[STARPU_MAXCUDADEVS];
															
 
																-int _starpu_cuda_bus_ids[STARPU_MAXCUDADEVS+1][STARPU_MAXCUDADEVS+1];
															
 
																+int _starpu_cuda_bus_ids[STARPU_MAXCUDADEVS+STARPU_MAXNUMANODES][STARPU_MAXCUDADEVS+STARPU_MAXNUMANODES];
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 static cudaStream_t streams[STARPU_NMAXWORKERS];
															
 
																 static cudaStream_t out_transfer_streams[STARPU_MAXCUDADEVS];
															
@@ -163,7 +163,7 @@ cudaStream_t starpu_cuda_get_local_in_transfer_stream()
 
																 	int worker = starpu_worker_get_id_check();
															
 
																 	int devid = starpu_worker_get_devid(worker);
															
 
																 	cudaStream_t stream;
															
 
																-
															
 
																+	
															
 
																 	stream = in_transfer_streams[devid];
															
 
																 	STARPU_ASSERT(stream);
															
 
																 	return stream;
															
@@ -323,7 +323,7 @@ static void init_device_context(unsigned devid, unsigned memnode)
 
																 					{
															
 
																 						_STARPU_DEBUG("Enabled GPU-Direct %d -> %d\n", worker->devid, devid);
															
 
																 						/* direct copies are made from the destination, see link_supports_direct_transfers */
															
 
																-						starpu_bus_set_direct(_starpu_cuda_bus_ids[worker->devid][devid], 1);
															
 
																+						starpu_bus_set_direct(_starpu_cuda_bus_ids[worker->devid+STARPU_MAXNUMANODES][devid+STARPU_MAXNUMANODES], 1);
															
 
																 					}
															
 
																 				}
															
 
																 			}
															
--- a/src/drivers/cuda/driver_cuda.h
+++ b/src/drivers/cuda/driver_cuda.h
@@ -32,7 +32,7 @@ extern struct _starpu_driver_ops _starpu_driver_cuda_ops;
 
																 void _starpu_cuda_init(void);
															
 
																 unsigned _starpu_get_cuda_device_count(void);
															
 
																-extern int _starpu_cuda_bus_ids[STARPU_MAXCUDADEVS+1][STARPU_MAXCUDADEVS+1];
															
 
																+extern int _starpu_cuda_bus_ids[STARPU_MAXCUDADEVS+STARPU_MAXNUMANODES][STARPU_MAXCUDADEVS+STARPU_MAXNUMANODES];
															
 
																 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
															
 
																 void _starpu_cuda_discover_devices (struct _starpu_machine_config *);
															
--- a/src/drivers/mp_common/source_common.c
+++ b/src/drivers/mp_common/source_common.c
@@ -950,7 +950,6 @@ static void _starpu_src_common_worker_internal_work(struct _starpu_worker_set *
 
																         starpu_pthread_wait_reset(&worker_set->workers[0].wait);
															
 
																 #endif
															
 
																-
															
 
																 	/* Test if async transfers are completed */
															
 
																 	for (i = 0; i < worker_set->nworkers; i++)
															
 
																 	{
															
--- a/src/drivers/mpi/driver_mpi_common.c
+++ b/src/drivers/mpi/driver_mpi_common.c
@@ -469,7 +469,7 @@ void _starpu_mpi_common_barrier(void)
 
																 /* Compute bandwidth and latency between source and sink nodes
															
 
																  * Source node has to have the entire set of times at the end
															
 
																  */
															
 
																-void _starpu_mpi_common_measure_bandwidth_latency(double bandwidth_dtod[STARPU_MAXMPIDEVS][STARPU_MAXMPIDEVS], double latency_dtod[STARPU_MAXMPIDEVS][STARPU_MAXMPIDEVS])
															
 
																+void _starpu_mpi_common_measure_bandwidth_latency(double timing_dtod[STARPU_MAXMPIDEVS][STARPU_MAXMPIDEVS], double latency_dtod[STARPU_MAXMPIDEVS][STARPU_MAXMPIDEVS])
															
 
																 {
															
 
																         int ret;
															
 
																         unsigned iter;
															
@@ -506,7 +506,7 @@ void _starpu_mpi_common_measure_bandwidth_latency(double bandwidth_dtod[STARPU_M
 
																                                         STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "Bandwidth of MPI Master/Slave cannot be measured !");
															
 
																                                 }
															
 
																                                 end = starpu_timing_now();
															
 
																-                                bandwidth_dtod[sender][receiver] = (NITER*SIZE_BANDWIDTH)/(end - start);
															
 
																+                                timing_dtod[sender][receiver] = (end - start)/NITER/SIZE_BANDWIDTH;
															
 
																                                 /* measure latency sender to receiver */
															
 
																                                 start = starpu_timing_now();
															
@@ -546,14 +546,14 @@ void _starpu_mpi_common_measure_bandwidth_latency(double bandwidth_dtod[STARPU_M
 
																                 /* if we are the sender, we send the data */
															
 
																                 if (sender == id_proc)
															
 
																                 {
															
 
																-                        MPI_Send(bandwidth_dtod[sender], STARPU_MAXMPIDEVS, MPI_DOUBLE, src_node_id, 42, MPI_COMM_WORLD);
															
 
																+                        MPI_Send(timing_dtod[sender], STARPU_MAXMPIDEVS, MPI_DOUBLE, src_node_id, 42, MPI_COMM_WORLD);
															
 
																                         MPI_Send(latency_dtod[sender], STARPU_MAXMPIDEVS, MPI_DOUBLE, src_node_id, 42, MPI_COMM_WORLD);
															
 
																                 }
															
 
																                 /* the master node receives the data */
															
 
																                 if (src_node_id == id_proc)
															
 
																                 {
															
 
																-                        MPI_Recv(bandwidth_dtod[sender], STARPU_MAXMPIDEVS, MPI_DOUBLE, sender, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
															
 
																+                        MPI_Recv(timing_dtod[sender], STARPU_MAXMPIDEVS, MPI_DOUBLE, sender, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
															
 
																                         MPI_Recv(latency_dtod[sender], STARPU_MAXMPIDEVS, MPI_DOUBLE, sender, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
															
 
																                 }
															
--- a/src/util/openmp_runtime_support.c
+++ b/src/util/openmp_runtime_support.c
@@ -2415,8 +2415,12 @@ void starpu_omp_atomic_fallback_inline_end(void)
 
																 void starpu_omp_vector_annotate(starpu_data_handle_t handle, uint32_t slice_base)
															
 
																 {
															
 
																+	/* FIXME Oli: rather iterate over all nodes? */
															
 
																+	int node = starpu_data_get_home_node(handle);
															
 
																+	if (node < 0 || (starpu_node_get_kind(node) != STARPU_CPU_RAM))
															
 
																+		node = STARPU_MAIN_RAM;
															
 
																 	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *)
															
 
																-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
															
 
																+		starpu_data_get_interface_on_node(handle, node);
															
 
																 	assert(vector_interface->id == STARPU_VECTOR_INTERFACE_ID);
															
 
																 	vector_interface->slice_base = slice_base;
															
 
																 }
															
--- a/tests/datawizard/interfaces/test_interfaces.c
+++ b/tests/datawizard/interfaces/test_interfaces.c
@@ -794,6 +794,8 @@ handle_to_pointer(void)
 
																 	{
															
 
																 		if (starpu_node_get_kind(node) != STARPU_CPU_RAM)
															
 
																 			continue;
															
 
																+		if (!starpu_data_test_if_allocated_on_node(handle, node))
															
 
																+			continue;
															
 
																 		ptr = handle->ops->handle_to_pointer(handle, node);
															
 
																 		if (starpu_data_lookup(ptr) != handle)
															
--- a/tests/datawizard/nowhere.c
+++ b/tests/datawizard/nowhere.c
@@ -1,6 +1,7 @@
 
																 /* StarPU --- Runtime system for heterogeneous multicore architectures.
															
 
																  *
															
 
																  * Copyright (C) 2015-2016  Université de Bordeaux
															
 
																+ * Copyright (C) 2017  Inria
															
 
																  *
															
 
																  * StarPU is free software; you can redistribute it and/or modify
															
 
																  * it under the terms of the GNU Lesser General Public License as published by
															
@@ -84,6 +85,13 @@ int main(int argc, char **argv)
 
																 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
															
 
																 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
															
 
																+	if (starpu_memory_nodes_get_numa_count() > 1)
															
 
																+	{
															
 
																+		/* FIXME: assumes only one RAM node */
															
 
																+		starpu_shutdown();
															
 
																+		return STARPU_TEST_SKIPPED;
															
 
																+	}
															
 
																+
															
 
																 	starpu_variable_data_register(&handle_x, STARPU_MAIN_RAM, (uintptr_t)&x, sizeof(x));
															
 
																 	starpu_variable_data_register(&handle_y, STARPU_MAIN_RAM, (uintptr_t)&y, sizeof(y));
															
--- a/tests/datawizard/specific_node.c
+++ b/tests/datawizard/specific_node.c
@@ -34,10 +34,8 @@ starpu_data_handle_t data_handle;
 
																 unsigned data;
															
 
																-void specific_kernel(STARPU_ATTRIBUTE_UNUSED void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
															
 
																+void specific_kernel(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
															
 
																 {
															
 
																-	/* We do not protect this variable because it is only accessed when the
															
 
																-	 * "data_handle" piece of data is accessed. */
															
 
																 	unsigned *dataptr = (unsigned*) STARPU_VARIABLE_GET_PTR(descr[0]);
															
 
																 	STARPU_ASSERT(dataptr == &data);
															
@@ -55,6 +53,12 @@ static struct starpu_codelet specific_cl =
 
																 	.nodes = {STARPU_MAIN_RAM},
															
 
																 };
															
 
																+void cpu_codelet_unsigned_inc(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
															
 
																+{
															
 
																+	unsigned *dataptr = (unsigned*) STARPU_VARIABLE_GET_PTR(descr[0]);
															
 
																+	(*dataptr)++;
															
 
																+}
															
 
																+
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 void cuda_codelet_unsigned_inc(void *descr[], STARPU_ATTRIBUTE_UNUSED void *cl_arg);
															
 
																 #endif
															
@@ -64,7 +68,7 @@ void opencl_codelet_unsigned_inc(void *buffers[], void *args);
 
																 static struct starpu_codelet cl =
															
 
																 {
															
 
																-	.cpu_funcs = {specific_kernel},
															
 
																+	.cpu_funcs = {cpu_codelet_unsigned_inc},
															
 
																 #ifdef STARPU_USE_CUDA
															
 
																 	.cuda_funcs = {cuda_codelet_unsigned_inc},
															
 
																 	.cuda_flags = {STARPU_CUDA_ASYNC},
															
--- a/tests/disk/mem_reclaim.c
+++ b/tests/disk/mem_reclaim.c
@@ -60,7 +60,16 @@ int main(int argc, char **argv)
 
																 }
															
 
																 #else
															
 
																-const struct starpu_data_copy_methods my_vector_copy_data_methods_s;
															
 
																+static int (*any_to_any)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
															
 
																+
															
 
																+/* We need a ram-to-ram copy for NUMA machine, use any_to_any for that */
															
 
																+static int ram_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node) {
															
 
																+	return any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
															
 
																+}
															
 
																+
															
 
																+const struct starpu_data_copy_methods my_vector_copy_data_methods_s = {
															
 
																+	.ram_to_ram = ram_to_ram
															
 
																+};
															
 
																 struct starpu_data_interface_ops starpu_interface_my_vector_ops;
															
 
																 void starpu_my_vector_data_register(starpu_data_handle_t *handleptr, int home_node,
															
@@ -218,6 +227,7 @@ int main(void)
 
																 	setenv("STARPU_LIMIT_CPU_MEM", MEMSIZE_STR, 1);
															
 
																 	/* Build an vector-like interface which doesn't have the any_to_any helper, to force making use of pack/unpack */
															
 
																+	any_to_any = starpu_interface_vector_ops.copy_methods->any_to_any;
															
 
																 	memcpy(&starpu_interface_my_vector_ops, &starpu_interface_vector_ops, sizeof(starpu_interface_my_vector_ops));
															
 
																 	starpu_interface_my_vector_ops.copy_methods = &my_vector_copy_data_methods_s;