8 years ago · cbee1918f5
--- a/configure.ac
+++ b/configure.ac
@@ -562,6 +562,23 @@ AM_CONDITIONAL([STARPU_USE_MIC], [test "x$enable_mic" = "xyes"])
 
				 
			
 
				 ###############################################################################
			
 
				 
			
 
				+###############################################################################
			
 
				+#                                                                             #
			
 
				+#                           NUMA memory nodes                                 #
			
 
				+#                                                                             #
			
 
				+###############################################################################
			
 
				+
			
 
				+AC_MSG_CHECKING(maximum number of NUMA nodes)
			
 
				+AC_ARG_ENABLE(maxnumanodes, [AS_HELP_STRING([--enable-maxnumanodes=<number>],
			
 
				+			[maximum number of NUMA nodes])],
			
 
				+			nmaxnumanodes=$enableval, nmaxnumanodes=2)
			
 
				+AC_MSG_RESULT($nmaxnumanodes)
			
 
				+AC_DEFINE_UNQUOTED(STARPU_MAXNUMANODES, [$nmaxnumanodes],
			
 
				+		[maximum number of NUMA nodes])
			
 
				+
			
 
				+
			
 
				+###############################################################################
			
 
				+
			
 
				 AC_PATH_PROGS([STARPU_MS_LIB], [lib])
			
 
				 AC_ARG_VAR([STARPU_MS_LIB], [Path to Microsoft's Visual Studio `lib' tool])
			
 
				 AM_CONDITIONAL([STARPU_HAVE_MS_LIB], [test "x$STARPU_MS_LIB" != "x"])
			
@@ -2138,8 +2155,8 @@ if test x$maxnodes = x0 ; then
 
				 	else
			
 
				 		# We have one memory node shared by all CPU workers, one node per GPU
			
 
				 		# and per MIC device
			
 
				-		# we add nodes to use 3 memory disks
			
 
				-		nodes=4
			
 
				+		# we add nodes to use 2 memory disks
			
 
				+		nodes=`expr $nmaxnumanodes + 2`
			
 
				 		if test x$enable_cuda = xyes ; then
			
 
				 			# we could have used nmaxcudadev + 1, but this would certainly give an
			
 
				 			# odd number.
			
--- a/doc/doxygen/chapters/210_check_list_performance.doxy
+++ b/doc/doxygen/chapters/210_check_list_performance.doxy
@@ -197,9 +197,13 @@ structures of StarPU by describing the shape of your machine and/or your
 
				 application at the configure step.
			
 
				 
			
 
				 To reduce the memory footprint of the data internal structures of StarPU, one
			
 
				-can set the \ref enable-maxcpus "--enable-maxcpus", \ref enable-maxcudadev
			
 
				-"--enable-maxcudadev", \ref enable-maxopencldev "--enable-maxopencldev" and
			
 
				-\ref enable-maxnodes "--enable-maxnodes" configure parameters to give StarPU
			
 
				+can set the
			
 
				+\ref enable-maxcpus "--enable-maxcpus",
			
 
				+\ref enable-maxnumanodes "--enable-maxnumanodes",
			
 
				+\ref enable-maxcudadev "--enable-maxcudadev",
			
 
				+\ref enable-maxopencldev "--enable-maxopencldev" and
			
 
				+\ref enable-maxnodes "--enable-maxnodes"
			
 
				+configure parameters to give StarPU
			
 
				 the architecture of the machine it will run on, thus tuning the size of the
			
 
				 structures to the machine.
			
 
				 
			
--- a/doc/doxygen/chapters/501_environment_variables.doxy
+++ b/doc/doxygen/chapters/501_environment_variables.doxy
@@ -845,6 +845,14 @@ available to the application in the main CPU memory. Setting it enables allocati
 
				 cache in main memory. Setting it to zero lets StarPU overflow memory.
			
 
				 </dd>
			
 
				 
			
 
				+<dt>STARPU_LIMIT_CPU_NUMA_devid_MEM</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_LIMIT_CPU_NUMA_devid_MEM
			
 
				+\addindex __env__STARPU_LIMIT_CPU_NUMA_devid_MEM
			
 
				+This variable specifies the maximum number of megabytes that should be
			
 
				+available to the application on the NUMA node with the OS identifier <c>devid</c>.
			
 
				+</dd>
			
 
				+
			
 
				 <dt>STARPU_MINIMUM_AVAILABLE_MEM</dt>
			
 
				 <dd>
			
 
				 \anchor STARPU_MINIMUM_AVAILABLE_MEM
			
@@ -1110,6 +1118,19 @@ implements an advanced but centralized management of concurrent data
 
				 accesses (see \ref ConcurrentDataAccess).
			
 
				 </dd>
			
 
				 
			
 
				+<dt>STARPU_USE_NUMA</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_USE_NUMA 
			
 
				+\addindex __env__STARPU_USE_NUMA
			
 
				+When defined, NUMA nodes are taking into account by StarPU. Otherwise, memory
			
 
				+is considered as only one node. This is experimental for now.
			
 
				+
			
 
				+When enabled, STARPU_MAIN_MEMORY is a pointer to the NUMA node associated to the
			
 
				+first CPU worker if it exists, the NUMA node associated to the first GPU discovered otherwise.
			
 
				+If StarPU doesn't find any NUMA node after these step, STARPU_MAIN_MEMORY is the first NUMA node
			
 
				+discovered by StarPU.
			
 
				+</dd>
			
 
				+
			
 
				 </dl>
			
 
				 
			
 
				 \section ConfiguringTheHypervisor Configuring The Hypervisor
			
--- a/doc/doxygen/chapters/510_configure_options.doxy
+++ b/doc/doxygen/chapters/510_configure_options.doxy
@@ -130,6 +130,14 @@ Use at most <c>count</c> CPU cores.  This information is then
 
				 available as the macro ::STARPU_MAXCPUS.
			
 
				 </dd>
			
 
				 
			
 
				+<dt>--enable-maxnumanodes=<c>count</c></dt>
			
 
				+<dd>
			
 
				+\anchor enable-maxnumanodes
			
 
				+\addindex __configure__--enable-maxnumanodes
			
 
				+Use at most <c>count</c> NUMA nodes.  This information is then
			
 
				+available as the macro ::STARPU_MAXNUMANODES.
			
 
				+</dd>
			
 
				+
			
 
				 <dt>--disable-cpu</dt>
			
 
				 <dd>
			
 
				 \anchor disable-cpu
			
--- a/doc/doxygen/chapters/api/data_management.doxy
+++ b/doc/doxygen/chapters/api/data_management.doxy
@@ -2,7 +2,7 @@
 
				  * This file is part of the StarPU Handbook.
			
 
				  * Copyright (C) 2009--2011  Universit@'e de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
			
 
				- * Copyright (C) 2011, 2012 INRIA
			
 
				+ * Copyright (C) 2011, 2012, 2017  INRIA
			
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
 
				 
			
@@ -104,6 +104,10 @@ data to StarPU, the specified memory node indicates where the piece of
 
				 data initially resides (we also call this memory node the home node of
			
 
				 a piece of data).
			
 
				 
			
 
				+In the case of NUMA systems, functions starpu_memory_nodes_numa_devid_to_id()
			
 
				+and starpu_memory_nodes_numa_id_to_devid() can be used to convert from NUMA node
			
 
				+numbers as seen by the Operating System and NUMA node numbers as seen by StarPU.
			
 
				+
			
 
				 \fn void starpu_data_register(starpu_data_handle_t *handleptr, int home_node, void *data_interface, struct starpu_data_interface_ops *ops)
			
 
				 \ingroup API_Data_Management
			
 
				 Register a piece of data into the handle located at the
			
--- a/doc/doxygen/chapters/api/workers.doxy
+++ b/doc/doxygen/chapters/api/workers.doxy
@@ -250,6 +250,16 @@ Return the type of \p node as defined by
 
				 this function should be used in the allocation function to determine
			
 
				 on which device the memory needs to be allocated.
			
 
				 
			
 
				+\fn int starpu_memory_nodes_numa_id_to_devid(int osid)
			
 
				+\ingroup API_Workers_Properties
			
 
				+This function returns the identifier of the memory node associated to the NUMA
			
 
				+node identified by \p osid by the Operating System.
			
 
				+
			
 
				+\fn int starpu_memory_nodes_numa_devid_to_id(unsigned id);
			
 
				+\ingroup API_Workers_Properties
			
 
				+This function returns the Operating System identifier of the memory node
			
 
				+whose StarPU identifier is \p id.
			
 
				+
			
 
				 \fn char *starpu_worker_get_type_as_string(enum starpu_worker_archtype type)
			
 
				 \ingroup API_Workers_Properties
			
 
				 Return worker \p type as a string.
			
--- a/examples/cpp/add_vectors_cpp11.cpp
+++ b/examples/cpp/add_vectors_cpp11.cpp
@@ -2,7 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2009, 2010-2011, 2013-2015  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
			
 
				- * Copyright (C) 2012 INRIA
			
 
				+ * Copyright (C) 2012, 2017  INRIA
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -78,6 +78,12 @@ int main(int argc, char **argv)
 
				 		return 77;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				+	if (starpu_memory_nodes_get_numa_count() > 1)
			
 
				+	{
			
 
				+		starpu_shutdown();
			
 
				+		return 77;
			
 
				+	}
			
 
				+
			
 
				 	// StarPU data registering
			
 
				 	starpu_data_handle_t spu_vec_A;
			
 
				 	starpu_data_handle_t spu_vec_B;
			
--- a/include/starpu_config.h.in
+++ b/include/starpu_config.h.in
@@ -88,6 +88,7 @@
 
				 #undef STARPU_MAXNODES
			
 
				 #undef STARPU_NMAXBUFS
			
 
				 #undef STARPU_MAXCPUS
			
 
				+#undef STARPU_MAXNUMANODES
			
 
				 #undef STARPU_MAXCUDADEVS
			
 
				 #undef STARPU_MAXOPENCLDEVS
			
 
				 #undef STARPU_MAXMICDEVS
			
--- a/include/starpu_data.h
+++ b/include/starpu_data.h
@@ -2,7 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2010-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015  CNRS
			
 
				- * Copyright (C) 2016  Inria
			
 
				+ * Copyright (C) 2016, 2017  Inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -132,6 +132,10 @@ enum starpu_node_kind
 
				 
			
 
				 unsigned starpu_worker_get_memory_node(unsigned workerid);
			
 
				 unsigned starpu_memory_nodes_get_count(void);
			
 
				+int starpu_memory_nodes_get_numa_count(void);
			
 
				+int starpu_memory_nodes_numa_id_to_devid(int osid);
			
 
				+int starpu_memory_nodes_numa_devid_to_id(unsigned id);
			
 
				+
			
 
				 enum starpu_node_kind starpu_node_get_kind(unsigned node);
			
 
				 
			
 
				 void starpu_data_set_wt_mask(starpu_data_handle_t handle, uint32_t wt_mask);
			
--- a/include/starpu_data_interfaces.h
+++ b/include/starpu_data_interfaces.h
@@ -483,6 +483,8 @@ size_t starpu_data_get_size(starpu_data_handle_t handle);
 
				 
			
 
				 starpu_data_handle_t starpu_data_lookup(const void *ptr);
			
 
				 
			
 
				+int starpu_data_get_home_node(starpu_data_handle_t handle);
			
 
				+
			
 
				 #ifdef __cplusplus
			
 
				 }
			
 
				 #endif
			
--- a/src/core/disk.c
+++ b/src/core/disk.c
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2013  Corentin Salingue
			
 
				  * Copyright (C) 2015, 2016, 2017  CNRS
			
 
				+ * Copyright (C) 2017  Inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -77,16 +78,22 @@ int starpu_disk_register(struct starpu_disk_ops *func, void *parameter, starpu_s
 
				 {
			
 
				 	STARPU_ASSERT_MSG(size < 0 || size >= STARPU_DISK_SIZE_MIN, "Minimum disk size is %d Bytes ! (Here %d) \n", (int) STARPU_DISK_SIZE_MIN, (int) size);
			
 
				 	/* register disk */
			
 
				-	unsigned memory_node = _starpu_memory_node_register(STARPU_DISK_RAM, 0);
			
 
				+	unsigned disk_memnode = _starpu_memory_node_register(STARPU_DISK_RAM, 0);
			
 
				 
			
 
				-	_starpu_register_bus(STARPU_MAIN_RAM, memory_node);
			
 
				-	_starpu_register_bus(memory_node, STARPU_MAIN_RAM);
			
 
				+        /* Connect the disk memory node to all numa memory nodes */
			
 
				+        int nb_numa_nodes = starpu_memory_nodes_get_numa_count();
			
 
				+        int numa_node;
			
 
				+        for (numa_node = 0; numa_node < nb_numa_nodes; numa_node++)
			
 
				+        {
			
 
				+                _starpu_register_bus(disk_memnode, numa_node);
			
 
				+                _starpu_register_bus(numa_node, disk_memnode);
			
 
				+        }
			
 
				 
			
 
				 	/* connect disk */
			
 
				 	void *base = func->plug(parameter, size);
			
 
				 
			
 
				 	/* remember it */
			
 
				-	int n STARPU_ATTRIBUTE_UNUSED = add_disk_in_list(memory_node,func,base);
			
 
				+	int n STARPU_ATTRIBUTE_UNUSED = add_disk_in_list(disk_memnode, func, base);
			
 
				 
			
 
				 #ifdef STARPU_SIMGRID
			
 
				 	char name[16];
			
@@ -96,13 +103,13 @@ int starpu_disk_register(struct starpu_disk_ops *func, void *parameter, starpu_s
 
				 	_starpu_simgrid_memory_node_set_host(memory_node, host);
			
 
				 #endif
			
 
				 
			
 
				-	int ret = func->bandwidth(memory_node);
			
 
				+	int ret = func->bandwidth(disk_memnode);
			
 
				 	/* have a problem with the disk */
			
 
				 	if (ret == 0)
			
 
				 		return -ENOENT;
			
 
				 	if (size >= 0)
			
 
				-		_starpu_memory_manager_set_global_memory_size(memory_node, size);
			
 
				-	return memory_node;
			
 
				+		_starpu_memory_manager_set_global_memory_size(disk_memnode, size);
			
 
				+	return disk_memnode;
			
 
				 }
			
 
				 
			
 
				 void _starpu_disk_unregister(void)
			
--- a/src/core/perfmodel/perfmodel.h
+++ b/src/core/perfmodel/perfmodel.h
@@ -110,6 +110,10 @@ void _starpu_simgrid_get_platform_path(int version, char *path, size_t maxlen);
 
				 
			
 
				 void _starpu_perfmodel_realloc(struct starpu_perfmodel *model, int nb);
			
 
				 
			
 
				+#if defined(STARPU_HAVE_HWLOC)
			
 
				+hwloc_topology_t _starpu_perfmodel_get_hwtopology();
			
 
				+#endif
			
 
				+
			
 
				 #ifdef __cplusplus
			
 
				 }
			
 
				 #endif
			
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
--- a/src/core/simgrid.c
+++ b/src/core/simgrid.c
@@ -1,8 +1,8 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2012-2017  Université de Bordeaux
			
 
				- * Copyright (C) 2016  	    Inria
			
 
				- * Copyright (C) 2016, 2017  	    CNRS
			
 
				+ * Copyright (C) 2016, 2017  Inria
			
 
				+ * Copyright (C) 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -1039,8 +1039,19 @@ void _starpu_simgrid_count_ngpus(void)
 
				 			ngpus = 0;
			
 
				 			for (src2 = 1; src2 < STARPU_MAXNODES; src2++)
			
 
				 			{
			
 
				-				if (starpu_bus_get_id(src2, STARPU_MAIN_RAM) == -1)
			
 
				+				int numa;
			
 
				+				int nnumas = starpu_memory_nodes_get_numa_count();
			
 
				+				int found = 0;
			
 
				+				for (numa = 0; numa < nnumas; numa++)
			
 
				+					if (starpu_bus_get_id(src2, numa) != -1)
			
 
				+					{
			
 
				+						found = 1;
			
 
				+						break;
			
 
				+					}
			
 
				+					
			
 
				+				if (!found)
			
 
				 					continue;
			
 
				+
			
 
				 				msg_host_t srchost2 = _starpu_simgrid_get_memnode_host(src2);
			
 
				 				int routesize2;
			
 
				 #ifdef HAVE_SG_HOST_ROUTE
			
--- a/src/core/topology.c
+++ b/src/core/topology.c
@@ -30,6 +30,7 @@
 
				 #include <drivers/mpi/driver_mpi_common.h>
			
 
				 #include <drivers/mp_common/source_common.h>
			
 
				 #include <drivers/opencl/driver_opencl.h>
			
 
				+#include <drivers/opencl/driver_opencl_utils.h>
			
 
				 #include <profiling/profiling.h>
			
 
				 #include <datawizard/datastats.h>
			
 
				 #include <datawizard/memory_nodes.h>
			
@@ -54,11 +55,23 @@
 
				 #include <hwloc/cuda.h>
			
 
				 #endif
			
 
				 
			
 
				+#if defined(STARPU_USE_OPENCL)
			
 
				+#include <hwloc/opencl.h>
			
 
				+#endif
			
 
				+
			
 
				 static unsigned topology_is_initialized = 0;
			
 
				 static int nobind;
			
 
				 
			
 
				 /* For checking whether two workers share the same PU, indexed by PU number */
			
 
				 static int cpu_worker[STARPU_MAXCPUS];
			
 
				+static unsigned nb_numa_nodes = 0;
			
 
				+static int numa_memory_nodes_to_hwloclogid[STARPU_MAXNUMANODES]; /* indexed by StarPU numa node to convert in hwloc logid */
			
 
				+static int numa_memory_nodes_to_physicalid[STARPU_MAXNUMANODES]; /* indexed by StarPU numa node to convert in physical id */
			
 
				+static unsigned numa_bus_id[STARPU_MAXNUMANODES*STARPU_MAXNUMANODES];
			
 
				+static int _starpu_get_logical_numa_node_worker(unsigned workerid);
			
 
				+
			
 
				+#define STARPU_NUMA_UNINITIALIZED (-2)
			
 
				+#define STARPU_NUMA_MAIN_RAM (-1)
			
 
				 
			
 
				 #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_SCC) || defined(STARPU_SIMGRID) || defined(STARPU_USE_MPI_MASTER_SLAVE)
			
 
				 
			
@@ -87,6 +100,124 @@ static struct _starpu_worker_set mic_worker_set[STARPU_MAXMICDEVS];
 
				 struct _starpu_worker_set mpi_worker_set[STARPU_MAXMPIDEVS];
			
 
				 #endif
			
 
				 
			
 
				+int starpu_memory_nodes_get_numa_count(void)
			
 
				+{
			
 
				+	return nb_numa_nodes;
			
 
				+}
			
 
				+
			
 
				+#if defined(STARPU_HAVE_HWLOC)
			
 
				+static int numa_get_logical_id(hwloc_obj_t obj)
			
 
				+{
			
 
				+	STARPU_ASSERT(obj);
			
 
				+	while (obj->type != HWLOC_OBJ_NODE)
			
 
				+	{
			
 
				+		obj = obj->parent;
			
 
				+
			
 
				+		/* If we don't find a "node" obj before the root, this means
			
 
				+		 * hwloc does not know whether there are numa nodes or not, so
			
 
				+		 * we should not use a per-node sampling in that case. */
			
 
				+		if (!obj)
			
 
				+			return STARPU_NUMA_MAIN_RAM;
			
 
				+	}
			
 
				+	return obj->logical_index;
			
 
				+}
			
 
				+
			
 
				+static int numa_get_physical_id(hwloc_obj_t obj)
			
 
				+{
			
 
				+	STARPU_ASSERT(obj);
			
 
				+	while (obj->type != HWLOC_OBJ_NODE)
			
 
				+	{
			
 
				+		obj = obj->parent;
			
 
				+
			
 
				+		/* If we don't find a "node" obj before the root, this means
			
 
				+		 * hwloc does not know whether there are numa nodes or not, so
			
 
				+		 * we should not use a per-node sampling in that case. */
			
 
				+		if (!obj)
			
 
				+			return STARPU_NUMA_MAIN_RAM;
			
 
				+	}
			
 
				+	return obj->os_index;
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+static int _starpu_get_logical_numa_node_worker(unsigned workerid)
			
 
				+{
			
 
				+#if defined(STARPU_HAVE_HWLOC)
			
 
				+	char * state;
			
 
				+	if ((state = starpu_getenv("STARPU_USE_NUMA")) && atoi(state))
			
 
				+	{
			
 
				+		struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
			
 
				+		struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config() ;
			
 
				+		struct _starpu_machine_topology *topology = &config->topology ;
			
 
				+
			
 
				+		hwloc_obj_t obj;
			
 
				+		switch(worker->arch) 	
			
 
				+		{
			
 
				+			case STARPU_CPU_WORKER:
			
 
				+				obj = hwloc_get_obj_by_type(topology->hwtopology, HWLOC_OBJ_PU, worker->bindid) ;
			
 
				+				break;
			
 
				+			default:
			
 
				+				STARPU_ABORT();
			
 
				+		}
			
 
				+
			
 
				+		return numa_get_logical_id(obj);
			
 
				+	}
			
 
				+	else		
			
 
				+#endif 
			
 
				+	{
			
 
				+		(void) workerid; /* unused */
			
 
				+		return STARPU_NUMA_MAIN_RAM;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static int _starpu_get_physical_numa_node_worker(unsigned workerid)
			
 
				+{
			
 
				+#if defined(STARPU_HAVE_HWLOC)
			
 
				+	char * state;
			
 
				+	if ((state = starpu_getenv("STARPU_USE_NUMA")) && atoi(state))
			
 
				+	{
			
 
				+		struct _starpu_worker *worker = _starpu_get_worker_struct(workerid);
			
 
				+		struct _starpu_machine_config *config = (struct _starpu_machine_config *)_starpu_get_machine_config() ;
			
 
				+		struct _starpu_machine_topology *topology = &config->topology ;
			
 
				+
			
 
				+		hwloc_obj_t obj;
			
 
				+		switch(worker->arch) 	
			
 
				+		{
			
 
				+			case STARPU_CPU_WORKER:
			
 
				+				obj = hwloc_get_obj_by_type(topology->hwtopology, HWLOC_OBJ_PU, worker->bindid) ;
			
 
				+				break;
			
 
				+			default:
			
 
				+				STARPU_ABORT();
			
 
				+		}
			
 
				+
			
 
				+		return numa_get_physical_id(obj);
			
 
				+	}
			
 
				+	else		
			
 
				+#endif 
			
 
				+	{
			
 
				+		(void) workerid; /* unused */
			
 
				+		return STARPU_NUMA_MAIN_RAM;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static int _starpu_numa_get_logical_id_from_pu(int pu)
			
 
				+{
			
 
				+#if defined(STARPU_HAVE_HWLOC)
			
 
				+	if (nb_numa_nodes > 1)
			
 
				+	{
			
 
				+		struct _starpu_machine_config *config = _starpu_get_machine_config();
			
 
				+		struct _starpu_machine_topology *topology = &config->topology;
			
 
				+
			
 
				+		hwloc_obj_t obj = hwloc_get_obj_by_type(topology->hwtopology, HWLOC_OBJ_PU, pu);
			
 
				+		return numa_get_logical_id(obj);
			
 
				+	}
			
 
				+	else
			
 
				+#endif
			
 
				+	{
			
 
				+		return -1;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				 struct _starpu_worker *_starpu_get_worker_from_driver(struct starpu_driver *d)
			
 
				 {
			
 
				 	unsigned nworkers = starpu_worker_get_count();
			
@@ -846,6 +977,67 @@ _starpu_topology_get_nhwpu (struct _starpu_machine_config *config)
 
				 	return config->topology.nhwpus;
			
 
				 }
			
 
				 
			
 
				+unsigned _starpu_topology_get_nnumanodes(struct _starpu_machine_config *config STARPU_ATTRIBUTE_UNUSED)
			
 
				+{
			
 
				+#if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
			
 
				+        _starpu_opencl_init();
			
 
				+#endif
			
 
				+#if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
			
 
				+        _starpu_init_cuda();
			
 
				+#endif
			
 
				+        _starpu_init_topology(config);
			
 
				+
			
 
				+	int res;
			
 
				+#if defined(STARPU_HAVE_HWLOC)
			
 
				+	char * state;
			
 
				+	if ((state = starpu_getenv("STARPU_USE_NUMA")) && atoi(state))
			
 
				+	{
			
 
				+		struct _starpu_machine_topology *topology = &config->topology ;
			
 
				+		int nnumanodes = hwloc_get_nbobjs_by_type(topology->hwtopology, HWLOC_OBJ_NODE) ;
			
 
				+		res = nnumanodes > 0 ? nnumanodes : 1 ;
			
 
				+	}
			
 
				+	else
			
 
				+#endif 
			
 
				+	{	
			
 
				+		res = 1;
			
 
				+	}
			
 
				+
			
 
				+	STARPU_ASSERT_MSG(res <= STARPU_MAXNUMANODES, "Number of NUMA nodes discovered is higher than maximum accepted ! Use configure option --enable-maxnumanodes=xxx to increase the maximum value of supported NUMA nodes.\n");
			
 
				+	return res;
			
 
				+}
			
 
				+
			
 
				+//TODO change this in an array
			
 
				+int starpu_memory_nodes_numa_hwloclogid_to_id(int logid)
			
 
				+{
			
 
				+	unsigned n;
			
 
				+	for (n = 0; n < nb_numa_nodes; n++)
			
 
				+		if (numa_memory_nodes_to_hwloclogid[n] == logid)
			
 
				+			return n;
			
 
				+	return -1;
			
 
				+}
			
 
				+
			
 
				+int starpu_memory_nodes_numa_id_to_hwloclogid(unsigned id)
			
 
				+{
			
 
				+	STARPU_ASSERT(id < STARPU_MAXNUMANODES);
			
 
				+	return numa_memory_nodes_to_hwloclogid[id];
			
 
				+}
			
 
				+
			
 
				+int starpu_memory_nodes_numa_devid_to_id(unsigned id)
			
 
				+{
			
 
				+	STARPU_ASSERT(id < STARPU_MAXNUMANODES);
			
 
				+	return numa_memory_nodes_to_physicalid[id];
			
 
				+}
			
 
				+
			
 
				+//TODO change this in an array
			
 
				+int starpu_memory_nodes_numa_id_to_devid(int osid)
			
 
				+{
			
 
				+	unsigned n;
			
 
				+	for (n = 0; n < nb_numa_nodes; n++)
			
 
				+		if (numa_memory_nodes_to_physicalid[n] == osid)
			
 
				+			return n;
			
 
				+	return -1;
			
 
				+}
			
 
				+
			
 
				 #ifdef STARPU_HAVE_HWLOC
			
 
				 void _starpu_topology_filter(hwloc_topology_t topology)
			
 
				 {
			
@@ -1751,35 +1943,294 @@ _starpu_bind_thread_on_cpus (
 
				 #endif
			
 
				 }
			
 
				 
			
 
				-static void
			
 
				-_starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_config STARPU_ATTRIBUTE_UNUSED)
			
 
				+static void _starpu_init_binding_cpu(struct _starpu_machine_config *config)
			
 
				+{
			
 
				+	unsigned worker;
			
 
				+	for (worker = 0; worker < config->topology.nworkers; worker++)
			
 
				+	{
			
 
				+		struct _starpu_worker *workerarg = &config->workers[worker];
			
 
				+
			
 
				+		switch (workerarg->arch)
			
 
				+		{
			
 
				+			case STARPU_CPU_WORKER:
			
 
				+			{
			
 
				+				/* Dedicate a cpu core to that worker */
			
 
				+				workerarg->bindid = _starpu_get_next_bindid(config, NULL, 0);
			
 
				+				break;
			
 
				+			}
			
 
				+			default:
			
 
				+				/* Do nothing */
			
 
				+				break;
			
 
				+		}
			
 
				+
			
 
				+
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+//TODO : Check SIMGRID
			
 
				+static void _starpu_init_numa_node(struct _starpu_machine_config *config)
			
 
				 {
			
 
				-	/* launch one thread per CPU */
			
 
				-	unsigned ram_memory_node;
			
 
				+	nb_numa_nodes = 0;
			
 
				+
			
 
				+	unsigned i;
			
 
				+	for (i = 0; i < STARPU_MAXNUMANODES; i++)
			
 
				+	{
			
 
				+		numa_memory_nodes_to_hwloclogid[i] = STARPU_NUMA_UNINITIALIZED;
			
 
				+		numa_memory_nodes_to_physicalid[i] = STARPU_NUMA_UNINITIALIZED;
			
 
				+	}
			
 
				 
			
 
				-	/* note that even if the CPU cpu are not used, we always have a RAM
			
 
				-	 * node */
			
 
				-	/* TODO : support NUMA  ;) */
			
 
				-	ram_memory_node = _starpu_memory_node_register(STARPU_CPU_RAM, 0);
			
 
				-	STARPU_ASSERT(ram_memory_node == STARPU_MAIN_RAM);
			
 
				 
			
 
				+	char * state;
			
 
				+	/* NUMA mode activated */
			
 
				+	if ((state = starpu_getenv("STARPU_USE_NUMA")) && atoi(state))
			
 
				+	{
			
 
				+		/* Take all NUMA nodes used by CPU workers */
			
 
				+		unsigned worker;
			
 
				+		for (worker = 0; worker < config->topology.nworkers; worker++)
			
 
				+		{
			
 
				+			struct _starpu_worker *workerarg = &config->workers[worker];
			
 
				+			if (workerarg->arch == STARPU_CPU_WORKER)
			
 
				+			{
			
 
				+				int numa_logical_id = _starpu_get_logical_numa_node_worker(worker);
			
 
				+
			
 
				+				/* Convert logical id to StarPU id to check if this NUMA node is already saved or not */
			
 
				+				int numa_starpu_id = starpu_memory_nodes_numa_hwloclogid_to_id(numa_logical_id);
			
 
				+
			
 
				+				/* This shouldn't happen */
			
 
				+				if (numa_starpu_id == -1 && nb_numa_nodes == STARPU_MAXNUMANODES)
			
 
				+				{
			
 
				+					_STARPU_MSG("Warning: %u NUMA nodes available. Only %u enabled. Use configure option --enable-maxnumanodes=xxx to update the maximum value of supported NUMA nodes.\n", _starpu_topology_get_nnumanodes(config), STARPU_MAXNUMANODES);
			
 
				+					STARPU_ABORT();
			
 
				+				}
			
 
				+
			
 
				+				if (numa_starpu_id == -1)
			
 
				+				{
			
 
				+					int devid = numa_logical_id == STARPU_NUMA_MAIN_RAM ? 0 : numa_logical_id;
			
 
				+					int memnode = _starpu_memory_node_register(STARPU_CPU_RAM, devid);
			
 
				+					STARPU_ASSERT_MSG(memnode < STARPU_MAXNUMANODES, "Wrong Memory Node : %d (only %d available)", memnode, STARPU_MAXNUMANODES);
			
 
				+					numa_memory_nodes_to_hwloclogid[memnode] = numa_logical_id;
			
 
				+					int numa_physical_id = _starpu_get_physical_numa_node_worker(worker);
			
 
				+					numa_memory_nodes_to_physicalid[memnode] = numa_physical_id;
			
 
				+					nb_numa_nodes++;
			
 
				+#ifdef STARPU_SIMGRID
			
 
				+					snprintf(name, sizeof(name), "RAM%d", memnode);
			
 
				+					host = _starpu_simgrid_get_host_by_name(name);
			
 
				+					STARPU_ASSERT(host);
			
 
				+					_starpu_simgrid_memory_node_set_host(memnode, host);
			
 
				+#endif
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		/* If we found NUMA nodes from CPU workers, it's good */
			
 
				+		if (nb_numa_nodes != 0)
			
 
				+			return;
			
 
				+
			
 
				+		_STARPU_DISP("No NUMA nodes found when checking CPU workers...\n");
			
 
				+
			
 
				+#if (defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)) && defined(STARPU_HAVE_HWLOC)
			
 
				+		_STARPU_DISP("Take NUMA nodes attached to CUDA and OpenCL devices...\n");
			
 
				+#endif
			
 
				+
			
 
				+#if defined(STARPU_USE_CUDA) && defined(STARPU_HAVE_HWLOC)
			
 
				+		for (i = 0; i < config->topology.ncudagpus; i++)
			
 
				+		{
			
 
				+			hwloc_obj_t obj = hwloc_cuda_get_device_osdev_by_index(config->topology.hwtopology, i);
			
 
				+
			
 
				+			/* Hwloc cannot recognize some devices */
			
 
				+			if (!obj)
			
 
				+				continue;
			
 
				+
			
 
				+			while (obj->type != HWLOC_OBJ_NODE)
			
 
				+			{
			
 
				+				obj = obj->parent;
			
 
				+
			
 
				+				/* If we don't find a "node" obj before the root, this means
			
 
				+				 * hwloc does not know whether there are numa nodes or not, so
			
 
				+				 * we should not use a per-node sampling in that case. */
			
 
				+				if (!obj)
			
 
				+					continue;
			
 
				+			}
			
 
				+			int numa_starpu_id = starpu_memory_nodes_numa_hwloclogid_to_id(obj->logical_index);
			
 
				+
			
 
				+			/* This shouldn't happen */
			
 
				+			if (numa_starpu_id == -1 && nb_numa_nodes == STARPU_MAXNUMANODES)
			
 
				+			{
			
 
				+				_STARPU_MSG("Warning: %u NUMA nodes available. Only %u enabled. Use configure option --enable-maxnumanodes=xxx to update the maximum value of supported NUMA nodes.\n", _starpu_topology_get_nnumanodes(config), STARPU_MAXNUMANODES);
			
 
				+				STARPU_ABORT();
			
 
				+			}
			
 
				+
			
 
				+			if (numa_starpu_id == -1)
			
 
				+			{
			
 
				+				int memnode = _starpu_memory_node_register(STARPU_CPU_RAM, obj->logical_index);
			
 
				+				STARPU_ASSERT_MSG(memnode < STARPU_MAXNUMANODES, "Wrong Memory Node : %d (only %d available)", memnode, STARPU_MAXNUMANODES);
			
 
				+				numa_memory_nodes_to_hwloclogid[memnode] = obj->logical_index;
			
 
				+				numa_memory_nodes_to_physicalid[memnode] = obj->os_index;
			
 
				+				nb_numa_nodes++;
			
 
				+#ifdef STARPU_SIMGRID
			
 
				+				snprintf(name, sizeof(name), "RAM%d", memnode);
			
 
				+				host = _starpu_simgrid_get_host_by_name(name);
			
 
				+				STARPU_ASSERT(host);
			
 
				+				_starpu_simgrid_memory_node_set_host(memnode, host);
			
 
				+#endif
			
 
				+			}
			
 
				+		}	
			
 
				+#endif
			
 
				+#if defined(STARPU_USE_OPENCL) && defined(STARPU_HAVE_HWLOC)
			
 
				+		if (config->topology.nopenclgpus > 0)
			
 
				+		{
			
 
				+			cl_int err;
			
 
				+			cl_platform_id platform_id[_STARPU_OPENCL_PLATFORM_MAX];
			
 
				+			cl_uint nb_platforms;
			
 
				+			unsigned platform;
			
 
				+			unsigned nb_opencl_devices = 0, num = 0;
			
 
				+
			
 
				+			err = clGetPlatformIDs(_STARPU_OPENCL_PLATFORM_MAX, platform_id, &nb_platforms);
			
 
				+			if (STARPU_UNLIKELY(err != CL_SUCCESS)) 
			
 
				+				nb_platforms=0;
			
 
				+
			
 
				+			cl_device_type device_type = CL_DEVICE_TYPE_GPU|CL_DEVICE_TYPE_ACCELERATOR;
			
 
				+			if (starpu_get_env_number("STARPU_OPENCL_ON_CPUS") > 0)
			
 
				+				device_type |= CL_DEVICE_TYPE_CPU;
			
 
				+			if (starpu_get_env_number("STARPU_OPENCL_ONLY_ON_CPUS") > 0)
			
 
				+				device_type = CL_DEVICE_TYPE_CPU;
			
 
				+
			
 
				+			for (platform = 0; platform < nb_platforms ; platform++)
			
 
				+			{
			
 
				+				err = clGetDeviceIDs(platform_id[platform], device_type, 0, NULL, &num);
			
 
				+				if (err != CL_SUCCESS)
			
 
				+					num = 0;
			
 
				+				nb_opencl_devices += num;
			
 
				+
			
 
				+				for (i = 0; i < num; i++)
			
 
				+				{
			
 
				+					hwloc_obj_t obj = hwloc_opencl_get_device_osdev_by_index(config->topology.hwtopology, platform, i);
			
 
				+
			
 
				+					/* Hwloc cannot recognize some devices */
			
 
				+					if (!obj)
			
 
				+						continue;
			
 
				+
			
 
				+					while (obj->type != HWLOC_OBJ_NODE)
			
 
				+					{
			
 
				+						obj = obj->parent;
			
 
				+
			
 
				+						/* If we don't find a "node" obj before the root, this means
			
 
				+						 * hwloc does not know whether there are numa nodes or not, so
			
 
				+						 * we should not use a per-node sampling in that case. */
			
 
				+						if (!obj)
			
 
				+							continue;
			
 
				+					}
			
 
				+					int numa_starpu_id = starpu_memory_nodes_numa_hwloclogid_to_id(obj->logical_index);
			
 
				+
			
 
				+					/* This shouldn't happen */
			
 
				+					if (numa_starpu_id == -1 && nb_numa_nodes == STARPU_MAXNUMANODES)
			
 
				+					{
			
 
				+						_STARPU_MSG("Warning: %u NUMA nodes available. Only %u enabled. Use configure option --enable-maxnumanodes=xxx to update the maximum value of supported NUMA nodes.\n", _starpu_topology_get_nnumanodes(config), STARPU_MAXNUMANODES);
			
 
				+						STARPU_ABORT();
			
 
				+					}
			
 
				+
			
 
				+					if (numa_starpu_id == -1)
			
 
				+					{
			
 
				+						int memnode = _starpu_memory_node_register(STARPU_CPU_RAM, obj->logical_index);
			
 
				+						STARPU_ASSERT_MSG(memnode < STARPU_MAXNUMANODES, "Wrong Memory Node : %d (only %d available)", memnode, STARPU_MAXNUMANODES);
			
 
				+						numa_memory_nodes_to_hwloclogid[memnode] = obj->logical_index;
			
 
				+						numa_memory_nodes_to_physicalid[memnode] = obj->os_index;	
			
 
				+						nb_numa_nodes++;
			
 
				 #ifdef STARPU_SIMGRID
			
 
				-	char name[16];
			
 
				-	msg_host_t host = _starpu_simgrid_get_host_by_name("RAM");
			
 
				-	STARPU_ASSERT(host);
			
 
				-	_starpu_simgrid_memory_node_set_host(STARPU_MAIN_RAM, host);
			
 
				+						snprintf(name, sizeof(name), "RAM%d", memnode);
			
 
				+						host = _starpu_simgrid_get_host_by_name(name);
			
 
				+						STARPU_ASSERT(host);
			
 
				+						_starpu_simgrid_memory_node_set_host(memnode, host);
			
 
				 #endif
			
 
				+					}
			
 
				+				}	
			
 
				+			}
			
 
				+		}
			
 
				+#endif
			
 
				+	}
			
 
				+	
			
 
				+#if (defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)) && defined(STARPU_HAVE_HWLOC)
			
 
				+	//Found NUMA nodes from CUDA nodes
			
 
				+	if (nb_numa_nodes != 0)
			
 
				+		return;
			
 
				+
			
 
				+	/* In case, we do not find any NUMA nodes when checking NUMA nodes attached to GPUs, we take all of them */
			
 
				+	_STARPU_DISP("No NUMA nodes found when checking GPUs devices...\n");
			
 
				+#endif
			
 
				+
			
 
				+	_STARPU_DISP("Finally, take all NUMA nodes available... \n");
			
 
				+
			
 
				+	unsigned nnuma = _starpu_topology_get_nnumanodes(config);
			
 
				+	if (nnuma > STARPU_MAXNUMANODES)
			
 
				+	{
			
 
				+		_STARPU_MSG("Warning: %u NUMA nodes available. Only %u enabled. Use configure option --enable-maxnumanodes=xxx to update the maximum value of supported NUMA nodes.\n", _starpu_topology_get_nnumanodes(config), STARPU_MAXNUMANODES);
			
 
				+		nnuma = STARPU_MAXNUMANODES;		
			
 
				+	}
			
 
				+
			
 
				+	unsigned numa;
			
 
				+	for (numa = 0; numa < nnuma; numa++)
			
 
				+	{
			
 
				+#if defined(STARPU_HAVE_HWLOC)
			
 
				+		if (nnuma > 1)
			
 
				+		{
			
 
				+			hwloc_obj_t obj = hwloc_get_obj_by_type(config->topology.hwtopology, HWLOC_OBJ_NUMANODE, numa);
			
 
				+			unsigned numa_logical_id = obj->logical_index;
			
 
				+			unsigned numa_physical_id = obj->os_index;
			
 
				 
			
 
				+			int memnode = _starpu_memory_node_register(STARPU_CPU_RAM, 0);
			
 
				+			STARPU_ASSERT_MSG(memnode < STARPU_MAXNUMANODES, "Wrong Memory Node : %d (only %d available) \n", memnode, STARPU_MAXNUMANODES);
			
 
				+
			
 
				+			numa_memory_nodes_to_hwloclogid[memnode] = numa_logical_id;
			
 
				+			numa_memory_nodes_to_physicalid[memnode] = numa_physical_id;
			
 
				+			nb_numa_nodes++;								
			
 
				+
			
 
				+#ifdef STARPU_SIMGRID
			
 
				+			snprintf(name, sizeof(name), "RAM%d", memnode);
			
 
				+			host = _starpu_simgrid_get_host_by_name(name);
			
 
				+			STARPU_ASSERT(host);
			
 
				+			_starpu_simgrid_memory_node_set_host(memnode, host);
			
 
				+#endif
			
 
				+		}
			
 
				+		else
			
 
				+#endif /* defined(STARPU_HAVE_HWLOC) */
			
 
				+		{
			
 
				+
			
 
				+			/* In this case, nnuma has only one node */
			
 
				+			int memnode = _starpu_memory_node_register(STARPU_CPU_RAM, 0);
			
 
				+			STARPU_ASSERT_MSG(memnode == STARPU_MAIN_RAM, "Wrong Memory Node : %d (expected %d) \n", memnode, STARPU_MAIN_RAM);
			
 
				+
			
 
				+			numa_memory_nodes_to_hwloclogid[memnode] = STARPU_NUMA_MAIN_RAM;
			
 
				+			numa_memory_nodes_to_physicalid[memnode] = STARPU_NUMA_MAIN_RAM;
			
 
				+			nb_numa_nodes++;								
			
 
				+#ifdef STARPU_SIMGRID
			
 
				+			char name[16];
			
 
				+			msg_host_t host = _starpu_simgrid_get_host_by_name("RAM");
			
 
				+			STARPU_ASSERT(host);
			
 
				+			_starpu_simgrid_memory_node_set_host(STARPU_MAIN_RAM, host);
			
 
				+#endif
			
 
				+		}
			
 
				+
			
 
				+	}	
			
 
				+	
			
 
				+	STARPU_ASSERT_MSG(nb_numa_nodes > 0, "No NUMA node found... We need at least one memory node !\n");	
			
 
				+}
			
 
				+
			
 
				+static void _starpu_init_numa_bus()
			
 
				+{
			
 
				+	unsigned i, j;
			
 
				+	for (i = 0; i < nb_numa_nodes; i++)
			
 
				+		for (j = 0; j < nb_numa_nodes; j++)
			
 
				+			if (i != j)
			
 
				+				numa_bus_id[i*nb_numa_nodes+j] = _starpu_register_bus(i, j);
			
 
				+}
			
 
				+
			
 
				+static void
			
 
				+_starpu_init_workers_binding_and_memory (struct _starpu_machine_config *config, int no_mp_config STARPU_ATTRIBUTE_UNUSED)
			
 
				+{
			
 
				 	/* We will store all the busid of the different (src, dst)
			
 
				 	 * combinations in a matrix which we initialize here. */
			
 
				 	_starpu_initialize_busid_matrix();
			
 
				 
			
 
				-	/* Each device is initialized,
			
 
				-	 * giving it a memory node and a core bind id.
			
 
				-	 */
			
 
				-	/* TODO: STARPU_MAXNUMANODES */
			
 
				-	unsigned numa_init[1] = { 1 };
			
 
				-	unsigned numa_memory_nodes[1] = { ram_memory_node };
			
 
				 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
			
 
				 	unsigned cuda_init[STARPU_MAXCUDADEVS] = { };
			
 
				 	unsigned cuda_memory_nodes[STARPU_MAXCUDADEVS];
			
@@ -1801,6 +2252,7 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
				 	unsigned mpi_memory_nodes[STARPU_MAXMPIDEVS];
			
 
				 	unsigned mpi_bindid[STARPU_MAXMPIDEVS];
			
 
				 #endif
			
 
				+
			
 
				 	unsigned bindid;
			
 
				 
			
 
				 	for (bindid = 0; bindid < config->nbindid; bindid++)
			
@@ -1810,6 +2262,13 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
				 		config->bindid_workers[bindid].nworkers = 0;
			
 
				 	}
			
 
				 
			
 
				+	/* Init CPU binding before NUMA nodes, because we use it to discover NUMA nodes */
			
 
				+	_starpu_init_binding_cpu(config);
			
 
				+
			
 
				+	/* Initialize NUMA nodes */
			
 
				+	_starpu_init_numa_node(config);
			
 
				+	_starpu_init_numa_bus();
			
 
				+
			
 
				 	unsigned worker;
			
 
				 	for (worker = 0; worker < config->topology.nworkers; worker++)
			
 
				 	{
			
@@ -1828,33 +2287,22 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
				 		{
			
 
				 			case STARPU_CPU_WORKER:
			
 
				 			{
			
 
				-				/* TODO: NUMA */
			
 
				-				int numaid = 0;
			
 
				-				/* "dedicate" a cpu core to that worker */
			
 
				-				if (numa_init[numaid])
			
 
				-				{
			
 
				-					memory_node = numa_memory_nodes[numaid];
			
 
				-				}
			
 
				-				else
			
 
				-				{
			
 
				-					numa_init[numaid] = 1;
			
 
				-					memory_node = numa_memory_nodes[numaid] = _starpu_memory_node_register(STARPU_CPU_RAM, numaid);
			
 
				-#ifdef STARPU_SIMGRID
			
 
				-					snprintf(name, sizeof(name), "RAM%d", numaid);
			
 
				-					host = _starpu_simgrid_get_host_by_name(name);
			
 
				-					STARPU_ASSERT(host);
			
 
				-					_starpu_simgrid_memory_node_set_host(memory_node, host);
			
 
				-#endif
			
 
				-				}
			
 
				-				workerarg->bindid = _starpu_get_next_bindid(config, NULL, 0);
			
 
				+				int numa_logical_id = _starpu_get_logical_numa_node_worker(worker);
			
 
				+				int numa_starpu_id =  starpu_memory_nodes_numa_hwloclogid_to_id(numa_logical_id);
			
 
				+				if (numa_starpu_id >= STARPU_MAXNUMANODES)
			
 
				+					numa_starpu_id = STARPU_MAIN_RAM;
			
 
				+
			
 
				+				workerarg->numa_memory_node = memory_node = numa_starpu_id;
			
 
				+
			
 
				 				_starpu_memory_node_add_nworkers(memory_node);
			
 
				 
			
 
				-                                _starpu_worker_drives_memory_node(workerarg, STARPU_MAIN_RAM);
			
 
				-				_starpu_worker_drives_memory_node(workerarg, memory_node);
			
 
				+				_starpu_worker_drives_memory_node(workerarg, numa_starpu_id);
			
 
				 				break;
			
 
				 			}
			
 
				 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
			
 
				 			case STARPU_CUDA_WORKER:
			
 
				+			{
			
 
				+				unsigned numa;
			
 
				 #ifndef STARPU_SIMGRID
			
 
				 				if (may_bind_automatically[STARPU_CUDA_WORKER])
			
 
				 				{
			
@@ -1884,8 +2332,11 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
				 						workerarg->bindid = cuda_bindid[devid] = _starpu_get_next_bindid(config, preferred_binding, npreferred);
			
 
				 					memory_node = cuda_memory_nodes[devid] = _starpu_memory_node_register(STARPU_CUDA_RAM, devid);
			
 
				 
			
 
				-					_starpu_cuda_bus_ids[0][devid+1] = _starpu_register_bus(STARPU_MAIN_RAM, memory_node);
			
 
				-					_starpu_cuda_bus_ids[devid+1][0] = _starpu_register_bus(memory_node, STARPU_MAIN_RAM);
			
 
				+					for (numa = 0; numa < nb_numa_nodes; numa++)
			
 
				+					{
			
 
				+						_starpu_cuda_bus_ids[numa][devid+STARPU_MAXNUMANODES] = _starpu_register_bus(numa, memory_node);
			
 
				+						_starpu_cuda_bus_ids[devid+STARPU_MAXNUMANODES][numa] = _starpu_register_bus(memory_node, numa);
			
 
				+					}
			
 
				 #ifdef STARPU_SIMGRID
			
 
				 					const char* cuda_memcpy_peer;
			
 
				 					snprintf(name, sizeof(name), "CUDA%u", devid);
			
@@ -1912,8 +2363,8 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
				 							if (workerarg2->arch == STARPU_CUDA_WORKER)
			
 
				 							{
			
 
				 								unsigned memory_node2 = starpu_worker_get_memory_node(worker2);
			
 
				-								_starpu_cuda_bus_ids[devid2][devid] = _starpu_register_bus(memory_node2, memory_node);
			
 
				-								_starpu_cuda_bus_ids[devid][devid2] = _starpu_register_bus(memory_node, memory_node2);
			
 
				+								_starpu_cuda_bus_ids[devid2+STARPU_MAXNUMANODES][devid+STARPU_MAXNUMANODES] = _starpu_register_bus(memory_node2, memory_node);
			
 
				+								_starpu_cuda_bus_ids[devid+STARPU_MAXNUMANODES][devid2+STARPU_MAXNUMANODES] = _starpu_register_bus(memory_node, memory_node2);
			
 
				 #ifndef STARPU_SIMGRID
			
 
				 #if defined(HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX) && HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX
			
 
				 								{
			
@@ -1931,8 +2382,8 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
				 											_STARPU_DEBUG("CUDA%u and CUDA%u are linked through %s, along %u GPUs\n", devid, devid2, name, data->ngpus);
			
 
				 										}
			
 
				 #endif
			
 
				-										starpu_bus_set_ngpus(_starpu_cuda_bus_ids[devid2][devid], data->ngpus);
			
 
				-										starpu_bus_set_ngpus(_starpu_cuda_bus_ids[devid][devid2], data->ngpus);
			
 
				+										starpu_bus_set_ngpus(_starpu_cuda_bus_ids[devid2+STARPU_MAXNUMANODES][devid+STARPU_MAXNUMANODES], data->ngpus);
			
 
				+										starpu_bus_set_ngpus(_starpu_cuda_bus_ids[devid+STARPU_MAXNUMANODES][devid2+STARPU_MAXNUMANODES], data->ngpus);
			
 
				 									}
			
 
				 								}
			
 
				 #endif
			
@@ -1943,13 +2394,19 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
				 				}
			
 
				 				_starpu_memory_node_add_nworkers(memory_node);
			
 
				 
			
 
				-                                _starpu_worker_drives_memory_node(&workerarg->set->workers[0], STARPU_MAIN_RAM);
			
 
				+				//This worker can manage transfers on NUMA nodes
			
 
				+				for (numa = 0; numa < nb_numa_nodes; numa++)
			
 
				+						_starpu_worker_drives_memory_node(&workerarg->set->workers[0], numa);
			
 
				+
			
 
				 				_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
			
 
				 				break;
			
 
				+			}
			
 
				 #endif
			
 
				 
			
 
				 #if defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
			
 
				 		        case STARPU_OPENCL_WORKER:
			
 
				+			{
			
 
				+				unsigned numa;
			
 
				 #ifndef STARPU_SIMGRID
			
 
				 				if (may_bind_automatically[STARPU_OPENCL_WORKER])
			
 
				 				{
			
@@ -1970,8 +2427,12 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
				 					opencl_init[devid] = 1;
			
 
				 					workerarg->bindid = opencl_bindid[devid] = _starpu_get_next_bindid(config, preferred_binding, npreferred);
			
 
				 					memory_node = opencl_memory_nodes[devid] = _starpu_memory_node_register(STARPU_OPENCL_RAM, devid);
			
 
				-					_starpu_register_bus(STARPU_MAIN_RAM, memory_node);
			
 
				-					_starpu_register_bus(memory_node, STARPU_MAIN_RAM);
			
 
				+
			
 
				+					for (numa = 0; numa < nb_numa_nodes; numa++)
			
 
				+					{
			
 
				+						_starpu_register_bus(numa, memory_node);
			
 
				+						_starpu_register_bus(memory_node, numa);
			
 
				+					}
			
 
				 #ifdef STARPU_SIMGRID
			
 
				 					snprintf(name, sizeof(name), "OpenCL%u", devid);
			
 
				 					host = _starpu_simgrid_get_host_by_name(name);
			
@@ -1981,13 +2442,19 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
				 				}
			
 
				 				_starpu_memory_node_add_nworkers(memory_node);
			
 
				 
			
 
				-                                _starpu_worker_drives_memory_node(workerarg, STARPU_MAIN_RAM);
			
 
				+				//This worker can manage transfers on NUMA nodes
			
 
				+				for (numa = 0; numa < nb_numa_nodes; numa++)
			
 
				+						_starpu_worker_drives_memory_node(workerarg, numa);
			
 
				+
			
 
				 				_starpu_worker_drives_memory_node(workerarg, memory_node);
			
 
				 				break;
			
 
				+			}
			
 
				 #endif
			
 
				 
			
 
				 #ifdef STARPU_USE_MIC
			
 
				 		        case STARPU_MIC_WORKER:
			
 
				+			{
			
 
				+				unsigned numa;
			
 
				 				if (mic_init[devid])
			
 
				 				{
			
 
				 					memory_node = mic_memory_nodes[devid];
			
@@ -2004,21 +2471,30 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
				 					//}
			
 
				 					mic_bindid[devid] = _starpu_get_next_bindid(config, preferred_binding, npreferred);
			
 
				 					memory_node = mic_memory_nodes[devid] = _starpu_memory_node_register(STARPU_MIC_RAM, devid);
			
 
				-					_starpu_register_bus(STARPU_MAIN_RAM, memory_node);
			
 
				-					_starpu_register_bus(memory_node, STARPU_MAIN_RAM);
			
 
				+
			
 
				+					for (numa = 0; numa < nb_numa_nodes; numa++)
			
 
				+					{
			
 
				+						_starpu_register_bus(numa, memory_node);
			
 
				+						_starpu_register_bus(memory_node, numa);
			
 
				+					}
			
 
				 
			
 
				 				}
			
 
				 				workerarg->bindid = mic_bindid[devid];
			
 
				 				_starpu_memory_node_add_nworkers(memory_node);
			
 
				 
			
 
				-                                _starpu_worker_drives_memory_node(&workerarg->set->workers[0], STARPU_MAIN_RAM);
			
 
				+				//This worker can manage transfers on NUMA nodes
			
 
				+				for (numa = 0; numa < nb_numa_nodes; numa++)
			
 
				+						_starpu_worker_drives_memory_node(&workerarg->set->workers[0], numa);
			
 
				+
			
 
				 				_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
			
 
				 				break;
			
 
				+			}
			
 
				 #endif /* STARPU_USE_MIC */
			
 
				 
			
 
				 #ifdef STARPU_USE_SCC
			
 
				 			case STARPU_SCC_WORKER:
			
 
				 			{
			
 
				+				unsigned numa;
			
 
				 				/* Node 0 represents the SCC shared memory when we're on SCC. */
			
 
				 				struct _starpu_memory_node_descr *descr = _starpu_memory_node_get_description();
			
 
				 				descr->nodes[ram_memory_node] = STARPU_SCC_SHM;
			
@@ -2026,7 +2502,10 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
				 				memory_node = ram_memory_node;
			
 
				 				_starpu_memory_node_add_nworkers(memory_node);
			
 
				 
			
 
				-                                _starpu_worker_drives_memory_node(workerarg, STARPU_MAIN_RAM);
			
 
				+				//This worker can manage transfers on NUMA nodes
			
 
				+				for (numa = 0; numa < nb_numa_nodes; numa++)
			
 
				+						_starpu_worker_drives_memory_node(workerarg, numa);
			
 
				+
			
 
				 				_starpu_worker_drives_memory_node(workerarg, memory_node);
			
 
				 			}
			
 
				 				break;
			
@@ -2035,6 +2514,7 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
				 #ifdef STARPU_USE_MPI_MASTER_SLAVE
			
 
				 			case STARPU_MPI_MS_WORKER:
			
 
				 			{
			
 
				+				unsigned numa;
			
 
				 				if (mpi_init[devid])
			
 
				 				{
			
 
				 					memory_node = mpi_memory_nodes[devid];
			
@@ -2044,11 +2524,18 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
				 					mpi_init[devid] = 1;
			
 
				 					mpi_bindid[devid] = _starpu_get_next_bindid(config, preferred_binding, npreferred);
			
 
				 					memory_node = mpi_memory_nodes[devid] = _starpu_memory_node_register(STARPU_MPI_MS_RAM, devid);
			
 
				-					_starpu_register_bus(STARPU_MAIN_RAM, memory_node);
			
 
				-					_starpu_register_bus(memory_node, STARPU_MAIN_RAM);
			
 
				+		
			
 
				+					for (numa = 0; numa < nb_numa_nodes; numa++)
			
 
				+					{	
			
 
				+						_starpu_register_bus(numa, memory_node);
			
 
				+						_starpu_register_bus(memory_node, numa);
			
 
				+					}
			
 
				 
			
 
				 				}
			
 
				-                                _starpu_worker_drives_memory_node(&workerarg->set->workers[0], STARPU_MAIN_RAM);
			
 
				+				//This worker can manage transfers on NUMA nodes
			
 
				+				for (numa = 0; numa < nb_numa_nodes; numa++)
			
 
				+						_starpu_worker_drives_memory_node(&workerarg->set->workers[0], numa);
			
 
				+
			
 
				 				_starpu_worker_drives_memory_node(&workerarg->set->workers[0], memory_node);
			
 
				 #ifndef STARPU_MPI_MASTER_SLAVE_MULTIPLE_THREAD
			
 
				                                 /* MPI driver thread can manage all slave memories if we disable the MPI multiple thread */
			
@@ -2154,7 +2641,7 @@ _starpu_build_topology (struct _starpu_machine_config *config, int no_mp_config)
 
				 	_starpu_memory_nodes_init();
			
 
				 	_starpu_datastats_init();
			
 
				 
			
 
				-	_starpu_init_workers_binding(config, no_mp_config);
			
 
				+	_starpu_init_workers_binding_and_memory(config, no_mp_config);
			
 
				 
			
 
				 	config->cpus_nodeid = -1;
			
 
				 	config->cuda_nodeid = -1;
			
@@ -2293,3 +2780,4 @@ starpu_topology_print (FILE *output)
 
				 		fprintf(output, "\n");
			
 
				 	}
			
 
				 }
			
 
				+
			
--- a/src/core/topology.h
+++ b/src/core/topology.h
@@ -1,7 +1,8 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2010, 2012, 2014-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2009-2010, 2012, 2014-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2015, 2017  CNRS
			
 
				+ * Copyright (C) 2017  Inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -51,6 +52,9 @@ unsigned _starpu_topology_get_nhwcpu(struct _starpu_machine_config *config);
 
				 /* returns the number of logical cpus */
			
 
				 unsigned _starpu_topology_get_nhwpu(struct _starpu_machine_config *config);
			
 
				 
			
 
				+/* returns the number of NUMA nodes */
			
 
				+unsigned _starpu_topology_get_nnumanodes(struct _starpu_machine_config *config);
			
 
				+
			
 
				 #ifdef STARPU_HAVE_HWLOC
			
 
				 /* Small convenient function to filter hwloc topology depending on HWLOC API version */
			
 
				 void _starpu_topology_filter(hwloc_topology_t topology);
			
@@ -68,4 +72,7 @@ void _starpu_bind_thread_on_cpus(struct _starpu_machine_config *config STARPU_AT
 
				 
			
 
				 struct _starpu_worker *_starpu_get_worker_from_driver(struct starpu_driver *d);
			
 
				 
			
 
				+int starpu_memory_nodes_get_numa_count(void);
			
 
				+int starpu_memory_nodes_numa_id_to_hwloclogid(unsigned id);
			
 
				+	
			
 
				 #endif // __TOPOLOGY_H__
			
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -1599,8 +1599,13 @@ void starpu_shutdown(void)
 
				 
			
 
				 	/* tell all workers to shutdown */
			
 
				 	_starpu_kill_all_workers(&_starpu_config);
			
 
				-
			
 
				-	_starpu_free_all_automatically_allocated_buffers(STARPU_MAIN_RAM);
			
 
				+	
			
 
				+	unsigned i;
			
 
				+	unsigned nb_numa_nodes = starpu_memory_nodes_get_numa_count();
			
 
				+	for (i=0; i<nb_numa_nodes; i++)
			
 
				+	{
			
 
				+		_starpu_free_all_automatically_allocated_buffers(i);
			
 
				+	}
			
 
				 
			
 
				 	{
			
 
				 	     int stats = starpu_get_env_number("STARPU_STATS");
			
--- a/src/core/workers.h
+++ b/src/core/workers.h
@@ -84,6 +84,7 @@ LIST_TYPE(_starpu_worker,
 
				 	starpu_pthread_cond_t started_cond; /* indicate when the worker is ready */
			
 
				 	starpu_pthread_cond_t ready_cond; /* indicate when the worker is ready */
			
 
				 	unsigned memory_node; /* which memory node is the worker associated with ? */
			
 
				+	unsigned numa_memory_node; /* which numa memory node is the worker associated with? (logical index) */
			
 
				 	/* condition variable used for passive waiting operations on worker
			
 
				 	 * STARPU_PTHREAD_COND_BROADCAST must be used instead of STARPU_PTHREAD_COND_SIGNAL,
			
 
				 	 * since the condition is shared for multiple purpose */
			
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c
@@ -1,7 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures. *
			
 
				  * Copyright (C) 2009-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
			
 
				- * Copyright (C) 2014  INRIA
			
 
				+ * Copyright (C) 2014, 2017  INRIA
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -320,6 +320,29 @@ static int link_supports_direct_transfers(starpu_data_handle_t handle, unsigned
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+/* Now, we use slowness/bandwidth to compare numa nodes, is it better to use latency ? */
			
 
				+static unsigned chose_best_numa_between_src_and_dest(int src, int dst)
			
 
				+{
			
 
				+	double timing_best;
			
 
				+	int best_numa = -1;
			
 
				+	unsigned numa;
			
 
				+	const unsigned nb_numa_nodes = starpu_memory_nodes_get_numa_count();
			
 
				+	for(numa = 0; numa < nb_numa_nodes; numa++)
			
 
				+	{
			
 
				+		double actual = 1.0/starpu_transfer_bandwidth(src, numa) + 1.0/starpu_transfer_bandwidth(numa, dst);
			
 
				+
			
 
				+		/* Compare slowness : take the lowest */
			
 
				+		if (best_numa < 0 || actual < timing_best)
			
 
				+		{
			
 
				+			best_numa = numa;
			
 
				+			timing_best = actual;
			
 
				+		}
			
 
				+	}
			
 
				+	STARPU_ASSERT(best_numa >= 0);
			
 
				+	
			
 
				+	return best_numa;
			
 
				+}
			
 
				+
			
 
				 /* Determines the path of a request : each hop is defined by (src,dst) and the
			
 
				  * node that handles the hop. The returned value indicates the number of hops,
			
 
				  * and the max_len is the maximum number of hops (ie. the size of the
			
@@ -362,9 +385,11 @@ static int determine_request_path(starpu_data_handle_t handle,
 
				 		STARPU_ASSERT(max_len >= 2);
			
 
				 		STARPU_ASSERT(src_node >= 0);
			
 
				 
			
 
				+		unsigned numa = chose_best_numa_between_src_and_dest(src_node, dst_node);
			
 
				+
			
 
				 		/* GPU -> RAM */
			
 
				 		src_nodes[0] = src_node;
			
 
				-		dst_nodes[0] = STARPU_MAIN_RAM;
			
 
				+		dst_nodes[0] = numa;
			
 
				 
			
 
				 		if (starpu_node_get_kind(src_node) == STARPU_DISK_RAM)
			
 
				 			/* Disks don't have their own driver thread */
			
@@ -380,7 +405,7 @@ static int determine_request_path(starpu_data_handle_t handle,
 
				 		}
			
 
				 
			
 
				 		/* RAM -> GPU */
			
 
				-		src_nodes[1] = STARPU_MAIN_RAM;
			
 
				+		src_nodes[1] = numa;
			
 
				 		dst_nodes[1] = dst_node;
			
 
				 
			
 
				 		if (starpu_node_get_kind(dst_node) == STARPU_DISK_RAM)
			
@@ -573,7 +598,7 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 
				 		/* if the data is in write only mode (and not SCRATCH or REDUX), there is no need for a source, data will be initialized by the task itself */
			
 
				 		if (mode & STARPU_W)
			
 
				 			dst_replicate->initialized = 1;
			
 
				-		if (requesting_node == STARPU_MAIN_RAM && !nwait)
			
 
				+		if (starpu_node_get_kind(requesting_node) == STARPU_CPU_RAM && !nwait)
			
 
				 		{
			
 
				 			/* And this is the main RAM, really no need for a
			
 
				 			 * request, just allocate */
			
--- a/src/datawizard/copy_driver.c
+++ b/src/datawizard/copy_driver.c
@@ -864,10 +864,9 @@ void _starpu_driver_wait_request_completion(struct _starpu_async_channel *async_
 
				                 _starpu_mpi_common_wait_event(async_channel);
			
 
				                 break;
			
 
				 #endif
			
 
				-	case STARPU_MAIN_RAM:
			
 
				+	case STARPU_CPU_RAM:
			
 
				 		starpu_disk_wait_request(async_channel);
			
 
				 		break;
			
 
				-	case STARPU_CPU_RAM:
			
 
				 	default:
			
 
				 		STARPU_ABORT();
			
 
				 	}
			
--- a/src/datawizard/data_request.c
+++ b/src/datawizard/data_request.c
@@ -147,7 +147,7 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 
				 	if (handling_node == -1)
			
 
				 		handling_node = STARPU_MAIN_RAM;
			
 
				 	r->handling_node = handling_node;
			
 
				-	STARPU_ASSERT(handling_node == STARPU_MAIN_RAM || _starpu_memory_node_get_nworkers(handling_node));
			
 
				+	STARPU_ASSERT(starpu_node_get_kind(handling_node) == STARPU_CPU_RAM || _starpu_memory_node_get_nworkers(handling_node));
			
 
				 	r->completed = 0;
			
 
				 	r->prefetch = is_prefetch;
			
 
				 	r->prio = prio;
			
@@ -276,7 +276,7 @@ void _starpu_post_data_request(struct _starpu_data_request *r)
 
				 	unsigned handling_node = r->handling_node;
			
 
				 	/* We don't have a worker for disk nodes, these should have been posted to a main RAM node */
			
 
				 	STARPU_ASSERT(starpu_node_get_kind(handling_node) != STARPU_DISK_RAM);
			
 
				-	STARPU_ASSERT(handling_node == STARPU_MAIN_RAM || _starpu_memory_node_get_nworkers(handling_node));
			
 
				+	STARPU_ASSERT(starpu_node_get_kind(handling_node) == STARPU_CPU_RAM || _starpu_memory_node_get_nworkers(handling_node));
			
 
				 
			
 
				 //	_STARPU_DEBUG("POST REQUEST\n");
			
 
				 
			
--- a/src/datawizard/datawizard.c
+++ b/src/datawizard/datawizard.c
@@ -22,6 +22,7 @@
 
				 #include <datawizard/memory_nodes.h>
			
 
				 #include <core/workers.h>
			
 
				 #include <core/progress_hook.h>
			
 
				+#include <core/topology.h>
			
 
				 #ifdef STARPU_SIMGRID
			
 
				 #include <core/simgrid.h>
			
 
				 #endif
			
@@ -71,8 +72,16 @@ int __starpu_datawizard_progress(unsigned may_alloc, unsigned push_requests)
 
				         unsigned memnode;
			
 
				 
			
 
				 	if (!worker)
			
 
				+	{
			
 
				 		/* Call from main application, only make RAM requests progress */
			
 
				-		return ___starpu_datawizard_progress(STARPU_MAIN_RAM, may_alloc, push_requests);
			
 
				+		int ret = 0;
			
 
				+		int nnumas = starpu_memory_nodes_get_numa_count();
			
 
				+		int numa;
			
 
				+		for (numa = 0; numa < nnumas; numa++)
			
 
				+			ret |=  ___starpu_datawizard_progress(numa, may_alloc, push_requests);
			
 
				+
			
 
				+		return ret;
			
 
				+	}
			
 
				 	if (worker->set)
			
 
				 		/* Runing one of the workers of a worker set. The reference for
			
 
				 		 * driving memory is its worker 0 (see registrations in topology.c) */
			
--- a/src/datawizard/datawizard.h
+++ b/src/datawizard/datawizard.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009, 2010, 2014  Université de Bordeaux
			
 
				+ * Copyright (C) 2009, 2010, 2014, 2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2013  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
--- a/src/datawizard/filters.c
+++ b/src/datawizard/filters.c
@@ -188,14 +188,18 @@ static void _starpu_data_partition(starpu_data_handle_t initial_handle, starpu_d
 
				 		/* This is lazy allocation, allocate it now in main RAM, so as
			
 
				 		 * to have somewhere to gather pieces later */
			
 
				 		/* FIXME: mark as unevictable! */
			
 
				-		int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[STARPU_MAIN_RAM], 0);
			
 
				+		int home_node = initial_handle->home_node;
			
 
				+		if (home_node < 0 || (starpu_node_get_kind(home_node) != STARPU_CPU_RAM))
			
 
				+			home_node = STARPU_MAIN_RAM;
			
 
				+		int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[home_node], 0);
			
 
				 #ifdef STARPU_DEVEL
			
 
				 #warning we should reclaim memory if allocation failed
			
 
				 #endif
			
 
				 		STARPU_ASSERT(!ret);
			
 
				 	}
			
 
				 
			
 
				-	_starpu_data_unregister_ram_pointer(initial_handle);
			
 
				+	for (node = 0; node < STARPU_MAXNODES; node++)
			
 
				+		_starpu_data_unregister_ram_pointer(initial_handle, node);
			
 
				 
			
 
				 	if (nparts && !inherit_state)
			
 
				 	{
			
@@ -324,10 +328,14 @@ static void _starpu_data_partition(starpu_data_handle_t initial_handle, starpu_d
 
				 		 * store it in the handle */
			
 
				 		child->footprint = _starpu_compute_data_footprint(child);
			
 
				 
			
 
				-		void *ptr;
			
 
				-		ptr = starpu_data_handle_to_pointer(child, STARPU_MAIN_RAM);
			
 
				-		if (ptr != NULL)
			
 
				-			_starpu_data_register_ram_pointer(child, ptr);
			
 
				+		for (node = 0; node < STARPU_MAXNODES; node++)
			
 
				+		{
			
 
				+			if (starpu_node_get_kind(node) != STARPU_CPU_RAM)
			
 
				+				continue;
			
 
				+			void *ptr = starpu_data_handle_to_pointer(child, node);
			
 
				+			if (ptr != NULL)
			
 
				+				_starpu_data_register_ram_pointer(child, ptr);
			
 
				+		}
			
 
				 
			
 
				 		_STARPU_TRACE_HANDLE_DATA_REGISTER(child);
			
 
				 	}
			
@@ -428,7 +436,8 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 
				 			child_handle->unregister_hook(child_handle);
			
 
				 		}
			
 
				 
			
 
				-		_starpu_data_unregister_ram_pointer(child_handle);
			
 
				+		for (node = 0; node < STARPU_MAXNODES; node++)
			
 
				+			_starpu_data_unregister_ram_pointer(child_handle, node);
			
 
				 
			
 
				 		if (child_handle->per_worker)
			
 
				 		{
			
@@ -444,9 +453,14 @@ void starpu_data_unpartition(starpu_data_handle_t root_handle, unsigned gatherin
 
				 		_starpu_memory_stats_free(child_handle);
			
 
				 	}
			
 
				 
			
 
				-	ptr = starpu_data_handle_to_pointer(root_handle, STARPU_MAIN_RAM);
			
 
				-	if (ptr != NULL)
			
 
				-		_starpu_data_register_ram_pointer(root_handle, ptr);
			
 
				+	for (node = 0; node < STARPU_MAXNODES; node++)
			
 
				+	{
			
 
				+		if (starpu_node_get_kind(node) != STARPU_CPU_RAM)
			
 
				+			continue;
			
 
				+		ptr = starpu_data_handle_to_pointer(root_handle, node);
			
 
				+		if (ptr != NULL)
			
 
				+			_starpu_data_register_ram_pointer(root_handle, ptr);
			
 
				+	}
			
 
				 
			
 
				 	/* the gathering_node should now have a valid copy of all the children.
			
 
				 	 * For all nodes, if the node had all copies and none was locally
			
--- a/src/datawizard/interfaces/bcsr_interface.c
+++ b/src/datawizard/interfaces/bcsr_interface.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2009-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -130,7 +130,7 @@ void starpu_bcsr_data_register(starpu_data_handle_t *handleptr, int home_node,
 
				 		.elemsize = elemsize
			
 
				 	};
			
 
				 #ifndef STARPU_SIMGRID
			
 
				-	if (home_node == STARPU_MAIN_RAM)
			
 
				+	if (home_node >= 0 && starpu_node_get_kind(home_node) == STARPU_CPU_RAM)
			
 
				 	{
			
 
				 		STARPU_ASSERT_ACCESSIBLE(nzval);
			
 
				 		STARPU_ASSERT_ACCESSIBLE(nzval + nnz*elemsize*r*c - 1);
			
@@ -260,9 +260,13 @@ uintptr_t starpu_bcsr_get_local_nzval(starpu_data_handle_t handle)
 
				 
			
 
				 uint32_t *starpu_bcsr_get_local_colind(starpu_data_handle_t handle)
			
 
				 {
			
 
				+	int node = handle->home_node;
			
 
				+	if (node < 0 || (starpu_node_get_kind(node) != STARPU_CPU_RAM))
			
 
				+		node = STARPU_MAIN_RAM;
			
 
				+
			
 
				 	/* XXX 0 */
			
 
				 	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
			
 
				-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
			
 
				+		starpu_data_get_interface_on_node(handle, node);
			
 
				 
			
 
				 #ifdef STARPU_DEBUG
			
 
				 	STARPU_ASSERT_MSG(data_interface->id == STARPU_BCSR_INTERFACE_ID, "Error. The given data is not a bcsr.");
			
@@ -273,9 +277,13 @@ uint32_t *starpu_bcsr_get_local_colind(starpu_data_handle_t handle)
 
				 
			
 
				 uint32_t *starpu_bcsr_get_local_rowptr(starpu_data_handle_t handle)
			
 
				 {
			
 
				+	int node = handle->home_node;
			
 
				+	if (node < 0 || (starpu_node_get_kind(node) != STARPU_CPU_RAM))
			
 
				+		node = STARPU_MAIN_RAM;
			
 
				+
			
 
				 	/* XXX 0 */
			
 
				 	struct starpu_bcsr_interface *data_interface = (struct starpu_bcsr_interface *)
			
 
				-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
			
 
				+		starpu_data_get_interface_on_node(handle, node);
			
 
				 
			
 
				 #ifdef STARPU_DEBUG
			
 
				 	STARPU_ASSERT_MSG(data_interface->id == STARPU_BCSR_INTERFACE_ID, "Error. The given data is not a bcsr.");
			
--- a/src/datawizard/interfaces/block_interface.c
+++ b/src/datawizard/interfaces/block_interface.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2009-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -163,7 +163,7 @@ void starpu_block_data_register(starpu_data_handle_t *handleptr, int home_node,
 
				 		.elemsize = elemsize
			
 
				 	};
			
 
				 #ifndef STARPU_SIMGRID
			
 
				-	if (home_node == STARPU_MAIN_RAM)
			
 
				+	if (home_node >= 0 && starpu_node_get_kind(home_node) == STARPU_CPU_RAM)
			
 
				 	{
			
 
				 		STARPU_ASSERT_ACCESSIBLE(ptr);
			
 
				 		STARPU_ASSERT_ACCESSIBLE(ptr + (nz-1)*ldz*elemsize + (ny-1)*ldy*elemsize + nx*elemsize - 1);
			
--- a/src/datawizard/interfaces/coo_interface.c
+++ b/src/datawizard/interfaces/coo_interface.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2013-2016  Université Bordeaux
			
 
				+ * Copyright (C) 2013-2017  Université Bordeaux
			
 
				  * Copyright (C) 2012 INRIA
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -18,6 +18,7 @@
 
				 #include <starpu.h>
			
 
				 #include <common/fxt.h>
			
 
				 #include <datawizard/memalloc.h>
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				 
			
 
				 static int
			
 
				 copy_any_to_any(void *src_interface, unsigned src_node,
			
@@ -236,7 +237,7 @@ starpu_coo_data_register(starpu_data_handle_t *handleptr, int home_node,
 
				 		.elemsize = elemsize,
			
 
				 	};
			
 
				 #ifndef STARPU_SIMGRID
			
 
				-	if (home_node == STARPU_MAIN_RAM)
			
 
				+	if (home_node >= 0 && starpu_node_get_kind(home_node) == STARPU_CPU_RAM)
			
 
				 	{
			
 
				 		STARPU_ASSERT_ACCESSIBLE(columns);
			
 
				 		STARPU_ASSERT_ACCESSIBLE((uintptr_t) columns + n_values*sizeof(uint32_t) - 1);
			
--- a/src/datawizard/interfaces/csr_interface.c
+++ b/src/datawizard/interfaces/csr_interface.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2009-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2017  CNRS
			
 
				  *
			
@@ -112,7 +112,7 @@ void starpu_csr_data_register(starpu_data_handle_t *handleptr, int home_node,
 
				 		.elemsize = elemsize
			
 
				 	};
			
 
				 #ifndef STARPU_SIMGRID
			
 
				-	if (home_node == STARPU_MAIN_RAM)
			
 
				+	if (home_node >= 0 && starpu_node_get_kind(home_node) == STARPU_CPU_RAM)
			
 
				 	{
			
 
				 		STARPU_ASSERT_ACCESSIBLE(nzval);
			
 
				 		STARPU_ASSERT_ACCESSIBLE(nzval + nnz*elemsize - 1);
			
--- a/src/datawizard/interfaces/data_interface.c
+++ b/src/datawizard/interfaces/data_interface.c
@@ -369,12 +369,14 @@ static void _starpu_register_new_data(starpu_data_handle_t handle,
 
				 	/* now the data is available ! */
			
 
				 	_starpu_spin_unlock(&handle->header_lock);
			
 
				 
			
 
				-
			
 
				-
			
 
				-	ptr = starpu_data_handle_to_pointer(handle, STARPU_MAIN_RAM);
			
 
				-	if (ptr != NULL)
			
 
				+	for (node = 0; node < STARPU_MAXNODES; node++)
			
 
				 	{
			
 
				-		_starpu_data_register_ram_pointer(handle, ptr);
			
 
				+		if (starpu_node_get_kind(node) != STARPU_CPU_RAM)
			
 
				+			continue;
			
 
				+
			
 
				+		ptr = starpu_data_handle_to_pointer(handle, node);
			
 
				+		if (ptr != NULL)
			
 
				+			_starpu_data_register_ram_pointer(handle, ptr);
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -521,13 +523,17 @@ struct starpu_data_interface_ops* starpu_data_get_interface_ops(starpu_data_hand
 
				  * Stop monitoring a piece of data
			
 
				  */
			
 
				 
			
 
				-void _starpu_data_unregister_ram_pointer(starpu_data_handle_t handle)
			
 
				+void _starpu_data_unregister_ram_pointer(starpu_data_handle_t handle, unsigned node)
			
 
				 {
			
 
				-	const void *ram_ptr = starpu_data_handle_to_pointer(handle, STARPU_MAIN_RAM);
			
 
				+	if (starpu_node_get_kind(node) != STARPU_CPU_RAM)
			
 
				+		return;
			
 
				+
			
 
				 #ifdef STARPU_OPENMP
			
 
				 	if (handle->removed_from_context_hash)
			
 
				 		return;
			
 
				 #endif
			
 
				+	const void *ram_ptr = starpu_data_handle_to_pointer(handle, node);
			
 
				+
			
 
				 	if (ram_ptr != NULL)
			
 
				 	{
			
 
				 		/* Remove the PTR -> HANDLE mapping.  If a mapping from PTR
			
@@ -757,7 +763,10 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 
				 			_STARPU_DEBUG("Conversion needed\n");
			
 
				 			void *buffers[1];
			
 
				 			struct starpu_multiformat_interface *format_interface;
			
 
				-			format_interface = (struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
			
 
				+			home_node = handle->home_node;
			
 
				+			if (home_node < 0 || (starpu_node_get_kind(home_node) != STARPU_CPU_RAM))
			
 
				+				home_node = STARPU_MAIN_RAM;
			
 
				+			format_interface = (struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, home_node);
			
 
				 			struct starpu_codelet *cl = NULL;
			
 
				 			enum starpu_node_kind node_kind = starpu_node_get_kind(handle->mf_node);
			
 
				 
			
@@ -850,16 +859,19 @@ retry_busy:
 
				 
			
 
				 	size_t size = _starpu_data_get_size(handle);
			
 
				 
			
 
				-	_starpu_data_unregister_ram_pointer(handle);
			
 
				-
			
 
				 	/* Destroy the data now */
			
 
				 	unsigned node;
			
 
				 	for (node = 0; node < STARPU_MAXNODES; node++)
			
 
				 	{
			
 
				 		struct _starpu_data_replicate *local = &handle->per_node[node];
			
 
				+		if (local->allocated)
			
 
				+		{
			
 
				+			_starpu_data_unregister_ram_pointer(handle, node);
			
 
				+
			
 
				 		/* free the data copy in a lazy fashion */
			
 
				-		if (local->allocated && local->automatically_allocated)
			
 
				-			_starpu_request_mem_chunk_removal(handle, local, node, size);
			
 
				+			if (local->automatically_allocated)
			
 
				+				_starpu_request_mem_chunk_removal(handle, local, node, size);
			
 
				+		}
			
 
				 	}
			
 
				 	if (handle->per_worker)
			
 
				 	{
			
@@ -976,8 +988,7 @@ static void _starpu_data_invalidate(void *data)
 
				 
			
 
				 		if (local->mc && local->allocated && local->automatically_allocated)
			
 
				 		{
			
 
				-			if (node == STARPU_MAIN_RAM)
			
 
				-				_starpu_data_unregister_ram_pointer(handle);
			
 
				+			_starpu_data_unregister_ram_pointer(handle, node);
			
 
				 
			
 
				 			/* free the data copy in a lazy fashion */
			
 
				 			_starpu_request_mem_chunk_removal(handle, local, node, size);
			
--- a/src/datawizard/interfaces/data_interface.h
+++ b/src/datawizard/interfaces/data_interface.h
@@ -78,7 +78,7 @@ extern void _starpu_data_register_ram_pointer(starpu_data_handle_t handle,
 
				 						void *ptr)
			
 
				 	STARPU_ATTRIBUTE_INTERNAL;
			
 
				 
			
 
				-extern void _starpu_data_unregister_ram_pointer(starpu_data_handle_t handle)
			
 
				+extern void _starpu_data_unregister_ram_pointer(starpu_data_handle_t handle, unsigned node)
			
 
				 	STARPU_ATTRIBUTE_INTERNAL;
			
 
				 
			
 
				 #define _starpu_data_is_multiformat_handle(handle) handle->ops->is_multiformat
			
--- a/src/datawizard/interfaces/matrix_interface.c
+++ b/src/datawizard/interfaces/matrix_interface.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2010-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2016, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -174,7 +174,7 @@ void starpu_matrix_data_register(starpu_data_handle_t *handleptr, int home_node,
 
				                 .offset = 0
			
 
				 	};
			
 
				 #ifndef STARPU_SIMGRID
			
 
				-	if (home_node == STARPU_MAIN_RAM)
			
 
				+	if (home_node >= 0 && starpu_node_get_kind(home_node) == STARPU_CPU_RAM)
			
 
				 	{
			
 
				 		STARPU_ASSERT_ACCESSIBLE(ptr);
			
 
				 		STARPU_ASSERT_ACCESSIBLE(ptr + (ny-1)*ld*elemsize + nx*elemsize - 1);
			
--- a/src/datawizard/interfaces/variable_interface.c
+++ b/src/datawizard/interfaces/variable_interface.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2010-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2010-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2017  CNRS
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
@@ -113,7 +113,7 @@ void starpu_variable_data_register(starpu_data_handle_t *handleptr, int home_nod
 
				 		.elemsize = elemsize
			
 
				 	};
			
 
				 #ifndef STARPU_SIMGRID
			
 
				-	if (home_node == STARPU_MAIN_RAM)
			
 
				+	if (home_node >= 0 && starpu_node_get_kind(home_node) == STARPU_CPU_RAM)
			
 
				 	{
			
 
				 		STARPU_ASSERT_ACCESSIBLE(ptr);
			
 
				 		STARPU_ASSERT_ACCESSIBLE(ptr + elemsize - 1);
			
--- a/src/datawizard/interfaces/vector_interface.c
+++ b/src/datawizard/interfaces/vector_interface.c
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2009-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2009-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2017  CNRS
			
 
				  * Copyright (C) 2017  Inria
			
 
				  *
			
@@ -122,7 +122,7 @@ void starpu_vector_data_register(starpu_data_handle_t *handleptr, int home_node,
 
				                 .offset = 0
			
 
				 	};
			
 
				 #if (!defined(STARPU_SIMGRID) && !defined(STARPU_OPENMP))
			
 
				-	if (home_node == STARPU_MAIN_RAM)
			
 
				+	if (home_node >= 0 && starpu_node_get_kind(home_node) == STARPU_CPU_RAM)
			
 
				 	{
			
 
				 		STARPU_ASSERT_ACCESSIBLE(ptr);
			
 
				 		STARPU_ASSERT_ACCESSIBLE(ptr + nx*elemsize - 1);
			
--- a/src/datawizard/malloc.c
+++ b/src/datawizard/malloc.c
@@ -2,6 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2009-2010, 2012-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
			
 
				+ * Copyright (C) 2017  Inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -112,8 +113,14 @@ static struct starpu_codelet malloc_pinned_cl =
 
				 };
			
 
				 #endif
			
 
				 
			
 
				+/* Allocation in CPU RAM */
			
 
				 int starpu_malloc_flags(void **A, size_t dim, int flags)
			
 
				 {
			
 
				+	return _starpu_malloc_flags_on_node(STARPU_MAIN_RAM, A, dim, flags);
			
 
				+}
			
 
				+
			
 
				+int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int flags)
			
 
				+{
			
 
				 	int ret=0;
			
 
				 
			
 
				 	STARPU_ASSERT(A);
			
@@ -121,14 +128,14 @@ int starpu_malloc_flags(void **A, size_t dim, int flags)
 
				 	if (flags & STARPU_MALLOC_COUNT)
			
 
				 	{
			
 
				 		if (!(flags & STARPU_MALLOC_NORECLAIM))
			
 
				-			while (starpu_memory_allocate(STARPU_MAIN_RAM, dim, flags) != 0)
			
 
				+			while (starpu_memory_allocate(dst_node, dim, flags) != 0)
			
 
				 			{
			
 
				 				size_t freed;
			
 
				 				size_t reclaim = 2 * dim;
			
 
				 				_STARPU_DEBUG("There is not enough memory left, we are going to reclaim %ld\n", (long)reclaim);
			
 
				-				_STARPU_TRACE_START_MEMRECLAIM(STARPU_MAIN_RAM,0);
			
 
				-				freed = _starpu_memory_reclaim_generic(STARPU_MAIN_RAM, 0, reclaim);
			
 
				-				_STARPU_TRACE_END_MEMRECLAIM(STARPU_MAIN_RAM,0);
			
 
				+				_STARPU_TRACE_START_MEMRECLAIM(dst_node,0);
			
 
				+				freed = _starpu_memory_reclaim_generic(dst_node, 0, reclaim);
			
 
				+				_STARPU_TRACE_END_MEMRECLAIM(dst_node,0);
			
 
				 				if (freed < dim && !(flags & STARPU_MEMORY_WAIT))
			
 
				 				{
			
 
				 					// We could not reclaim enough memory
			
@@ -137,9 +144,9 @@ int starpu_malloc_flags(void **A, size_t dim, int flags)
 
				 				}
			
 
				 			}
			
 
				 		else if (flags & STARPU_MEMORY_WAIT)
			
 
				-			starpu_memory_allocate(STARPU_MAIN_RAM, dim, flags);
			
 
				+			starpu_memory_allocate(dst_node, dim, flags);
			
 
				 		else
			
 
				-			starpu_memory_allocate(STARPU_MAIN_RAM, dim, flags | STARPU_MEMORY_OVERFLOW);
			
 
				+			starpu_memory_allocate(dst_node, dim, flags | STARPU_MEMORY_OVERFLOW);
			
 
				 	}
			
 
				 
			
 
				 	if (flags & STARPU_MALLOC_PINNED && disable_pinning <= 0 && STARPU_RUNNING_ON_VALGRIND == 0)
			
@@ -298,6 +305,18 @@ int starpu_malloc_flags(void **A, size_t dim, int flags)
 
				 		_starpu_scc_allocate_shared_memory(A, dim);
			
 
				 #endif
			
 
				 	}
			
 
				+#ifdef STARPU_HAVE_HWLOC
			
 
				+	if (starpu_memory_nodes_get_numa_count() > 1) {
			
 
				+		struct _starpu_machine_config *config = _starpu_get_machine_config();
			
 
				+		hwloc_topology_t hwtopology = config->topology.hwtopology;
			
 
				+		hwloc_obj_t numa_node_obj = hwloc_get_obj_by_type(hwtopology, HWLOC_OBJ_NODE, starpu_memory_nodes_numa_id_to_hwloclogid(dst_node));
			
 
				+		hwloc_bitmap_t nodeset = numa_node_obj->nodeset;
			
 
				+		*A = hwloc_alloc_membind_nodeset(hwtopology, dim, nodeset, HWLOC_MEMBIND_BIND | HWLOC_MEMBIND_NOCPUBIND, flags);
			
 
				+		//fprintf(stderr, "Allocation %lu bytes on NUMA node %d [%p]\n", (unsigned long) dim, starpu_memnode_get_numaphysid(dst_node), *A);
			
 
				+		if (!*A)
			
 
				+			ret = -ENOMEM;
			
 
				+	}
			
 
				+#endif /* STARPU_HAVE_HWLOC */
			
 
				 	else
			
 
				 #ifdef STARPU_HAVE_POSIX_MEMALIGN
			
 
				 	if (_malloc_align != sizeof(void*))
			
@@ -333,7 +352,7 @@ end:
 
				 	}
			
 
				 	else if (flags & STARPU_MALLOC_COUNT)
			
 
				 	{
			
 
				-		starpu_memory_deallocate(STARPU_MAIN_RAM, dim);
			
 
				+		starpu_memory_deallocate(dst_node, dim);
			
 
				 	}
			
 
				 
			
 
				 	return ret;
			
@@ -383,6 +402,11 @@ static struct starpu_codelet free_pinned_cl =
 
				 
			
 
				 int starpu_free_flags(void *A, size_t dim, int flags)
			
 
				 {
			
 
				+	return _starpu_free_flags_on_node(STARPU_MAIN_RAM, A, dim, flags);
			
 
				+}
			
 
				+
			
 
				+int _starpu_free_flags_on_node(unsigned dst_node, void *A, size_t dim, int flags)
			
 
				+{
			
 
				 	if (flags & STARPU_MALLOC_PINNED && disable_pinning <= 0 && STARPU_RUNNING_ON_VALGRIND == 0)
			
 
				 	{
			
 
				 		if (_starpu_can_submit_cuda_task())
			
@@ -470,6 +494,13 @@ int starpu_free_flags(void *A, size_t dim, int flags)
 
				 		_starpu_scc_free_shared_memory(A);
			
 
				 #endif
			
 
				 	}
			
 
				+#ifdef STARPU_HAVE_HWLOC
			
 
				+	else if (starpu_memory_nodes_get_numa_count() > 1) {
			
 
				+		struct _starpu_machine_config *config = _starpu_get_machine_config();
			
 
				+		hwloc_topology_t hwtopology = config->topology.hwtopology;
			
 
				+		hwloc_free(hwtopology, A, dim);
			
 
				+	}
			
 
				+#endif /* STARPU_HAVE_HWLOC */
			
 
				 	else
			
 
				 		free(A);
			
 
				 
			
@@ -478,7 +509,7 @@ out:
 
				 #endif
			
 
				 	if (flags & STARPU_MALLOC_COUNT)
			
 
				 	{
			
 
				-		starpu_memory_deallocate(STARPU_MAIN_RAM, dim);
			
 
				+		starpu_memory_deallocate(dst_node, dim);
			
 
				 	}
			
 
				 
			
 
				 	return 0;
			
@@ -516,7 +547,7 @@ _starpu_malloc_on_node(unsigned dst_node, size_t size, int flags)
 
				 	{
			
 
				 		case STARPU_CPU_RAM:
			
 
				 		{
			
 
				-			starpu_malloc_flags((void**) &addr, size,
			
 
				+			_starpu_malloc_flags_on_node(dst_node, (void**) &addr, size,			
			
 
				 #if defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
			
 
				 					/* without memcpy_peer, we can not
			
 
				 					 * allocated pinned memory, since it
			
@@ -646,7 +677,7 @@ _starpu_free_on_node_flags(unsigned dst_node, uintptr_t addr, size_t size, int f
 
				 	switch(kind)
			
 
				 	{
			
 
				 		case STARPU_CPU_RAM:
			
 
				-			starpu_free_flags((void*)addr, size,
			
 
				+			_starpu_free_flags_on_node(dst_node, (void*)addr, size,
			
 
				 #if defined(STARPU_USE_CUDA) && !defined(HAVE_CUDA_MEMCPY_PEER) && !defined(STARPU_SIMGRID)
			
 
				 					flags & ~STARPU_MALLOC_PINNED
			
 
				 #else
			
--- a/src/datawizard/malloc.h
+++ b/src/datawizard/malloc.h
@@ -1,6 +1,6 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				- * Copyright (C) 2013  Université de Bordeaux
			
 
				+ * Copyright (C) 2013, 2017  Université de Bordeaux
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -22,4 +22,6 @@ void _starpu_malloc_shutdown(unsigned dst_node);
 
				 
			
 
				 void _starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size);
			
 
				 
			
 
				+int _starpu_malloc_flags_on_node(unsigned dst_node, void **A, size_t dim, int flags);
			
 
				+int _starpu_free_flags_on_node(unsigned dst_node, void *A, size_t dim, int flags);
			
 
				 #endif
			
--- a/src/datawizard/memalloc.c
+++ b/src/datawizard/memalloc.c
@@ -2,7 +2,7 @@
 
				  *
			
 
				  * Copyright (C) 2009-2017  Université de Bordeaux
			
 
				  * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017  CNRS
			
 
				- * Copyright (C) 2016  Inria
			
 
				+ * Copyright (C) 2016, 2017  Inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -21,6 +21,7 @@
 
				 #include <datawizard/memalloc.h>
			
 
				 #include <datawizard/footprint.h>
			
 
				 #include <core/disk.h>
			
 
				+#include <core/topology.h>
			
 
				 #include <starpu.h>
			
 
				 #include <common/uthash.h>
			
 
				 
			
@@ -382,8 +383,8 @@ static size_t free_memory_on_node(struct _starpu_mem_chunk *mc, unsigned node)
 
				 			data_interface = mc->chunk_interface;
			
 
				 		STARPU_ASSERT(data_interface);
			
 
				 
			
 
				-		if (handle && node == STARPU_MAIN_RAM)
			
 
				-			_starpu_data_unregister_ram_pointer(handle);
			
 
				+		if (handle && (starpu_node_get_kind(node) == STARPU_CPU_RAM))
			
 
				+			_starpu_data_unregister_ram_pointer(handle, node);
			
 
				 
			
 
				 		_STARPU_TRACE_START_FREE(node, mc->size);
			
 
				 		mc->ops->free_data_on_node(data_interface, node);
			
@@ -443,8 +444,7 @@ static void reuse_mem_chunk(unsigned node, struct _starpu_data_replicate *new_re
 
				 	struct _starpu_data_replicate *old_replicate = mc->replicate;
			
 
				 	if (old_replicate)
			
 
				 	{
			
 
				-		if (node == STARPU_MAIN_RAM)
			
 
				-			_starpu_data_unregister_ram_pointer(old_replicate->handle);
			
 
				+		_starpu_data_unregister_ram_pointer(old_replicate->handle, node);
			
 
				 		old_replicate->allocated = 0;
			
 
				 		old_replicate->automatically_allocated = 0;
			
 
				 		old_replicate->initialized = 0;
			
@@ -1486,11 +1486,11 @@ int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_
 
				 	replicate->allocated = 1;
			
 
				 	replicate->automatically_allocated = 1;
			
 
				 
			
 
				-	if (replicate->relaxed_coherency == 0 && dst_node == STARPU_MAIN_RAM)
			
 
				+	if (replicate->relaxed_coherency == 0 && (starpu_node_get_kind(dst_node) == STARPU_CPU_RAM))
			
 
				 	{
			
 
				 		/* We are allocating the buffer in main memory, also register it
			
 
				 		 * for the gcc plugin.  */
			
 
				-		void *ptr = starpu_data_handle_to_pointer(handle, STARPU_MAIN_RAM);
			
 
				+		void *ptr = starpu_data_handle_to_pointer(handle, dst_node);
			
 
				 		if (ptr != NULL)
			
 
				 		{
			
 
				 			_starpu_data_register_ram_pointer(handle, ptr);
			
@@ -1617,7 +1617,7 @@ get_better_disk_can_accept_size(starpu_data_handle_t handle, unsigned node)
 
				 	int target = -1;
			
 
				 	unsigned nnodes = starpu_memory_nodes_get_count();
			
 
				 	unsigned int i;
			
 
				-	double time_disk = 0;
			
 
				+	double time_disk = 0.0;
			
 
				 
			
 
				 	for (i = 0; i < nnodes; i++)
			
 
				 	{
			
@@ -1628,13 +1628,17 @@ get_better_disk_can_accept_size(starpu_data_handle_t handle, unsigned node)
 
				 			/* if we can write on the disk */
			
 
				 			if (_starpu_get_disk_flag(i) != STARPU_DISK_NO_RECLAIM)
			
 
				 			{
			
 
				-				/* only time can change between disk <-> main_ram
			
 
				-				 * and not between main_ram <-> worker if we compare diks*/
			
 
				-				double time_tmp = starpu_transfer_predict(i, STARPU_MAIN_RAM, _starpu_data_get_size(handle));
			
 
				-				if (target == -1 || time_disk > time_tmp)
			
 
				+				unsigned numa;
			
 
				+				unsigned nnumas = starpu_memory_nodes_get_numa_count();
			
 
				+				for (numa = 0; numa < nnumas; numa++)
			
 
				 				{
			
 
				-					target = i;
			
 
				-					time_disk = time_tmp;
			
 
				+					/* TODO : check if starpu_transfer_predict(node, i,...) is the same */
			
 
				+					double time_tmp = starpu_transfer_predict(node, numa, _starpu_data_get_size(handle)) + starpu_transfer_predict(i, numa, _starpu_data_get_size(handle));
			
 
				+					if (target == -1 || time_disk > time_tmp)
			
 
				+					{
			
 
				+						target = i;
			
 
				+						time_disk = time_tmp;
			
 
				+					}
			
 
				 				}
			
 
				 			}
			
 
				 		}
			
@@ -1642,6 +1646,9 @@ get_better_disk_can_accept_size(starpu_data_handle_t handle, unsigned node)
 
				 	return target;
			
 
				 }
			
 
				 
			
 
				+#ifdef STARPU_DEVEL
			
 
				+#  warning TODO: better choose NUMA node
			
 
				+#endif
			
 
				 
			
 
				 static unsigned
			
 
				 choose_target(starpu_data_handle_t handle, unsigned node)
			
@@ -1650,14 +1657,20 @@ choose_target(starpu_data_handle_t handle, unsigned node)
 
				 	size_t size_handle = _starpu_data_get_size(handle);
			
 
				 	if (handle->home_node != -1)
			
 
				 		/* try to push on RAM if we can before to push on disk */
			
 
				-		if(starpu_node_get_kind(handle->home_node) == STARPU_DISK_RAM && node != STARPU_MAIN_RAM)
			
 
				+		if(starpu_node_get_kind(handle->home_node) == STARPU_DISK_RAM && (starpu_node_get_kind(node) != STARPU_CPU_RAM))
			
 
				 		{
			
 
				-			if (handle->per_node[STARPU_MAIN_RAM].allocated ||
			
 
				-			    _starpu_memory_manager_test_allocate_size(STARPU_MAIN_RAM, size_handle) == 1)
			
 
				+ 	                unsigned i;
			
 
				+			unsigned nb_numa_nodes = starpu_memory_nodes_get_numa_count();
			
 
				+			for (i=0; i<nb_numa_nodes; i++)
			
 
				 			{
			
 
				-				target = STARPU_MAIN_RAM;
			
 
				+				if (handle->per_node[i].allocated || 
			
 
				+				    _starpu_memory_manager_test_allocate_size(i, size_handle) == 1)
			
 
				+				{
			
 
				+					target = i;
			
 
				+					break;
			
 
				+				}
			
 
				 			}
			
 
				-			else
			
 
				+			if (target == -1)
			
 
				 			{
			
 
				 				target = get_better_disk_can_accept_size(handle, node);
			
 
				 			}
			
@@ -1672,19 +1685,26 @@ choose_target(starpu_data_handle_t handle, unsigned node)
 
				 	{
			
 
				 		/* handle->home_node == -1 */
			
 
				 		/* no place for datas in RAM, we push on disk */
			
 
				-		if (node == STARPU_MAIN_RAM)
			
 
				+		if (starpu_node_get_kind(node) == STARPU_CPU_RAM)
			
 
				 		{
			
 
				 			target = get_better_disk_can_accept_size(handle, node);
			
 
				-		}
			
 
				+		} else {
			
 
				 		/* node != 0 */
			
 
				 		/* try to push data to RAM if we can before to push on disk*/
			
 
				-		else if (handle->per_node[STARPU_MAIN_RAM].allocated ||
			
 
				-			 _starpu_memory_manager_test_allocate_size(STARPU_MAIN_RAM, size_handle) == 1)
			
 
				-		{
			
 
				-			target = STARPU_MAIN_RAM;
			
 
				+			unsigned i;
			
 
				+			unsigned nb_numa_nodes = starpu_memory_nodes_get_numa_count();
			
 
				+			for (i=0; i<nb_numa_nodes; i++)
			
 
				+			{
			
 
				+				if (handle->per_node[i].allocated || 
			
 
				+				    _starpu_memory_manager_test_allocate_size(i, size_handle) == 1)
			
 
				+				{
			
 
				+					target = i;
			
 
				+					break;
			
 
				+				}
			
 
				+			}
			
 
				 		}
			
 
				 		/* no place in RAM */
			
 
				-		else
			
 
				+		if (target == -1)
			
 
				 		{
			
 
				 			target = get_better_disk_can_accept_size(handle, node);
			
 
				 		}
			
--- a/src/datawizard/memory_nodes.c
+++ b/src/datawizard/memory_nodes.c
@@ -77,7 +77,7 @@ void _starpu_memory_node_get_name(unsigned node, char *name, int size)
 
				 	switch (_starpu_descr.nodes[node])
			
 
				 	{
			
 
				 	case STARPU_CPU_RAM:
			
 
				-		prefix = "RAM";
			
 
				+		prefix = "NUMA";
			
 
				 		break;
			
 
				 	case STARPU_CUDA_RAM:
			
 
				 		prefix = "CUDA";
			
--- a/src/datawizard/user_interactions.c
+++ b/src/datawizard/user_interactions.c
@@ -257,13 +257,19 @@ int starpu_data_acquire_on_node_cb(starpu_data_handle_t handle, int node,
 
				 int starpu_data_acquire_cb(starpu_data_handle_t handle,
			
 
				 			   enum starpu_data_access_mode mode, void (*callback)(void *), void *arg)
			
 
				 {
			
 
				-	return starpu_data_acquire_on_node_cb(handle, STARPU_MAIN_RAM, mode, callback, arg);
			
 
				+	int home_node = handle->home_node;
			
 
				+	if (home_node < 0)
			
 
				+		home_node = STARPU_MAIN_RAM;
			
 
				+	return starpu_data_acquire_on_node_cb(handle, home_node, mode, callback, arg);
			
 
				 }
			
 
				 
			
 
				 int starpu_data_acquire_cb_sequential_consistency(starpu_data_handle_t handle,
			
 
				 						  enum starpu_data_access_mode mode, void (*callback)(void *), void *arg, int sequential_consistency)
			
 
				 {
			
 
				-	return starpu_data_acquire_on_node_cb_sequential_consistency(handle, STARPU_MAIN_RAM, mode, callback, arg, sequential_consistency);
			
 
				+	int home_node = handle->home_node;
			
 
				+	if (home_node < 0)
			
 
				+		home_node = STARPU_MAIN_RAM;
			
 
				+	return starpu_data_acquire_on_node_cb_sequential_consistency(handle, home_node, mode, callback, arg, sequential_consistency);
			
 
				 }
			
 
				 
			
 
				 
			
@@ -372,7 +378,10 @@ int starpu_data_acquire_on_node(starpu_data_handle_t handle, int node, enum star
 
				 
			
 
				 int starpu_data_acquire(starpu_data_handle_t handle, enum starpu_data_access_mode mode)
			
 
				 {
			
 
				-	return starpu_data_acquire_on_node(handle, STARPU_MAIN_RAM, mode);
			
 
				+	int home_node = handle->home_node;
			
 
				+	if (home_node < 0)
			
 
				+		home_node = STARPU_MAIN_RAM;
			
 
				+	return starpu_data_acquire_on_node(handle, home_node, mode);
			
 
				 }
			
 
				 
			
 
				 int starpu_data_acquire_on_node_try(starpu_data_handle_t handle, int node, enum starpu_data_access_mode mode)
			
@@ -445,7 +454,10 @@ void starpu_data_release_on_node(starpu_data_handle_t handle, int node)
 
				 
			
 
				 void starpu_data_release(starpu_data_handle_t handle)
			
 
				 {
			
 
				-	starpu_data_release_on_node(handle, STARPU_MAIN_RAM);
			
 
				+	int home_node = handle->home_node;
			
 
				+	if (home_node < 0)
			
 
				+		home_node = STARPU_MAIN_RAM;
			
 
				+	starpu_data_release_on_node(handle, home_node);
			
 
				 }
			
 
				 
			
 
				 static void _prefetch_data_on_node(void *arg)
			
--- a/src/drivers/cpu/driver_cpu.c
+++ b/src/drivers/cpu/driver_cpu.c
@@ -153,35 +153,43 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 
				 static size_t _starpu_cpu_get_global_mem_size(int nodeid STARPU_ATTRIBUTE_UNUSED, struct _starpu_machine_config *config STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	size_t global_mem;
			
 
				-	starpu_ssize_t limit;
			
 
				+	starpu_ssize_t limit = -1;
			
 
				 
			
 
				-	limit = starpu_get_env_number("STARPU_LIMIT_CPU_MEM");
			
 
				-#ifdef STARPU_DEVEL
			
 
				-#  warning TODO: take into account NUMA node and check STARPU_LIMIT_CPU_numanode_MEM
			
 
				-#endif
			
 
				+	char name[32];
			
 
				 
			
 
				 #if defined(STARPU_HAVE_HWLOC)
			
 
				 	struct _starpu_machine_topology *topology = &config->topology;
			
 
				 
			
 
				-#if 0
			
 
				-	/* Do not limit ourself to a single NUMA node yet, as we don't have real NUMA support for now */
			
 
				-        int depth_node = hwloc_get_type_depth(topology->hwtopology, HWLOC_OBJ_NODE);
			
 
				-
			
 
				-	if (depth_node == HWLOC_TYPE_DEPTH_UNKNOWN)
			
 
				-	     global_mem = hwloc_get_root_obj(topology->hwtopology)->memory.total_memory;
			
 
				+	int nnumas = starpu_memory_nodes_get_numa_count();
			
 
				+	if (nnumas > 1)
			
 
				+	{
			
 
				+		int depth_node = hwloc_get_type_depth(topology->hwtopology, HWLOC_OBJ_NODE);
			
 
				+
			
 
				+		if (depth_node == HWLOC_TYPE_DEPTH_UNKNOWN)
			
 
				+		     global_mem = hwloc_get_root_obj(topology->hwtopology)->memory.total_memory;
			
 
				+		else {
			
 
				+		     hwloc_obj_t obj = hwloc_get_obj_by_depth(topology->hwtopology, depth_node, nodeid);
			
 
				+		     global_mem = obj->memory.local_memory;
			
 
				+		     sprintf(name, "STARPU_LIMIT_CPU_NUMA_%d_MEM", obj->os_index);
			
 
				+		     limit = starpu_get_env_number(name);
			
 
				+		}
			
 
				+	}
			
 
				 	else
			
 
				-	     global_mem = hwloc_get_obj_by_depth(topology->hwtopology, depth_node, nodeid)->memory.local_memory;
			
 
				-#else
			
 
				-	global_mem = hwloc_get_root_obj(topology->hwtopology)->memory.total_memory;
			
 
				-#endif
			
 
				+	{
			
 
				+		/* Do not limit ourself to a single NUMA node */
			
 
				+		global_mem = hwloc_get_root_obj(topology->hwtopology)->memory.total_memory;
			
 
				+	}
			
 
				 
			
 
				 #else /* STARPU_HAVE_HWLOC */
			
 
				 #ifdef STARPU_DEVEL
			
 
				-#  warning use sysinfo when available to get global size
			
 
				+#  warning TODO: use sysinfo when available to get global size
			
 
				 #endif
			
 
				 	global_mem = 0;
			
 
				 #endif
			
 
				 
			
 
				+	if (limit == -1)
			
 
				+		limit = starpu_get_env_number("STARPU_LIMIT_CPU_MEM");
			
 
				+
			
 
				 	if (limit < 0)
			
 
				 		// No limit is defined, we return the global memory size
			
 
				 		return global_mem;
			
@@ -198,9 +206,7 @@ int _starpu_cpu_driver_init(struct _starpu_worker *cpu_worker)
 
				 	int devid = cpu_worker->devid;
			
 
				 
			
 
				 	_starpu_driver_start(cpu_worker, _STARPU_FUT_CPU_KEY, 1);
			
 
				-	/* FIXME: when we have NUMA support, properly turn node number into NUMA node number */
			
 
				-	_starpu_memory_manager_set_global_memory_size(cpu_worker->memory_node, _starpu_cpu_get_global_mem_size(cpu_worker->memory_node, cpu_worker->config));
			
 
				-
			
 
				+	_starpu_memory_manager_set_global_memory_size(cpu_worker->memory_node, _starpu_cpu_get_global_mem_size(cpu_worker->numa_memory_node, cpu_worker->config));
			
 
				 	snprintf(cpu_worker->name, sizeof(cpu_worker->name), "CPU %d", devid);
			
 
				 	snprintf(cpu_worker->short_name, sizeof(cpu_worker->short_name), "CPU %d", devid);
			
 
				 	starpu_pthread_setname(cpu_worker->short_name);
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -53,7 +53,7 @@
 
				 static int ncudagpus = -1;
			
 
				 
			
 
				 static size_t global_mem[STARPU_MAXCUDADEVS];
			
 
				-int _starpu_cuda_bus_ids[STARPU_MAXCUDADEVS+1][STARPU_MAXCUDADEVS+1];
			
 
				+int _starpu_cuda_bus_ids[STARPU_MAXCUDADEVS+STARPU_MAXNUMANODES][STARPU_MAXCUDADEVS+STARPU_MAXNUMANODES];
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 static cudaStream_t streams[STARPU_NMAXWORKERS];
			
 
				 static cudaStream_t out_transfer_streams[STARPU_MAXCUDADEVS];
			
@@ -163,7 +163,7 @@ cudaStream_t starpu_cuda_get_local_in_transfer_stream()
 
				 	int worker = starpu_worker_get_id_check();
			
 
				 	int devid = starpu_worker_get_devid(worker);
			
 
				 	cudaStream_t stream;
			
 
				-
			
 
				+	
			
 
				 	stream = in_transfer_streams[devid];
			
 
				 	STARPU_ASSERT(stream);
			
 
				 	return stream;
			
@@ -323,7 +323,7 @@ static void init_device_context(unsigned devid, unsigned memnode)
 
				 					{
			
 
				 						_STARPU_DEBUG("Enabled GPU-Direct %d -> %d\n", worker->devid, devid);
			
 
				 						/* direct copies are made from the destination, see link_supports_direct_transfers */
			
 
				-						starpu_bus_set_direct(_starpu_cuda_bus_ids[worker->devid][devid], 1);
			
 
				+						starpu_bus_set_direct(_starpu_cuda_bus_ids[worker->devid+STARPU_MAXNUMANODES][devid+STARPU_MAXNUMANODES], 1);
			
 
				 					}
			
 
				 				}
			
 
				 			}
			
--- a/src/drivers/cuda/driver_cuda.h
+++ b/src/drivers/cuda/driver_cuda.h
@@ -32,7 +32,7 @@ extern struct _starpu_driver_ops _starpu_driver_cuda_ops;
 
				 
			
 
				 void _starpu_cuda_init(void);
			
 
				 unsigned _starpu_get_cuda_device_count(void);
			
 
				-extern int _starpu_cuda_bus_ids[STARPU_MAXCUDADEVS+1][STARPU_MAXCUDADEVS+1];
			
 
				+extern int _starpu_cuda_bus_ids[STARPU_MAXCUDADEVS+STARPU_MAXNUMANODES][STARPU_MAXCUDADEVS+STARPU_MAXNUMANODES];
			
 
				 
			
 
				 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
			
 
				 void _starpu_cuda_discover_devices (struct _starpu_machine_config *);
			
--- a/src/drivers/mp_common/source_common.c
+++ b/src/drivers/mp_common/source_common.c
@@ -950,7 +950,6 @@ static void _starpu_src_common_worker_internal_work(struct _starpu_worker_set *
 
				         starpu_pthread_wait_reset(&worker_set->workers[0].wait);
			
 
				 #endif
			
 
				 
			
 
				-
			
 
				 	/* Test if async transfers are completed */
			
 
				 	for (i = 0; i < worker_set->nworkers; i++)
			
 
				 	{
			
--- a/src/drivers/mpi/driver_mpi_common.c
+++ b/src/drivers/mpi/driver_mpi_common.c
@@ -469,7 +469,7 @@ void _starpu_mpi_common_barrier(void)
 
				 /* Compute bandwidth and latency between source and sink nodes
			
 
				  * Source node has to have the entire set of times at the end
			
 
				  */
			
 
				-void _starpu_mpi_common_measure_bandwidth_latency(double bandwidth_dtod[STARPU_MAXMPIDEVS][STARPU_MAXMPIDEVS], double latency_dtod[STARPU_MAXMPIDEVS][STARPU_MAXMPIDEVS])
			
 
				+void _starpu_mpi_common_measure_bandwidth_latency(double timing_dtod[STARPU_MAXMPIDEVS][STARPU_MAXMPIDEVS], double latency_dtod[STARPU_MAXMPIDEVS][STARPU_MAXMPIDEVS])
			
 
				 {
			
 
				         int ret;
			
 
				         unsigned iter;
			
@@ -506,7 +506,7 @@ void _starpu_mpi_common_measure_bandwidth_latency(double bandwidth_dtod[STARPU_M
 
				                                         STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "Bandwidth of MPI Master/Slave cannot be measured !");
			
 
				                                 }
			
 
				                                 end = starpu_timing_now();
			
 
				-                                bandwidth_dtod[sender][receiver] = (NITER*SIZE_BANDWIDTH)/(end - start);
			
 
				+                                timing_dtod[sender][receiver] = (end - start)/NITER/SIZE_BANDWIDTH;
			
 
				 
			
 
				                                 /* measure latency sender to receiver */
			
 
				                                 start = starpu_timing_now();
			
@@ -546,14 +546,14 @@ void _starpu_mpi_common_measure_bandwidth_latency(double bandwidth_dtod[STARPU_M
 
				                 /* if we are the sender, we send the data */
			
 
				                 if (sender == id_proc)
			
 
				                 {
			
 
				-                        MPI_Send(bandwidth_dtod[sender], STARPU_MAXMPIDEVS, MPI_DOUBLE, src_node_id, 42, MPI_COMM_WORLD);
			
 
				+                        MPI_Send(timing_dtod[sender], STARPU_MAXMPIDEVS, MPI_DOUBLE, src_node_id, 42, MPI_COMM_WORLD);
			
 
				                         MPI_Send(latency_dtod[sender], STARPU_MAXMPIDEVS, MPI_DOUBLE, src_node_id, 42, MPI_COMM_WORLD);
			
 
				                 }
			
 
				 
			
 
				                 /* the master node receives the data */
			
 
				                 if (src_node_id == id_proc)
			
 
				                 {
			
 
				-                        MPI_Recv(bandwidth_dtod[sender], STARPU_MAXMPIDEVS, MPI_DOUBLE, sender, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
			
 
				+                        MPI_Recv(timing_dtod[sender], STARPU_MAXMPIDEVS, MPI_DOUBLE, sender, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
			
 
				                         MPI_Recv(latency_dtod[sender], STARPU_MAXMPIDEVS, MPI_DOUBLE, sender, 42, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
			
 
				                 }
			
 
				 
			
--- a/src/util/openmp_runtime_support.c
+++ b/src/util/openmp_runtime_support.c
@@ -2415,8 +2415,12 @@ void starpu_omp_atomic_fallback_inline_end(void)
 
				 
			
 
				 void starpu_omp_vector_annotate(starpu_data_handle_t handle, uint32_t slice_base)
			
 
				 {
			
 
				+	/* FIXME Oli: rather iterate over all nodes? */
			
 
				+	int node = starpu_data_get_home_node(handle);
			
 
				+	if (node < 0 || (starpu_node_get_kind(node) != STARPU_CPU_RAM))
			
 
				+		node = STARPU_MAIN_RAM;
			
 
				 	struct starpu_vector_interface *vector_interface = (struct starpu_vector_interface *)
			
 
				-		starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
			
 
				+		starpu_data_get_interface_on_node(handle, node);
			
 
				 	assert(vector_interface->id == STARPU_VECTOR_INTERFACE_ID);
			
 
				 	vector_interface->slice_base = slice_base;
			
 
				 }
			
--- a/tests/datawizard/interfaces/test_interfaces.c
+++ b/tests/datawizard/interfaces/test_interfaces.c
@@ -794,6 +794,8 @@ handle_to_pointer(void)
 
				 	{
			
 
				 		if (starpu_node_get_kind(node) != STARPU_CPU_RAM)
			
 
				 			continue;
			
 
				+		if (!starpu_data_test_if_allocated_on_node(handle, node))
			
 
				+			continue;
			
 
				 
			
 
				 		ptr = handle->ops->handle_to_pointer(handle, node);
			
 
				 		if (starpu_data_lookup(ptr) != handle)
			
--- a/tests/datawizard/nowhere.c
+++ b/tests/datawizard/nowhere.c
@@ -1,6 +1,7 @@
 
				 /* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				  *
			
 
				  * Copyright (C) 2015-2016  Université de Bordeaux
			
 
				+ * Copyright (C) 2017  Inria
			
 
				  *
			
 
				  * StarPU is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU Lesser General Public License as published by
			
@@ -84,6 +85,13 @@ int main(int argc, char **argv)
 
				 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				+	if (starpu_memory_nodes_get_numa_count() > 1)
			
 
				+	{
			
 
				+		/* FIXME: assumes only one RAM node */
			
 
				+		starpu_shutdown();
			
 
				+		return STARPU_TEST_SKIPPED;
			
 
				+	}
			
 
				+
			
 
				 	starpu_variable_data_register(&handle_x, STARPU_MAIN_RAM, (uintptr_t)&x, sizeof(x));
			
 
				 	starpu_variable_data_register(&handle_y, STARPU_MAIN_RAM, (uintptr_t)&y, sizeof(y));
			
 
				 
			
--- a/tests/datawizard/specific_node.c
+++ b/tests/datawizard/specific_node.c
@@ -34,10 +34,8 @@ starpu_data_handle_t data_handle;
 
				 
			
 
				 unsigned data;
			
 
				 
			
 
				-void specific_kernel(STARPU_ATTRIBUTE_UNUSED void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				+void specific_kernel(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				 {
			
 
				-	/* We do not protect this variable because it is only accessed when the
			
 
				-	 * "data_handle" piece of data is accessed. */
			
 
				 	unsigned *dataptr = (unsigned*) STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				 
			
 
				 	STARPU_ASSERT(dataptr == &data);
			
@@ -55,6 +53,12 @@ static struct starpu_codelet specific_cl =
 
				 	.nodes = {STARPU_MAIN_RAM},
			
 
				 };
			
 
				 
			
 
				+void cpu_codelet_unsigned_inc(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				+{
			
 
				+	unsigned *dataptr = (unsigned*) STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+	(*dataptr)++;
			
 
				+}
			
 
				+
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 void cuda_codelet_unsigned_inc(void *descr[], STARPU_ATTRIBUTE_UNUSED void *cl_arg);
			
 
				 #endif
			
@@ -64,7 +68,7 @@ void opencl_codelet_unsigned_inc(void *buffers[], void *args);
 
				 
			
 
				 static struct starpu_codelet cl =
			
 
				 {
			
 
				-	.cpu_funcs = {specific_kernel},
			
 
				+	.cpu_funcs = {cpu_codelet_unsigned_inc},
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 	.cuda_funcs = {cuda_codelet_unsigned_inc},
			
 
				 	.cuda_flags = {STARPU_CUDA_ASYNC},
			
--- a/tests/disk/mem_reclaim.c
+++ b/tests/disk/mem_reclaim.c
@@ -60,7 +60,16 @@ int main(int argc, char **argv)
 
				 }
			
 
				 #else
			
 
				 
			
 
				-const struct starpu_data_copy_methods my_vector_copy_data_methods_s;
			
 
				+static int (*any_to_any)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
			
 
				+
			
 
				+/* We need a ram-to-ram copy for NUMA machine, use any_to_any for that */
			
 
				+static int ram_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node) {
			
 
				+	return any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
			
 
				+}
			
 
				+
			
 
				+const struct starpu_data_copy_methods my_vector_copy_data_methods_s = {
			
 
				+	.ram_to_ram = ram_to_ram
			
 
				+};
			
 
				 struct starpu_data_interface_ops starpu_interface_my_vector_ops;
			
 
				 
			
 
				 void starpu_my_vector_data_register(starpu_data_handle_t *handleptr, int home_node,
			
@@ -218,6 +227,7 @@ int main(void)
 
				 	setenv("STARPU_LIMIT_CPU_MEM", MEMSIZE_STR, 1);
			
 
				 
			
 
				 	/* Build an vector-like interface which doesn't have the any_to_any helper, to force making use of pack/unpack */
			
 
				+	any_to_any = starpu_interface_vector_ops.copy_methods->any_to_any;
			
 
				 	memcpy(&starpu_interface_my_vector_ops, &starpu_interface_vector_ops, sizeof(starpu_interface_my_vector_ops));
			
 
				 	starpu_interface_my_vector_ops.copy_methods = &my_vector_copy_data_methods_s;